All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 01/10] drm/i915: add interface to simulate gpu hangs
@ 2012-04-27 13:17 Daniel Vetter
  2012-04-27 13:17 ` [PATCH 02/10] drm/i915: rework dev->first_error locking Daniel Vetter
                   ` (10 more replies)
  0 siblings, 11 replies; 31+ messages in thread
From: Daniel Vetter @ 2012-04-27 13:17 UTC (permalink / raw)
  To: Intel Graphics Development; +Cc: Daniel Vetter

gpu reset is a very important piece of our infrastructure.
Unfortunately we only really it test by actually hanging the gpu,
which often has bad side-effects for the entire system. And the gpu
hang handling code is one of the rather complicated pieces of code we
have, consisting of
- hang detection
- error capture
- actual gpu reset
- reset of all the gem bookkeeping
- reinitialition of the entire gpu

This patch adds a debugfs to selectively stopping rings by ceasing to
update the hw tail pointer, which will result in the gpu no longer
updating it's head pointer and eventually to the hangcheck firing.
This way we can exercise the gpu hang code under controlled conditions
without a dying gpu taking down the entire systems.

Patch motivated by me forgetting to properly reinitialize ppgtt after
a gpu reset.

Usage:

echo $((1 << $ringnum)) > i915_ring_stop # stops one ring

echo 0xffffffff > i915_ring_stop # stops all, future-proof version

then run whatever testload is desired. i915_ring_stop automatically
resets after a gpu hang is detected to avoid hanging the gpu to fast
and declaring it wedged.

v2: Incorporate feedback from Chris Wilson.

v3: Add the missing cleanup.

Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_debugfs.c     |   65 +++++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/i915_drv.h         |    2 +
 drivers/gpu/drm/i915/intel_ringbuffer.c |    4 ++
 3 files changed, 71 insertions(+), 0 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 120db46..3f6e28e 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1585,6 +1585,64 @@ static const struct file_operations i915_wedged_fops = {
 };
 
 static ssize_t
+i915_ring_stop_read(struct file *filp,
+		    char __user *ubuf,
+		    size_t max,
+		    loff_t *ppos)
+{
+	struct drm_device *dev = filp->private_data;
+	drm_i915_private_t *dev_priv = dev->dev_private;
+	char buf[80];
+	int len;
+
+	len = snprintf(buf, sizeof(buf),
+		       "0x%08x\n", dev_priv->stop_rings);
+
+	if (len > sizeof(buf))
+		len = sizeof(buf);
+
+	return simple_read_from_buffer(ubuf, max, ppos, buf, len);
+}
+
+static ssize_t
+i915_ring_stop_write(struct file *filp,
+		     const char __user *ubuf,
+		     size_t cnt,
+		     loff_t *ppos)
+{
+	struct drm_device *dev = filp->private_data;
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	char buf[20];
+	int val = 0;
+
+	if (cnt > 0) {
+		if (cnt > sizeof(buf) - 1)
+			return -EINVAL;
+
+		if (copy_from_user(buf, ubuf, cnt))
+			return -EFAULT;
+		buf[cnt] = 0;
+
+		val = simple_strtoul(buf, NULL, 0);
+	}
+
+	DRM_DEBUG_DRIVER("Stopping rings 0x%08x\n", val);
+
+	mutex_lock(&dev->struct_mutex);
+	dev_priv->stop_rings = val;
+	mutex_unlock(&dev->struct_mutex);
+
+	return cnt;
+}
+
+static const struct file_operations i915_ring_stop_fops = {
+	.owner = THIS_MODULE,
+	.open = simple_open,
+	.read = i915_ring_stop_read,
+	.write = i915_ring_stop_write,
+	.llseek = default_llseek,
+};
+static ssize_t
 i915_max_freq_read(struct file *filp,
 		   char __user *ubuf,
 		   size_t max,
@@ -1884,6 +1942,11 @@ int i915_debugfs_init(struct drm_minor *minor)
 				  &i915_cache_sharing_fops);
 	if (ret)
 		return ret;
+	ret = i915_debugfs_create(minor->debugfs_root, minor,
+				  "i915_ring_stop",
+				  &i915_ring_stop_fops);
+	if (ret)
+		return ret;
 
 	return drm_debugfs_create_files(i915_debugfs_list,
 					I915_DEBUGFS_ENTRIES,
@@ -1902,6 +1965,8 @@ void i915_debugfs_cleanup(struct drm_minor *minor)
 				 1, minor);
 	drm_debugfs_remove_files((struct drm_info_list *) &i915_cache_sharing_fops,
 				 1, minor);
+	drm_debugfs_remove_files((struct drm_info_list *) &i915_ring_stop_fops,
+				 1, minor);
 }
 
 #endif /* CONFIG_DEBUG_FS */
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 0095c8d..5e62b35 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -413,6 +413,8 @@ typedef struct drm_i915_private {
 	uint32_t last_instdone;
 	uint32_t last_instdone1;
 
+	unsigned int stop_rings;
+
 	unsigned long cfb_size;
 	unsigned int cfb_fb;
 	enum plane cfb_plane;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 427b7c5..d2de16d 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -1221,7 +1221,11 @@ int intel_ring_begin(struct intel_ring_buffer *ring,
 
 void intel_ring_advance(struct intel_ring_buffer *ring)
 {
+	struct drm_i915_private *dev_priv = ring->dev->dev_private;
+
 	ring->tail &= ring->size - 1;
+	if (dev_priv->stop_rings & intel_ring_flag(ring))
+		return;
 	ring->write_tail(ring, ring->tail);
 }
 
-- 
1.7.9

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 02/10] drm/i915: rework dev->first_error locking
  2012-04-27 13:17 [PATCH 01/10] drm/i915: add interface to simulate gpu hangs Daniel Vetter
@ 2012-04-27 13:17 ` Daniel Vetter
  2012-05-04 17:04   ` Eugeni Dodonov
  2012-04-27 13:17 ` [PATCH 03/10] drm/i915: allow the existing error_state to be destroyed Daniel Vetter
                   ` (9 subsequent siblings)
  10 siblings, 1 reply; 31+ messages in thread
From: Daniel Vetter @ 2012-04-27 13:17 UTC (permalink / raw)
  To: Intel Graphics Development; +Cc: Daniel Vetter

- reduce the irq disabled section, even for a debugfs file this was
  way too long.
- always disable irqs when taking the lock.

v2: Thou shalt not mistake locking for reference counting, so:
- reference count the error_state to protect from concurent freeeing.
  This will be only really used in the next patch.

Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/i915_debugfs.c |   13 ++++++++-----
 drivers/gpu/drm/i915/i915_drv.h     |    4 ++++
 drivers/gpu/drm/i915/i915_irq.c     |   12 +++++++-----
 3 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 3f6e28e..b46c198 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -701,12 +701,16 @@ static int i915_error_state(struct seq_file *m, void *unused)
 	int i, j, page, offset, elt;
 
 	spin_lock_irqsave(&dev_priv->error_lock, flags);
-	if (!dev_priv->first_error) {
+	error = dev_priv->first_error;
+	if (error)
+		kref_get(&error->ref);
+	spin_unlock_irqrestore(&dev_priv->error_lock, flags);
+
+	if (!error) {
 		seq_printf(m, "no error state collected\n");
-		goto out;
+		return 0;
 	}
 
-	error = dev_priv->first_error;
 
 	seq_printf(m, "Time: %ld s %ld us\n", error->time.tv_sec,
 		   error->time.tv_usec);
@@ -788,8 +792,7 @@ static int i915_error_state(struct seq_file *m, void *unused)
 	if (error->display)
 		intel_display_print_error_state(m, dev, error->display);
 
-out:
-	spin_unlock_irqrestore(&dev_priv->error_lock, flags);
+	kref_put(&error->ref, i915_error_state_free);
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 5e62b35..2aaa594 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -39,6 +39,7 @@
 #include <drm/intel-gtt.h>
 #include <linux/backlight.h>
 #include <linux/intel-iommu.h>
+#include <linux/kref.h>
 
 /* General customization:
  */
@@ -171,6 +172,7 @@ struct sdvo_device_mapping {
 struct intel_display_error_state;
 
 struct drm_i915_error_state {
+	struct kref ref;
 	u32 eir;
 	u32 pgtbl_er;
 	u32 pipestat[I915_MAX_PIPES];
@@ -466,6 +468,7 @@ typedef struct drm_i915_private {
 	unsigned int fsb_freq, mem_freq, is_ddr3;
 
 	spinlock_t error_lock;
+	/* Protected by dev->error_lock. */
 	struct drm_i915_error_state *first_error;
 	struct work_struct error_work;
 	struct completion error_completion;
@@ -1163,6 +1166,7 @@ extern int i915_vblank_pipe_get(struct drm_device *dev, void *data,
 				struct drm_file *file_priv);
 extern int i915_vblank_swap(struct drm_device *dev, void *data,
 			    struct drm_file *file_priv);
+void i915_error_state_free(struct kref *error_ref);
 
 void
 i915_enable_pipestat(drm_i915_private_t *dev_priv, int pipe, u32 mask);
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 5c360ee..b6c02eb 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -869,10 +869,11 @@ i915_error_object_free(struct drm_i915_error_object *obj)
 	kfree(obj);
 }
 
-static void
-i915_error_state_free(struct drm_device *dev,
-		      struct drm_i915_error_state *error)
+void
+i915_error_state_free(struct kref *error_ref)
 {
+	struct drm_i915_error_state *error = container_of(error_ref,
+							  typeof(*error), ref);
 	int i;
 
 	for (i = 0; i < ARRAY_SIZE(error->ring); i++) {
@@ -1120,6 +1121,7 @@ static void i915_capture_error_state(struct drm_device *dev)
 	DRM_INFO("capturing error event; look for more information in /debug/dri/%d/i915_error_state\n",
 		 dev->primary->index);
 
+	kref_init(&error->ref);
 	error->eir = I915_READ(EIR);
 	error->pgtbl_er = I915_READ(PGTBL_ER);
 	for_each_pipe(pipe)
@@ -1181,7 +1183,7 @@ static void i915_capture_error_state(struct drm_device *dev)
 	spin_unlock_irqrestore(&dev_priv->error_lock, flags);
 
 	if (error)
-		i915_error_state_free(dev, error);
+		i915_error_state_free(&error->ref);
 }
 
 void i915_destroy_error_state(struct drm_device *dev)
@@ -1196,7 +1198,7 @@ void i915_destroy_error_state(struct drm_device *dev)
 	spin_unlock_irqrestore(&dev_priv->error_lock, flags);
 
 	if (error)
-		i915_error_state_free(dev, error);
+		kref_put(&error->ref, i915_error_state_free);
 }
 #else
 #define i915_capture_error_state(x)
-- 
1.7.9

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 03/10] drm/i915: allow the existing error_state to be destroyed
  2012-04-27 13:17 [PATCH 01/10] drm/i915: add interface to simulate gpu hangs Daniel Vetter
  2012-04-27 13:17 ` [PATCH 02/10] drm/i915: rework dev->first_error locking Daniel Vetter
@ 2012-04-27 13:17 ` Daniel Vetter
  2012-05-04 11:56   ` Daniel Vetter
  2012-04-27 13:17 ` [PATCH 04/10] drm/i915: simplify i915_reset a bit Daniel Vetter
                   ` (8 subsequent siblings)
  10 siblings, 1 reply; 31+ messages in thread
From: Daniel Vetter @ 2012-04-27 13:17 UTC (permalink / raw)
  To: Intel Graphics Development; +Cc: Daniel Vetter

... by writing (anything) to i915_error_state.

This way we can simulate a bunch of gpu hangs and run the error_state
capture code every time (without the need to reload the module).

To make that happen we need to abandon the simple seq_file wrappers
provided by the drm core. While at it, but the new error_state
refcounting to some good use and associated. This should help greatly
when we finally get around to split up the giant single seq_file block
that the error_state file currently is into smaller parts.

v2: Actually squash all the fixes into the patch ...

Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/i915_debugfs.c |   90 ++++++++++++++++++++++++++++++-----
 1 files changed, 77 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index b46c198..4805df0 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -691,21 +691,19 @@ static void i915_ring_error_state(struct seq_file *m,
 	seq_printf(m, "  ring->tail: 0x%08x\n", error->cpu_ring_tail[ring]);
 }
 
+struct i915_error_state_file_priv {
+	struct drm_device *dev;
+	struct drm_i915_error_state *error;
+};
+
 static int i915_error_state(struct seq_file *m, void *unused)
 {
-	struct drm_info_node *node = (struct drm_info_node *) m->private;
-	struct drm_device *dev = node->minor->dev;
+	struct i915_error_state_file_priv *error_priv = m->private;
+	struct drm_device *dev = error_priv->dev;
 	drm_i915_private_t *dev_priv = dev->dev_private;
-	struct drm_i915_error_state *error;
-	unsigned long flags;
+	struct drm_i915_error_state *error = error_priv->error;
 	int i, j, page, offset, elt;
 
-	spin_lock_irqsave(&dev_priv->error_lock, flags);
-	error = dev_priv->first_error;
-	if (error)
-		kref_get(&error->ref);
-	spin_unlock_irqrestore(&dev_priv->error_lock, flags);
-
 	if (!error) {
 		seq_printf(m, "no error state collected\n");
 		return 0;
@@ -792,11 +790,71 @@ static int i915_error_state(struct seq_file *m, void *unused)
 	if (error->display)
 		intel_display_print_error_state(m, dev, error->display);
 
-	kref_put(&error->ref, i915_error_state_free);
-
 	return 0;
 }
 
+static ssize_t
+i915_error_state_write(struct file *filp,
+		       const char __user *ubuf,
+		       size_t cnt,
+		       loff_t *ppos)
+{
+	struct seq_file *m = filp->private_data;
+	struct i915_error_state_file_priv *error_priv = m->private;
+	struct drm_device *dev = error_priv->dev;
+
+	DRM_DEBUG_DRIVER("Resetting error state\n");
+
+	mutex_lock(&dev->struct_mutex);
+	i915_destroy_error_state(dev);
+	mutex_unlock(&dev->struct_mutex);
+
+	return cnt;
+}
+
+static int i915_error_state_open(struct inode *inode, struct file *file)
+{
+	struct drm_device *dev = inode->i_private;
+	drm_i915_private_t *dev_priv = dev->dev_private;
+	struct i915_error_state_file_priv *error_priv;
+	unsigned long flags;
+
+	error_priv = kzalloc(sizeof(*error_priv), GFP_KERNEL);
+	if (!error_priv)
+		return -ENOMEM;
+
+	error_priv->dev = dev;
+
+	spin_lock_irqsave(&dev_priv->error_lock, flags);
+	error_priv->error = dev_priv->first_error;
+	if (error_priv->error)
+		kref_get(&error_priv->error->ref);
+	spin_unlock_irqrestore(&dev_priv->error_lock, flags);
+
+	return single_open(file, i915_error_state, error_priv);
+}
+
+static int i915_error_state_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *m = file->private_data;
+	struct i915_error_state_file_priv *error_priv = m->private;
+
+	if (error_priv->error)
+		kref_put(&error_priv->error->ref, i915_error_state_free);
+	kfree(error_priv);
+
+	return single_release(inode, file);
+}
+
+static const struct file_operations i915_error_state_fops = {
+	.owner = THIS_MODULE,
+	.open = i915_error_state_open,
+	.read = seq_read,
+	.write = i915_error_state_write,
+	.llseek = default_llseek,
+	.release = i915_error_state_release,
+};
+
 static int i915_rstdby_delays(struct seq_file *m, void *unused)
 {
 	struct drm_info_node *node = (struct drm_info_node *) m->private;
@@ -1645,6 +1703,7 @@ static const struct file_operations i915_ring_stop_fops = {
 	.write = i915_ring_stop_write,
 	.llseek = default_llseek,
 };
+
 static ssize_t
 i915_max_freq_read(struct file *filp,
 		   char __user *ubuf,
@@ -1899,7 +1958,6 @@ static struct drm_info_list i915_debugfs_list[] = {
 	{"i915_gem_hws", i915_hws_info, 0, (void *)RCS},
 	{"i915_gem_hws_blt", i915_hws_info, 0, (void *)BCS},
 	{"i915_gem_hws_bsd", i915_hws_info, 0, (void *)VCS},
-	{"i915_error_state", i915_error_state, 0},
 	{"i915_rstdby_delays", i915_rstdby_delays, 0},
 	{"i915_cur_delayinfo", i915_cur_delayinfo, 0},
 	{"i915_delayfreq_table", i915_delayfreq_table, 0},
@@ -1951,6 +2009,12 @@ int i915_debugfs_init(struct drm_minor *minor)
 	if (ret)
 		return ret;
 
+	ret = i915_debugfs_create(minor->debugfs_root, minor,
+				  "i915_error_state",
+				  &i915_error_state_fops);
+	if (ret)
+		return ret;
+
 	return drm_debugfs_create_files(i915_debugfs_list,
 					I915_DEBUGFS_ENTRIES,
 					minor->debugfs_root, minor);
-- 
1.7.9

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 04/10] drm/i915: simplify i915_reset a bit
  2012-04-27 13:17 [PATCH 01/10] drm/i915: add interface to simulate gpu hangs Daniel Vetter
  2012-04-27 13:17 ` [PATCH 02/10] drm/i915: rework dev->first_error locking Daniel Vetter
  2012-04-27 13:17 ` [PATCH 03/10] drm/i915: allow the existing error_state to be destroyed Daniel Vetter
@ 2012-04-27 13:17 ` Daniel Vetter
  2012-05-04 16:47   ` Eugeni Dodonov
  2012-04-27 13:17 ` [PATCH 05/10] drm/i915: extract intel_gpu_reset Daniel Vetter
                   ` (7 subsequent siblings)
  10 siblings, 1 reply; 31+ messages in thread
From: Daniel Vetter @ 2012-04-27 13:17 UTC (permalink / raw)
  To: Intel Graphics Development; +Cc: Daniel Vetter

- need_display is always true, scrap it.
- don't reacquire the mutex to do nothing after having restored the
  gem state.

Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/i915_drv.c |   19 +++++--------------
 1 files changed, 5 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 8a98f9a..f5450bb 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -787,11 +787,6 @@ static int gen6_do_reset(struct drm_device *dev, u8 flags)
 int i915_reset(struct drm_device *dev, u8 flags)
 {
 	drm_i915_private_t *dev_priv = dev->dev_private;
-	/*
-	 * We really should only reset the display subsystem if we actually
-	 * need to
-	 */
-	bool need_display = true;
 	int ret;
 
 	if (!i915_try_reset)
@@ -863,22 +858,18 @@ int i915_reset(struct drm_device *dev, u8 flags)
 		drm_irq_uninstall(dev);
 		drm_mode_config_reset(dev);
 		drm_irq_install(dev);
-
-		mutex_lock(&dev->struct_mutex);
+	} else {
+		mutex_unlock(&dev->struct_mutex);
 	}
 
-	mutex_unlock(&dev->struct_mutex);
-
 	/*
 	 * Perform a full modeset as on later generations, e.g. Ironlake, we may
 	 * need to retrain the display link and cannot just restore the register
 	 * values.
 	 */
-	if (need_display) {
-		mutex_lock(&dev->mode_config.mutex);
-		drm_helper_resume_force_mode(dev);
-		mutex_unlock(&dev->mode_config.mutex);
-	}
+	mutex_lock(&dev->mode_config.mutex);
+	drm_helper_resume_force_mode(dev);
+	mutex_unlock(&dev->mode_config.mutex);
 
 	return 0;
 }
-- 
1.7.9

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 05/10] drm/i915: extract intel_gpu_reset
  2012-04-27 13:17 [PATCH 01/10] drm/i915: add interface to simulate gpu hangs Daniel Vetter
                   ` (2 preceding siblings ...)
  2012-04-27 13:17 ` [PATCH 04/10] drm/i915: simplify i915_reset a bit Daniel Vetter
@ 2012-04-27 13:17 ` Daniel Vetter
  2012-04-30  1:03   ` Ben Widawsky
  2012-05-04 16:51   ` Eugeni Dodonov
  2012-04-27 13:17 ` [PATCH 06/10] drm/i915: make gpu hangman more resilient Daniel Vetter
                   ` (6 subsequent siblings)
  10 siblings, 2 replies; 31+ messages in thread
From: Daniel Vetter @ 2012-04-27 13:17 UTC (permalink / raw)
  To: Intel Graphics Development; +Cc: Daniel Vetter, Ben Widawsky

Slightly cleans up the code and could be useful for e.g. Ben
Widawsky's hw context patches.

v2: New colours!

Cc: Ben Widawsky <ben@bwidawsk.net>
Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/i915_drv.c |   43 ++++++++++++++++++++++++--------------
 1 files changed, 27 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index f5450bb..c4251a1 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -768,6 +768,29 @@ static int gen6_do_reset(struct drm_device *dev, u8 flags)
 	return ret;
 }
 
+static int intel_gpu_reset(struct drm_device *dev, u8 flags)
+{
+	int ret = -ENODEV;
+
+	switch (INTEL_INFO(dev)->gen) {
+	case 7:
+	case 6:
+		ret = gen6_do_reset(dev, flags);
+		break;
+	case 5:
+		ret = ironlake_do_reset(dev, flags);
+		break;
+	case 4:
+		ret = i965_do_reset(dev, flags);
+		break;
+	case 2:
+		ret = i8xx_do_reset(dev, flags);
+		break;
+	}
+
+	return ret;
+}
+
 /**
  * i915_reset - reset chip after a hang
  * @dev: drm device to reset
@@ -798,23 +821,11 @@ int i915_reset(struct drm_device *dev, u8 flags)
 	i915_gem_reset(dev);
 
 	ret = -ENODEV;
-	if (get_seconds() - dev_priv->last_gpu_reset < 5) {
+	if (get_seconds() - dev_priv->last_gpu_reset < 5)
 		DRM_ERROR("GPU hanging too fast, declaring wedged!\n");
-	} else switch (INTEL_INFO(dev)->gen) {
-	case 7:
-	case 6:
-		ret = gen6_do_reset(dev, flags);
-		break;
-	case 5:
-		ret = ironlake_do_reset(dev, flags);
-		break;
-	case 4:
-		ret = i965_do_reset(dev, flags);
-		break;
-	case 2:
-		ret = i8xx_do_reset(dev, flags);
-		break;
-	}
+	else
+		ret = intel_gpu_reset(dev, flags);
+
 	dev_priv->last_gpu_reset = get_seconds();
 	if (ret) {
 		DRM_ERROR("Failed to reset chip.\n");
-- 
1.7.9

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 06/10] drm/i915: make gpu hangman more resilient
  2012-04-27 13:17 [PATCH 01/10] drm/i915: add interface to simulate gpu hangs Daniel Vetter
                   ` (3 preceding siblings ...)
  2012-04-27 13:17 ` [PATCH 05/10] drm/i915: extract intel_gpu_reset Daniel Vetter
@ 2012-04-27 13:17 ` Daniel Vetter
  2012-05-04 16:47   ` Eugeni Dodonov
  2012-04-27 13:17 ` [PATCH 07/10] drm/i915: kill flags parameter for reset functions Daniel Vetter
                   ` (5 subsequent siblings)
  10 siblings, 1 reply; 31+ messages in thread
From: Daniel Vetter @ 2012-04-27 13:17 UTC (permalink / raw)
  To: Intel Graphics Development; +Cc: Daniel Vetter

- reset the stop_rings infrastructure while resetting the hw to
  avoid angering the hangcheck right away (and potentially declaring
  the gpu permanently wedged).

- ignore reset failures when hanging due to the hangman - we don't
  have reset code for all generations.

v2: Ensure that we only ignore reset failures when the hw reset is not
implemented and not when it failed.

Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/i915_drv.c |   12 ++++++++++++
 1 files changed, 12 insertions(+), 0 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index c4251a1..ab70a1a 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -770,6 +770,7 @@ static int gen6_do_reset(struct drm_device *dev, u8 flags)
 
 static int intel_gpu_reset(struct drm_device *dev, u8 flags)
 {
+	struct drm_i915_private *dev_priv = dev->dev_private;
 	int ret = -ENODEV;
 
 	switch (INTEL_INFO(dev)->gen) {
@@ -788,6 +789,17 @@ static int intel_gpu_reset(struct drm_device *dev, u8 flags)
 		break;
 	}
 
+	/* Also reset the gpu hangman. */
+	if (dev_priv->stop_rings) {
+		DRM_DEBUG("Simulated gpu hang, resetting stop_rings\n");
+		dev_priv->stop_rings = 0;
+		if (ret == -ENODEV) {
+			DRM_ERROR("Reset not implemented, but ignoring "
+				  "error for simulated gpu hangs\n");
+			ret = 0;
+		}
+	}
+
 	return ret;
 }
 
-- 
1.7.9

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 07/10] drm/i915: kill flags parameter for reset functions
  2012-04-27 13:17 [PATCH 01/10] drm/i915: add interface to simulate gpu hangs Daniel Vetter
                   ` (4 preceding siblings ...)
  2012-04-27 13:17 ` [PATCH 06/10] drm/i915: make gpu hangman more resilient Daniel Vetter
@ 2012-04-27 13:17 ` Daniel Vetter
  2012-05-04 16:47   ` Eugeni Dodonov
  2012-04-27 13:17 ` [PATCH 08/10] drm/i915: also reset the media engine on gen4/5 Daniel Vetter
                   ` (4 subsequent siblings)
  10 siblings, 1 reply; 31+ messages in thread
From: Daniel Vetter @ 2012-04-27 13:17 UTC (permalink / raw)
  To: Intel Graphics Development; +Cc: Daniel Vetter

Only half of them even cared, and it's always the same one.

Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/i915_drv.c |   29 +++++++++++++++--------------
 drivers/gpu/drm/i915/i915_drv.h |    2 +-
 drivers/gpu/drm/i915/i915_irq.c |    2 +-
 3 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index ab70a1a..2ce4c0e 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -673,7 +673,7 @@ int i915_resume(struct drm_device *dev)
 	return 0;
 }
 
-static int i8xx_do_reset(struct drm_device *dev, u8 flags)
+static int i8xx_do_reset(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
@@ -710,7 +710,7 @@ static int i965_reset_complete(struct drm_device *dev)
 	return gdrst & 0x1;
 }
 
-static int i965_do_reset(struct drm_device *dev, u8 flags)
+static int i965_do_reset(struct drm_device *dev)
 {
 	u8 gdrst;
 
@@ -720,20 +720,22 @@ static int i965_do_reset(struct drm_device *dev, u8 flags)
 	 * triggers the reset; when done, the hardware will clear it.
 	 */
 	pci_read_config_byte(dev->pdev, I965_GDRST, &gdrst);
-	pci_write_config_byte(dev->pdev, I965_GDRST, gdrst | flags | 0x1);
+	pci_write_config_byte(dev->pdev, I965_GDRST,
+			      gdrst | GRDOM_RENDER | 0x1);
 
 	return wait_for(i965_reset_complete(dev), 500);
 }
 
-static int ironlake_do_reset(struct drm_device *dev, u8 flags)
+static int ironlake_do_reset(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	u32 gdrst = I915_READ(MCHBAR_MIRROR_BASE + ILK_GDSR);
-	I915_WRITE(MCHBAR_MIRROR_BASE + ILK_GDSR, gdrst | flags | 0x1);
+	I915_WRITE(MCHBAR_MIRROR_BASE + ILK_GDSR,
+		   gdrst | GRDOM_RENDER | 0x1);
 	return wait_for(I915_READ(MCHBAR_MIRROR_BASE + ILK_GDSR) & 0x1, 500);
 }
 
-static int gen6_do_reset(struct drm_device *dev, u8 flags)
+static int gen6_do_reset(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	int	ret;
@@ -768,7 +770,7 @@ static int gen6_do_reset(struct drm_device *dev, u8 flags)
 	return ret;
 }
 
-static int intel_gpu_reset(struct drm_device *dev, u8 flags)
+static int intel_gpu_reset(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	int ret = -ENODEV;
@@ -776,16 +778,16 @@ static int intel_gpu_reset(struct drm_device *dev, u8 flags)
 	switch (INTEL_INFO(dev)->gen) {
 	case 7:
 	case 6:
-		ret = gen6_do_reset(dev, flags);
+		ret = gen6_do_reset(dev);
 		break;
 	case 5:
-		ret = ironlake_do_reset(dev, flags);
+		ret = ironlake_do_reset(dev);
 		break;
 	case 4:
-		ret = i965_do_reset(dev, flags);
+		ret = i965_do_reset(dev);
 		break;
 	case 2:
-		ret = i8xx_do_reset(dev, flags);
+		ret = i8xx_do_reset(dev);
 		break;
 	}
 
@@ -806,7 +808,6 @@ static int intel_gpu_reset(struct drm_device *dev, u8 flags)
 /**
  * i915_reset - reset chip after a hang
  * @dev: drm device to reset
- * @flags: reset domains
  *
  * Reset the chip.  Useful if a hang is detected. Returns zero on successful
  * reset or otherwise an error code.
@@ -819,7 +820,7 @@ static int intel_gpu_reset(struct drm_device *dev, u8 flags)
  *   - re-init interrupt state
  *   - re-init display
  */
-int i915_reset(struct drm_device *dev, u8 flags)
+int i915_reset(struct drm_device *dev)
 {
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	int ret;
@@ -836,7 +837,7 @@ int i915_reset(struct drm_device *dev, u8 flags)
 	if (get_seconds() - dev_priv->last_gpu_reset < 5)
 		DRM_ERROR("GPU hanging too fast, declaring wedged!\n");
 	else
-		ret = intel_gpu_reset(dev, flags);
+		ret = intel_gpu_reset(dev);
 
 	dev_priv->last_gpu_reset = get_seconds();
 	if (ret) {
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 2aaa594..fd61500 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1143,7 +1143,7 @@ extern long i915_compat_ioctl(struct file *filp, unsigned int cmd,
 extern int i915_emit_box(struct drm_device *dev,
 			 struct drm_clip_rect *box,
 			 int DR1, int DR4);
-extern int i915_reset(struct drm_device *dev, u8 flags);
+extern int i915_reset(struct drm_device *dev);
 extern unsigned long i915_chipset_val(struct drm_i915_private *dev_priv);
 extern unsigned long i915_mch_val(struct drm_i915_private *dev_priv);
 extern unsigned long i915_gfx_val(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index b6c02eb..d7e68da 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -777,7 +777,7 @@ static void i915_error_work_func(struct work_struct *work)
 	if (atomic_read(&dev_priv->mm.wedged)) {
 		DRM_DEBUG_DRIVER("resetting chip\n");
 		kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, reset_event);
-		if (!i915_reset(dev, GRDOM_RENDER)) {
+		if (!i915_reset(dev)) {
 			atomic_set(&dev_priv->mm.wedged, 0);
 			kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, reset_done_event);
 		}
-- 
1.7.9

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 08/10] drm/i915: also reset the media engine on gen4/5
  2012-04-27 13:17 [PATCH 01/10] drm/i915: add interface to simulate gpu hangs Daniel Vetter
                   ` (5 preceding siblings ...)
  2012-04-27 13:17 ` [PATCH 07/10] drm/i915: kill flags parameter for reset functions Daniel Vetter
@ 2012-04-27 13:17 ` Daniel Vetter
  2012-04-27 13:17 ` [PATCH 09/10] drm/i915: remove modeset reset from i915_reset Daniel Vetter
                   ` (3 subsequent siblings)
  10 siblings, 0 replies; 31+ messages in thread
From: Daniel Vetter @ 2012-04-27 13:17 UTC (permalink / raw)
  To: Intel Graphics Development; +Cc: Daniel Vetter

... we actually use it.

Unfortunately we can't reset both at the same time without also
resetting the display unit, so do render and media separately.

Also replace magic constants with proper #defines.

Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/i915_drv.c |   28 +++++++++++++++++++++++++---
 drivers/gpu/drm/i915/i915_reg.h |    1 +
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 2ce4c0e..0c97b9b 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -712,6 +712,7 @@ static int i965_reset_complete(struct drm_device *dev)
 
 static int i965_do_reset(struct drm_device *dev)
 {
+	int ret;
 	u8 gdrst;
 
 	/*
@@ -721,7 +722,17 @@ static int i965_do_reset(struct drm_device *dev)
 	 */
 	pci_read_config_byte(dev->pdev, I965_GDRST, &gdrst);
 	pci_write_config_byte(dev->pdev, I965_GDRST,
-			      gdrst | GRDOM_RENDER | 0x1);
+			      gdrst | GRDOM_RENDER |
+			      GRDOM_RESET_ENABLE);
+	ret =  wait_for(i965_reset_complete(dev), 500);
+	if (ret)
+		return ret;
+
+	/* We can't reset render&media without also resetting display ... */
+	pci_read_config_byte(dev->pdev, I965_GDRST, &gdrst);
+	pci_write_config_byte(dev->pdev, I965_GDRST,
+			      gdrst | GRDOM_MEDIA |
+			      GRDOM_RESET_ENABLE);
 
 	return wait_for(i965_reset_complete(dev), 500);
 }
@@ -729,9 +740,20 @@ static int i965_do_reset(struct drm_device *dev)
 static int ironlake_do_reset(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
-	u32 gdrst = I915_READ(MCHBAR_MIRROR_BASE + ILK_GDSR);
+	u32 gdrst;
+	int ret;
+
+	gdrst = I915_READ(MCHBAR_MIRROR_BASE + ILK_GDSR);
+	I915_WRITE(MCHBAR_MIRROR_BASE + ILK_GDSR,
+		   gdrst | GRDOM_RENDER | GRDOM_RESET_ENABLE);
+	ret = wait_for(I915_READ(MCHBAR_MIRROR_BASE + ILK_GDSR) & 0x1, 500);
+	if (ret)
+		return ret;
+
+	/* We can't reset render&media without also resetting display ... */
+	gdrst = I915_READ(MCHBAR_MIRROR_BASE + ILK_GDSR);
 	I915_WRITE(MCHBAR_MIRROR_BASE + ILK_GDSR,
-		   gdrst | GRDOM_RENDER | 0x1);
+		   gdrst | GRDOM_MEDIA | GRDOM_RESET_ENABLE);
 	return wait_for(I915_READ(MCHBAR_MIRROR_BASE + ILK_GDSR) & 0x1, 500);
 }
 
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index f1f4d8f..28aca90 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -82,6 +82,7 @@
 #define  GRDOM_FULL	(0<<2)
 #define  GRDOM_RENDER	(1<<2)
 #define  GRDOM_MEDIA	(3<<2)
+#define  GRDOM_RESET_ENABLE (1<<0)
 
 #define GEN6_MBCUNIT_SNPCR	0x900c /* for LLC config */
 #define   GEN6_MBC_SNPCR_SHIFT	21
-- 
1.7.9

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 09/10] drm/i915: remove modeset reset from i915_reset
  2012-04-27 13:17 [PATCH 01/10] drm/i915: add interface to simulate gpu hangs Daniel Vetter
                   ` (6 preceding siblings ...)
  2012-04-27 13:17 ` [PATCH 08/10] drm/i915: also reset the media engine on gen4/5 Daniel Vetter
@ 2012-04-27 13:17 ` Daniel Vetter
  2012-04-27 13:17 ` [PATCH 10/10] drm/i915: kill gen4 gpu reset code Daniel Vetter
                   ` (2 subsequent siblings)
  10 siblings, 0 replies; 31+ messages in thread
From: Daniel Vetter @ 2012-04-27 13:17 UTC (permalink / raw)
  To: Intel Graphics Development; +Cc: Daniel Vetter

On gen4+ we don't reset the display unit, so resetting the complete
modeset state should not be necessary.

We can't do reset on gen3 anyway, which leaves us with gen2 reset:
According to Chris Wilson, that doesn't work so great, so he suggested
we just ignore that. If the need ever arrises, we can re-add it later
on.

Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/i915_drv.c |   10 ----------
 1 files changed, 0 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 0c97b9b..7a09c28 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -902,21 +902,11 @@ int i915_reset(struct drm_device *dev)
 			intel_modeset_init_hw(dev);
 
 		drm_irq_uninstall(dev);
-		drm_mode_config_reset(dev);
 		drm_irq_install(dev);
 	} else {
 		mutex_unlock(&dev->struct_mutex);
 	}
 
-	/*
-	 * Perform a full modeset as on later generations, e.g. Ironlake, we may
-	 * need to retrain the display link and cannot just restore the register
-	 * values.
-	 */
-	mutex_lock(&dev->mode_config.mutex);
-	drm_helper_resume_force_mode(dev);
-	mutex_unlock(&dev->mode_config.mutex);
-
 	return 0;
 }
 
-- 
1.7.9

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 10/10] drm/i915: kill gen4 gpu reset code
  2012-04-27 13:17 [PATCH 01/10] drm/i915: add interface to simulate gpu hangs Daniel Vetter
                   ` (7 preceding siblings ...)
  2012-04-27 13:17 ` [PATCH 09/10] drm/i915: remove modeset reset from i915_reset Daniel Vetter
@ 2012-04-27 13:17 ` Daniel Vetter
  2012-04-27 18:49   ` Eric Anholt
  2012-04-28  4:56 ` [PATCH 01/10] drm/i915: add interface to simulate gpu hangs Ben Widawsky
  2012-05-03 12:48 ` [PATCH] " Daniel Vetter
  10 siblings, 1 reply; 31+ messages in thread
From: Daniel Vetter @ 2012-04-27 13:17 UTC (permalink / raw)
  To: Intel Graphics Development; +Cc: Daniel Vetter

It's busted. Compared to ilk, snb and ivb where I've run the i-g-t
hangman test in a loop for a few hours, my gm45 never managed to reset
the gpu. And Chris Wilson confirmed on irc that he's never seen a case
where the gen4 gpu reset code actually helped.

Given that it's not unlikely that this is causing harm, kill it.

Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/i915_drv.c |   37 -------------------------------------
 1 files changed, 0 insertions(+), 37 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 7a09c28..0578c79 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -703,40 +703,6 @@ static int i8xx_do_reset(struct drm_device *dev)
 	return 0;
 }
 
-static int i965_reset_complete(struct drm_device *dev)
-{
-	u8 gdrst;
-	pci_read_config_byte(dev->pdev, I965_GDRST, &gdrst);
-	return gdrst & 0x1;
-}
-
-static int i965_do_reset(struct drm_device *dev)
-{
-	int ret;
-	u8 gdrst;
-
-	/*
-	 * Set the domains we want to reset (GRDOM/bits 2 and 3) as
-	 * well as the reset bit (GR/bit 0).  Setting the GR bit
-	 * triggers the reset; when done, the hardware will clear it.
-	 */
-	pci_read_config_byte(dev->pdev, I965_GDRST, &gdrst);
-	pci_write_config_byte(dev->pdev, I965_GDRST,
-			      gdrst | GRDOM_RENDER |
-			      GRDOM_RESET_ENABLE);
-	ret =  wait_for(i965_reset_complete(dev), 500);
-	if (ret)
-		return ret;
-
-	/* We can't reset render&media without also resetting display ... */
-	pci_read_config_byte(dev->pdev, I965_GDRST, &gdrst);
-	pci_write_config_byte(dev->pdev, I965_GDRST,
-			      gdrst | GRDOM_MEDIA |
-			      GRDOM_RESET_ENABLE);
-
-	return wait_for(i965_reset_complete(dev), 500);
-}
-
 static int ironlake_do_reset(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
@@ -805,9 +771,6 @@ static int intel_gpu_reset(struct drm_device *dev)
 	case 5:
 		ret = ironlake_do_reset(dev);
 		break;
-	case 4:
-		ret = i965_do_reset(dev);
-		break;
 	case 2:
 		ret = i8xx_do_reset(dev);
 		break;
-- 
1.7.9

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH 10/10] drm/i915: kill gen4 gpu reset code
  2012-04-27 13:17 ` [PATCH 10/10] drm/i915: kill gen4 gpu reset code Daniel Vetter
@ 2012-04-27 18:49   ` Eric Anholt
  2012-04-27 19:17     ` Daniel Vetter
  0 siblings, 1 reply; 31+ messages in thread
From: Eric Anholt @ 2012-04-27 18:49 UTC (permalink / raw)
  To: Intel Graphics Development; +Cc: Daniel Vetter


[-- Attachment #1.1: Type: text/plain, Size: 630 bytes --]

On Fri, 27 Apr 2012 15:17:47 +0200, Daniel Vetter <daniel.vetter@ffwll.ch> wrote:
> It's busted. Compared to ilk, snb and ivb where I've run the i-g-t
> hangman test in a loop for a few hours, my gm45 never managed to reset
> the gpu. And Chris Wilson confirmed on irc that he's never seen a case
> where the gen4 gpu reset code actually helped.

I remember originally when the hangcheck stuff landed, on a 965gm
running compiz, dragging a window with a bad shader around and seeing a
stutter as it reset the hardware successfully before crashing soon after
(then turning off the hanging plugin and continuing to use the system).

[-- Attachment #1.2: Type: application/pgp-signature, Size: 197 bytes --]

[-- Attachment #2: Type: text/plain, Size: 159 bytes --]

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 10/10] drm/i915: kill gen4 gpu reset code
  2012-04-27 18:49   ` Eric Anholt
@ 2012-04-27 19:17     ` Daniel Vetter
  2012-04-27 23:26       ` Eric Anholt
  0 siblings, 1 reply; 31+ messages in thread
From: Daniel Vetter @ 2012-04-27 19:17 UTC (permalink / raw)
  To: Eric Anholt; +Cc: Daniel Vetter, Intel Graphics Development

On Fri, Apr 27, 2012 at 11:49:16AM -0700, Eric Anholt wrote:
> On Fri, 27 Apr 2012 15:17:47 +0200, Daniel Vetter <daniel.vetter@ffwll.ch> wrote:
> > It's busted. Compared to ilk, snb and ivb where I've run the i-g-t
> > hangman test in a loop for a few hours, my gm45 never managed to reset
> > the gpu. And Chris Wilson confirmed on irc that he's never seen a case
> > where the gen4 gpu reset code actually helped.
> 
> I remember originally when the hangcheck stuff landed, on a 965gm
> running compiz, dragging a window with a bad shader around and seeing a
> stutter as it reset the hardware successfully before crashing soon after
> (then turning off the hanging plugin and continuing to use the system).

I'm confused - so hangcheck once indeed managed to reset a i965gm gpu or
never?
-Daniel
-- 
Daniel Vetter
Mail: daniel@ffwll.ch
Mobile: +41 (0)79 365 57 48

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 10/10] drm/i915: kill gen4 gpu reset code
  2012-04-27 19:17     ` Daniel Vetter
@ 2012-04-27 23:26       ` Eric Anholt
  2012-05-02 19:33         ` [PATCH] drm/i915: fix gen4 gpu reset Daniel Vetter
  0 siblings, 1 reply; 31+ messages in thread
From: Eric Anholt @ 2012-04-27 23:26 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: Daniel Vetter, Intel Graphics Development


[-- Attachment #1.1: Type: text/plain, Size: 979 bytes --]

On Fri, 27 Apr 2012 21:17:02 +0200, Daniel Vetter <daniel@ffwll.ch> wrote:
> On Fri, Apr 27, 2012 at 11:49:16AM -0700, Eric Anholt wrote:
> > On Fri, 27 Apr 2012 15:17:47 +0200, Daniel Vetter <daniel.vetter@ffwll.ch> wrote:
> > > It's busted. Compared to ilk, snb and ivb where I've run the i-g-t
> > > hangman test in a loop for a few hours, my gm45 never managed to reset
> > > the gpu. And Chris Wilson confirmed on irc that he's never seen a case
> > > where the gen4 gpu reset code actually helped.
> > 
> > I remember originally when the hangcheck stuff landed, on a 965gm
> > running compiz, dragging a window with a bad shader around and seeing a
> > stutter as it reset the hardware successfully before crashing soon after
> > (then turning off the hanging plugin and continuing to use the system).
> 
> I'm confused - so hangcheck once indeed managed to reset a i965gm gpu or
> never?

As far as I can remember, at one point it was reliably resetting.

[-- Attachment #1.2: Type: application/pgp-signature, Size: 197 bytes --]

[-- Attachment #2: Type: text/plain, Size: 159 bytes --]

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 01/10] drm/i915: add interface to simulate gpu hangs
  2012-04-27 13:17 [PATCH 01/10] drm/i915: add interface to simulate gpu hangs Daniel Vetter
                   ` (8 preceding siblings ...)
  2012-04-27 13:17 ` [PATCH 10/10] drm/i915: kill gen4 gpu reset code Daniel Vetter
@ 2012-04-28  4:56 ` Ben Widawsky
  2012-05-02 15:23   ` Daniel Vetter
  2012-05-03 12:48 ` [PATCH] " Daniel Vetter
  10 siblings, 1 reply; 31+ messages in thread
From: Ben Widawsky @ 2012-04-28  4:56 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: Intel Graphics Development

On Fri, 27 Apr 2012 15:17:38 +0200
Daniel Vetter <daniel.vetter@ffwll.ch> wrote:

> gpu reset is a very important piece of our infrastructure.
> Unfortunately we only really it test by actually hanging the gpu,
> which often has bad side-effects for the entire system. And the gpu
> hang handling code is one of the rather complicated pieces of code we
> have, consisting of
> - hang detection
> - error capture
> - actual gpu reset
> - reset of all the gem bookkeeping
> - reinitialition of the entire gpu
> 
> This patch adds a debugfs to selectively stopping rings by ceasing to
> update the hw tail pointer, which will result in the gpu no longer
> updating it's head pointer and eventually to the hangcheck firing.
> This way we can exercise the gpu hang code under controlled conditions
> without a dying gpu taking down the entire systems.

You mean, tail pointer?

> 
> Patch motivated by me forgetting to properly reinitialize ppgtt after
> a gpu reset.
> 
> Usage:
> 
> echo $((1 << $ringnum)) > i915_ring_stop # stops one ring
> 
> echo 0xffffffff > i915_ring_stop # stops all, future-proof version
> 
> then run whatever testload is desired. i915_ring_stop automatically
> resets after a gpu hang is detected to avoid hanging the gpu to fast
> and declaring it wedged.
> 
> v2: Incorporate feedback from Chris Wilson.
> 
> v3: Add the missing cleanup.
> 
> Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_debugfs.c     |   65
> +++++++++++++++++++++++++++++++
> drivers/gpu/drm/i915/i915_drv.h         |    2 +
> drivers/gpu/drm/i915/intel_ringbuffer.c |    4 ++ 3 files changed, 71
> insertions(+), 0 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c
> b/drivers/gpu/drm/i915/i915_debugfs.c index 120db46..3f6e28e 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -1585,6 +1585,64 @@ static const struct file_operations
> i915_wedged_fops = { };
>  
>  static ssize_t
> +i915_ring_stop_read(struct file *filp,
> +		    char __user *ubuf,
> +		    size_t max,
> +		    loff_t *ppos)
> +{
> +	struct drm_device *dev = filp->private_data;
> +	drm_i915_private_t *dev_priv = dev->dev_private;
> +	char buf[80];
> +	int len;
> +
> +	len = snprintf(buf, sizeof(buf),
> +		       "0x%08x\n", dev_priv->stop_rings);
> +
> +	if (len > sizeof(buf))
> +		len = sizeof(buf);
> +
> +	return simple_read_from_buffer(ubuf, max, ppos, buf, len);
> +}
> +
> +static ssize_t
> +i915_ring_stop_write(struct file *filp,
> +		     const char __user *ubuf,
> +		     size_t cnt,
> +		     loff_t *ppos)
> +{
> +	struct drm_device *dev = filp->private_data;
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +	char buf[20];
> +	int val = 0;
> +
> +	if (cnt > 0) {
> +		if (cnt > sizeof(buf) - 1)
> +			return -EINVAL;
> +
> +		if (copy_from_user(buf, ubuf, cnt))
> +			return -EFAULT;
> +		buf[cnt] = 0;
> +
> +		val = simple_strtoul(buf, NULL, 0);
> +	}
> +
> +	DRM_DEBUG_DRIVER("Stopping rings 0x%08x\n", val);
> +
> +	mutex_lock(&dev->struct_mutex);
> +	dev_priv->stop_rings = val;
> +	mutex_unlock(&dev->struct_mutex);
> +
> +	return cnt;
> +}

Not a huge deal, but I'd go with an interruptible mutex lock there.

Also, I think it would be cool if you did a tail update when going from
!0->0 stop_rings.

> +
> +static const struct file_operations i915_ring_stop_fops = {
> +	.owner = THIS_MODULE,
> +	.open = simple_open,
> +	.read = i915_ring_stop_read,
> +	.write = i915_ring_stop_write,
> +	.llseek = default_llseek,
> +};
> +static ssize_t
>  i915_max_freq_read(struct file *filp,
>  		   char __user *ubuf,
>  		   size_t max,
> @@ -1884,6 +1942,11 @@ int i915_debugfs_init(struct drm_minor *minor)
>  				  &i915_cache_sharing_fops);
>  	if (ret)
>  		return ret;
> +	ret = i915_debugfs_create(minor->debugfs_root, minor,
> +				  "i915_ring_stop",
> +				  &i915_ring_stop_fops);
> +	if (ret)
> +		return ret;
>  
>  	return drm_debugfs_create_files(i915_debugfs_list,
>  					I915_DEBUGFS_ENTRIES,
> @@ -1902,6 +1965,8 @@ void i915_debugfs_cleanup(struct drm_minor
> *minor) 1, minor);
>  	drm_debugfs_remove_files((struct drm_info_list *)
> &i915_cache_sharing_fops, 1, minor);
> +	drm_debugfs_remove_files((struct drm_info_list *)
> &i915_ring_stop_fops,
> +				 1, minor);
>  }
>  
>  #endif /* CONFIG_DEBUG_FS */
> diff --git a/drivers/gpu/drm/i915/i915_drv.h
> b/drivers/gpu/drm/i915/i915_drv.h index 0095c8d..5e62b35 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -413,6 +413,8 @@ typedef struct drm_i915_private {
>  	uint32_t last_instdone;
>  	uint32_t last_instdone1;
>  
> +	unsigned int stop_rings;
> +
>  	unsigned long cfb_size;
>  	unsigned int cfb_fb;
>  	enum plane cfb_plane;
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c
> b/drivers/gpu/drm/i915/intel_ringbuffer.c index 427b7c5..d2de16d
> 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -1221,7 +1221,11 @@ int intel_ring_begin(struct intel_ring_buffer
> *ring, 
>  void intel_ring_advance(struct intel_ring_buffer *ring)
>  {
> +	struct drm_i915_private *dev_priv = ring->dev->dev_private;
> +
>  	ring->tail &= ring->size - 1;
> +	if (dev_priv->stop_rings & intel_ring_flag(ring))
> +		return;
>  	ring->write_tail(ring, ring->tail);
>  }
>  

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 05/10] drm/i915: extract intel_gpu_reset
  2012-04-27 13:17 ` [PATCH 05/10] drm/i915: extract intel_gpu_reset Daniel Vetter
@ 2012-04-30  1:03   ` Ben Widawsky
  2012-05-04 16:51   ` Eugeni Dodonov
  1 sibling, 0 replies; 31+ messages in thread
From: Ben Widawsky @ 2012-04-30  1:03 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: Intel Graphics Development

On Fri, 27 Apr 2012 15:17:42 +0200
Daniel Vetter <daniel.vetter@ffwll.ch> wrote:

> Slightly cleans up the code and could be useful for e.g. Ben
> Widawsky's hw context patches.
> 
> v2: New colours!
> 
> Cc: Ben Widawsky <ben@bwidawsk.net>

Unfortunately, I have been unable to make use of this patch. In all my
attempts, doing intel_gpu_reset (with nothing else) will result in the
next operation causing a hangcheck anyway. I think we should dig a bit
more into this before we both.

> Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
> ---
>  drivers/gpu/drm/i915/i915_drv.c |   43 ++++++++++++++++++++++++--------------
>  1 files changed, 27 insertions(+), 16 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index f5450bb..c4251a1 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -768,6 +768,29 @@ static int gen6_do_reset(struct drm_device *dev, u8 flags)
>  	return ret;
>  }
>  
> +static int intel_gpu_reset(struct drm_device *dev, u8 flags)
> +{
> +	int ret = -ENODEV;
> +
> +	switch (INTEL_INFO(dev)->gen) {
> +	case 7:
> +	case 6:
> +		ret = gen6_do_reset(dev, flags);
> +		break;
> +	case 5:
> +		ret = ironlake_do_reset(dev, flags);
> +		break;
> +	case 4:
> +		ret = i965_do_reset(dev, flags);
> +		break;
> +	case 2:
> +		ret = i8xx_do_reset(dev, flags);
> +		break;
> +	}
> +
> +	return ret;
> +}
> +
>  /**
>   * i915_reset - reset chip after a hang
>   * @dev: drm device to reset
> @@ -798,23 +821,11 @@ int i915_reset(struct drm_device *dev, u8 flags)
>  	i915_gem_reset(dev);
>  
>  	ret = -ENODEV;
> -	if (get_seconds() - dev_priv->last_gpu_reset < 5) {
> +	if (get_seconds() - dev_priv->last_gpu_reset < 5)
>  		DRM_ERROR("GPU hanging too fast, declaring wedged!\n");
> -	} else switch (INTEL_INFO(dev)->gen) {
> -	case 7:
> -	case 6:
> -		ret = gen6_do_reset(dev, flags);
> -		break;
> -	case 5:
> -		ret = ironlake_do_reset(dev, flags);
> -		break;
> -	case 4:
> -		ret = i965_do_reset(dev, flags);
> -		break;
> -	case 2:
> -		ret = i8xx_do_reset(dev, flags);
> -		break;
> -	}
> +	else
> +		ret = intel_gpu_reset(dev, flags);
> +
>  	dev_priv->last_gpu_reset = get_seconds();
>  	if (ret) {
>  		DRM_ERROR("Failed to reset chip.\n");

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 01/10] drm/i915: add interface to simulate gpu hangs
  2012-04-28  4:56 ` [PATCH 01/10] drm/i915: add interface to simulate gpu hangs Ben Widawsky
@ 2012-05-02 15:23   ` Daniel Vetter
  0 siblings, 0 replies; 31+ messages in thread
From: Daniel Vetter @ 2012-05-02 15:23 UTC (permalink / raw)
  To: Ben Widawsky; +Cc: Daniel Vetter, Intel Graphics Development

On Fri, Apr 27, 2012 at 09:56:55PM -0700, Ben Widawsky wrote:
> On Fri, 27 Apr 2012 15:17:38 +0200
> Daniel Vetter <daniel.vetter@ffwll.ch> wrote:
> 
> > gpu reset is a very important piece of our infrastructure.
> > Unfortunately we only really it test by actually hanging the gpu,
> > which often has bad side-effects for the entire system. And the gpu
> > hang handling code is one of the rather complicated pieces of code we
> > have, consisting of
> > - hang detection
> > - error capture
> > - actual gpu reset
> > - reset of all the gem bookkeeping
> > - reinitialition of the entire gpu
> > 
> > This patch adds a debugfs to selectively stopping rings by ceasing to
> > update the hw tail pointer, which will result in the gpu no longer
> > updating it's head pointer and eventually to the hangcheck firing.
> > This way we can exercise the gpu hang code under controlled conditions
> > without a dying gpu taking down the entire systems.
> 
> You mean, tail pointer?

Yes, I mean tail pointer ;-)

> > Patch motivated by me forgetting to properly reinitialize ppgtt after
> > a gpu reset.
> > 
> > Usage:
> > 
> > echo $((1 << $ringnum)) > i915_ring_stop # stops one ring
> > 
> > echo 0xffffffff > i915_ring_stop # stops all, future-proof version
> > 
> > then run whatever testload is desired. i915_ring_stop automatically
> > resets after a gpu hang is detected to avoid hanging the gpu to fast
> > and declaring it wedged.
> > 
> > v2: Incorporate feedback from Chris Wilson.
> > 
> > v3: Add the missing cleanup.
> > 
> > Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
> > Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
> > ---
> >  drivers/gpu/drm/i915/i915_debugfs.c     |   65
> > +++++++++++++++++++++++++++++++
> > drivers/gpu/drm/i915/i915_drv.h         |    2 +
> > drivers/gpu/drm/i915/intel_ringbuffer.c |    4 ++ 3 files changed, 71
> > insertions(+), 0 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_debugfs.c
> > b/drivers/gpu/drm/i915/i915_debugfs.c index 120db46..3f6e28e 100644
> > --- a/drivers/gpu/drm/i915/i915_debugfs.c
> > +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> > @@ -1585,6 +1585,64 @@ static const struct file_operations
> > i915_wedged_fops = { };
> >  
> >  static ssize_t
> > +i915_ring_stop_read(struct file *filp,
> > +		    char __user *ubuf,
> > +		    size_t max,
> > +		    loff_t *ppos)
> > +{
> > +	struct drm_device *dev = filp->private_data;
> > +	drm_i915_private_t *dev_priv = dev->dev_private;
> > +	char buf[80];
> > +	int len;
> > +
> > +	len = snprintf(buf, sizeof(buf),
> > +		       "0x%08x\n", dev_priv->stop_rings);
> > +
> > +	if (len > sizeof(buf))
> > +		len = sizeof(buf);
> > +
> > +	return simple_read_from_buffer(ubuf, max, ppos, buf, len);
> > +}
> > +
> > +static ssize_t
> > +i915_ring_stop_write(struct file *filp,
> > +		     const char __user *ubuf,
> > +		     size_t cnt,
> > +		     loff_t *ppos)
> > +{
> > +	struct drm_device *dev = filp->private_data;
> > +	struct drm_i915_private *dev_priv = dev->dev_private;
> > +	char buf[20];
> > +	int val = 0;
> > +
> > +	if (cnt > 0) {
> > +		if (cnt > sizeof(buf) - 1)
> > +			return -EINVAL;
> > +
> > +		if (copy_from_user(buf, ubuf, cnt))
> > +			return -EFAULT;
> > +		buf[cnt] = 0;
> > +
> > +		val = simple_strtoul(buf, NULL, 0);
> > +	}
> > +
> > +	DRM_DEBUG_DRIVER("Stopping rings 0x%08x\n", val);
> > +
> > +	mutex_lock(&dev->struct_mutex);
> > +	dev_priv->stop_rings = val;
> > +	mutex_unlock(&dev->struct_mutex);
> > +
> > +	return cnt;
> > +}
> 
> Not a huge deal, but I'd go with an interruptible mutex lock there.
> 
> Also, I think it would be cool if you did a tail update when going from
> !0->0 stop_rings.

I've added code to the reset logic to clear stop_rings after the gpu
reset, so for the hangman usecase you'll never write 0 into this sysfs
file. Original stop_rings also cleared the error_state, but Chris rightly
complained that this is too ugly and clearing the error_state is useful in
itself. Hence I've reworked the code for that.

But for updating the tail pointer again here I don't see the use case, so
I'd prefer to keep it dead-simple.
-Daniel

> 
> > +
> > +static const struct file_operations i915_ring_stop_fops = {
> > +	.owner = THIS_MODULE,
> > +	.open = simple_open,
> > +	.read = i915_ring_stop_read,
> > +	.write = i915_ring_stop_write,
> > +	.llseek = default_llseek,
> > +};
> > +static ssize_t
> >  i915_max_freq_read(struct file *filp,
> >  		   char __user *ubuf,
> >  		   size_t max,
> > @@ -1884,6 +1942,11 @@ int i915_debugfs_init(struct drm_minor *minor)
> >  				  &i915_cache_sharing_fops);
> >  	if (ret)
> >  		return ret;
> > +	ret = i915_debugfs_create(minor->debugfs_root, minor,
> > +				  "i915_ring_stop",
> > +				  &i915_ring_stop_fops);
> > +	if (ret)
> > +		return ret;
> >  
> >  	return drm_debugfs_create_files(i915_debugfs_list,
> >  					I915_DEBUGFS_ENTRIES,
> > @@ -1902,6 +1965,8 @@ void i915_debugfs_cleanup(struct drm_minor
> > *minor) 1, minor);
> >  	drm_debugfs_remove_files((struct drm_info_list *)
> > &i915_cache_sharing_fops, 1, minor);
> > +	drm_debugfs_remove_files((struct drm_info_list *)
> > &i915_ring_stop_fops,
> > +				 1, minor);
> >  }
> >  
> >  #endif /* CONFIG_DEBUG_FS */
> > diff --git a/drivers/gpu/drm/i915/i915_drv.h
> > b/drivers/gpu/drm/i915/i915_drv.h index 0095c8d..5e62b35 100644
> > --- a/drivers/gpu/drm/i915/i915_drv.h
> > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > @@ -413,6 +413,8 @@ typedef struct drm_i915_private {
> >  	uint32_t last_instdone;
> >  	uint32_t last_instdone1;
> >  
> > +	unsigned int stop_rings;
> > +
> >  	unsigned long cfb_size;
> >  	unsigned int cfb_fb;
> >  	enum plane cfb_plane;
> > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c
> > b/drivers/gpu/drm/i915/intel_ringbuffer.c index 427b7c5..d2de16d
> > 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> > @@ -1221,7 +1221,11 @@ int intel_ring_begin(struct intel_ring_buffer
> > *ring, 
> >  void intel_ring_advance(struct intel_ring_buffer *ring)
> >  {
> > +	struct drm_i915_private *dev_priv = ring->dev->dev_private;
> > +
> >  	ring->tail &= ring->size - 1;
> > +	if (dev_priv->stop_rings & intel_ring_flag(ring))
> > +		return;
> >  	ring->write_tail(ring, ring->tail);
> >  }
> >  
> 

-- 
Daniel Vetter
Mail: daniel@ffwll.ch
Mobile: +41 (0)79 365 57 48

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH] drm/i915: fix gen4 gpu reset
  2012-04-27 23:26       ` Eric Anholt
@ 2012-05-02 19:33         ` Daniel Vetter
  2012-05-02 19:54           ` Kenneth Graunke
  2012-05-04 17:06           ` Eugeni Dodonov
  0 siblings, 2 replies; 31+ messages in thread
From: Daniel Vetter @ 2012-05-02 19:33 UTC (permalink / raw)
  To: Intel Graphics Development; +Cc: Daniel Vetter

While trying to fix up gen4 gpu reset in

commit f49f0586191fe16140410db0a46d43bdc690d6af
Author: Kenneth Graunke <kenneth@whitecape.org>
Date:   Sat Sep 11 01:19:14 2010 -0700

    drm/i915: Actually set the reset bit in i965_reset

a little confusion about when wait_for times out has been introduced -
wait for loops _until_ the condition is true.

This fixes gpu reset on my gm45, testing with my hangman code shows
that it's now fairly reliable - it only died after well over 100 reset
cycles.

Cc: Kenneth Graunke <kenneth@whitecape.org>
Cc: Eric Anholt <eric@anholt.net>
Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/i915_drv.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 133f101..77b7a50 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -707,7 +707,7 @@ static int i965_reset_complete(struct drm_device *dev)
 {
 	u8 gdrst;
 	pci_read_config_byte(dev->pdev, I965_GDRST, &gdrst);
-	return gdrst & 0x1;
+	return (gdrst & GRDOM_RESET_ENABLE) == 0;
 }
 
 static int i965_do_reset(struct drm_device *dev)
-- 
1.7.9.1

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH] drm/i915: fix gen4 gpu reset
  2012-05-02 19:33         ` [PATCH] drm/i915: fix gen4 gpu reset Daniel Vetter
@ 2012-05-02 19:54           ` Kenneth Graunke
  2012-05-04 12:07             ` Daniel Vetter
  2012-05-04 17:06           ` Eugeni Dodonov
  1 sibling, 1 reply; 31+ messages in thread
From: Kenneth Graunke @ 2012-05-02 19:54 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: Intel Graphics Development

On 05/02/2012 12:33 PM, Daniel Vetter wrote:
> While trying to fix up gen4 gpu reset in
>
> commit f49f0586191fe16140410db0a46d43bdc690d6af
> Author: Kenneth Graunke<kenneth@whitecape.org>
> Date:   Sat Sep 11 01:19:14 2010 -0700
>
>      drm/i915: Actually set the reset bit in i965_reset
>
> a little confusion about when wait_for times out has been introduced -
> wait for loops _until_ the condition is true.
>
> This fixes gpu reset on my gm45, testing with my hangman code shows
> that it's now fairly reliable - it only died after well over 100 reset
> cycles.
>
> Cc: Kenneth Graunke<kenneth@whitecape.org>
> Cc: Eric Anholt<eric@anholt.net>
> Signed-Off-by: Daniel Vetter<daniel.vetter@ffwll.ch>
> ---
>   drivers/gpu/drm/i915/i915_drv.c |    2 +-
>   1 files changed, 1 insertions(+), 1 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index 133f101..77b7a50 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -707,7 +707,7 @@ static int i965_reset_complete(struct drm_device *dev)
>   {
>   	u8 gdrst;
>   	pci_read_config_byte(dev->pdev, I965_GDRST,&gdrst);
> -	return gdrst&  0x1;
> +	return (gdrst&  GRDOM_RESET_ENABLE) == 0;
>   }
>
>   static int i965_do_reset(struct drm_device *dev)

How embarassing :)  Ironlake also has the same bug and IIRC hasn't been 
resetting properly for me.  I can test that if you'd like.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>

I might also Cc: stable@kernel.org an equivalent of this...but, up to you.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH] drm/i915: add interface to simulate gpu hangs
  2012-04-27 13:17 [PATCH 01/10] drm/i915: add interface to simulate gpu hangs Daniel Vetter
                   ` (9 preceding siblings ...)
  2012-04-28  4:56 ` [PATCH 01/10] drm/i915: add interface to simulate gpu hangs Ben Widawsky
@ 2012-05-03 12:48 ` Daniel Vetter
  2012-05-03 19:00   ` Eugeni Dodonov
  10 siblings, 1 reply; 31+ messages in thread
From: Daniel Vetter @ 2012-05-03 12:48 UTC (permalink / raw)
  To: Intel Graphics Development; +Cc: Daniel Vetter

gpu reset is a very important piece of our infrastructure.
Unfortunately we only really it test by actually hanging the gpu,
which often has bad side-effects for the entire system. And the gpu
hang handling code is one of the rather complicated pieces of code we
have, consisting of
- hang detection
- error capture
- actual gpu reset
- reset of all the gem bookkeeping
- reinitialition of the entire gpu

This patch adds a debugfs to selectively stopping rings by ceasing to
update the hw tail pointer, which will result in the gpu no longer
updating it's head pointer and eventually to the hangcheck firing.
This way we can exercise the gpu hang code under controlled conditions
without a dying gpu taking down the entire systems.

Patch motivated by me forgetting to properly reinitialize ppgtt after
a gpu reset.

Usage:

echo $((1 << $ringnum)) > i915_ring_stop # stops one ring

echo 0xffffffff > i915_ring_stop # stops all, future-proof version

then run whatever testload is desired. i915_ring_stop automatically
resets after a gpu hang is detected to avoid hanging the gpu to fast
and declaring it wedged.

v2: Incorporate feedback from Chris Wilson.

v3: Add the missing cleanup.

v4: Fix up inconsistent size of ring_stop_read vs _write, noticed by
Eugeni Dodonov.

Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_debugfs.c     |   65 +++++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/i915_drv.c         |    2 +
 drivers/gpu/drm/i915/i915_drv.h         |    2 +
 drivers/gpu/drm/i915/intel_ringbuffer.c |    4 ++
 4 files changed, 73 insertions(+), 0 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 120db46..602e100 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1585,6 +1585,64 @@ static const struct file_operations i915_wedged_fops = {
 };
 
 static ssize_t
+i915_ring_stop_read(struct file *filp,
+		    char __user *ubuf,
+		    size_t max,
+		    loff_t *ppos)
+{
+	struct drm_device *dev = filp->private_data;
+	drm_i915_private_t *dev_priv = dev->dev_private;
+	char buf[20];
+	int len;
+
+	len = snprintf(buf, sizeof(buf),
+		       "0x%08x\n", dev_priv->stop_rings);
+
+	if (len > sizeof(buf))
+		len = sizeof(buf);
+
+	return simple_read_from_buffer(ubuf, max, ppos, buf, len);
+}
+
+static ssize_t
+i915_ring_stop_write(struct file *filp,
+		     const char __user *ubuf,
+		     size_t cnt,
+		     loff_t *ppos)
+{
+	struct drm_device *dev = filp->private_data;
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	char buf[20];
+	int val = 0;
+
+	if (cnt > 0) {
+		if (cnt > sizeof(buf) - 1)
+			return -EINVAL;
+
+		if (copy_from_user(buf, ubuf, cnt))
+			return -EFAULT;
+		buf[cnt] = 0;
+
+		val = simple_strtoul(buf, NULL, 0);
+	}
+
+	DRM_DEBUG_DRIVER("Stopping rings 0x%08x\n", val);
+
+	mutex_lock(&dev->struct_mutex);
+	dev_priv->stop_rings = val;
+	mutex_unlock(&dev->struct_mutex);
+
+	return cnt;
+}
+
+static const struct file_operations i915_ring_stop_fops = {
+	.owner = THIS_MODULE,
+	.open = simple_open,
+	.read = i915_ring_stop_read,
+	.write = i915_ring_stop_write,
+	.llseek = default_llseek,
+};
+static ssize_t
 i915_max_freq_read(struct file *filp,
 		   char __user *ubuf,
 		   size_t max,
@@ -1884,6 +1942,11 @@ int i915_debugfs_init(struct drm_minor *minor)
 				  &i915_cache_sharing_fops);
 	if (ret)
 		return ret;
+	ret = i915_debugfs_create(minor->debugfs_root, minor,
+				  "i915_ring_stop",
+				  &i915_ring_stop_fops);
+	if (ret)
+		return ret;
 
 	return drm_debugfs_create_files(i915_debugfs_list,
 					I915_DEBUGFS_ENTRIES,
@@ -1902,6 +1965,8 @@ void i915_debugfs_cleanup(struct drm_minor *minor)
 				 1, minor);
 	drm_debugfs_remove_files((struct drm_info_list *) &i915_cache_sharing_fops,
 				 1, minor);
+	drm_debugfs_remove_files((struct drm_info_list *) &i915_ring_stop_fops,
+				 1, minor);
 }
 
 #endif /* CONFIG_DEBUG_FS */
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 8a98f9a..90a84f9 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -800,6 +800,8 @@ int i915_reset(struct drm_device *dev, u8 flags)
 	if (!mutex_trylock(&dev->struct_mutex))
 		return -EBUSY;
 
+	dev_priv->stop_rings = 0;
+
 	i915_gem_reset(dev);
 
 	ret = -ENODEV;
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 0095c8d..5e62b35 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -413,6 +413,8 @@ typedef struct drm_i915_private {
 	uint32_t last_instdone;
 	uint32_t last_instdone1;
 
+	unsigned int stop_rings;
+
 	unsigned long cfb_size;
 	unsigned int cfb_fb;
 	enum plane cfb_plane;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 427b7c5..d2de16d 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -1221,7 +1221,11 @@ int intel_ring_begin(struct intel_ring_buffer *ring,
 
 void intel_ring_advance(struct intel_ring_buffer *ring)
 {
+	struct drm_i915_private *dev_priv = ring->dev->dev_private;
+
 	ring->tail &= ring->size - 1;
+	if (dev_priv->stop_rings & intel_ring_flag(ring))
+		return;
 	ring->write_tail(ring, ring->tail);
 }
 
-- 
1.7.7.6

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: [PATCH] drm/i915: add interface to simulate gpu hangs
  2012-05-03 12:48 ` [PATCH] " Daniel Vetter
@ 2012-05-03 19:00   ` Eugeni Dodonov
  2012-05-05 19:13     ` Daniel Vetter
  0 siblings, 1 reply; 31+ messages in thread
From: Eugeni Dodonov @ 2012-05-03 19:00 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: Intel Graphics Development


[-- Attachment #1.1: Type: text/plain, Size: 1637 bytes --]

On Thu, May 3, 2012 at 9:48 AM, Daniel Vetter <daniel.vetter@ffwll.ch>wrote:

> gpu reset is a very important piece of our infrastructure.
> Unfortunately we only really it test by actually hanging the gpu,
> which often has bad side-effects for the entire system. And the gpu
> hang handling code is one of the rather complicated pieces of code we
> have, consisting of
> - hang detection
> - error capture
> - actual gpu reset
> - reset of all the gem bookkeeping
> - reinitialition of the entire gpu
>
> This patch adds a debugfs to selectively stopping rings by ceasing to
> update the hw tail pointer, which will result in the gpu no longer
> updating it's head pointer and eventually to the hangcheck firing.
> This way we can exercise the gpu hang code under controlled conditions
> without a dying gpu taking down the entire systems.
>
> Patch motivated by me forgetting to properly reinitialize ppgtt after
> a gpu reset.
>
> Usage:
>
> echo $((1 << $ringnum)) > i915_ring_stop # stops one ring
>
> echo 0xffffffff > i915_ring_stop # stops all, future-proof version
>
> then run whatever testload is desired. i915_ring_stop automatically
> resets after a gpu hang is detected to avoid hanging the gpu to fast
> and declaring it wedged.
>
> v2: Incorporate feedback from Chris Wilson.
>
> v3: Add the missing cleanup.
>
> v4: Fix up inconsistent size of ring_stop_read vs _write, noticed by
> Eugeni Dodonov.
>
> Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
>

Reviewed-by: Eugeni Dodonov <eugeni.dodonov@intel.com>

-- 
Eugeni Dodonov
<http://eugeni.dodonov.net/>

[-- Attachment #1.2: Type: text/html, Size: 2324 bytes --]

[-- Attachment #2: Type: text/plain, Size: 159 bytes --]

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 03/10] drm/i915: allow the existing error_state to be destroyed
  2012-04-27 13:17 ` [PATCH 03/10] drm/i915: allow the existing error_state to be destroyed Daniel Vetter
@ 2012-05-04 11:56   ` Daniel Vetter
  2012-05-04 17:15     ` Eugeni Dodonov
  0 siblings, 1 reply; 31+ messages in thread
From: Daniel Vetter @ 2012-05-04 11:56 UTC (permalink / raw)
  To: Intel Graphics Development; +Cc: Daniel Vetter

On Fri, Apr 27, 2012 at 03:17:40PM +0200, Daniel Vetter wrote:
> ... by writing (anything) to i915_error_state.
> 
> This way we can simulate a bunch of gpu hangs and run the error_state
> capture code every time (without the need to reload the module).
> 
> To make that happen we need to abandon the simple seq_file wrappers
> provided by the drm core. While at it, but the new error_state
> refcounting to some good use and associated. This should help greatly
> when we finally get around to split up the giant single seq_file block
> that the error_state file currently is into smaller parts.

Meh, commit msg fail, that phrase should read "While at it put the new
error_state refcounting to some good use and associated the error_state to
the debugfs when opening the file. Otherwise the error_state could change
while someone is reading it. This should help greatly ..."

-Daniel
> 
> v2: Actually squash all the fixes into the patch ...
> 
> Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
> ---
>  drivers/gpu/drm/i915/i915_debugfs.c |   90 ++++++++++++++++++++++++++++++-----
>  1 files changed, 77 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index b46c198..4805df0 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -691,21 +691,19 @@ static void i915_ring_error_state(struct seq_file *m,
>  	seq_printf(m, "  ring->tail: 0x%08x\n", error->cpu_ring_tail[ring]);
>  }
>  
> +struct i915_error_state_file_priv {
> +	struct drm_device *dev;
> +	struct drm_i915_error_state *error;
> +};
> +
>  static int i915_error_state(struct seq_file *m, void *unused)
>  {
> -	struct drm_info_node *node = (struct drm_info_node *) m->private;
> -	struct drm_device *dev = node->minor->dev;
> +	struct i915_error_state_file_priv *error_priv = m->private;
> +	struct drm_device *dev = error_priv->dev;
>  	drm_i915_private_t *dev_priv = dev->dev_private;
> -	struct drm_i915_error_state *error;
> -	unsigned long flags;
> +	struct drm_i915_error_state *error = error_priv->error;
>  	int i, j, page, offset, elt;
>  
> -	spin_lock_irqsave(&dev_priv->error_lock, flags);
> -	error = dev_priv->first_error;
> -	if (error)
> -		kref_get(&error->ref);
> -	spin_unlock_irqrestore(&dev_priv->error_lock, flags);
> -
>  	if (!error) {
>  		seq_printf(m, "no error state collected\n");
>  		return 0;
> @@ -792,11 +790,71 @@ static int i915_error_state(struct seq_file *m, void *unused)
>  	if (error->display)
>  		intel_display_print_error_state(m, dev, error->display);
>  
> -	kref_put(&error->ref, i915_error_state_free);
> -
>  	return 0;
>  }
>  
> +static ssize_t
> +i915_error_state_write(struct file *filp,
> +		       const char __user *ubuf,
> +		       size_t cnt,
> +		       loff_t *ppos)
> +{
> +	struct seq_file *m = filp->private_data;
> +	struct i915_error_state_file_priv *error_priv = m->private;
> +	struct drm_device *dev = error_priv->dev;
> +
> +	DRM_DEBUG_DRIVER("Resetting error state\n");
> +
> +	mutex_lock(&dev->struct_mutex);
> +	i915_destroy_error_state(dev);
> +	mutex_unlock(&dev->struct_mutex);
> +
> +	return cnt;
> +}
> +
> +static int i915_error_state_open(struct inode *inode, struct file *file)
> +{
> +	struct drm_device *dev = inode->i_private;
> +	drm_i915_private_t *dev_priv = dev->dev_private;
> +	struct i915_error_state_file_priv *error_priv;
> +	unsigned long flags;
> +
> +	error_priv = kzalloc(sizeof(*error_priv), GFP_KERNEL);
> +	if (!error_priv)
> +		return -ENOMEM;
> +
> +	error_priv->dev = dev;
> +
> +	spin_lock_irqsave(&dev_priv->error_lock, flags);
> +	error_priv->error = dev_priv->first_error;
> +	if (error_priv->error)
> +		kref_get(&error_priv->error->ref);
> +	spin_unlock_irqrestore(&dev_priv->error_lock, flags);
> +
> +	return single_open(file, i915_error_state, error_priv);
> +}
> +
> +static int i915_error_state_release(struct inode *inode, struct file *file)
> +{
> +	struct seq_file *m = file->private_data;
> +	struct i915_error_state_file_priv *error_priv = m->private;
> +
> +	if (error_priv->error)
> +		kref_put(&error_priv->error->ref, i915_error_state_free);
> +	kfree(error_priv);
> +
> +	return single_release(inode, file);
> +}
> +
> +static const struct file_operations i915_error_state_fops = {
> +	.owner = THIS_MODULE,
> +	.open = i915_error_state_open,
> +	.read = seq_read,
> +	.write = i915_error_state_write,
> +	.llseek = default_llseek,
> +	.release = i915_error_state_release,
> +};
> +
>  static int i915_rstdby_delays(struct seq_file *m, void *unused)
>  {
>  	struct drm_info_node *node = (struct drm_info_node *) m->private;
> @@ -1645,6 +1703,7 @@ static const struct file_operations i915_ring_stop_fops = {
>  	.write = i915_ring_stop_write,
>  	.llseek = default_llseek,
>  };
> +
>  static ssize_t
>  i915_max_freq_read(struct file *filp,
>  		   char __user *ubuf,
> @@ -1899,7 +1958,6 @@ static struct drm_info_list i915_debugfs_list[] = {
>  	{"i915_gem_hws", i915_hws_info, 0, (void *)RCS},
>  	{"i915_gem_hws_blt", i915_hws_info, 0, (void *)BCS},
>  	{"i915_gem_hws_bsd", i915_hws_info, 0, (void *)VCS},
> -	{"i915_error_state", i915_error_state, 0},
>  	{"i915_rstdby_delays", i915_rstdby_delays, 0},
>  	{"i915_cur_delayinfo", i915_cur_delayinfo, 0},
>  	{"i915_delayfreq_table", i915_delayfreq_table, 0},
> @@ -1951,6 +2009,12 @@ int i915_debugfs_init(struct drm_minor *minor)
>  	if (ret)
>  		return ret;
>  
> +	ret = i915_debugfs_create(minor->debugfs_root, minor,
> +				  "i915_error_state",
> +				  &i915_error_state_fops);
> +	if (ret)
> +		return ret;
> +
>  	return drm_debugfs_create_files(i915_debugfs_list,
>  					I915_DEBUGFS_ENTRIES,
>  					minor->debugfs_root, minor);
> -- 
> 1.7.9
> 

-- 
Daniel Vetter
Mail: daniel@ffwll.ch
Mobile: +41 (0)79 365 57 48

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] drm/i915: fix gen4 gpu reset
  2012-05-02 19:54           ` Kenneth Graunke
@ 2012-05-04 12:07             ` Daniel Vetter
  0 siblings, 0 replies; 31+ messages in thread
From: Daniel Vetter @ 2012-05-04 12:07 UTC (permalink / raw)
  To: Kenneth Graunke; +Cc: Daniel Vetter, Intel Graphics Development

On Wed, May 02, 2012 at 12:54:46PM -0700, Kenneth Graunke wrote:
> On 05/02/2012 12:33 PM, Daniel Vetter wrote:
> >While trying to fix up gen4 gpu reset in
> >
> >commit f49f0586191fe16140410db0a46d43bdc690d6af
> >Author: Kenneth Graunke<kenneth@whitecape.org>
> >Date:   Sat Sep 11 01:19:14 2010 -0700
> >
> >     drm/i915: Actually set the reset bit in i965_reset
> >
> >a little confusion about when wait_for times out has been introduced -
> >wait for loops _until_ the condition is true.
> >
> >This fixes gpu reset on my gm45, testing with my hangman code shows
> >that it's now fairly reliable - it only died after well over 100 reset
> >cycles.
> >
> >Cc: Kenneth Graunke<kenneth@whitecape.org>
> >Cc: Eric Anholt<eric@anholt.net>
> >Signed-Off-by: Daniel Vetter<daniel.vetter@ffwll.ch>
> >---
> >  drivers/gpu/drm/i915/i915_drv.c |    2 +-
> >  1 files changed, 1 insertions(+), 1 deletions(-)
> >
> >diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> >index 133f101..77b7a50 100644
> >--- a/drivers/gpu/drm/i915/i915_drv.c
> >+++ b/drivers/gpu/drm/i915/i915_drv.c
> >@@ -707,7 +707,7 @@ static int i965_reset_complete(struct drm_device *dev)
> >  {
> >  	u8 gdrst;
> >  	pci_read_config_byte(dev->pdev, I965_GDRST,&gdrst);
> >-	return gdrst&  0x1;
> >+	return (gdrst&  GRDOM_RESET_ENABLE) == 0;
> >  }
> >
> >  static int i965_do_reset(struct drm_device *dev)
> 
> How embarassing :)  Ironlake also has the same bug and IIRC hasn't
> been resetting properly for me.  I can test that if you'd like.

Yeah, would be great if you could try to reproduce your ilk reset failure
- it seems to work for me here.

> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
> 
> I might also Cc: stable@kernel.org an equivalent of this...but, up to you.

Well, given that we do not even have a bug report about this regression,
I'll leave this out. Thanks for the review.
-Daniel
-- 
Daniel Vetter
Mail: daniel@ffwll.ch
Mobile: +41 (0)79 365 57 48

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 07/10] drm/i915: kill flags parameter for reset functions
  2012-04-27 13:17 ` [PATCH 07/10] drm/i915: kill flags parameter for reset functions Daniel Vetter
@ 2012-05-04 16:47   ` Eugeni Dodonov
  0 siblings, 0 replies; 31+ messages in thread
From: Eugeni Dodonov @ 2012-05-04 16:47 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: Intel Graphics Development

On 04/27/2012 10:17 AM, Daniel Vetter wrote:
> Only half of them even cared, and it's always the same one.
>
> Signed-Off-by: Daniel Vetter<daniel.vetter@ffwll.ch>

Reviewed-by: Eugeni Dodonov <eugeni.dodonov@intel.com>

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 06/10] drm/i915: make gpu hangman more resilient
  2012-04-27 13:17 ` [PATCH 06/10] drm/i915: make gpu hangman more resilient Daniel Vetter
@ 2012-05-04 16:47   ` Eugeni Dodonov
  0 siblings, 0 replies; 31+ messages in thread
From: Eugeni Dodonov @ 2012-05-04 16:47 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: Intel Graphics Development

On 04/27/2012 10:17 AM, Daniel Vetter wrote:
> - reset the stop_rings infrastructure while resetting the hw to
>    avoid angering the hangcheck right away (and potentially declaring
>    the gpu permanently wedged).
>
> - ignore reset failures when hanging due to the hangman - we don't
>    have reset code for all generations.
>
> v2: Ensure that we only ignore reset failures when the hw reset is not
> implemented and not when it failed.
>
> Signed-off-by: Daniel Vetter<daniel.vetter@ffwll.ch>

Reviewed-by: Eugeni Dodonov <eugeni.dodonov@intel.com>

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 04/10] drm/i915: simplify i915_reset a bit
  2012-04-27 13:17 ` [PATCH 04/10] drm/i915: simplify i915_reset a bit Daniel Vetter
@ 2012-05-04 16:47   ` Eugeni Dodonov
  0 siblings, 0 replies; 31+ messages in thread
From: Eugeni Dodonov @ 2012-05-04 16:47 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: Intel Graphics Development

On 04/27/2012 10:17 AM, Daniel Vetter wrote:
> - need_display is always true, scrap it.
> - don't reacquire the mutex to do nothing after having restored the
>    gem state.
>
> Signed-Off-by: Daniel Vetter<daniel.vetter@ffwll.ch>

Reviewed-by: Eugeni Dodonov <eugeni.dodonov@intel.com>

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 05/10] drm/i915: extract intel_gpu_reset
  2012-04-27 13:17 ` [PATCH 05/10] drm/i915: extract intel_gpu_reset Daniel Vetter
  2012-04-30  1:03   ` Ben Widawsky
@ 2012-05-04 16:51   ` Eugeni Dodonov
  1 sibling, 0 replies; 31+ messages in thread
From: Eugeni Dodonov @ 2012-05-04 16:51 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: Intel Graphics Development, Ben Widawsky

On 04/27/2012 10:17 AM, Daniel Vetter wrote:
> Slightly cleans up the code and could be useful for e.g. Ben
> Widawsky's hw context patches.
>
> v2: New colours!
>
> Cc: Ben Widawsky<ben@bwidawsk.net>
> Signed-Off-by: Daniel Vetter<daniel.vetter@ffwll.ch>

Reviewed-by: Eugeni Dodonov <eugeni.dodonov@intel.com>

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 02/10] drm/i915: rework dev->first_error locking
  2012-04-27 13:17 ` [PATCH 02/10] drm/i915: rework dev->first_error locking Daniel Vetter
@ 2012-05-04 17:04   ` Eugeni Dodonov
  0 siblings, 0 replies; 31+ messages in thread
From: Eugeni Dodonov @ 2012-05-04 17:04 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: Intel Graphics Development

On 04/27/2012 10:17 AM, Daniel Vetter wrote:
> - reduce the irq disabled section, even for a debugfs file this was
>    way too long.
> - always disable irqs when taking the lock.
>
> v2: Thou shalt not mistake locking for reference counting, so:
> - reference count the error_state to protect from concurent freeeing.
>    This will be only really used in the next patch.
>
> Signed-Off-by: Daniel Vetter<daniel.vetter@ffwll.ch>

Reviewed-by: Eugeni Dodonov <eugeni.dodonov@intel.com>

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] drm/i915: fix gen4 gpu reset
  2012-05-02 19:33         ` [PATCH] drm/i915: fix gen4 gpu reset Daniel Vetter
  2012-05-02 19:54           ` Kenneth Graunke
@ 2012-05-04 17:06           ` Eugeni Dodonov
  1 sibling, 0 replies; 31+ messages in thread
From: Eugeni Dodonov @ 2012-05-04 17:06 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: Intel Graphics Development

On 05/02/2012 04:33 PM, Daniel Vetter wrote:
> While trying to fix up gen4 gpu reset in
>
> commit f49f0586191fe16140410db0a46d43bdc690d6af
> Author: Kenneth Graunke<kenneth@whitecape.org>
> Date:   Sat Sep 11 01:19:14 2010 -0700
>
>      drm/i915: Actually set the reset bit in i965_reset
>
> a little confusion about when wait_for times out has been introduced -
> wait for loops _until_ the condition is true.
>
> This fixes gpu reset on my gm45, testing with my hangman code shows
> that it's now fairly reliable - it only died after well over 100 reset
> cycles.
>
> Cc: Kenneth Graunke<kenneth@whitecape.org>
> Cc: Eric Anholt<eric@anholt.net>
> Signed-Off-by: Daniel Vetter<daniel.vetter@ffwll.ch>

I haven't tested it on a real machine to verify if it actually does the 
reset, but this change does explains why it could not work before :).

Reviewed-by: Eugeni Dodonov <eugeni.dodonov@intel.com>

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 03/10] drm/i915: allow the existing error_state to be destroyed
  2012-05-04 11:56   ` Daniel Vetter
@ 2012-05-04 17:15     ` Eugeni Dodonov
  2012-05-04 19:58       ` Daniel Vetter
  0 siblings, 1 reply; 31+ messages in thread
From: Eugeni Dodonov @ 2012-05-04 17:15 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: Daniel Vetter, Intel Graphics Development

On 05/04/2012 08:56 AM, Daniel Vetter wrote:
>> +static int i915_error_state_release(struct inode *inode, struct file *file)
>> +{
>> +	struct seq_file *m = file->private_data;
>> +	struct i915_error_state_file_priv *error_priv = m->private;
>> +
>> +	if (error_priv->error)
>> +		kref_put(&error_priv->error->ref, i915_error_state_free);
>> +	kfree(error_priv);

Maybe a stupid question, but shouldn't we hold the error_lock here as well?

But apart from that:
Reviewed-by: Eugeni Dodonov <eugeni.dodonov@intel.com>

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 03/10] drm/i915: allow the existing error_state to be destroyed
  2012-05-04 17:15     ` Eugeni Dodonov
@ 2012-05-04 19:58       ` Daniel Vetter
  0 siblings, 0 replies; 31+ messages in thread
From: Daniel Vetter @ 2012-05-04 19:58 UTC (permalink / raw)
  To: eugeni.dodonov; +Cc: Daniel Vetter, Intel Graphics Development

On Fri, May 04, 2012 at 02:15:33PM -0300, Eugeni Dodonov wrote:
> On 05/04/2012 08:56 AM, Daniel Vetter wrote:
> >>+static int i915_error_state_release(struct inode *inode, struct file *file)
> >>+{
> >>+	struct seq_file *m = file->private_data;
> >>+	struct i915_error_state_file_priv *error_priv = m->private;
> >>+
> >>+	if (error_priv->error)
> >>+		kref_put(&error_priv->error->ref, i915_error_state_free);
> >>+	kfree(error_priv);
> 
> Maybe a stupid question, but shouldn't we hold the error_lock here as well?

That's the cool thing with ref-counting: As long as someone is still
holding a referencing, the object won't ever disappear. The tricky part is
to ensure that every time someone could still access a reference, it
actually does. Let's go through the list:
- When creating the error state in the hangcheck code we init the refcount
  to 1. We drop that reference if the error state capturing fails. On
  success we transfer this reference to dev_priv->error_state (hence no
  additional refcounting is necessary).
- As long as dev_priv->error_state is non-NULL, it holds a reference onto
  that error_state.
- When we open the error_state debugfs file we grab a reference and only
  drop it when we close the file.

The important bit in that dance is that the refcount held by
dev_priv->error_state and whether that pointer is non-NULL need to be
always consistent (to avoid somebody reading a non-NULL error_state
pointer while the refcount has already dropped to zero). This is the only
place we need locking - and what my comment about not mistaking
ref-counting for locking alluded to.

Cheers, Daniel

> But apart from that:
> Reviewed-by: Eugeni Dodonov <eugeni.dodonov@intel.com>


-- 
Daniel Vetter
Mail: daniel@ffwll.ch
Mobile: +41 (0)79 365 57 48

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH] drm/i915: add interface to simulate gpu hangs
  2012-05-03 19:00   ` Eugeni Dodonov
@ 2012-05-05 19:13     ` Daniel Vetter
  0 siblings, 0 replies; 31+ messages in thread
From: Daniel Vetter @ 2012-05-05 19:13 UTC (permalink / raw)
  To: Eugeni Dodonov; +Cc: Daniel Vetter, Intel Graphics Development

On Thu, May 03, 2012 at 04:00:00PM -0300, Eugeni Dodonov wrote:
> On Thu, May 3, 2012 at 9:48 AM, Daniel Vetter <daniel.vetter@ffwll.ch>wrote:
> 
> > gpu reset is a very important piece of our infrastructure.
> > Unfortunately we only really it test by actually hanging the gpu,
> > which often has bad side-effects for the entire system. And the gpu
> > hang handling code is one of the rather complicated pieces of code we
> > have, consisting of
> > - hang detection
> > - error capture
> > - actual gpu reset
> > - reset of all the gem bookkeeping
> > - reinitialition of the entire gpu
> >
> > This patch adds a debugfs to selectively stopping rings by ceasing to
> > update the hw tail pointer, which will result in the gpu no longer
> > updating it's head pointer and eventually to the hangcheck firing.
> > This way we can exercise the gpu hang code under controlled conditions
> > without a dying gpu taking down the entire systems.
> >
> > Patch motivated by me forgetting to properly reinitialize ppgtt after
> > a gpu reset.
> >
> > Usage:
> >
> > echo $((1 << $ringnum)) > i915_ring_stop # stops one ring
> >
> > echo 0xffffffff > i915_ring_stop # stops all, future-proof version
> >
> > then run whatever testload is desired. i915_ring_stop automatically
> > resets after a gpu hang is detected to avoid hanging the gpu to fast
> > and declaring it wedged.
> >
> > v2: Incorporate feedback from Chris Wilson.
> >
> > v3: Add the missing cleanup.
> >
> > v4: Fix up inconsistent size of ring_stop_read vs _write, noticed by
> > Eugeni Dodonov.
> >
> > Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
> > Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
> >
> 
> Reviewed-by: Eugeni Dodonov <eugeni.dodonov@intel.com>

I've slurped the hangman into -next, thanks for the review.
-Daniel
-- 
Daniel Vetter
Mail: daniel@ffwll.ch
Mobile: +41 (0)79 365 57 48

^ permalink raw reply	[flat|nested] 31+ messages in thread

end of thread, other threads:[~2012-05-05 19:12 UTC | newest]

Thread overview: 31+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-04-27 13:17 [PATCH 01/10] drm/i915: add interface to simulate gpu hangs Daniel Vetter
2012-04-27 13:17 ` [PATCH 02/10] drm/i915: rework dev->first_error locking Daniel Vetter
2012-05-04 17:04   ` Eugeni Dodonov
2012-04-27 13:17 ` [PATCH 03/10] drm/i915: allow the existing error_state to be destroyed Daniel Vetter
2012-05-04 11:56   ` Daniel Vetter
2012-05-04 17:15     ` Eugeni Dodonov
2012-05-04 19:58       ` Daniel Vetter
2012-04-27 13:17 ` [PATCH 04/10] drm/i915: simplify i915_reset a bit Daniel Vetter
2012-05-04 16:47   ` Eugeni Dodonov
2012-04-27 13:17 ` [PATCH 05/10] drm/i915: extract intel_gpu_reset Daniel Vetter
2012-04-30  1:03   ` Ben Widawsky
2012-05-04 16:51   ` Eugeni Dodonov
2012-04-27 13:17 ` [PATCH 06/10] drm/i915: make gpu hangman more resilient Daniel Vetter
2012-05-04 16:47   ` Eugeni Dodonov
2012-04-27 13:17 ` [PATCH 07/10] drm/i915: kill flags parameter for reset functions Daniel Vetter
2012-05-04 16:47   ` Eugeni Dodonov
2012-04-27 13:17 ` [PATCH 08/10] drm/i915: also reset the media engine on gen4/5 Daniel Vetter
2012-04-27 13:17 ` [PATCH 09/10] drm/i915: remove modeset reset from i915_reset Daniel Vetter
2012-04-27 13:17 ` [PATCH 10/10] drm/i915: kill gen4 gpu reset code Daniel Vetter
2012-04-27 18:49   ` Eric Anholt
2012-04-27 19:17     ` Daniel Vetter
2012-04-27 23:26       ` Eric Anholt
2012-05-02 19:33         ` [PATCH] drm/i915: fix gen4 gpu reset Daniel Vetter
2012-05-02 19:54           ` Kenneth Graunke
2012-05-04 12:07             ` Daniel Vetter
2012-05-04 17:06           ` Eugeni Dodonov
2012-04-28  4:56 ` [PATCH 01/10] drm/i915: add interface to simulate gpu hangs Ben Widawsky
2012-05-02 15:23   ` Daniel Vetter
2012-05-03 12:48 ` [PATCH] " Daniel Vetter
2012-05-03 19:00   ` Eugeni Dodonov
2012-05-05 19:13     ` Daniel Vetter

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.