All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] drm/i915: Track last register written (debug)
@ 2015-02-20  5:33 Ben Widawsky
  2015-02-20  5:33 ` [PATCH 2/2] drm/i915: Add the last written reg to error state Ben Widawsky
  2015-02-20 11:55 ` [PATCH 1/2] drm/i915: Track last register written (debug) Paulo Zanoni
  0 siblings, 2 replies; 6+ messages in thread
From: Ben Widawsky @ 2015-02-20  5:33 UTC (permalink / raw)
  To: Intel GFX; +Cc: Ben Widawsky, Ben Widawsky

Register writes are supposed to be posted, and therefore "fast." In order to try
to preserve this behavior, the patch uses percpu variables to make the overhead
minimal. The slow data collection is done at error time, and that only occurs if
the user specified mmio_debug=1 module parameter.

I wrote this patch to try to track down annoying and sporadic errors on my
Broadwell laptop. Sometimes it's obvious how you got to this point from the
backtrace, but other times it isn't.

Output looks like this:
[   63.122716] [drm:hsw_unclaimed_reg_detect] *ERROR* Unclaimed register detected. Please use the i915.mmio_debug=1 to debug this problem.
[  658.662961] ------------[ cut here ]------------
...
[  658.663113] ---[ end trace c7f0a18c4d4e027b ]---
[  658.663114] [drm:hsw_unclaimed_reg_debug] *ERROR* Last registers written:
[  658.663115] [drm:hsw_unclaimed_reg_debug] *ERROR* 	0x00002030
[  658.663116] [drm:hsw_unclaimed_reg_debug] *ERROR* 	0x00022030
[  658.663117] [drm:hsw_unclaimed_reg_debug] *ERROR* 	0x00002030
[  658.663118] [drm:hsw_unclaimed_reg_debug] *ERROR* 	0x00002030
[  658.663119] [drm:hsw_unclaimed_reg_debug] *ERROR* 	0x00000000
[  658.663120] [drm:hsw_unclaimed_reg_debug] *ERROR* 	0x00000000
[  658.663121] [drm:hsw_unclaimed_reg_debug] *ERROR* 	0x00000000
[  658.663122] [drm:hsw_unclaimed_reg_debug] *ERROR* 	0x00000000

The same data is obtainable with using the various perf/ftrace hooks, but that
is a bit larger of a hammer. It's hard to tell how useful this is/would be.

Cc: Paulo Zanoni <przanoni@gmail.com>
Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
---
 drivers/gpu/drm/i915/i915_drv.h     | 11 +++++++++--
 drivers/gpu/drm/i915/intel_uncore.c | 11 ++++++++++-
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index c33327d..6fa22db 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2882,10 +2882,17 @@ int vlv_freq_opcode(struct drm_i915_private *dev_priv, int val);
 #define I915_READ16_NOTRACE(reg)	dev_priv->uncore.funcs.mmio_readw(dev_priv, (reg), false)
 #define I915_WRITE16_NOTRACE(reg, val)	dev_priv->uncore.funcs.mmio_writew(dev_priv, (reg), (val), false)
 
+DECLARE_PER_CPU(uint32_t, i915_last_written);
 #define I915_READ(reg)		dev_priv->uncore.funcs.mmio_readl(dev_priv, (reg), true)
-#define I915_WRITE(reg, val)	dev_priv->uncore.funcs.mmio_writel(dev_priv, (reg), (val), true)
+#define I915_WRITE(reg, val)	do {						\
+	dev_priv->uncore.funcs.mmio_writel(dev_priv, (reg), (val), true);	\
+	raw_cpu_write(i915_last_written, reg);					\
+} while (0)
 #define I915_READ_NOTRACE(reg)		dev_priv->uncore.funcs.mmio_readl(dev_priv, (reg), false)
-#define I915_WRITE_NOTRACE(reg, val)	dev_priv->uncore.funcs.mmio_writel(dev_priv, (reg), (val), false)
+#define I915_WRITE_NOTRACE(reg, val)	do {					\
+	dev_priv->uncore.funcs.mmio_writel(dev_priv, (reg), (val), false);	\
+	raw_cpu_write(i915_last_written, reg);					\
+} while (0)
 
 /* Be very careful with read/write 64-bit values. On 32-bit machines, they
  * will be implemented using 2 32-bit writes in an arbitrary order with
diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
index b29091b..001ce55 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -33,7 +33,10 @@
 #define __raw_i915_write16(dev_priv__, reg__, val__) writew(val__, (dev_priv__)->regs + (reg__))
 
 #define __raw_i915_read32(dev_priv__, reg__) readl((dev_priv__)->regs + (reg__))
-#define __raw_i915_write32(dev_priv__, reg__, val__) writel(val__, (dev_priv__)->regs + (reg__))
+#define __raw_i915_write32(dev_priv__, reg__, val__) do { \
+	writel(val__, (dev_priv__)->regs + (reg__));	  \
+	raw_cpu_write(i915_last_written, reg__);	  \
+} while (0)
 
 #define __raw_i915_read64(dev_priv__, reg__) readq((dev_priv__)->regs + (reg__))
 #define __raw_i915_write64(dev_priv__, reg__, val__) writeq(val__, (dev_priv__)->regs + (reg__))
@@ -513,6 +516,7 @@ ilk_dummy_write(struct drm_i915_private *dev_priv)
 	__raw_i915_write32(dev_priv, MI_MODE, 0);
 }
 
+DEFINE_PER_CPU(uint32_t, i915_last_written);
 static void
 hsw_unclaimed_reg_debug(struct drm_i915_private *dev_priv, u32 reg, bool read,
 			bool before)
@@ -524,8 +528,13 @@ hsw_unclaimed_reg_debug(struct drm_i915_private *dev_priv, u32 reg, bool read,
 		return;
 
 	if (__raw_i915_read32(dev_priv, FPGA_DBG) & FPGA_DBG_RM_NOCLAIM) {
+		int cpu;
 		WARN(1, "Unclaimed register detected %s %s register 0x%x\n",
 		     when, op, reg);
+		DRM_ERROR("Last registers written:\n");
+		for_each_possible_cpu(cpu)
+			DRM_ERROR("\t0x%08x\n", per_cpu(i915_last_written, cpu));
+
 		__raw_i915_write32(dev_priv, FPGA_DBG, FPGA_DBG_RM_NOCLAIM);
 	}
 }
-- 
2.3.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 2/2] drm/i915: Add the last written reg to error state
  2015-02-20  5:33 [PATCH 1/2] drm/i915: Track last register written (debug) Ben Widawsky
@ 2015-02-20  5:33 ` Ben Widawsky
  2015-02-20 12:04   ` Chris Wilson
  2015-02-20 13:29   ` shuang.he
  2015-02-20 11:55 ` [PATCH 1/2] drm/i915: Track last register written (debug) Paulo Zanoni
  1 sibling, 2 replies; 6+ messages in thread
From: Ben Widawsky @ 2015-02-20  5:33 UTC (permalink / raw)
  To: Intel GFX; +Cc: Ben Widawsky, Ben Widawsky

Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
---
 drivers/gpu/drm/i915/i915_drv.h       |  1 +
 drivers/gpu/drm/i915/i915_gpu_error.c | 11 +++++++++++
 2 files changed, 12 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 6fa22db..49bc296 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -323,6 +323,7 @@ struct drm_i915_error_state {
 	struct timeval time;
 
 	char error_msg[128];
+	u32 last_written[NR_CPUS];
 	u32 reset_count;
 	u32 suspend_count;
 
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 2c87a79..f7737c5 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -356,6 +356,9 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 				   error->ring[i].pid);
 		}
 	}
+	for_each_possible_cpu(i)
+		err_printf(m, "Last written register (cpu=%d): 0x%08x\n",
+			   i, error->last_written[i]);
 	err_printf(m, "Reset count: %u\n", error->reset_count);
 	err_printf(m, "Suspend count: %u\n", error->suspend_count);
 	err_printf(m, "PCI ID: 0x%04x\n", dev->pdev->device);
@@ -1253,6 +1256,13 @@ static void i915_error_capture_msg(struct drm_device *dev,
 		  wedged ? "reset" : "continue");
 }
 
+static void i915_capture_cpu_state(struct drm_i915_error_state *error)
+{
+	int cpu;
+	for_each_possible_cpu(cpu)
+		error->last_written[cpu] = per_cpu(i915_last_written, cpu);
+}
+
 static void i915_capture_gen_state(struct drm_i915_private *dev_priv,
 				   struct drm_i915_error_state *error)
 {
@@ -1286,6 +1296,7 @@ void i915_capture_error_state(struct drm_device *dev, bool wedged,
 
 	kref_init(&error->ref);
 
+	i915_capture_cpu_state(error);
 	i915_capture_gen_state(dev_priv, error);
 	i915_capture_reg_state(dev_priv, error);
 	i915_gem_capture_buffers(dev_priv, error);
-- 
2.3.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH 1/2] drm/i915: Track last register written (debug)
  2015-02-20  5:33 [PATCH 1/2] drm/i915: Track last register written (debug) Ben Widawsky
  2015-02-20  5:33 ` [PATCH 2/2] drm/i915: Add the last written reg to error state Ben Widawsky
@ 2015-02-20 11:55 ` Paulo Zanoni
  2015-02-20 11:58   ` Chris Wilson
  1 sibling, 1 reply; 6+ messages in thread
From: Paulo Zanoni @ 2015-02-20 11:55 UTC (permalink / raw)
  To: Ben Widawsky; +Cc: Intel GFX, Ben Widawsky

2015-02-20 3:33 GMT-02:00 Ben Widawsky <benjamin.widawsky@intel.com>:
> Register writes are supposed to be posted, and therefore "fast." In order to try
> to preserve this behavior, the patch uses percpu variables to make the overhead
> minimal. The slow data collection is done at error time, and that only occurs if
> the user specified mmio_debug=1 module parameter.
>
> I wrote this patch to try to track down annoying and sporadic errors on my
> Broadwell laptop. Sometimes it's obvious how you got to this point from the
> backtrace, but other times it isn't.
>

Register reads can also trigger the unclaimed register error. Don't
you want to also record the last one we read on each CPU too?

Also, just to remind you since you seem to be debugging something:
user space programs (such as intel_reg_dumper or anything else) and
other Kernel modules (like vgacon and others) can also trigger these
errors, and we can't easily catch these cases.

Pinging Chris since he always has some ideas about this code.


> Output looks like this:
> [   63.122716] [drm:hsw_unclaimed_reg_detect] *ERROR* Unclaimed register detected. Please use the i915.mmio_debug=1 to debug this problem.
> [  658.662961] ------------[ cut here ]------------
> ...
> [  658.663113] ---[ end trace c7f0a18c4d4e027b ]---
> [  658.663114] [drm:hsw_unclaimed_reg_debug] *ERROR* Last registers written:
> [  658.663115] [drm:hsw_unclaimed_reg_debug] *ERROR*    0x00002030
> [  658.663116] [drm:hsw_unclaimed_reg_debug] *ERROR*    0x00022030
> [  658.663117] [drm:hsw_unclaimed_reg_debug] *ERROR*    0x00002030
> [  658.663118] [drm:hsw_unclaimed_reg_debug] *ERROR*    0x00002030
> [  658.663119] [drm:hsw_unclaimed_reg_debug] *ERROR*    0x00000000
> [  658.663120] [drm:hsw_unclaimed_reg_debug] *ERROR*    0x00000000
> [  658.663121] [drm:hsw_unclaimed_reg_debug] *ERROR*    0x00000000
> [  658.663122] [drm:hsw_unclaimed_reg_debug] *ERROR*    0x00000000
>
> The same data is obtainable with using the various perf/ftrace hooks, but that
> is a bit larger of a hammer. It's hard to tell how useful this is/would be.
>
> Cc: Paulo Zanoni <przanoni@gmail.com>
> Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
> ---
>  drivers/gpu/drm/i915/i915_drv.h     | 11 +++++++++--
>  drivers/gpu/drm/i915/intel_uncore.c | 11 ++++++++++-
>  2 files changed, 19 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index c33327d..6fa22db 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2882,10 +2882,17 @@ int vlv_freq_opcode(struct drm_i915_private *dev_priv, int val);
>  #define I915_READ16_NOTRACE(reg)       dev_priv->uncore.funcs.mmio_readw(dev_priv, (reg), false)
>  #define I915_WRITE16_NOTRACE(reg, val) dev_priv->uncore.funcs.mmio_writew(dev_priv, (reg), (val), false)
>
> +DECLARE_PER_CPU(uint32_t, i915_last_written);
>  #define I915_READ(reg)         dev_priv->uncore.funcs.mmio_readl(dev_priv, (reg), true)
> -#define I915_WRITE(reg, val)   dev_priv->uncore.funcs.mmio_writel(dev_priv, (reg), (val), true)
> +#define I915_WRITE(reg, val)   do {                                            \
> +       dev_priv->uncore.funcs.mmio_writel(dev_priv, (reg), (val), true);       \
> +       raw_cpu_write(i915_last_written, reg);                                  \
> +} while (0)
>  #define I915_READ_NOTRACE(reg)         dev_priv->uncore.funcs.mmio_readl(dev_priv, (reg), false)
> -#define I915_WRITE_NOTRACE(reg, val)   dev_priv->uncore.funcs.mmio_writel(dev_priv, (reg), (val), false)
> +#define I915_WRITE_NOTRACE(reg, val)   do {                                    \
> +       dev_priv->uncore.funcs.mmio_writel(dev_priv, (reg), (val), false);      \
> +       raw_cpu_write(i915_last_written, reg);                                  \
> +} while (0)
>
>  /* Be very careful with read/write 64-bit values. On 32-bit machines, they
>   * will be implemented using 2 32-bit writes in an arbitrary order with
> diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
> index b29091b..001ce55 100644
> --- a/drivers/gpu/drm/i915/intel_uncore.c
> +++ b/drivers/gpu/drm/i915/intel_uncore.c
> @@ -33,7 +33,10 @@
>  #define __raw_i915_write16(dev_priv__, reg__, val__) writew(val__, (dev_priv__)->regs + (reg__))
>
>  #define __raw_i915_read32(dev_priv__, reg__) readl((dev_priv__)->regs + (reg__))
> -#define __raw_i915_write32(dev_priv__, reg__, val__) writel(val__, (dev_priv__)->regs + (reg__))
> +#define __raw_i915_write32(dev_priv__, reg__, val__) do { \
> +       writel(val__, (dev_priv__)->regs + (reg__));      \
> +       raw_cpu_write(i915_last_written, reg__);          \
> +} while (0)
>
>  #define __raw_i915_read64(dev_priv__, reg__) readq((dev_priv__)->regs + (reg__))
>  #define __raw_i915_write64(dev_priv__, reg__, val__) writeq(val__, (dev_priv__)->regs + (reg__))
> @@ -513,6 +516,7 @@ ilk_dummy_write(struct drm_i915_private *dev_priv)
>         __raw_i915_write32(dev_priv, MI_MODE, 0);
>  }
>
> +DEFINE_PER_CPU(uint32_t, i915_last_written);
>  static void
>  hsw_unclaimed_reg_debug(struct drm_i915_private *dev_priv, u32 reg, bool read,
>                         bool before)
> @@ -524,8 +528,13 @@ hsw_unclaimed_reg_debug(struct drm_i915_private *dev_priv, u32 reg, bool read,
>                 return;
>
>         if (__raw_i915_read32(dev_priv, FPGA_DBG) & FPGA_DBG_RM_NOCLAIM) {
> +               int cpu;
>                 WARN(1, "Unclaimed register detected %s %s register 0x%x\n",
>                      when, op, reg);
> +               DRM_ERROR("Last registers written:\n");
> +               for_each_possible_cpu(cpu)
> +                       DRM_ERROR("\t0x%08x\n", per_cpu(i915_last_written, cpu));
> +
>                 __raw_i915_write32(dev_priv, FPGA_DBG, FPGA_DBG_RM_NOCLAIM);
>         }
>  }
> --
> 2.3.0
>



-- 
Paulo Zanoni
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 1/2] drm/i915: Track last register written (debug)
  2015-02-20 11:55 ` [PATCH 1/2] drm/i915: Track last register written (debug) Paulo Zanoni
@ 2015-02-20 11:58   ` Chris Wilson
  0 siblings, 0 replies; 6+ messages in thread
From: Chris Wilson @ 2015-02-20 11:58 UTC (permalink / raw)
  To: Paulo Zanoni; +Cc: Intel GFX, Ben Widawsky, Ben Widawsky

On Fri, Feb 20, 2015 at 09:55:09AM -0200, Paulo Zanoni wrote:
> 2015-02-20 3:33 GMT-02:00 Ben Widawsky <benjamin.widawsky@intel.com>:
> > Register writes are supposed to be posted, and therefore "fast." In order to try
> > to preserve this behavior, the patch uses percpu variables to make the overhead
> > minimal. The slow data collection is done at error time, and that only occurs if
> > the user specified mmio_debug=1 module parameter.
> >
> > I wrote this patch to try to track down annoying and sporadic errors on my
> > Broadwell laptop. Sometimes it's obvious how you got to this point from the
> > backtrace, but other times it isn't.
> >
> 
> Register reads can also trigger the unclaimed register error. Don't
> you want to also record the last one we read on each CPU too?
> 
> Also, just to remind you since you seem to be debugging something:
> user space programs (such as intel_reg_dumper or anything else) and
> other Kernel modules (like vgacon and others) can also trigger these
> errors, and we can't easily catch these cases.
> 
> Pinging Chris since he always has some ideas about this code.

In this case it is probably better to look at the automatic oneshot
mmio_debug patch.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 2/2] drm/i915: Add the last written reg to error state
  2015-02-20  5:33 ` [PATCH 2/2] drm/i915: Add the last written reg to error state Ben Widawsky
@ 2015-02-20 12:04   ` Chris Wilson
  2015-02-20 13:29   ` shuang.he
  1 sibling, 0 replies; 6+ messages in thread
From: Chris Wilson @ 2015-02-20 12:04 UTC (permalink / raw)
  To: Ben Widawsky; +Cc: Intel GFX, Ben Widawsky

At first glance, this is interesting. On reflection I think I still only
care about the register values at the time of the error. Otherwise, I
would rather have the last few registers written - and only those of the
interesting set as defined by the error state - their values and when.
We would also then require the timestamp of batch execution to marry with
the register updates if we suspect them of causing GPU hangs.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 2/2] drm/i915: Add the last written reg to error state
  2015-02-20  5:33 ` [PATCH 2/2] drm/i915: Add the last written reg to error state Ben Widawsky
  2015-02-20 12:04   ` Chris Wilson
@ 2015-02-20 13:29   ` shuang.he
  1 sibling, 0 replies; 6+ messages in thread
From: shuang.he @ 2015-02-20 13:29 UTC (permalink / raw)
  To: shuang.he, ethan.gao, intel-gfx, benjamin.widawsky

Tested-By: PRC QA PRTS (Patch Regression Test System Contact: shuang.he@intel.com)
Task id: 5799
-------------------------------------Summary-------------------------------------
Platform          Delta          drm-intel-nightly          Series Applied
PNV                 -2              277/277              275/277
ILK                                  313/313              313/313
SNB                 -1              309/309              308/309
IVB                                  382/382              382/382
BYT                                  296/296              296/296
HSW                 -1              425/425              424/425
BDW                 -1              318/318              317/318
-------------------------------------Detailed-------------------------------------
Platform  Test                                drm-intel-nightly          Series Applied
 PNV  igt_gem_userptr_blits_coherency-sync      NO_RESULT(1)CRASH(5)NRUN(1)PASS(6)      CRASH(2)
 PNV  igt_gem_userptr_blits_coherency-unsync      CRASH(3)NRUN(1)PASS(4)      CRASH(2)
*SNB  igt_kms_plane_plane-panning-top-left-pipe-B-plane-2      PASS(2)      TIMEOUT(1)PASS(1)
*HSW  igt_gem_storedw_batches_loop_secure-dispatch      PASS(2)      DMESG_WARN(1)PASS(1)
*BDW  igt_gem_gtt_hog      PASS(17)      DMESG_WARN(1)PASS(1)
Note: You need to pay more attention to line start with '*'
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2015-02-20 13:29 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-02-20  5:33 [PATCH 1/2] drm/i915: Track last register written (debug) Ben Widawsky
2015-02-20  5:33 ` [PATCH 2/2] drm/i915: Add the last written reg to error state Ben Widawsky
2015-02-20 12:04   ` Chris Wilson
2015-02-20 13:29   ` shuang.he
2015-02-20 11:55 ` [PATCH 1/2] drm/i915: Track last register written (debug) Paulo Zanoni
2015-02-20 11:58   ` Chris Wilson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.