All of lore.kernel.org
 help / color / mirror / Atom feed
From: Chris Wilson <chris@chris-wilson.co.uk>
To: intel-gfx@lists.freedesktop.org
Subject: [PATCH 2/2] drm/i915: Allow disabling error capture
Date: Tue, 11 Oct 2016 14:32:42 +0100	[thread overview]
Message-ID: <20161011133242.26487-2-chris@chris-wilson.co.uk> (raw)
In-Reply-To: <20161011133242.26487-1-chris@chris-wilson.co.uk>

We currently capture the GPU state after we detect a hang. This is vital
for us to both triage and debug hangs in the wild (post-mortem
debugging). However, it comes at the cost of running some potentially
dangerous code (since it has to make very few assumption about the state
of the driver) that is quite resource intensive.

This patch introduces both a method to disable error capture at runtime
(for users who hit bugs at runtime and need a workaround) and to disable
error capture at compiletime (for realtime users who want to minimise
any possible latency, and never require error capture, saving ~30k of
code). The cost is that we know have to be wary of (and test!) a kconfig
flag and a module parameter. The effect of the module parameter is easy
to verify through code inspection and runtime testing, but a kconfig flag
needs regular compile checking.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Acked-by: Jani Nikula <jani.nikula@linux.intel.com>
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch
---
 drivers/gpu/drm/i915/Kconfig          | 13 +++++++++++++
 drivers/gpu/drm/i915/Makefile         |  4 +++-
 drivers/gpu/drm/i915/i915_debugfs.c   |  6 ++++++
 drivers/gpu/drm/i915/i915_drv.h       | 16 ++++++++++++++++
 drivers/gpu/drm/i915/i915_gpu_error.c |  3 +++
 drivers/gpu/drm/i915/i915_params.c    |  9 +++++++++
 drivers/gpu/drm/i915/i915_params.h    |  1 +
 drivers/gpu/drm/i915/i915_sysfs.c     | 25 ++++++++++++++++++++-----
 drivers/gpu/drm/i915/intel_display.c  |  4 ++++
 drivers/gpu/drm/i915/intel_overlay.c  |  4 ++++
 10 files changed, 79 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig
index 7769e469118f..8844b99bd760 100644
--- a/drivers/gpu/drm/i915/Kconfig
+++ b/drivers/gpu/drm/i915/Kconfig
@@ -46,6 +46,19 @@ config DRM_I915_PRELIMINARY_HW_SUPPORT
 
 	  If in doubt, say "N".
 
+config DRM_I915_CAPTURE_ERROR
+	bool "Enable capturing GPU state following a hang"
+	depends on DRM_I915
+	default y
+	help
+	  This option enables capturing the GPU state when a hang is detected.
+	  This information is vital for triaging hangs and assists in debugging.
+	  Please report any hang to
+            https://bugs.freedesktop.org/enter_bug.cgi?product=DRI
+	  for triaging.
+
+	  If in doubt, say "Y".
+
 config DRM_I915_USERPTR
 	bool "Always enable userptr support"
 	depends on DRM_I915
diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index a998c2bce70a..8790ae4fb171 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -42,7 +42,6 @@ i915-y += i915_cmd_parser.o \
 	  i915_gem_stolen.o \
 	  i915_gem_tiling.o \
 	  i915_gem_userptr.o \
-	  i915_gpu_error.o \
 	  i915_trace_points.o \
 	  intel_breadcrumbs.o \
 	  intel_engine_cs.o \
@@ -107,6 +106,9 @@ i915-y += dvo_ch7017.o \
 	  intel_sdvo.o \
 	  intel_tv.o
 
+# Post-mortem debug and GPU hang state capture
+i915-$(CONFIG_DRM_I915_CAPTURE_ERROR) += i915_gpu_error.o
+
 # virtual gpu code
 i915-y += i915_vgpu.o
 
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index f6762e00f872..358663e833d6 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -960,6 +960,8 @@ static int i915_hws_info(struct seq_file *m, void *data)
 	return 0;
 }
 
+#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
+
 static ssize_t
 i915_error_state_write(struct file *filp,
 		       const char __user *ubuf,
@@ -1042,6 +1044,8 @@ static const struct file_operations i915_error_state_fops = {
 	.release = i915_error_state_release,
 };
 
+#endif
+
 static int
 i915_next_seqno_get(void *data, u64 *val)
 {
@@ -5398,7 +5402,9 @@ static const struct i915_debugfs_files {
 	{"i915_ring_missed_irq", &i915_ring_missed_irq_fops},
 	{"i915_ring_test_irq", &i915_ring_test_irq_fops},
 	{"i915_gem_drop_caches", &i915_drop_caches_fops},
+#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
 	{"i915_error_state", &i915_error_state_fops},
+#endif
 	{"i915_next_seqno", &i915_next_seqno_fops},
 	{"i915_display_crc_ctl", &i915_display_crc_ctl_fops},
 	{"i915_pri_wm_latency", &i915_pri_wm_latency_fops},
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 4553a5372008..380590b30bbf 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3544,6 +3544,8 @@ static inline void intel_display_crc_init(struct drm_i915_private *dev_priv) {}
 #endif
 
 /* i915_gpu_error.c */
+#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
+
 __printf(2, 3)
 void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...);
 int i915_error_state_to_str(struct drm_i915_error_state_buf *estr,
@@ -3564,6 +3566,20 @@ void i915_error_state_get(struct drm_device *dev,
 void i915_error_state_put(struct i915_error_state_file_priv *error_priv);
 void i915_destroy_error_state(struct drm_device *dev);
 
+#else
+
+static inline void i915_capture_error_state(struct drm_i915_private *dev_priv,
+					    u32 engine_mask,
+					    const char *error_msg)
+{
+}
+
+static inline void i915_destroy_error_state(struct drm_device *dev)
+{
+}
+
+#endif
+
 const char *i915_cache_level_str(struct drm_i915_private *i915, int type);
 
 /* i915_cmd_parser.c */
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 04205c82f0c9..c88c0d192a60 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -1464,6 +1464,9 @@ void i915_capture_error_state(struct drm_i915_private *dev_priv,
 	struct drm_i915_error_state *error;
 	unsigned long flags;
 
+	if (!i915.error_capture)
+		return;
+
 	if (READ_ONCE(dev_priv->gpu_error.first_error))
 		return;
 
diff --git a/drivers/gpu/drm/i915/i915_params.c b/drivers/gpu/drm/i915/i915_params.c
index 768ad89d9cd4..629e4334719c 100644
--- a/drivers/gpu/drm/i915/i915_params.c
+++ b/drivers/gpu/drm/i915/i915_params.c
@@ -47,6 +47,7 @@ struct i915_params i915 __read_mostly = {
 	.load_detect_test = 0,
 	.force_reset_modeset_test = 0,
 	.reset = true,
+	.error_capture = true,
 	.invert_brightness = 0,
 	.disable_display = 0,
 	.enable_cmd_parser = 1,
@@ -115,6 +116,14 @@ MODULE_PARM_DESC(vbt_sdvo_panel_type,
 module_param_named_unsafe(reset, i915.reset, bool, 0600);
 MODULE_PARM_DESC(reset, "Attempt GPU resets (default: true)");
 
+#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
+module_param_named(error_capture, i915.error_capture, bool, 0600);
+MODULE_PARM_DESC(error_capture,
+	"Record the GPU state following a hang. "
+	"This information in /sys/class/drm/card<N>/error is vital for "
+	"triaging and debugging hangs.");
+#endif
+
 module_param_named_unsafe(enable_hangcheck, i915.enable_hangcheck, bool, 0644);
 MODULE_PARM_DESC(enable_hangcheck,
 	"Periodically check GPU activity for detecting hangs. "
diff --git a/drivers/gpu/drm/i915/i915_params.h b/drivers/gpu/drm/i915/i915_params.h
index 3a0dd78ddb38..94efc899c1ef 100644
--- a/drivers/gpu/drm/i915/i915_params.h
+++ b/drivers/gpu/drm/i915/i915_params.h
@@ -59,6 +59,7 @@ struct i915_params {
 	bool load_detect_test;
 	bool force_reset_modeset_test;
 	bool reset;
+	bool error_capture;
 	bool disable_display;
 	bool verbose_state_checks;
 	bool nuclear_pageflip;
diff --git a/drivers/gpu/drm/i915/i915_sysfs.c b/drivers/gpu/drm/i915/i915_sysfs.c
index 1012eeea1324..b626c58db113 100644
--- a/drivers/gpu/drm/i915/i915_sysfs.c
+++ b/drivers/gpu/drm/i915/i915_sysfs.c
@@ -514,6 +514,8 @@ static const struct attribute *vlv_attrs[] = {
 	NULL,
 };
 
+#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
+
 static ssize_t error_state_read(struct file *filp, struct kobject *kobj,
 				struct bin_attribute *attr, char *buf,
 				loff_t off, size_t count)
@@ -571,6 +573,21 @@ static struct bin_attribute error_state_attr = {
 	.write = error_state_write,
 };
 
+static void i915_setup_error_capture(struct device *kdev)
+{
+	if (sysfs_create_bin_file(&kdev->kobj, &error_state_attr))
+		DRM_ERROR("error_state sysfs setup failed\n");
+}
+
+static void i915_remove_error_capture(struct device *kdev)
+{
+	sysfs_remove_bin_file(&kdev->kobj, &error_state_attr);
+}
+#else
+static void i915_setup_error_capture(struct device *kdev) {}
+static void i915_teardown_error_capture(struct device *kdev) {}
+#endif
+
 void i915_setup_sysfs(struct drm_i915_private *dev_priv)
 {
 	struct device *kdev = dev_priv->drm.primary->kdev;
@@ -617,17 +634,15 @@ void i915_setup_sysfs(struct drm_i915_private *dev_priv)
 	if (ret)
 		DRM_ERROR("RPS sysfs setup failed\n");
 
-	ret = sysfs_create_bin_file(&kdev->kobj,
-				    &error_state_attr);
-	if (ret)
-		DRM_ERROR("error_state sysfs setup failed\n");
+	i915_setup_error_capture(kdev);
 }
 
 void i915_teardown_sysfs(struct drm_i915_private *dev_priv)
 {
 	struct device *kdev = dev_priv->drm.primary->kdev;
 
-	sysfs_remove_bin_file(&kdev->kobj, &error_state_attr);
+	i915_teardown_error_capture(kdev);
+
 	if (IS_VALLEYVIEW(dev_priv) || IS_CHERRYVIEW(dev_priv))
 		sysfs_remove_files(&kdev->kobj, vlv_attrs);
 	else
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 23a6c7213eca..70234caacf39 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -17099,6 +17099,8 @@ int intel_modeset_vga_set_state(struct drm_device *dev, bool state)
 	return 0;
 }
 
+#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
+
 struct intel_display_error_state {
 
 	u32 power_well_driver;
@@ -17281,3 +17283,5 @@ intel_display_print_error_state(struct drm_i915_error_state_buf *m,
 		err_printf(m, "  VSYNC: %08x\n", error->transcoder[i].vsync);
 	}
 }
+
+#endif
diff --git a/drivers/gpu/drm/i915/intel_overlay.c b/drivers/gpu/drm/i915/intel_overlay.c
index a24bc8c7889f..8c411bfc3b3f 100644
--- a/drivers/gpu/drm/i915/intel_overlay.c
+++ b/drivers/gpu/drm/i915/intel_overlay.c
@@ -1470,6 +1470,8 @@ void intel_cleanup_overlay(struct drm_i915_private *dev_priv)
 	kfree(dev_priv->overlay);
 }
 
+#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
+
 struct intel_overlay_error_state {
 	struct overlay_registers regs;
 	unsigned long base;
@@ -1587,3 +1589,5 @@ intel_overlay_print_error_state(struct drm_i915_error_state_buf *m,
 	P(UVSCALEV);
 #undef P
 }
+
+#endif
-- 
2.9.3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

  reply	other threads:[~2016-10-11 13:32 UTC|newest]

Thread overview: 19+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-10-11  9:27 [CI 1/5] drm/i915: Allow disabling error capture Chris Wilson
2016-10-11  9:27 ` [CI 2/5] drm/i915: Stop the machine whilst capturing the GPU crash dump Chris Wilson
2016-10-11  9:27 ` [CI 3/5] drm/i915: Always use the GTT for error capture Chris Wilson
2016-10-11  9:27 ` [CI 4/5] drm/i915: Consolidate error object printing Chris Wilson
2016-10-11  9:27 ` [CI 5/5] drm/i915: Compress GPU objects in error state Chris Wilson
2016-10-11  9:52 ` [CI 1/5] drm/i915: Allow disabling error capture Jani Nikula
2016-10-11  9:57   ` Chris Wilson
2016-10-11 10:16     ` Jani Nikula
2016-10-11 10:28       ` Chris Wilson
2016-10-11 10:56         ` Jani Nikula
2016-10-11 12:13           ` Daniel Vetter
2016-10-11 13:32             ` [PATCH 1/2] drm/i915: Move common code out of i915_gpu_error.c Chris Wilson
2016-10-11 13:32               ` Chris Wilson [this message]
2016-10-13  9:16                 ` [PATCH 2/2] drm/i915: Allow disabling error capture Jani Nikula
2016-10-13 10:01                   ` Chris Wilson
2016-10-12  8:51               ` [PATCH 1/2] drm/i915: Move common code out of i915_gpu_error.c Joonas Lahtinen
2016-10-11 11:20 ` ✗ Fi.CI.BAT: warning for series starting with [CI,1/5] drm/i915: Allow disabling error capture Patchwork
2018-10-26 13:02 ` ✗ Fi.CI.BAT: failure for series starting with [2/2] drm/i915: Allow disabling error capture (rev3) Patchwork
2018-10-26 13:03 ` Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20161011133242.26487-2-chris@chris-wilson.co.uk \
    --to=chris@chris-wilson.co.uk \
    --cc=intel-gfx@lists.freedesktop.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.