All of lore.kernel.org
 help / color / mirror / Atom feed
From: Jani Nikula <jani.nikula@linux.intel.com>
To: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>, intel-gfx@lists.freedesktop.org
Subject: Re: [CI 1/5] drm/i915: Allow disabling error capture
Date: Tue, 11 Oct 2016 13:56:48 +0300	[thread overview]
Message-ID: <877f9fqhlb.fsf@intel.com> (raw)
In-Reply-To: <20161011102857.GH16122@nuc-i3427.alporthouse.com>

On Tue, 11 Oct 2016, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> On Tue, Oct 11, 2016 at 01:16:42PM +0300, Jani Nikula wrote:
>> On Tue, 11 Oct 2016, Chris Wilson <chris@chris-wilson.co.uk> wrote:
>> > On Tue, Oct 11, 2016 at 12:52:00PM +0300, Jani Nikula wrote:
>> >> On Tue, 11 Oct 2016, Chris Wilson <chris@chris-wilson.co.uk> wrote:
>> >> > We currently capture the GPU state after we detect a hang. This is vital
>> >> > for us to both triage and debug hangs in the wild (post-mortem
>> >> > debugging). However, it comes at the cost of running some potentially
>> >> > dangerous code (since it has to make very few assumption about the state
>> >> > of the driver) that is quite resource intensive.
>> >> >
>> >> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>> >> > Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
>> >> > ---
>> >> >  drivers/gpu/drm/i915/Kconfig          | 10 ++++++++++
>> >> >  drivers/gpu/drm/i915/i915_debugfs.c   |  6 ++++++
>> >> >  drivers/gpu/drm/i915/i915_drv.h       | 16 ++++++++++++++++
>> >> >  drivers/gpu/drm/i915/i915_gpu_error.c |  7 +++++++
>> >> >  drivers/gpu/drm/i915/i915_params.c    |  9 +++++++++
>> >> >  drivers/gpu/drm/i915/i915_params.h    |  1 +
>> >> >  drivers/gpu/drm/i915/i915_sysfs.c     |  8 ++++++++
>> >> >  drivers/gpu/drm/i915/intel_display.c  |  4 ++++
>> >> >  drivers/gpu/drm/i915/intel_overlay.c  |  4 ++++
>> >> >  9 files changed, 65 insertions(+)
>> >> >
>> >> > diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig
>> >> > index 7769e469118f..10a6ac11b6a9 100644
>> >> > --- a/drivers/gpu/drm/i915/Kconfig
>> >> > +++ b/drivers/gpu/drm/i915/Kconfig
>> >> > @@ -46,6 +46,16 @@ config DRM_I915_PRELIMINARY_HW_SUPPORT
>> >> >  
>> >> >  	  If in doubt, say "N".
>> >> >  
>> >> > +config DRM_I915_CAPTURE_ERROR
>> >> > +	bool "Enable capturing GPU state following a hang"
>> >> > +	depends on DRM_I915
>> >> > +	default y
>> >> > +	help
>> >> > +	  This option enables capturing the GPU state when a hang is detected.
>> >> > +	  This information is vital for triaging hangs and assists in debugging.
>> >> > +
>> >> > +	  If in doubt, say "Y".
>> >> > +
>> >> >  config DRM_I915_USERPTR
>> >> >  	bool "Always enable userptr support"
>> >> >  	depends on DRM_I915
>> >> > diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
>> >> > index 20689f1cd719..e4b5ba771bea 100644
>> >> > --- a/drivers/gpu/drm/i915/i915_debugfs.c
>> >> > +++ b/drivers/gpu/drm/i915/i915_debugfs.c
>> >> > @@ -960,6 +960,8 @@ static int i915_hws_info(struct seq_file *m, void *data)
>> >> >  	return 0;
>> >> >  }
>> >> >  
>> >> > +#ifdef CONFIG_DRM_I915_CAPTURE_ERROR
>> >> > +
>> >> >  static ssize_t
>> >> >  i915_error_state_write(struct file *filp,
>> >> >  		       const char __user *ubuf,
>> >> > @@ -1042,6 +1044,8 @@ static const struct file_operations i915_error_state_fops = {
>> >> >  	.release = i915_error_state_release,
>> >> >  };
>> >> >  
>> >> > +#endif
>> >> > +
>> >> >  static int
>> >> >  i915_next_seqno_get(void *data, u64 *val)
>> >> >  {
>> >> > @@ -5398,7 +5402,9 @@ static const struct i915_debugfs_files {
>> >> >  	{"i915_ring_missed_irq", &i915_ring_missed_irq_fops},
>> >> >  	{"i915_ring_test_irq", &i915_ring_test_irq_fops},
>> >> >  	{"i915_gem_drop_caches", &i915_drop_caches_fops},
>> >> > +#ifdef CONFIG_DRM_I915_CAPTURE_ERROR
>> >> >  	{"i915_error_state", &i915_error_state_fops},
>> >> > +#endif
>> >> >  	{"i915_next_seqno", &i915_next_seqno_fops},
>> >> >  	{"i915_display_crc_ctl", &i915_display_crc_ctl_fops},
>> >> >  	{"i915_pri_wm_latency", &i915_pri_wm_latency_fops},
>> >> > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>> >> > index 54d860e1c0fc..4570c4fa0287 100644
>> >> > --- a/drivers/gpu/drm/i915/i915_drv.h
>> >> > +++ b/drivers/gpu/drm/i915/i915_drv.h
>> >> > @@ -3544,6 +3544,8 @@ static inline void intel_display_crc_init(struct drm_i915_private *dev_priv) {}
>> >> >  #endif
>> >> >  
>> >> >  /* i915_gpu_error.c */
>> >> > +#ifdef CONFIG_DRM_I915_CAPTURE_ERROR
>> >> > +
>> >> >  __printf(2, 3)
>> >> >  void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...);
>> >> >  int i915_error_state_to_str(struct drm_i915_error_state_buf *estr,
>> >> > @@ -3564,6 +3566,20 @@ void i915_error_state_get(struct drm_device *dev,
>> >> >  void i915_error_state_put(struct i915_error_state_file_priv *error_priv);
>> >> >  void i915_destroy_error_state(struct drm_device *dev);
>> >> >  
>> >> > +#else
>> >> > +
>> >> > +static inline void i915_capture_error_state(struct drm_i915_private *dev_priv,
>> >> > +					    u32 engine_mask,
>> >> > +					    const char *error_msg)
>> >> > +{
>> >> > +}
>> >> > +
>> >> > +static inline void i915_destroy_error_state(struct drm_device *dev)
>> >> > +{
>> >> > +}
>> >> > +
>> >> > +#endif
>> >> > +
>> >> >  void i915_get_engine_instdone(struct drm_i915_private *dev_priv,
>> >> >  			      enum intel_engine_id engine_id,
>> >> >  			      struct intel_instdone *instdone);
>> >> > diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
>> >> > index b5b58692ac5a..9b395ffa3b6a 100644
>> >> > --- a/drivers/gpu/drm/i915/i915_gpu_error.c
>> >> > +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
>> >> > @@ -30,6 +30,8 @@
>> >> >  #include <generated/utsrelease.h>
>> >> >  #include "i915_drv.h"
>> >> >  
>> >> > +#ifdef CONFIG_DRM_I915_CAPTURE_ERROR
>> >> > +
>> >> >  static const char *engine_str(int engine)
>> >> >  {
>> >> >  	switch (engine) {
>> >> > @@ -1464,6 +1466,9 @@ void i915_capture_error_state(struct drm_i915_private *dev_priv,
>> >> >  	struct drm_i915_error_state *error;
>> >> >  	unsigned long flags;
>> >> >  
>> >> > +	if (!i915.error_capture)
>> >> > +		return;
>> >> > +
>> >> >  	if (READ_ONCE(dev_priv->gpu_error.first_error))
>> >> >  		return;
>> >> >  
>> >> > @@ -1549,6 +1554,8 @@ void i915_destroy_error_state(struct drm_device *dev)
>> >> >  		kref_put(&error->ref, i915_error_state_free);
>> >> >  }
>> >> >  
>> >> > +#endif
>> >> > +
>> >> >  const char *i915_cache_level_str(struct drm_i915_private *i915, int type)
>> >> >  {
>> >> >  	switch (type) {
>> >> > diff --git a/drivers/gpu/drm/i915/i915_params.c b/drivers/gpu/drm/i915/i915_params.c
>> >> > index 768ad89d9cd4..e72a41223535 100644
>> >> > --- a/drivers/gpu/drm/i915/i915_params.c
>> >> > +++ b/drivers/gpu/drm/i915/i915_params.c
>> >> > @@ -47,6 +47,7 @@ struct i915_params i915 __read_mostly = {
>> >> >  	.load_detect_test = 0,
>> >> >  	.force_reset_modeset_test = 0,
>> >> >  	.reset = true,
>> >> > +	.error_capture = true,
>> >> >  	.invert_brightness = 0,
>> >> >  	.disable_display = 0,
>> >> >  	.enable_cmd_parser = 1,
>> >> > @@ -115,6 +116,14 @@ MODULE_PARM_DESC(vbt_sdvo_panel_type,
>> >> >  module_param_named_unsafe(reset, i915.reset, bool, 0600);
>> >> >  MODULE_PARM_DESC(reset, "Attempt GPU resets (default: true)");
>> >> >  
>> >> > +#ifdef CONFIG_DRM_I915_CAPTURE_ERROR
>> >> > +module_param_named(error_capture, i915.error_capture, bool, 0600);
>> >> > +MODULE_PARM_DESC(error_capture,
>> >> > +	"Record the GPU state following a hang. "
>> >> > +	"This information in /sys/class/drm/card<N>/error is vital for "
>> >> > +	"triaging and debugging hangs.");
>> >> > +#endif
>> >> 
>> >> The addition of the module parameter is really orthogonal to the config
>> >> option, and should be added separately.
>> >> 
>> >> The question is, what is the target audience of
>> >> CONFIG_DRM_I915_CAPTURE_ERROR=n? Is the added config #ifdeffery worth
>> >> the trouble? It *is* a maintainability burden, because we will only ever
>> >> be building with CONFIG_DRM_I915_CAPTURE_ERROR=y ourselves, and we'll
>> >> screw it up anyway. Does i915.error_capture=0 accomplish the same thing
>> >> (apart from not reducing code size)? Do we really need to be able to
>> >> prevent the root from enabling error capture on the fly?
>> >
>> > What I had in mind was that RT kernels will probably default to turning
>> > off this facility (since it will cause latencies in the hundreds of ms
>> > range).
>> >  
>> >> What would be wrong with just adding the module parameter, and making
>> >> CONFIG_DRM_I915_CAPTURE_ERROR the default value for it? I.e.
>> >> 
>> >> 	.error_capture = IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR),
>> >> 
>> >> in the initialization. No #ifdefs anywhere.
>> >
>> > I wanted to turn it off by default at compilation time.
>> 
>> What I suggest above does that. ;)
>
> It's 30KiB of code that they don't want. I know that's only about 3% of
> the size of the driver, but it's a help.

Fair enough. Please copy-paste some of the elaboration to the commit
message. Ack from me, but it wouldn't hurt to get an ack from Daniel as
well.

BR,
Jani.

-- 
Jani Nikula, Intel Open Source Technology Center
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

  reply	other threads:[~2016-10-11 10:56 UTC|newest]

Thread overview: 19+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-10-11  9:27 [CI 1/5] drm/i915: Allow disabling error capture Chris Wilson
2016-10-11  9:27 ` [CI 2/5] drm/i915: Stop the machine whilst capturing the GPU crash dump Chris Wilson
2016-10-11  9:27 ` [CI 3/5] drm/i915: Always use the GTT for error capture Chris Wilson
2016-10-11  9:27 ` [CI 4/5] drm/i915: Consolidate error object printing Chris Wilson
2016-10-11  9:27 ` [CI 5/5] drm/i915: Compress GPU objects in error state Chris Wilson
2016-10-11  9:52 ` [CI 1/5] drm/i915: Allow disabling error capture Jani Nikula
2016-10-11  9:57   ` Chris Wilson
2016-10-11 10:16     ` Jani Nikula
2016-10-11 10:28       ` Chris Wilson
2016-10-11 10:56         ` Jani Nikula [this message]
2016-10-11 12:13           ` Daniel Vetter
2016-10-11 13:32             ` [PATCH 1/2] drm/i915: Move common code out of i915_gpu_error.c Chris Wilson
2016-10-11 13:32               ` [PATCH 2/2] drm/i915: Allow disabling error capture Chris Wilson
2016-10-13  9:16                 ` Jani Nikula
2016-10-13 10:01                   ` Chris Wilson
2016-10-12  8:51               ` [PATCH 1/2] drm/i915: Move common code out of i915_gpu_error.c Joonas Lahtinen
2016-10-11 11:20 ` ✗ Fi.CI.BAT: warning for series starting with [CI,1/5] drm/i915: Allow disabling error capture Patchwork
2018-10-26 13:02 ` ✗ Fi.CI.BAT: failure for series starting with [2/2] drm/i915: Allow disabling error capture (rev3) Patchwork
2018-10-26 13:03 ` Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=877f9fqhlb.fsf@intel.com \
    --to=jani.nikula@linux.intel.com \
    --cc=chris@chris-wilson.co.uk \
    --cc=daniel.vetter@ffwll.ch \
    --cc=intel-gfx@lists.freedesktop.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.