All of lore.kernel.org
 help / color / mirror / Atom feed
From: Chris Wilson <chris@chris-wilson.co.uk>
To: intel-gfx@lists.freedesktop.org
Subject: [PATCH 06/33] drm/i915: Stop the machine whilst capturing the GPU crash dump
Date: Sun,  7 Aug 2016 15:45:14 +0100	[thread overview]
Message-ID: <1470581141-14432-7-git-send-email-chris@chris-wilson.co.uk> (raw)
In-Reply-To: <1470581141-14432-1-git-send-email-chris@chris-wilson.co.uk>

The error state is purposefully racy as we expect it to be called at any
time and so have avoided any locking whilst capturing the crash dump.
However, with multi-engine GPUs and multiple CPUs, those races can
manifest into OOPSes as we attempt to chase dangling pointers freed on
other CPUs. Under discussion are lots of ways to slow down normal
operation in order to protect the post-mortem error capture, but what it
we take the opposite approach and freeze the machine whilst the error
capture runs (note the GPU may still running, but as long as we don't
process any of the results the driver's bookkeeping will be static).

Note that by of itself, this is not a complete fix. It also depends on
the compiler barriers in list_add/list_del to prevent traversing the
lists into the void. We also depend that we only require state from
carefully controlled sources - i.e. all the state we require for
post-mortem debugging should be reachable from the request itself so
that we only have to worry about retrieving the request carefully. Once
we have the request, we know that all pointers from it are intact.

v2: Avoid drm_clflush_pages() inside stop_machine() as it may use
stop_machine() itself for its wbinvd fallback.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/Kconfig          |  1 +
 drivers/gpu/drm/i915/i915_drv.h       |  2 ++
 drivers/gpu/drm/i915/i915_gpu_error.c | 48 +++++++++++++++++++++--------------
 3 files changed, 32 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig
index 7769e469118f..7badcee88ebf 100644
--- a/drivers/gpu/drm/i915/Kconfig
+++ b/drivers/gpu/drm/i915/Kconfig
@@ -4,6 +4,7 @@ config DRM_I915
 	depends on X86 && PCI
 	select INTEL_GTT
 	select INTERVAL_TREE
+	select STOP_MACHINE
 	# we need shmfs for the swappable backing store, and in particular
 	# the shmem_readpage() which depends upon tmpfs
 	select SHMEM
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 15c41158b4cf..826486d03e8e 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -481,6 +481,8 @@ struct drm_i915_error_state {
 	struct kref ref;
 	struct timeval time;
 
+	struct drm_i915_private *i915;
+
 	char error_msg[128];
 	bool simulated;
 	int iommu;
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index ced296983caa..b94a59733cf8 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -28,6 +28,7 @@
  */
 
 #include <generated/utsrelease.h>
+#include <linux/stop_machine.h>
 #include "i915_drv.h"
 
 static const char *engine_str(int engine)
@@ -684,14 +685,12 @@ i915_error_object_create(struct drm_i915_private *dev_priv,
 
 	dst->page_count = num_pages;
 	while (num_pages--) {
-		unsigned long flags;
 		void *d;
 
 		d = kmalloc(PAGE_SIZE, GFP_ATOMIC);
 		if (d == NULL)
 			goto unwind;
 
-		local_irq_save(flags);
 		if (use_ggtt) {
 			void __iomem *s;
 
@@ -710,15 +709,10 @@ i915_error_object_create(struct drm_i915_private *dev_priv,
 
 			page = i915_gem_object_get_page(src, i);
 
-			drm_clflush_pages(&page, 1);
-
 			s = kmap_atomic(page);
 			memcpy(d, s, PAGE_SIZE);
 			kunmap_atomic(s);
-
-			drm_clflush_pages(&page, 1);
 		}
-		local_irq_restore(flags);
 
 		dst->pages[i++] = d;
 		reloc_offset += PAGE_SIZE;
@@ -1371,6 +1365,32 @@ static void i915_capture_gen_state(struct drm_i915_private *dev_priv,
 	error->suspend_count = dev_priv->suspend_count;
 }
 
+static int capture(void *data)
+{
+	struct drm_i915_error_state *error = data;
+
+	/* Ensure that what we readback from memory matches what the GPU sees */
+	wbinvd();
+
+	i915_capture_gen_state(error->i915, error);
+	i915_capture_reg_state(error->i915, error);
+	i915_gem_record_fences(error->i915, error);
+	i915_gem_record_rings(error->i915, error);
+
+	i915_capture_active_buffers(error->i915, error);
+	i915_capture_pinned_buffers(error->i915, error);
+
+	do_gettimeofday(&error->time);
+
+	error->overlay = intel_overlay_capture_error_state(error->i915);
+	error->display = intel_display_capture_error_state(error->i915);
+
+	/* And make sure we don't leave trash in the CPU cache */
+	wbinvd();
+
+	return 0;
+}
+
 /**
  * i915_capture_error_state - capture an error record for later analysis
  * @dev: drm device
@@ -1399,19 +1419,9 @@ void i915_capture_error_state(struct drm_i915_private *dev_priv,
 	}
 
 	kref_init(&error->ref);
+	error->i915 = dev_priv;
 
-	i915_capture_gen_state(dev_priv, error);
-	i915_capture_reg_state(dev_priv, error);
-	i915_gem_record_fences(dev_priv, error);
-	i915_gem_record_rings(dev_priv, error);
-
-	i915_capture_active_buffers(dev_priv, error);
-	i915_capture_pinned_buffers(dev_priv, error);
-
-	do_gettimeofday(&error->time);
-
-	error->overlay = intel_overlay_capture_error_state(dev_priv);
-	error->display = intel_display_capture_error_state(dev_priv);
+	stop_machine(capture, error, NULL);
 
 	i915_error_capture_msg(dev_priv, error, engine_mask, error_msg);
 	DRM_INFO("%s\n", error->error_msg);
-- 
2.8.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

  parent reply	other threads:[~2016-08-07 14:45 UTC|newest]

Thread overview: 125+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-08-07 14:45 First class VMA, take 2 Chris Wilson
2016-08-07 14:45 ` [PATCH 01/33] drm/i915: Add smp_rmb() to busy ioctl's RCU dance Chris Wilson
2016-08-08  9:12   ` Daniel Vetter
2016-08-08  9:30     ` Chris Wilson
2016-08-08  9:45       ` Chris Wilson
2016-08-09  6:36         ` Joonas Lahtinen
2016-08-09  7:14           ` Chris Wilson
2016-08-09  8:48             ` Joonas Lahtinen
2016-08-09  9:05               ` Chris Wilson
2016-08-10 10:12                 ` Daniel Vetter
2016-08-10 10:13                   ` Daniel Vetter
2016-08-10 11:00                     ` Joonas Lahtinen
2016-08-12  9:50                       ` Joonas Lahtinen
2016-08-07 14:45 ` [PATCH 02/33] drm/i915: Do not overwrite the request with zero on reallocation Chris Wilson
2016-08-08  9:25   ` Daniel Vetter
2016-08-08  9:56     ` Chris Wilson
2016-08-09  6:32       ` Daniel Vetter
2016-08-07 14:45 ` [PATCH 03/33] drm/i915: Move missed interrupt detection from hangcheck to breadcrumbs Chris Wilson
2016-08-09 14:08   ` [PATCH v2] " Chris Wilson
2016-08-09 14:10   ` [PATCH v3] " Chris Wilson
2016-08-09 15:24     ` Mika Kuoppala
2016-08-07 14:45 ` [PATCH 04/33] drm/i915: Use RCU to annotate and enforce protection for breadcrumb's bh Chris Wilson
2016-08-08  9:33   ` Daniel Vetter
2016-08-12  9:56   ` Joonas Lahtinen
2016-08-07 14:45 ` [PATCH 05/33] drm/i915: Reduce amount of duplicate buffer information captured on error Chris Wilson
2016-08-10  7:04   ` Joonas Lahtinen
2016-08-10  7:15     ` Chris Wilson
2016-08-10  8:07       ` Joonas Lahtinen
2016-08-10  8:36         ` Chris Wilson
2016-08-10 10:51           ` Joonas Lahtinen
2016-08-07 14:45 ` Chris Wilson [this message]
2016-08-07 14:45 ` [PATCH 07/33] drm/i915: Store the active context object on all engines upon error Chris Wilson
2016-08-09  9:02   ` Joonas Lahtinen
2016-08-07 14:45 ` [PATCH 08/33] drm/i915: Move setting of request->batch into its single callsite Chris Wilson
2016-08-09 15:53   ` Mika Kuoppala
2016-08-09 16:04     ` Chris Wilson
2016-08-10  7:19   ` Joonas Lahtinen
2016-08-07 14:45 ` [PATCH 09/33] drm/i915: Mark unmappable GGTT entries as PIN_HIGH Chris Wilson
2016-08-08  9:09   ` Joonas Lahtinen
2016-08-09 11:05   ` Tvrtko Ursulin
2016-08-09 11:13     ` Chris Wilson
2016-08-09 11:20       ` Chris Wilson
2016-08-07 14:45 ` [PATCH 10/33] drm/i915: Remove inactive/active list from debugfs Chris Wilson
2016-08-09 10:29   ` Joonas Lahtinen
2016-08-07 14:45 ` [PATCH 11/33] drm/i915: Focus debugfs/i915_gem_pinned to show only display pins Chris Wilson
2016-08-09 10:39   ` Joonas Lahtinen
2016-08-09 10:46     ` Chris Wilson
2016-08-09 11:32       ` Joonas Lahtinen
2016-08-07 14:45 ` [PATCH 12/33] drm/i915: Reduce i915_gem_objects to only show object information Chris Wilson
2016-08-10  7:29   ` Joonas Lahtinen
2016-08-10  7:38     ` Chris Wilson
2016-08-10  8:10       ` Joonas Lahtinen
2016-08-07 14:45 ` [PATCH 13/33] drm/i915: Remove redundant WARN_ON from __i915_add_request() Chris Wilson
2016-08-08  9:03   ` Joonas Lahtinen
2016-08-07 14:45 ` [PATCH 14/33] drm/i915: Create a VMA for an object Chris Wilson
2016-08-08  9:01   ` Joonas Lahtinen
2016-08-08  9:09     ` Chris Wilson
2016-08-10 10:58       ` Joonas Lahtinen
2016-08-07 14:45 ` [PATCH 15/33] drm/i915: Track pinned vma inside guc Chris Wilson
2016-08-11 16:19   ` Dave Gordon
2016-08-11 16:41     ` Chris Wilson
2016-08-07 14:45 ` [PATCH 16/33] drm/i915: Convert fence computations to use vma directly Chris Wilson
2016-08-09 10:27   ` Joonas Lahtinen
2016-08-09 10:33     ` Chris Wilson
2016-08-07 14:45 ` [PATCH 17/33] drm/i915: Use VMA directly for checking tiling parameters Chris Wilson
2016-08-09  6:18   ` Joonas Lahtinen
2016-08-09  8:03     ` Chris Wilson
2016-08-07 14:45 ` [PATCH 18/33] drm/i915: Use VMA as the primary object for context state Chris Wilson
2016-08-10  8:03   ` Joonas Lahtinen
2016-08-10  8:25     ` Chris Wilson
2016-08-10 10:54       ` Joonas Lahtinen
2016-08-07 14:45 ` [PATCH 19/33] drm/i915: Only clflush the context object when binding Chris Wilson
2016-08-10  8:41   ` Joonas Lahtinen
2016-08-10  9:02     ` Chris Wilson
2016-08-10 10:50       ` Joonas Lahtinen
2016-08-07 14:45 ` [PATCH 20/33] drm/i915: Use VMA for ringbuffer tracking Chris Wilson
2016-08-11  9:32   ` Joonas Lahtinen
2016-08-11  9:58     ` Chris Wilson
2016-08-07 14:45 ` [PATCH 21/33] drm/i915: Use VMA for scratch page tracking Chris Wilson
2016-08-08  8:00   ` [PATCH 1/3] " Chris Wilson
2016-08-08  8:00     ` [PATCH 2/3] drm/i915: Move common scratch allocation/destroy to intel_engine_cs.c Chris Wilson
2016-08-08  9:24       ` Matthew Auld
2016-08-08  8:00     ` [PATCH 3/3] drm/i915: Move common seqno reset " Chris Wilson
2016-08-08  9:40       ` Matthew Auld
2016-08-08 10:15         ` Chris Wilson
2016-08-08 15:34           ` Matthew Auld
2016-08-11 10:06   ` [PATCH 21/33] drm/i915: Use VMA for scratch page tracking Joonas Lahtinen
2016-08-11 10:22     ` Chris Wilson
2016-08-07 14:45 ` [PATCH 22/33] drm/i915/overlay: Use VMA as the primary tracker for images Chris Wilson
2016-08-11 10:17   ` Joonas Lahtinen
2016-08-07 14:45 ` [PATCH 23/33] drm/i915: Use VMA as the primary tracker for semaphore page Chris Wilson
2016-08-11 10:42   ` Joonas Lahtinen
2016-08-07 14:45 ` [PATCH 24/33] drm/i915: Use VMA for render state page tracking Chris Wilson
2016-08-11 10:46   ` Joonas Lahtinen
2016-08-07 14:45 ` [PATCH 25/33] drm/i915: Use VMA for wa_ctx tracking Chris Wilson
2016-08-11 10:53   ` Joonas Lahtinen
2016-08-11 11:02     ` Chris Wilson
2016-08-11 12:41       ` Joonas Lahtinen
2016-08-07 14:45 ` [PATCH 26/33] drm/i915: Track pinned VMA Chris Wilson
2016-08-11 12:18   ` Joonas Lahtinen
2016-08-11 12:37     ` Chris Wilson
2016-08-07 14:45 ` [PATCH 27/33] drm/i915: Print the batchbuffer offset next to BBADDR in error state Chris Wilson
2016-08-11 12:24   ` Joonas Lahtinen
2016-08-07 14:45 ` [PATCH 28/33] drm/i915: Move per-request pid from request to ctx Chris Wilson
2016-08-11 12:32   ` Joonas Lahtinen
2016-08-11 12:41     ` Chris Wilson
2016-08-07 14:45 ` [PATCH 29/33] drm/i915: Only record active and pending requests upon a GPU hang Chris Wilson
2016-08-11 12:36   ` Joonas Lahtinen
2016-08-07 14:45 ` [PATCH 30/33] drm/i915: Record the RING_MODE register for post-mortem debugging Chris Wilson
2016-08-08 11:35   ` Joonas Lahtinen
2016-08-07 14:45 ` [PATCH 31/33] drm/i915: Always use the GTT for error capture Chris Wilson
2016-08-07 14:45 ` [PATCH 32/33] drm/i915: Consolidate error object printing Chris Wilson
2016-08-09 11:44   ` Joonas Lahtinen
2016-08-09 11:53     ` Chris Wilson
2016-08-10 10:55       ` Joonas Lahtinen
2016-08-07 14:45 ` [PATCH 33/33] drm/i915: Compress GPU objects in error state Chris Wilson
2016-08-10 10:32   ` Joonas Lahtinen
2016-08-10 10:52     ` Chris Wilson
2016-08-10 11:26       ` Joonas Lahtinen
2016-08-07 15:16 ` ✗ Ro.CI.BAT: failure for series starting with [01/33] drm/i915: Add smp_rmb() to busy ioctl's RCU dance Patchwork
2016-08-08  9:46 ` ✗ Ro.CI.BAT: failure for series starting with [01/33] drm/i915: Add smp_rmb() to busy ioctl's RCU dance (rev4) Patchwork
2016-08-08 10:34 ` ✗ Fi.CI.BAT: " Patchwork
2016-08-09 14:10 ` ✗ Ro.CI.BAT: failure for series starting with [01/33] drm/i915: Add smp_rmb() to busy ioctl's RCU dance (rev5) Patchwork
2016-08-09 14:20 ` ✗ Ro.CI.BAT: failure for series starting with [01/33] drm/i915: Add smp_rmb() to busy ioctl's RCU dance (rev6) Patchwork
2016-08-10  6:43 ` Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1470581141-14432-7-git-send-email-chris@chris-wilson.co.uk \
    --to=chris@chris-wilson.co.uk \
    --cc=intel-gfx@lists.freedesktop.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.