All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/i915: Allow i915_gem_reset_prepare_engine to recurse
@ 2018-05-14 20:52 Chris Wilson
  2018-05-14 21:08 ` Chris Wilson
                   ` (3 more replies)
  0 siblings, 4 replies; 6+ messages in thread
From: Chris Wilson @ 2018-05-14 20:52 UTC (permalink / raw)
  To: intel-gfx

We call i915_gem_reset_prepare_engine() during reset and then upon
wedging if the reset fails. Unfortunately, kthread_park and similar do
not support being called recursively and so we must count the number of
times we prepare for reset and only actually prepare on the outermost
layer. (Similarly for finish on unwinding the onion.)

[   87.705581] WARNING: CPU: 2 PID: 1377 at kernel/kthread.c:505 kthread_park+0x55/0x60
[   87.705583] Modules linked in: snd_hda_codec_hdmi snd_hda_codec_realtek snd_hda_codec_generic i915 x86_pkg_temp_thermal intel_powerclamp coretemp crct10dif_pclmul crc32_pclmul ghash_clmulni_intel snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_pcm broadcom bcm_phy_lib tg3 mei_me prime_numbers mei lpc_ich
[   87.705618] CPU: 2 PID: 1377 Comm: gem_eio Tainted: G     U            4.17.0-rc5-CI-CI_DRM_4177+ #1
[   87.705620] Hardware name: Dell Inc. XPS 8300  /0Y2MRG, BIOS A06 10/17/2011
[   87.705622] RIP: 0010:kthread_park+0x55/0x60
[   87.705624] RSP: 0018:ffffc9000051bac0 EFLAGS: 00010202
[   87.705627] RAX: 0000000000000004 RBX: ffff88021ca13de8 RCX: 0000000000000001
[   87.705629] RDX: 0000000080000001 RSI: ffffffff821228a9 RDI: ffff88020e8f0040
[   87.705630] RBP: ffff880215937670 R08: 00000000bae32d65 R09: 0000000000000000
[   87.705632] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8802159376b0
[   87.705634] R13: ffff880215937670 R14: ffff880215930000 R15: ffffffffa01c8d60
[   87.705636] FS:  00007f0c32061980(0000) GS:ffff88022fa80000(0000) knlGS:0000000000000000
[   87.705637] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   87.705639] CR2: 00007f0c32094000 CR3: 000000021a0d4004 CR4: 00000000000606e0
[   87.705641] Call Trace:
[   87.705668]  i915_gem_reset_prepare_engine+0x1d/0xa0 [i915]
[   87.705694]  i915_gem_set_wedged+0x7b/0x1e0 [i915]
[   87.705699]  ? __drm_printfn_info+0x20/0x20
[   87.705722]  i915_reset+0x14a/0x290 [i915]
[   87.705743]  i915_reset_device+0x1fb/0x290 [i915]
[   87.705767]  ? __intel_get_crtc_scanline+0x1c0/0x1c0 [i915]
[   87.705772]  ? work_on_cpu_safe+0x50/0x50
[   87.705798]  i915_handle_error+0x207/0x4a0 [i915]
[   87.705810]  ? __might_fault+0x39/0x90
[   87.705835]  i915_wedged_set+0x7f/0xc0 [i915]
[   87.705841]  simple_attr_write+0xb0/0xd0
[   87.705847]  full_proxy_write+0x51/0x80
[   87.705852]  __vfs_write+0x31/0x160
[   87.705857]  ? rcu_read_lock_sched_held+0x6f/0x80
[   87.705860]  ? rcu_sync_lockdep_assert+0x29/0x50
[   87.705862]  ? __sb_start_write+0x152/0x1f0
[   87.705864]  ? __sb_start_write+0x168/0x1f0
[   87.705868]  vfs_write+0xbd/0x1a0
[   87.705872]  ksys_write+0x50/0xc0
[   87.705877]  do_syscall_64+0x55/0x190
[   87.705880]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[   87.705882] RIP: 0033:0x7f0c315df281
[   87.705884] RSP: 002b:00007ffc9c990328 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
[   87.705887] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f0c315df281
[   87.705889] RDX: 0000000000000002 RSI: 000055a5e23ef276 RDI: 0000000000000047
[   87.705890] RBP: 00007ffc9c990350 R08: 0000000000000000 R09: 0000000000000034
[   87.705892] R10: 0000000000000000 R11: 0000000000000246 R12: 000055a5e23ebc50
[   87.705894] R13: 00007ffc9c990dc0 R14: 0000000000000000 R15: 0000000000000000
[   87.705902] Code: 00 31 ed 48 39 c7 74 0e e8 79 db 00 00 48 8d 7b 18 e8 a0 05 88 00 89 e8 5b 5d c3 0f 0b bd da ff ff ff 89 e8 5b 5d c3 0f 0b eb b7 <0f> 0b bd f0 ff ff ff eb e2 66 90 41 57 41 56 49 c7 c6 f4 ff ff

References: 85f1abe0019f ("kthread, sched/wait: Fix kthread_parkme() completion issue")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_gem.c         | 52 ++++++++++++++++++-------
 drivers/gpu/drm/i915/intel_engine_cs.c  |  1 +
 drivers/gpu/drm/i915/intel_ringbuffer.h |  3 ++
 3 files changed, 43 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 0a2070112b66..b169b630bf78 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2997,14 +2997,11 @@ i915_gem_find_active_request(struct intel_engine_cs *engine)
 	return active;
 }
 
-/*
- * Ensure irq handler finishes, and not run again.
- * Also return the active request so that we only search for it once.
- */
-struct i915_request *
-i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
+static void __i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
 {
-	struct i915_request *request = NULL;
+	mutex_lock(&engine->reset_lock);
+	if (engine->reset_depth++)
+		goto unlock;
 
 	/*
 	 * During the reset sequence, we must prevent the engine from
@@ -3057,6 +3054,38 @@ i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
 	if (engine->i915->guc.preempt_wq)
 		flush_workqueue(engine->i915->guc.preempt_wq);
 
+unlock:
+	mutex_unlock(&engine->reset_lock);
+}
+
+static void __i915_gem_reset_finish_engine(struct intel_engine_cs *engine)
+{
+	mutex_lock(&engine->reset_lock);
+
+	GEM_BUG_ON(engine->reset_depth);
+	if (--engine->reset_depth)
+		goto unlock;
+
+	tasklet_enable(&engine->execlists.tasklet);
+	kthread_unpark(engine->breadcrumbs.signaler);
+
+	intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL);
+
+unlock:
+	mutex_unlock(&engine->reset_lock);
+}
+
+/*
+ * Ensure irq handler finishes, and not run again.
+ * Also return the active request so that we only search for it once.
+ */
+struct i915_request *
+i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
+{
+	struct i915_request *request = NULL;
+
+	__i915_gem_reset_prepare_engine(engine);
+
 	if (engine->irq_seqno_barrier)
 		engine->irq_seqno_barrier(engine);
 
@@ -3265,10 +3294,7 @@ void i915_gem_reset(struct drm_i915_private *dev_priv,
 
 void i915_gem_reset_finish_engine(struct intel_engine_cs *engine)
 {
-	tasklet_enable(&engine->execlists.tasklet);
-	kthread_unpark(engine->breadcrumbs.signaler);
-
-	intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL);
+	__i915_gem_reset_finish_engine(engine);
 }
 
 void i915_gem_reset_finish(struct drm_i915_private *dev_priv)
@@ -3332,7 +3358,7 @@ void i915_gem_set_wedged(struct drm_i915_private *i915)
 	 * for which we haven't set the fence error to EIO yet).
 	 */
 	for_each_engine(engine, i915, id) {
-		i915_gem_reset_prepare_engine(engine);
+		__i915_gem_reset_prepare_engine(engine);
 
 		engine->submit_request = nop_submit_request;
 		engine->schedule = NULL;
@@ -3380,7 +3406,7 @@ void i915_gem_set_wedged(struct drm_i915_private *i915)
 					       intel_engine_last_submit(engine));
 		spin_unlock_irqrestore(&engine->timeline.lock, flags);
 
-		i915_gem_reset_finish_engine(engine);
+		__i915_gem_reset_finish_engine(engine);
 	}
 
 	GEM_TRACE("end\n");
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index 6bfd7e3ed152..0de489da514e 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -306,6 +306,7 @@ intel_engine_setup(struct drm_i915_private *dev_priv,
 	/* Nothing to do here, execute in order of dependencies */
 	engine->schedule = NULL;
 
+	mutex_init(&engine->reset_lock);
 	seqlock_init(&engine->stats.lock);
 
 	ATOMIC_INIT_NOTIFIER_HEAD(&engine->context_status_notifier);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 010750e8ee44..ec2b359f3e8b 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -566,6 +566,9 @@ struct intel_engine_cs {
 
 	struct intel_engine_hangcheck hangcheck;
 
+	struct mutex reset_lock;
+	unsigned int reset_depth;
+
 #define I915_ENGINE_NEEDS_CMD_PARSER BIT(0)
 #define I915_ENGINE_SUPPORTS_STATS   BIT(1)
 #define I915_ENGINE_HAS_PREEMPTION   BIT(2)
-- 
2.17.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH] drm/i915: Allow i915_gem_reset_prepare_engine to recurse
  2018-05-14 20:52 [PATCH] drm/i915: Allow i915_gem_reset_prepare_engine to recurse Chris Wilson
@ 2018-05-14 21:08 ` Chris Wilson
  2018-05-14 21:58 ` ✗ Fi.CI.CHECKPATCH: warning for " Patchwork
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 6+ messages in thread
From: Chris Wilson @ 2018-05-14 21:08 UTC (permalink / raw)
  To: intel-gfx

Quoting Chris Wilson (2018-05-14 21:52:13)
> We call i915_gem_reset_prepare_engine() during reset and then upon
> wedging if the reset fails. Unfortunately, kthread_park and similar do
> not support being called recursively and so we must count the number of
> times we prepare for reset and only actually prepare on the outermost
> layer. (Similarly for finish on unwinding the onion.)

On an alternate timeline, I posted patches to remove the call to
kthread_park...
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* ✗ Fi.CI.CHECKPATCH: warning for drm/i915: Allow i915_gem_reset_prepare_engine to recurse
  2018-05-14 20:52 [PATCH] drm/i915: Allow i915_gem_reset_prepare_engine to recurse Chris Wilson
  2018-05-14 21:08 ` Chris Wilson
@ 2018-05-14 21:58 ` Patchwork
  2018-05-14 22:08 ` ✗ Fi.CI.BAT: failure " Patchwork
  2018-05-15  8:08 ` [PATCH] " Mika Kuoppala
  3 siblings, 0 replies; 6+ messages in thread
From: Patchwork @ 2018-05-14 21:58 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: drm/i915: Allow i915_gem_reset_prepare_engine to recurse
URL   : https://patchwork.freedesktop.org/series/43156/
State : warning

== Summary ==

$ dim checkpatch origin/drm-tip
7fae27664969 drm/i915: Allow i915_gem_reset_prepare_engine to recurse
-:57: WARNING:COMMIT_LOG_LONG_LINE: Possible unwrapped commit description (prefer a maximum 75 chars per line)
#57: 
References: 85f1abe0019f ("kthread, sched/wait: Fix kthread_parkme() completion issue")

-:57: ERROR:GIT_COMMIT_ID: Please use git commit description style 'commit <12+ chars of sha1> ("<title line>")' - ie: 'commit 85f1abe0019f ("kthread, sched/wait: Fix kthread_parkme() completion issue")'
#57: 
References: 85f1abe0019f ("kthread, sched/wait: Fix kthread_parkme() completion issue")

-:173: CHECK:UNCOMMENTED_DEFINITION: struct mutex definition without comment
#173: FILE: drivers/gpu/drm/i915/intel_ringbuffer.h:569:
+	struct mutex reset_lock;

total: 1 errors, 1 warnings, 1 checks, 99 lines checked

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* ✗ Fi.CI.BAT: failure for drm/i915: Allow i915_gem_reset_prepare_engine to recurse
  2018-05-14 20:52 [PATCH] drm/i915: Allow i915_gem_reset_prepare_engine to recurse Chris Wilson
  2018-05-14 21:08 ` Chris Wilson
  2018-05-14 21:58 ` ✗ Fi.CI.CHECKPATCH: warning for " Patchwork
@ 2018-05-14 22:08 ` Patchwork
  2018-05-15  8:08 ` [PATCH] " Mika Kuoppala
  3 siblings, 0 replies; 6+ messages in thread
From: Patchwork @ 2018-05-14 22:08 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: drm/i915: Allow i915_gem_reset_prepare_engine to recurse
URL   : https://patchwork.freedesktop.org/series/43156/
State : failure

== Summary ==

= CI Bug Log - changes from CI_DRM_4179 -> Patchwork_8999 =

== Summary - FAILURE ==

  Serious unknown changes coming with Patchwork_8999 absolutely need to be
  verified manually.
  
  If you think the reported changes have nothing to do with the changes
  introduced in Patchwork_8999, please notify your bug team to allow them
  to document this new failure mode, which will reduce false positives in CI.

  External URL: https://patchwork.freedesktop.org/api/1.0/series/43156/revisions/1/mbox/

== Possible new issues ==

  Here are the unknown changes that may have been introduced in Patchwork_8999:

  === IGT changes ===

    ==== Possible regressions ====

    igt@drv_hangman@error-state-basic:
      fi-skl-6700k2:      PASS -> DMESG-FAIL
      fi-skl-6770hq:      PASS -> DMESG-FAIL
      fi-cnl-y3:          PASS -> DMESG-FAIL
      fi-hsw-4770:        PASS -> INCOMPLETE
      fi-ivb-3520m:       PASS -> INCOMPLETE
      fi-bsw-n3050:       PASS -> DMESG-FAIL
      fi-ilk-650:         PASS -> INCOMPLETE
      fi-kbl-7560u:       PASS -> DMESG-FAIL
      fi-pnv-d510:        PASS -> INCOMPLETE
      fi-skl-6600u:       PASS -> DMESG-FAIL
      fi-cfl-8700k:       PASS -> DMESG-FAIL
      fi-gdg-551:         PASS -> INCOMPLETE
      fi-bxt-dsi:         PASS -> DMESG-FAIL
      fi-glk-j4005:       PASS -> DMESG-FAIL
      fi-kbl-7567u:       PASS -> DMESG-FAIL
      fi-bdw-5557u:       PASS -> DMESG-FAIL
      fi-blb-e6850:       PASS -> INCOMPLETE
      fi-skl-guc:         PASS -> DMESG-FAIL
      fi-kbl-r:           PASS -> DMESG-FAIL
      fi-bwr-2160:        PASS -> INCOMPLETE
      fi-bxt-j4205:       PASS -> DMESG-FAIL
      fi-bdw-gvtdvm:      PASS -> DMESG-FAIL
      fi-hsw-4200u:       PASS -> INCOMPLETE
      fi-hsw-4770r:       PASS -> INCOMPLETE
      fi-kbl-7500u:       PASS -> DMESG-FAIL
      fi-hsw-peppy:       PASS -> INCOMPLETE
      fi-skl-gvtdvm:      PASS -> DMESG-FAIL
      fi-snb-2600:        PASS -> INCOMPLETE
      fi-skl-6260u:       PASS -> DMESG-FAIL
      fi-cfl-s3:          PASS -> DMESG-FAIL
      fi-cfl-u:           PASS -> DMESG-FAIL
      fi-ivb-3770:        PASS -> INCOMPLETE
      fi-cnl-psr:         PASS -> DMESG-FAIL

    igt@gem_busy@basic-busy-default:
      fi-skl-6600u:       PASS -> INCOMPLETE
      fi-cfl-8700k:       PASS -> INCOMPLETE
      fi-kbl-7500u:       PASS -> INCOMPLETE
      fi-bdw-5557u:       PASS -> INCOMPLETE
      fi-kbl-r:           PASS -> INCOMPLETE
      fi-skl-6770hq:      PASS -> INCOMPLETE
      fi-cfl-u:           PASS -> INCOMPLETE
      fi-cfl-s3:          PASS -> INCOMPLETE
      fi-kbl-7560u:       PASS -> INCOMPLETE
      fi-skl-6260u:       PASS -> INCOMPLETE
      fi-bsw-n3050:       PASS -> INCOMPLETE
      fi-skl-6700k2:      PASS -> INCOMPLETE
      fi-skl-guc:         PASS -> INCOMPLETE
      fi-kbl-7567u:       PASS -> INCOMPLETE

    
== Known issues ==

  Here are the changes found in Patchwork_8999 that come from known issues:

  === IGT changes ===

    ==== Issues hit ====

    igt@drv_hangman@error-state-basic:
      fi-elk-e7500:       PASS -> INCOMPLETE (fdo#103989)
      fi-byt-n2820:       PASS -> INCOMPLETE (fdo#102657)
      fi-byt-j1900:       NOTRUN -> INCOMPLETE (fdo#102657)
      fi-snb-2520m:       NOTRUN -> INCOMPLETE (fdo#103713)

    igt@gem_busy@basic-busy-default:
      fi-glk-j4005:       PASS -> INCOMPLETE (fdo#103359, k.org#198133)
      fi-cnl-y3:          PASS -> INCOMPLETE (fdo#105086)
      fi-cnl-psr:         PASS -> INCOMPLETE (fdo#105086)
      fi-bxt-dsi:         PASS -> INCOMPLETE (fdo#103927)
      fi-skl-gvtdvm:      PASS -> INCOMPLETE (fdo#105600)
      fi-bdw-gvtdvm:      PASS -> INCOMPLETE (fdo#105600)
      fi-bxt-j4205:       PASS -> INCOMPLETE (fdo#103927)

    
    ==== Possible fixes ====

    igt@debugfs_test@read_all_entries:
      fi-snb-2520m:       INCOMPLETE (fdo#103713) -> PASS

    
  fdo#102657 https://bugs.freedesktop.org/show_bug.cgi?id=102657
  fdo#103359 https://bugs.freedesktop.org/show_bug.cgi?id=103359
  fdo#103713 https://bugs.freedesktop.org/show_bug.cgi?id=103713
  fdo#103927 https://bugs.freedesktop.org/show_bug.cgi?id=103927
  fdo#103989 https://bugs.freedesktop.org/show_bug.cgi?id=103989
  fdo#105086 https://bugs.freedesktop.org/show_bug.cgi?id=105086
  fdo#105600 https://bugs.freedesktop.org/show_bug.cgi?id=105600
  k.org#198133 https://bugzilla.kernel.org/show_bug.cgi?id=198133


== Participating hosts (41 -> 37) ==

  Additional (1): fi-byt-j1900 
  Missing    (5): fi-ctg-p8600 fi-ilk-m540 fi-byt-squawks fi-bsw-cyan fi-skl-6700hq 


== Build changes ==

    * Linux: CI_DRM_4179 -> Patchwork_8999

  CI_DRM_4179: be6d36ea8d6130f54ab5ec816555f1a46bd95f7b @ git://anongit.freedesktop.org/gfx-ci/linux
  IGT_4479: 89ae332745e31a075747a63ac5acc5baccf75769 @ git://anongit.freedesktop.org/xorg/app/intel-gpu-tools
  Patchwork_8999: 7fae27664969e82f0f176f17feaad4d329f7438e @ git://anongit.freedesktop.org/gfx-ci/linux
  piglit_4479: 3ba0657bff4216d1ec7179935590261855f1651e @ git://anongit.freedesktop.org/piglit


== Linux commits ==

7fae27664969 drm/i915: Allow i915_gem_reset_prepare_engine to recurse

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_8999/issues.html
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] drm/i915: Allow i915_gem_reset_prepare_engine to recurse
  2018-05-14 20:52 [PATCH] drm/i915: Allow i915_gem_reset_prepare_engine to recurse Chris Wilson
                   ` (2 preceding siblings ...)
  2018-05-14 22:08 ` ✗ Fi.CI.BAT: failure " Patchwork
@ 2018-05-15  8:08 ` Mika Kuoppala
  2018-05-15  8:13   ` Chris Wilson
  3 siblings, 1 reply; 6+ messages in thread
From: Mika Kuoppala @ 2018-05-15  8:08 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> We call i915_gem_reset_prepare_engine() during reset and then upon
> wedging if the reset fails. Unfortunately, kthread_park and similar do
> not support being called recursively and so we must count the number of
> times we prepare for reset and only actually prepare on the outermost
> layer. (Similarly for finish on unwinding the onion.)
>
> [   87.705581] WARNING: CPU: 2 PID: 1377 at kernel/kthread.c:505 kthread_park+0x55/0x60
> [   87.705583] Modules linked in: snd_hda_codec_hdmi snd_hda_codec_realtek snd_hda_codec_generic i915 x86_pkg_temp_thermal intel_powerclamp coretemp crct10dif_pclmul crc32_pclmul ghash_clmulni_intel snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_pcm broadcom bcm_phy_lib tg3 mei_me prime_numbers mei lpc_ich
> [   87.705618] CPU: 2 PID: 1377 Comm: gem_eio Tainted: G     U            4.17.0-rc5-CI-CI_DRM_4177+ #1
> [   87.705620] Hardware name: Dell Inc. XPS 8300  /0Y2MRG, BIOS A06 10/17/2011
> [   87.705622] RIP: 0010:kthread_park+0x55/0x60
> [   87.705624] RSP: 0018:ffffc9000051bac0 EFLAGS: 00010202
> [   87.705627] RAX: 0000000000000004 RBX: ffff88021ca13de8 RCX: 0000000000000001
> [   87.705629] RDX: 0000000080000001 RSI: ffffffff821228a9 RDI: ffff88020e8f0040
> [   87.705630] RBP: ffff880215937670 R08: 00000000bae32d65 R09: 0000000000000000
> [   87.705632] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8802159376b0
> [   87.705634] R13: ffff880215937670 R14: ffff880215930000 R15: ffffffffa01c8d60
> [   87.705636] FS:  00007f0c32061980(0000) GS:ffff88022fa80000(0000) knlGS:0000000000000000
> [   87.705637] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [   87.705639] CR2: 00007f0c32094000 CR3: 000000021a0d4004 CR4: 00000000000606e0
> [   87.705641] Call Trace:
> [   87.705668]  i915_gem_reset_prepare_engine+0x1d/0xa0 [i915]
> [   87.705694]  i915_gem_set_wedged+0x7b/0x1e0 [i915]
> [   87.705699]  ? __drm_printfn_info+0x20/0x20
> [   87.705722]  i915_reset+0x14a/0x290 [i915]
> [   87.705743]  i915_reset_device+0x1fb/0x290 [i915]
> [   87.705767]  ? __intel_get_crtc_scanline+0x1c0/0x1c0 [i915]
> [   87.705772]  ? work_on_cpu_safe+0x50/0x50
> [   87.705798]  i915_handle_error+0x207/0x4a0 [i915]
> [   87.705810]  ? __might_fault+0x39/0x90
> [   87.705835]  i915_wedged_set+0x7f/0xc0 [i915]
> [   87.705841]  simple_attr_write+0xb0/0xd0
> [   87.705847]  full_proxy_write+0x51/0x80
> [   87.705852]  __vfs_write+0x31/0x160
> [   87.705857]  ? rcu_read_lock_sched_held+0x6f/0x80
> [   87.705860]  ? rcu_sync_lockdep_assert+0x29/0x50
> [   87.705862]  ? __sb_start_write+0x152/0x1f0
> [   87.705864]  ? __sb_start_write+0x168/0x1f0
> [   87.705868]  vfs_write+0xbd/0x1a0
> [   87.705872]  ksys_write+0x50/0xc0
> [   87.705877]  do_syscall_64+0x55/0x190
> [   87.705880]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
> [   87.705882] RIP: 0033:0x7f0c315df281
> [   87.705884] RSP: 002b:00007ffc9c990328 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
> [   87.705887] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f0c315df281
> [   87.705889] RDX: 0000000000000002 RSI: 000055a5e23ef276 RDI: 0000000000000047
> [   87.705890] RBP: 00007ffc9c990350 R08: 0000000000000000 R09: 0000000000000034
> [   87.705892] R10: 0000000000000000 R11: 0000000000000246 R12: 000055a5e23ebc50
> [   87.705894] R13: 00007ffc9c990dc0 R14: 0000000000000000 R15: 0000000000000000
> [   87.705902] Code: 00 31 ed 48 39 c7 74 0e e8 79 db 00 00 48 8d 7b 18 e8 a0 05 88 00 89 e8 5b 5d c3 0f 0b bd da ff ff ff 89 e8 5b 5d c3 0f 0b eb b7 <0f> 0b bd f0 ff ff ff eb e2 66 90 41 57 41 56 49 c7 c6 f4 ff ff
>
> References: 85f1abe0019f ("kthread, sched/wait: Fix kthread_parkme() completion issue")
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> ---
>  drivers/gpu/drm/i915/i915_gem.c         | 52 ++++++++++++++++++-------
>  drivers/gpu/drm/i915/intel_engine_cs.c  |  1 +
>  drivers/gpu/drm/i915/intel_ringbuffer.h |  3 ++
>  3 files changed, 43 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 0a2070112b66..b169b630bf78 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2997,14 +2997,11 @@ i915_gem_find_active_request(struct intel_engine_cs *engine)
>  	return active;
>  }
>  
> -/*
> - * Ensure irq handler finishes, and not run again.
> - * Also return the active request so that we only search for it once.
> - */
> -struct i915_request *
> -i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
> +static void __i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
>  {
> -	struct i915_request *request = NULL;
> +	mutex_lock(&engine->reset_lock);
> +	if (engine->reset_depth++)
> +		goto unlock;
>  
>  	/*
>  	 * During the reset sequence, we must prevent the engine from
> @@ -3057,6 +3054,38 @@ i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
>  	if (engine->i915->guc.preempt_wq)
>  		flush_workqueue(engine->i915->guc.preempt_wq);
>  
> +unlock:
> +	mutex_unlock(&engine->reset_lock);
> +}
> +
> +static void __i915_gem_reset_finish_engine(struct intel_engine_cs *engine)
> +{
> +	mutex_lock(&engine->reset_lock);
> +
> +	GEM_BUG_ON(engine->reset_depth);

!engine->reset_depth

-Mika

> +	if (--engine->reset_depth)
> +		goto unlock;
> +
> +	tasklet_enable(&engine->execlists.tasklet);
> +	kthread_unpark(engine->breadcrumbs.signaler);
> +
> +	intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL);
> +
> +unlock:
> +	mutex_unlock(&engine->reset_lock);
> +}
> +
> +/*
> + * Ensure irq handler finishes, and not run again.
> + * Also return the active request so that we only search for it once.
> + */
> +struct i915_request *
> +i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
> +{
> +	struct i915_request *request = NULL;
> +
> +	__i915_gem_reset_prepare_engine(engine);
> +
>  	if (engine->irq_seqno_barrier)
>  		engine->irq_seqno_barrier(engine);
>  
> @@ -3265,10 +3294,7 @@ void i915_gem_reset(struct drm_i915_private *dev_priv,
>  
>  void i915_gem_reset_finish_engine(struct intel_engine_cs *engine)
>  {
> -	tasklet_enable(&engine->execlists.tasklet);
> -	kthread_unpark(engine->breadcrumbs.signaler);
> -
> -	intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL);
> +	__i915_gem_reset_finish_engine(engine);
>  }
>  
>  void i915_gem_reset_finish(struct drm_i915_private *dev_priv)
> @@ -3332,7 +3358,7 @@ void i915_gem_set_wedged(struct drm_i915_private *i915)
>  	 * for which we haven't set the fence error to EIO yet).
>  	 */
>  	for_each_engine(engine, i915, id) {
> -		i915_gem_reset_prepare_engine(engine);
> +		__i915_gem_reset_prepare_engine(engine);
>  
>  		engine->submit_request = nop_submit_request;
>  		engine->schedule = NULL;
> @@ -3380,7 +3406,7 @@ void i915_gem_set_wedged(struct drm_i915_private *i915)
>  					       intel_engine_last_submit(engine));
>  		spin_unlock_irqrestore(&engine->timeline.lock, flags);
>  
> -		i915_gem_reset_finish_engine(engine);
> +		__i915_gem_reset_finish_engine(engine);
>  	}
>  
>  	GEM_TRACE("end\n");
> diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
> index 6bfd7e3ed152..0de489da514e 100644
> --- a/drivers/gpu/drm/i915/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/intel_engine_cs.c
> @@ -306,6 +306,7 @@ intel_engine_setup(struct drm_i915_private *dev_priv,
>  	/* Nothing to do here, execute in order of dependencies */
>  	engine->schedule = NULL;
>  
> +	mutex_init(&engine->reset_lock);
>  	seqlock_init(&engine->stats.lock);
>  
>  	ATOMIC_INIT_NOTIFIER_HEAD(&engine->context_status_notifier);
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 010750e8ee44..ec2b359f3e8b 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -566,6 +566,9 @@ struct intel_engine_cs {
>  
>  	struct intel_engine_hangcheck hangcheck;
>  
> +	struct mutex reset_lock;
> +	unsigned int reset_depth;
> +
>  #define I915_ENGINE_NEEDS_CMD_PARSER BIT(0)
>  #define I915_ENGINE_SUPPORTS_STATS   BIT(1)
>  #define I915_ENGINE_HAS_PREEMPTION   BIT(2)
> -- 
> 2.17.0
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] drm/i915: Allow i915_gem_reset_prepare_engine to recurse
  2018-05-15  8:08 ` [PATCH] " Mika Kuoppala
@ 2018-05-15  8:13   ` Chris Wilson
  0 siblings, 0 replies; 6+ messages in thread
From: Chris Wilson @ 2018-05-15  8:13 UTC (permalink / raw)
  To: Mika Kuoppala, intel-gfx

Quoting Mika Kuoppala (2018-05-15 09:08:45)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > We call i915_gem_reset_prepare_engine() during reset and then upon
> > wedging if the reset fails. Unfortunately, kthread_park and similar do
> > not support being called recursively and so we must count the number of
> > times we prepare for reset and only actually prepare on the outermost
> > layer. (Similarly for finish on unwinding the onion.)
> >
> > [   87.705581] WARNING: CPU: 2 PID: 1377 at kernel/kthread.c:505 kthread_park+0x55/0x60
> > [   87.705583] Modules linked in: snd_hda_codec_hdmi snd_hda_codec_realtek snd_hda_codec_generic i915 x86_pkg_temp_thermal intel_powerclamp coretemp crct10dif_pclmul crc32_pclmul ghash_clmulni_intel snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_pcm broadcom bcm_phy_lib tg3 mei_me prime_numbers mei lpc_ich
> > [   87.705618] CPU: 2 PID: 1377 Comm: gem_eio Tainted: G     U            4.17.0-rc5-CI-CI_DRM_4177+ #1
> > [   87.705620] Hardware name: Dell Inc. XPS 8300  /0Y2MRG, BIOS A06 10/17/2011
> > [   87.705622] RIP: 0010:kthread_park+0x55/0x60
> > [   87.705624] RSP: 0018:ffffc9000051bac0 EFLAGS: 00010202
> > [   87.705627] RAX: 0000000000000004 RBX: ffff88021ca13de8 RCX: 0000000000000001
> > [   87.705629] RDX: 0000000080000001 RSI: ffffffff821228a9 RDI: ffff88020e8f0040
> > [   87.705630] RBP: ffff880215937670 R08: 00000000bae32d65 R09: 0000000000000000
> > [   87.705632] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8802159376b0
> > [   87.705634] R13: ffff880215937670 R14: ffff880215930000 R15: ffffffffa01c8d60
> > [   87.705636] FS:  00007f0c32061980(0000) GS:ffff88022fa80000(0000) knlGS:0000000000000000
> > [   87.705637] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > [   87.705639] CR2: 00007f0c32094000 CR3: 000000021a0d4004 CR4: 00000000000606e0
> > [   87.705641] Call Trace:
> > [   87.705668]  i915_gem_reset_prepare_engine+0x1d/0xa0 [i915]
> > [   87.705694]  i915_gem_set_wedged+0x7b/0x1e0 [i915]
> > [   87.705699]  ? __drm_printfn_info+0x20/0x20
> > [   87.705722]  i915_reset+0x14a/0x290 [i915]
> > [   87.705743]  i915_reset_device+0x1fb/0x290 [i915]
> > [   87.705767]  ? __intel_get_crtc_scanline+0x1c0/0x1c0 [i915]
> > [   87.705772]  ? work_on_cpu_safe+0x50/0x50
> > [   87.705798]  i915_handle_error+0x207/0x4a0 [i915]
> > [   87.705810]  ? __might_fault+0x39/0x90
> > [   87.705835]  i915_wedged_set+0x7f/0xc0 [i915]
> > [   87.705841]  simple_attr_write+0xb0/0xd0
> > [   87.705847]  full_proxy_write+0x51/0x80
> > [   87.705852]  __vfs_write+0x31/0x160
> > [   87.705857]  ? rcu_read_lock_sched_held+0x6f/0x80
> > [   87.705860]  ? rcu_sync_lockdep_assert+0x29/0x50
> > [   87.705862]  ? __sb_start_write+0x152/0x1f0
> > [   87.705864]  ? __sb_start_write+0x168/0x1f0
> > [   87.705868]  vfs_write+0xbd/0x1a0
> > [   87.705872]  ksys_write+0x50/0xc0
> > [   87.705877]  do_syscall_64+0x55/0x190
> > [   87.705880]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
> > [   87.705882] RIP: 0033:0x7f0c315df281
> > [   87.705884] RSP: 002b:00007ffc9c990328 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
> > [   87.705887] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f0c315df281
> > [   87.705889] RDX: 0000000000000002 RSI: 000055a5e23ef276 RDI: 0000000000000047
> > [   87.705890] RBP: 00007ffc9c990350 R08: 0000000000000000 R09: 0000000000000034
> > [   87.705892] R10: 0000000000000000 R11: 0000000000000246 R12: 000055a5e23ebc50
> > [   87.705894] R13: 00007ffc9c990dc0 R14: 0000000000000000 R15: 0000000000000000
> > [   87.705902] Code: 00 31 ed 48 39 c7 74 0e e8 79 db 00 00 48 8d 7b 18 e8 a0 05 88 00 89 e8 5b 5d c3 0f 0b bd da ff ff ff 89 e8 5b 5d c3 0f 0b eb b7 <0f> 0b bd f0 ff ff ff eb e2 66 90 41 57 41 56 49 c7 c6 f4 ff ff
> >
> > References: 85f1abe0019f ("kthread, sched/wait: Fix kthread_parkme() completion issue")
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> > ---
> >  drivers/gpu/drm/i915/i915_gem.c         | 52 ++++++++++++++++++-------
> >  drivers/gpu/drm/i915/intel_engine_cs.c  |  1 +
> >  drivers/gpu/drm/i915/intel_ringbuffer.h |  3 ++
> >  3 files changed, 43 insertions(+), 13 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> > index 0a2070112b66..b169b630bf78 100644
> > --- a/drivers/gpu/drm/i915/i915_gem.c
> > +++ b/drivers/gpu/drm/i915/i915_gem.c
> > @@ -2997,14 +2997,11 @@ i915_gem_find_active_request(struct intel_engine_cs *engine)
> >       return active;
> >  }
> >  
> > -/*
> > - * Ensure irq handler finishes, and not run again.
> > - * Also return the active request so that we only search for it once.
> > - */
> > -struct i915_request *
> > -i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
> > +static void __i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
> >  {
> > -     struct i915_request *request = NULL;
> > +     mutex_lock(&engine->reset_lock);
> > +     if (engine->reset_depth++)
> > +             goto unlock;
> >  
> >       /*
> >        * During the reset sequence, we must prevent the engine from
> > @@ -3057,6 +3054,38 @@ i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
> >       if (engine->i915->guc.preempt_wq)
> >               flush_workqueue(engine->i915->guc.preempt_wq);
> >  
> > +unlock:
> > +     mutex_unlock(&engine->reset_lock);
> > +}
> > +
> > +static void __i915_gem_reset_finish_engine(struct intel_engine_cs *engine)
> > +{
> > +     mutex_lock(&engine->reset_lock);
> > +
> > +     GEM_BUG_ON(engine->reset_depth);
> 
> !engine->reset_depth

Hah. I think I prefer the patchset to kill kthread_park entirely. Hmm,
should also then include the removal of parking from the signaler for a
dramatic saving of one if().
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2018-05-15  8:13 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-05-14 20:52 [PATCH] drm/i915: Allow i915_gem_reset_prepare_engine to recurse Chris Wilson
2018-05-14 21:08 ` Chris Wilson
2018-05-14 21:58 ` ✗ Fi.CI.CHECKPATCH: warning for " Patchwork
2018-05-14 22:08 ` ✗ Fi.CI.BAT: failure " Patchwork
2018-05-15  8:08 ` [PATCH] " Mika Kuoppala
2018-05-15  8:13   ` Chris Wilson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.