* [PATCH] drm/i915: Separate out reset flags from the reset counter
@ 2016-07-05 15:22 Chris Wilson
2016-07-05 16:02 ` ✗ Ro.CI.BAT: failure for " Patchwork
0 siblings, 1 reply; 2+ messages in thread
From: Chris Wilson @ 2016-07-05 15:22 UTC (permalink / raw)
To: intel-gfx; +Cc: Mika Kuoppala
In preparation for introducing a per-engine reset, we can first separate
the mixing of the reset state from the global reset counter.
The loss of atomicity in updating the reset state poses a small problem
for handling the waiters. For requests, this is solved by advancing the
seqno so that a waiter waking up after the reset knows the request is
complete. For pending flips, we still rely on the increment of the
global reset epoch (as well as the reset-in-progress flag) to signify
when the hardware was reset.
The advantage, now that we do not inspect the reset state during reset
itself i.e. we no longer emit requests during reset, is that we can use
the atomic updates of the state flags to ensure that only one reset
worker is active.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Arun Siluvery <arun.siluvery@linux.intel.com>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
---
drivers/gpu/drm/i915/i915_drv.c | 16 ++----
drivers/gpu/drm/i915/i915_drv.h | 46 +++++-----------
drivers/gpu/drm/i915/i915_gem.c | 15 ++---
drivers/gpu/drm/i915/i915_irq.c | 104 ++++++++++++++++-------------------
drivers/gpu/drm/i915/intel_display.c | 25 ++++++---
drivers/gpu/drm/i915/intel_drv.h | 4 +-
6 files changed, 93 insertions(+), 117 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 694edac2c703..98c6f4abb85b 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1611,7 +1611,7 @@ static int i915_drm_resume(struct drm_device *dev)
mutex_lock(&dev->struct_mutex);
if (i915_gem_init_hw(dev)) {
DRM_ERROR("failed to re-initialize GPU, declaring wedged!\n");
- atomic_or(I915_WEDGED, &dev_priv->gpu_error.reset_counter);
+ set_bit(I915_WEDGED, &dev_priv->gpu_error.flags);
}
mutex_unlock(&dev->struct_mutex);
@@ -1771,7 +1771,6 @@ int i915_reset(struct drm_i915_private *dev_priv)
{
struct drm_device *dev = &dev_priv->drm;
struct i915_gpu_error *error = &dev_priv->gpu_error;
- unsigned reset_counter;
int ret;
intel_reset_gt_powersave(dev_priv);
@@ -1779,14 +1778,8 @@ int i915_reset(struct drm_i915_private *dev_priv)
mutex_lock(&dev->struct_mutex);
/* Clear any previous failed attempts at recovery. Time to try again. */
- atomic_andnot(I915_WEDGED, &error->reset_counter);
-
- /* Clear the reset-in-progress flag and increment the reset epoch. */
- reset_counter = atomic_inc_return(&error->reset_counter);
- if (WARN_ON(__i915_reset_in_progress(reset_counter))) {
- ret = -EIO;
- goto error;
- }
+ __clear_bit(I915_WEDGED, &error->flags);
+ error->reset_count++;
pr_notice("drm/i915: Resetting chip after gpu hang\n");
@@ -1823,6 +1816,7 @@ int i915_reset(struct drm_i915_private *dev_priv)
goto error;
}
+ clear_bit(I915_RESET_IN_PROGRESS, &error->flags);
mutex_unlock(&dev->struct_mutex);
/*
@@ -1837,7 +1831,7 @@ int i915_reset(struct drm_i915_private *dev_priv)
return 0;
error:
- atomic_or(I915_WEDGED, &error->reset_counter);
+ set_bit(I915_WEDGED, &error->flags);
mutex_unlock(&dev->struct_mutex);
return ret;
}
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 33df29680a54..13f8ff02cf6e 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1375,9 +1375,10 @@ struct i915_gpu_error {
* State variable controlling the reset flow and count
*
* This is a counter which gets incremented when reset is triggered,
- * and again when reset has been handled. So odd values (lowest bit set)
- * means that reset is in progress and even values that
- * (reset_counter >> 1):th reset was successfully completed.
+ *
+ * Before the reset commences, the I915_RESET_IN_PROGRESS bit is set
+ * meaning that any waiters holding onto the struct_mutex should
+ * relinquish the lock immediately in order for the reset to start.
*
* If reset is not completed succesfully, the I915_WEDGE bit is
* set meaning that hardware is terminally sour and there is no
@@ -1392,10 +1393,11 @@ struct i915_gpu_error {
* naturally enforces the correct ordering between the bail-out of the
* waiter and the gpu reset work code.
*/
- atomic_t reset_counter;
+ unsigned long reset_count;
-#define I915_RESET_IN_PROGRESS_FLAG 1
-#define I915_WEDGED (1 << 31)
+ unsigned long flags;
+#define I915_RESET_IN_PROGRESS 0
+#define I915_WEDGED (BITS_PER_LONG-1)
/**
* Waitqueue to signal when a hang is detected. Used to for waiters
@@ -3319,44 +3321,24 @@ i915_gem_find_active_request(struct intel_engine_cs *engine);
void i915_gem_retire_requests(struct drm_i915_private *dev_priv);
void i915_gem_retire_requests_ring(struct intel_engine_cs *engine);
-static inline u32 i915_reset_counter(struct i915_gpu_error *error)
-{
- return atomic_read(&error->reset_counter);
-}
-
-static inline bool __i915_reset_in_progress(u32 reset)
-{
- return unlikely(reset & I915_RESET_IN_PROGRESS_FLAG);
-}
-
-static inline bool __i915_reset_in_progress_or_wedged(u32 reset)
-{
- return unlikely(reset & (I915_RESET_IN_PROGRESS_FLAG | I915_WEDGED));
-}
-
-static inline bool __i915_terminally_wedged(u32 reset)
-{
- return unlikely(reset & I915_WEDGED);
-}
-
static inline bool i915_reset_in_progress(struct i915_gpu_error *error)
{
- return __i915_reset_in_progress(i915_reset_counter(error));
+ return test_bit(I915_RESET_IN_PROGRESS, &error->flags);
}
-static inline bool i915_reset_in_progress_or_wedged(struct i915_gpu_error *error)
+static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
{
- return __i915_reset_in_progress_or_wedged(i915_reset_counter(error));
+ return test_bit(I915_WEDGED, &error->flags);
}
-static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
+static inline bool i915_reset_in_progress_or_wedged(struct i915_gpu_error *error)
{
- return __i915_terminally_wedged(i915_reset_counter(error));
+ return i915_reset_in_progress(error) | i915_terminally_wedged(error);
}
static inline u32 i915_reset_count(struct i915_gpu_error *error)
{
- return ((i915_reset_counter(error) & ~I915_WEDGED) + 1) / 2;
+ return READ_ONCE(error->reset_count);
}
void i915_gem_reset(struct drm_device *dev);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 8f50919ba9b4..11af5f8eff73 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1326,15 +1326,17 @@ put_rpm:
}
static int
-i915_gem_check_wedge(unsigned reset_counter, bool interruptible)
+i915_gem_check_wedge(struct drm_i915_private *dev_priv)
{
- if (__i915_terminally_wedged(reset_counter))
+ struct i915_gpu_error *error = &dev_priv->gpu_error;
+
+ if (i915_terminally_wedged(error))
return -EIO;
- if (__i915_reset_in_progress(reset_counter)) {
+ if (i915_reset_in_progress(error)) {
/* Non-interruptible callers can't handle -EAGAIN, hence return
* -EIO unconditionally for these. */
- if (!interruptible)
+ if (!dev_priv->mm.interruptible)
return -EIO;
return -EAGAIN;
@@ -3000,7 +3002,6 @@ __i915_gem_request_alloc(struct intel_engine_cs *engine,
struct drm_i915_gem_request **req_out)
{
struct drm_i915_private *dev_priv = engine->i915;
- unsigned reset_counter = i915_reset_counter(&dev_priv->gpu_error);
struct drm_i915_gem_request *req;
int ret;
@@ -3013,7 +3014,7 @@ __i915_gem_request_alloc(struct intel_engine_cs *engine,
* EIO if the GPU is already wedged, or EAGAIN to drop the struct_mutex
* and restart.
*/
- ret = i915_gem_check_wedge(reset_counter, dev_priv->mm.interruptible);
+ ret = i915_gem_check_wedge(dev_priv);
if (ret)
return ret;
@@ -5221,7 +5222,7 @@ int i915_gem_init(struct drm_device *dev)
* for all other failure, such as an allocation failure, bail.
*/
DRM_ERROR("Failed to initialize GPU, declaring it wedged\n");
- atomic_or(I915_WEDGED, &dev_priv->gpu_error.reset_counter);
+ __set_bit(I915_WEDGED, &dev_priv->gpu_error.flags);
ret = 0;
}
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index b77d808b71cd..1f12930f5515 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2508,53 +2508,41 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);
+ DRM_DEBUG_DRIVER("resetting chip\n");
+ kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
+
/*
- * Note that there's only one work item which does gpu resets, so we
- * need not worry about concurrent gpu resets potentially incrementing
- * error->reset_counter twice. We only need to take care of another
- * racing irq/hangcheck declaring the gpu dead for a second time. A
- * quick check for that is good enough: schedule_work ensures the
- * correct ordering between hang detection and this work item, and since
- * the reset in-progress bit is only ever set by code outside of this
- * work we don't need to worry about any other races.
+ * In most cases it's guaranteed that we get here with an RPM
+ * reference held, for example because there is a pending GPU
+ * request that won't finish until the reset is done. This
+ * isn't the case at least when we get here by doing a
+ * simulated reset via debugs, so get an RPM reference.
*/
- if (i915_reset_in_progress(&dev_priv->gpu_error)) {
- DRM_DEBUG_DRIVER("resetting chip\n");
- kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
-
- /*
- * In most cases it's guaranteed that we get here with an RPM
- * reference held, for example because there is a pending GPU
- * request that won't finish until the reset is done. This
- * isn't the case at least when we get here by doing a
- * simulated reset via debugs, so get an RPM reference.
- */
- intel_runtime_pm_get(dev_priv);
+ intel_runtime_pm_get(dev_priv);
- intel_prepare_reset(dev_priv);
+ intel_prepare_reset(dev_priv);
- /*
- * All state reset _must_ be completed before we update the
- * reset counter, for otherwise waiters might miss the reset
- * pending state and not properly drop locks, resulting in
- * deadlocks with the reset work.
- */
- ret = i915_reset(dev_priv);
+ /*
+ * All state reset _must_ be completed before we update the
+ * reset counter, for otherwise waiters might miss the reset
+ * pending state and not properly drop locks, resulting in
+ * deadlocks with the reset work.
+ */
+ ret = i915_reset(dev_priv);
- intel_finish_reset(dev_priv);
+ intel_finish_reset(dev_priv);
- intel_runtime_pm_put(dev_priv);
+ intel_runtime_pm_put(dev_priv);
- if (ret == 0)
- kobject_uevent_env(kobj,
- KOBJ_CHANGE, reset_done_event);
+ if (ret == 0)
+ kobject_uevent_env(kobj,
+ KOBJ_CHANGE, reset_done_event);
- /*
- * Note: The wake_up also serves as a memory barrier so that
- * waiters see the update value of the reset counter atomic_t.
- */
- wake_up_all(&dev_priv->gpu_error.reset_queue);
- }
+ /*
+ * Note: The wake_up also serves as a memory barrier so that
+ * waiters see the update value of the reset counter atomic_t.
+ */
+ i915_error_wake_up(dev_priv);
}
static void i915_report_and_clear_eir(struct drm_i915_private *dev_priv)
@@ -2673,25 +2661,27 @@ void i915_handle_error(struct drm_i915_private *dev_priv,
i915_capture_error_state(dev_priv, engine_mask, error_msg);
i915_report_and_clear_eir(dev_priv);
- if (engine_mask) {
- atomic_or(I915_RESET_IN_PROGRESS_FLAG,
- &dev_priv->gpu_error.reset_counter);
+ if (!engine_mask)
+ return;
- /*
- * Wakeup waiting processes so that the reset function
- * i915_reset_and_wakeup doesn't deadlock trying to grab
- * various locks. By bumping the reset counter first, the woken
- * processes will see a reset in progress and back off,
- * releasing their locks and then wait for the reset completion.
- * We must do this for _all_ gpu waiters that might hold locks
- * that the reset work needs to acquire.
- *
- * Note: The wake_up serves as the required memory barrier to
- * ensure that the waiters see the updated value of the reset
- * counter atomic_t.
- */
- i915_error_wake_up(dev_priv);
- }
+ if (test_and_set_bit(I915_RESET_IN_PROGRESS,
+ &dev_priv->gpu_error.flags))
+ return;
+
+ /*
+ * Wakeup waiting processes so that the reset function
+ * i915_reset_and_wakeup doesn't deadlock trying to grab
+ * various locks. By bumping the reset counter first, the woken
+ * processes will see a reset in progress and back off,
+ * releasing their locks and then wait for the reset completion.
+ * We must do this for _all_ gpu waiters that might hold locks
+ * that the reset work needs to acquire.
+ *
+ * Note: The wake_up serves as the required memory barrier to
+ * ensure that the waiters see the updated value of the reset
+ * counter atomic_t.
+ */
+ i915_error_wake_up(dev_priv);
i915_reset_and_wakeup(dev_priv);
}
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 111b350d1d7e..903f4509926f 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -3228,15 +3228,26 @@ void intel_finish_reset(struct drm_i915_private *dev_priv)
drm_modeset_unlock_all(&dev_priv->drm);
}
+static bool abort_flip_on_reset(struct intel_crtc *crtc)
+{
+ struct i915_gpu_error *error = &to_i915(crtc->base.dev)->gpu_error;
+
+ if (i915_reset_in_progress(error))
+ return true;
+
+ if (crtc->reset_count != i915_reset_count(error))
+ return true;
+
+ return false;
+}
+
static bool intel_crtc_has_pending_flip(struct drm_crtc *crtc)
{
struct drm_device *dev = crtc->dev;
struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
- unsigned reset_counter;
bool pending;
- reset_counter = i915_reset_counter(&to_i915(dev)->gpu_error);
- if (intel_crtc->reset_counter != reset_counter)
+ if (abort_flip_on_reset(intel_crtc))
return false;
spin_lock_irq(&dev->event_lock);
@@ -11067,10 +11078,8 @@ static bool __pageflip_finished_cs(struct intel_crtc *crtc,
{
struct drm_device *dev = crtc->base.dev;
struct drm_i915_private *dev_priv = to_i915(dev);
- unsigned reset_counter;
- reset_counter = i915_reset_counter(&dev_priv->gpu_error);
- if (crtc->reset_counter != reset_counter)
+ if (abort_flip_on_reset(crtc))
return true;
/*
@@ -11751,8 +11760,8 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc,
if (ret)
goto cleanup;
- intel_crtc->reset_counter = i915_reset_counter(&dev_priv->gpu_error);
- if (__i915_reset_in_progress_or_wedged(intel_crtc->reset_counter)) {
+ intel_crtc->reset_count = i915_reset_count(&dev_priv->gpu_error);
+ if (i915_reset_in_progress_or_wedged(&dev_priv->gpu_error)) {
ret = -EIO;
goto cleanup;
}
diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h
index e6a24d2154d1..faae0d844904 100644
--- a/drivers/gpu/drm/i915/intel_drv.h
+++ b/drivers/gpu/drm/i915/intel_drv.h
@@ -685,8 +685,8 @@ struct intel_crtc {
struct intel_crtc_state *config;
- /* reset counter value when the last flip was submitted */
- unsigned int reset_counter;
+ /* global reset count when the last flip was submitted */
+ unsigned int reset_count;
/* Access to these should be protected by dev_priv->irq_lock. */
bool cpu_fifo_underrun_disabled;
--
2.8.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply related [flat|nested] 2+ messages in thread
* ✗ Ro.CI.BAT: failure for drm/i915: Separate out reset flags from the reset counter
2016-07-05 15:22 [PATCH] drm/i915: Separate out reset flags from the reset counter Chris Wilson
@ 2016-07-05 16:02 ` Patchwork
0 siblings, 0 replies; 2+ messages in thread
From: Patchwork @ 2016-07-05 16:02 UTC (permalink / raw)
To: Chris Wilson; +Cc: intel-gfx
== Series Details ==
Series: drm/i915: Separate out reset flags from the reset counter
URL : https://patchwork.freedesktop.org/series/9522/
State : failure
== Summary ==
Series 9522v1 drm/i915: Separate out reset flags from the reset counter
http://patchwork.freedesktop.org/api/1.0/series/9522/revisions/1/mbox
Test gem_exec_flush:
Subgroup basic-batch-kernel-default-cmd:
pass -> FAIL (ro-byt-n2820)
Subgroup basic-batch-kernel-default-uc:
dmesg-fail -> PASS (fi-skl-i5-6260u)
dmesg-fail -> PASS (fi-skl-i7-6700k)
Test kms_pipe_crc_basic:
Subgroup suspend-read-crc-pipe-b:
pass -> DMESG-WARN (fi-skl-i5-6260u)
fi-kbl-qkkr total:234 pass:162 dwarn:26 dfail:0 fail:5 skip:41
fi-skl-i5-6260u total:234 pass:205 dwarn:1 dfail:0 fail:2 skip:26
fi-skl-i7-6700k total:234 pass:192 dwarn:0 dfail:0 fail:2 skip:40
fi-snb-i7-2600 total:234 pass:178 dwarn:0 dfail:0 fail:2 skip:54
ro-bdw-i5-5250u total:229 pass:204 dwarn:1 dfail:1 fail:0 skip:23
ro-bdw-i7-5600u total:229 pass:190 dwarn:0 dfail:1 fail:0 skip:38
ro-bsw-n3050 total:229 pass:176 dwarn:1 dfail:1 fail:2 skip:49
ro-byt-n2820 total:229 pass:180 dwarn:0 dfail:1 fail:3 skip:45
ro-hsw-i3-4010u total:229 pass:197 dwarn:0 dfail:1 fail:0 skip:31
ro-hsw-i7-4770r total:229 pass:197 dwarn:0 dfail:1 fail:0 skip:31
ro-ilk-i7-620lm total:229 pass:157 dwarn:0 dfail:1 fail:1 skip:70
ro-ilk1-i5-650 total:224 pass:157 dwarn:0 dfail:1 fail:1 skip:65
ro-ivb-i7-3770 total:229 pass:188 dwarn:0 dfail:1 fail:0 skip:40
ro-skl3-i5-6260u total:229 pass:208 dwarn:1 dfail:1 fail:0 skip:19
ro-snb-i7-2620M total:229 pass:179 dwarn:0 dfail:1 fail:1 skip:48
ro-bdw-i7-5557U failed to connect after reboot
Results at /archive/results/CI_IGT_test/RO_Patchwork_1423/
2a6e307 drm-intel-nightly: 2016y-07m-05d-11h-50m-09s UTC integration manifest
33bde36 drm/i915: Separate out reset flags from the reset counter
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2016-07-05 16:02 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-07-05 15:22 [PATCH] drm/i915: Separate out reset flags from the reset counter Chris Wilson
2016-07-05 16:02 ` ✗ Ro.CI.BAT: failure for " Patchwork
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.