From: Matthew Brost <matthew.brost@intel.com> To: <intel-gfx@lists.freedesktop.org>, <dri-devel@lists.freedesktop.org> Subject: [RFC PATCH 21/42] drm/i915/guc: Add hang check to GuC submit engine Date: Tue, 20 Jul 2021 13:57:41 -0700 [thread overview] Message-ID: <20210720205802.39610-22-matthew.brost@intel.com> (raw) In-Reply-To: <20210720205802.39610-1-matthew.brost@intel.com> The heartbeat uses a single instance of a GuC submit engine (GSE) to do the hang check. As such if a different GSE's state machine hangs, the heartbeat cannot detect this hang. Add timer to each GSE which in turn can disable all submissions if it is hung. Cc: John Harrison <John.C.Harrison@Intel.com> Signed-off-by: Matthew Brost <matthew.brost@intel.com> --- .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 36 +++++++++++++++++++ .../i915/gt/uc/intel_guc_submission_types.h | 3 ++ 2 files changed, 39 insertions(+) diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index d8be5a41d0ca..4cf233d39bea 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -105,15 +105,21 @@ static bool tasklet_blocked(struct guc_submit_engine *gse) return test_bit(GSE_STATE_TASKLET_BLOCKED, &gse->flags); } +/* 2 seconds seems like a reasonable timeout waiting for a G2H */ +#define MAX_TASKLET_BLOCKED_NS 2000000000 static void set_tasklet_blocked(struct guc_submit_engine *gse) { lockdep_assert_held(&gse->sched_engine.lock); + hrtimer_start_range_ns(&gse->hang_timer, + ns_to_ktime(MAX_TASKLET_BLOCKED_NS), 0, + HRTIMER_MODE_REL_PINNED); set_bit(GSE_STATE_TASKLET_BLOCKED, &gse->flags); } static void __clr_tasklet_blocked(struct guc_submit_engine *gse) { lockdep_assert_held(&gse->sched_engine.lock); + hrtimer_cancel(&gse->hang_timer); clear_bit(GSE_STATE_TASKLET_BLOCKED, &gse->flags); } @@ -1021,6 +1027,7 @@ static void disable_submission(struct intel_guc *guc) if (__tasklet_is_enabled(&sched_engine->tasklet)) { GEM_BUG_ON(!guc->ct.enabled); __tasklet_disable_sync_once(&sched_engine->tasklet); + hrtimer_try_to_cancel(&guc->gse[i]->hang_timer); sched_engine->tasklet.callback = NULL; } } @@ -3716,6 +3723,33 @@ static void guc_sched_engine_destroy(struct kref *kref) kfree(gse); } +static enum hrtimer_restart gse_hang(struct hrtimer *hrtimer) +{ + struct guc_submit_engine *gse = + container_of(hrtimer, struct guc_submit_engine, hang_timer); + struct intel_guc *guc = gse->sched_engine.private_data; + +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) + if (guc->gse_hang_expected) + drm_dbg(&guc_to_gt(guc)->i915->drm, + "GSE[%i] hung, disabling submission", gse->id); + else + drm_err(&guc_to_gt(guc)->i915->drm, + "GSE[%i] hung, disabling submission", gse->id); +#else + drm_err(&guc_to_gt(guc)->i915->drm, + "GSE[%i] hung, disabling submission", gse->id); +#endif + + /* + * Tasklet not making forward progress, disable submission which in turn + * will kick in the heartbeat to do a full GPU reset. + */ + disable_submission(guc); + + return HRTIMER_NORESTART; +} + static void guc_submit_engine_init(struct intel_guc *guc, struct guc_submit_engine *gse, int id) @@ -3733,6 +3767,8 @@ static void guc_submit_engine_init(struct intel_guc *guc, sched_engine->retire_inflight_request_prio = guc_retire_inflight_request_prio; sched_engine->private_data = guc; + hrtimer_init(&gse->hang_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + gse->hang_timer.function = gse_hang; gse->id = id; } diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission_types.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission_types.h index a5933e07bdd2..eae2e9725ede 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission_types.h +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission_types.h @@ -6,6 +6,8 @@ #ifndef _INTEL_GUC_SUBMISSION_TYPES_H_ #define _INTEL_GUC_SUBMISSION_TYPES_H_ +#include <linux/xarray.h> + #include "gt/intel_engine_types.h" #include "gt/intel_context_types.h" #include "i915_scheduler_types.h" @@ -41,6 +43,7 @@ struct guc_submit_engine { unsigned long flags; int total_num_rq_with_no_guc_id; atomic_t num_guc_ids_not_ready; + struct hrtimer hang_timer; int id; /* -- 2.28.0
WARNING: multiple messages have this Message-ID (diff)
From: Matthew Brost <matthew.brost@intel.com> To: <intel-gfx@lists.freedesktop.org>, <dri-devel@lists.freedesktop.org> Subject: [Intel-gfx] [RFC PATCH 21/42] drm/i915/guc: Add hang check to GuC submit engine Date: Tue, 20 Jul 2021 13:57:41 -0700 [thread overview] Message-ID: <20210720205802.39610-22-matthew.brost@intel.com> (raw) In-Reply-To: <20210720205802.39610-1-matthew.brost@intel.com> The heartbeat uses a single instance of a GuC submit engine (GSE) to do the hang check. As such if a different GSE's state machine hangs, the heartbeat cannot detect this hang. Add timer to each GSE which in turn can disable all submissions if it is hung. Cc: John Harrison <John.C.Harrison@Intel.com> Signed-off-by: Matthew Brost <matthew.brost@intel.com> --- .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 36 +++++++++++++++++++ .../i915/gt/uc/intel_guc_submission_types.h | 3 ++ 2 files changed, 39 insertions(+) diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index d8be5a41d0ca..4cf233d39bea 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -105,15 +105,21 @@ static bool tasklet_blocked(struct guc_submit_engine *gse) return test_bit(GSE_STATE_TASKLET_BLOCKED, &gse->flags); } +/* 2 seconds seems like a reasonable timeout waiting for a G2H */ +#define MAX_TASKLET_BLOCKED_NS 2000000000 static void set_tasklet_blocked(struct guc_submit_engine *gse) { lockdep_assert_held(&gse->sched_engine.lock); + hrtimer_start_range_ns(&gse->hang_timer, + ns_to_ktime(MAX_TASKLET_BLOCKED_NS), 0, + HRTIMER_MODE_REL_PINNED); set_bit(GSE_STATE_TASKLET_BLOCKED, &gse->flags); } static void __clr_tasklet_blocked(struct guc_submit_engine *gse) { lockdep_assert_held(&gse->sched_engine.lock); + hrtimer_cancel(&gse->hang_timer); clear_bit(GSE_STATE_TASKLET_BLOCKED, &gse->flags); } @@ -1021,6 +1027,7 @@ static void disable_submission(struct intel_guc *guc) if (__tasklet_is_enabled(&sched_engine->tasklet)) { GEM_BUG_ON(!guc->ct.enabled); __tasklet_disable_sync_once(&sched_engine->tasklet); + hrtimer_try_to_cancel(&guc->gse[i]->hang_timer); sched_engine->tasklet.callback = NULL; } } @@ -3716,6 +3723,33 @@ static void guc_sched_engine_destroy(struct kref *kref) kfree(gse); } +static enum hrtimer_restart gse_hang(struct hrtimer *hrtimer) +{ + struct guc_submit_engine *gse = + container_of(hrtimer, struct guc_submit_engine, hang_timer); + struct intel_guc *guc = gse->sched_engine.private_data; + +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) + if (guc->gse_hang_expected) + drm_dbg(&guc_to_gt(guc)->i915->drm, + "GSE[%i] hung, disabling submission", gse->id); + else + drm_err(&guc_to_gt(guc)->i915->drm, + "GSE[%i] hung, disabling submission", gse->id); +#else + drm_err(&guc_to_gt(guc)->i915->drm, + "GSE[%i] hung, disabling submission", gse->id); +#endif + + /* + * Tasklet not making forward progress, disable submission which in turn + * will kick in the heartbeat to do a full GPU reset. + */ + disable_submission(guc); + + return HRTIMER_NORESTART; +} + static void guc_submit_engine_init(struct intel_guc *guc, struct guc_submit_engine *gse, int id) @@ -3733,6 +3767,8 @@ static void guc_submit_engine_init(struct intel_guc *guc, sched_engine->retire_inflight_request_prio = guc_retire_inflight_request_prio; sched_engine->private_data = guc; + hrtimer_init(&gse->hang_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + gse->hang_timer.function = gse_hang; gse->id = id; } diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission_types.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission_types.h index a5933e07bdd2..eae2e9725ede 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission_types.h +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission_types.h @@ -6,6 +6,8 @@ #ifndef _INTEL_GUC_SUBMISSION_TYPES_H_ #define _INTEL_GUC_SUBMISSION_TYPES_H_ +#include <linux/xarray.h> + #include "gt/intel_engine_types.h" #include "gt/intel_context_types.h" #include "i915_scheduler_types.h" @@ -41,6 +43,7 @@ struct guc_submit_engine { unsigned long flags; int total_num_rq_with_no_guc_id; atomic_t num_guc_ids_not_ready; + struct hrtimer hang_timer; int id; /* -- 2.28.0 _______________________________________________ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
next prev parent reply other threads:[~2021-07-20 20:41 UTC|newest] Thread overview: 88+ messages / expand[flat|nested] mbox.gz Atom feed top 2021-07-20 20:57 [RFC PATCH 00/42] Parallel submission aka multi-bb execbuf Matthew Brost 2021-07-20 20:57 ` [Intel-gfx] " Matthew Brost 2021-07-20 20:57 ` [Intel-gfx] ✗ Fi.CI.BUILD: failure for " Patchwork 2021-07-20 20:57 ` [RFC PATCH 01/42] drm/i915/guc: GuC submission squashed into single patch Matthew Brost 2021-07-20 20:57 ` [Intel-gfx] " Matthew Brost 2021-07-28 12:57 ` kernel test robot 2021-07-20 20:57 ` [RFC PATCH 02/42] drm/i915/guc: Allow flexible number of context ids Matthew Brost 2021-07-20 20:57 ` [Intel-gfx] " Matthew Brost 2021-07-20 20:57 ` [RFC PATCH 03/42] drm/i915/guc: Connect the number of guc_ids to debugfs Matthew Brost 2021-07-20 20:57 ` [Intel-gfx] " Matthew Brost 2021-07-20 20:57 ` [RFC PATCH 04/42] drm/i915/guc: Don't return -EAGAIN to user when guc_ids exhausted Matthew Brost 2021-07-20 20:57 ` [Intel-gfx] " Matthew Brost 2021-07-20 20:57 ` [RFC PATCH 05/42] drm/i915/guc: Don't allow requests not ready to consume all guc_ids Matthew Brost 2021-07-20 20:57 ` [Intel-gfx] " Matthew Brost 2021-07-20 20:57 ` [RFC PATCH 06/42] drm/i915/guc: Introduce guc_submit_engine object Matthew Brost 2021-07-20 20:57 ` [Intel-gfx] " Matthew Brost 2021-07-20 20:57 ` [RFC PATCH 07/42] drm/i915/guc: Check return of __xa_store when registering a context Matthew Brost 2021-07-20 20:57 ` [Intel-gfx] " Matthew Brost 2021-07-20 20:57 ` [RFC PATCH 08/42] drm/i915/guc: Non-static lrc descriptor registration buffer Matthew Brost 2021-07-20 20:57 ` [Intel-gfx] " Matthew Brost 2021-07-20 20:57 ` [RFC PATCH 09/42] drm/i915/guc: Take GT PM ref when deregistering context Matthew Brost 2021-07-20 20:57 ` [Intel-gfx] " Matthew Brost 2021-07-20 20:57 ` [RFC PATCH 10/42] drm/i915: Add GT PM unpark worker Matthew Brost 2021-07-20 20:57 ` [Intel-gfx] " Matthew Brost 2021-07-20 20:57 ` [RFC PATCH 11/42] drm/i915/guc: Take engine PM when a context is pinned with GuC submission Matthew Brost 2021-07-20 20:57 ` [Intel-gfx] " Matthew Brost 2021-07-20 20:57 ` [RFC PATCH 12/42] drm/i915/guc: Don't call switch_to_kernel_context " Matthew Brost 2021-07-20 20:57 ` [Intel-gfx] " Matthew Brost 2021-07-20 20:57 ` [RFC PATCH 13/42] drm/i915/guc: Selftest for GuC flow control Matthew Brost 2021-07-20 20:57 ` [Intel-gfx] " Matthew Brost 2021-07-20 20:57 ` [RFC PATCH 14/42] drm/i915: Add logical engine mapping Matthew Brost 2021-07-20 20:57 ` [Intel-gfx] " Matthew Brost 2021-07-20 20:57 ` [RFC PATCH 15/42] drm/i915: Expose logical engine instance to user Matthew Brost 2021-07-20 20:57 ` [Intel-gfx] " Matthew Brost 2021-07-20 20:57 ` [RFC PATCH 16/42] drm/i915/guc: Introduce context parent-child relationship Matthew Brost 2021-07-20 20:57 ` [Intel-gfx] " Matthew Brost 2021-07-20 20:57 ` [RFC PATCH 17/42] drm/i915/guc: Implement GuC parent-child context pin / unpin functions Matthew Brost 2021-07-20 20:57 ` [Intel-gfx] " Matthew Brost 2021-07-20 20:57 ` [RFC PATCH 18/42] drm/i915/guc: Add multi-lrc context registration Matthew Brost 2021-07-20 20:57 ` [Intel-gfx] " Matthew Brost 2021-07-20 20:57 ` [RFC PATCH 19/42] drm/i915/guc: Ensure GuC schedule operations do not operate on child contexts Matthew Brost 2021-07-20 20:57 ` [Intel-gfx] " Matthew Brost 2021-07-20 20:57 ` [RFC PATCH 20/42] drm/i915/guc: Assign contexts in parent-child relationship consecutive guc_ids Matthew Brost 2021-07-20 20:57 ` [Intel-gfx] " Matthew Brost 2021-07-20 20:57 ` Matthew Brost [this message] 2021-07-20 20:57 ` [Intel-gfx] [RFC PATCH 21/42] drm/i915/guc: Add hang check to GuC submit engine Matthew Brost 2021-07-20 20:57 ` [RFC PATCH 22/42] drm/i915/guc: Add guc_child_context_destroy Matthew Brost 2021-07-20 20:57 ` [Intel-gfx] " Matthew Brost 2021-07-20 20:57 ` [RFC PATCH 23/42] drm/i915/guc: Implement multi-lrc submission Matthew Brost 2021-07-20 20:57 ` [Intel-gfx] " Matthew Brost 2021-07-20 20:57 ` [RFC PATCH 24/42] drm/i915/guc: Insert submit fences between requests in parent-child relationship Matthew Brost 2021-07-20 20:57 ` [Intel-gfx] " Matthew Brost 2021-07-20 20:57 ` [RFC PATCH 25/42] drm/i915/guc: Implement multi-lrc reset Matthew Brost 2021-07-20 20:57 ` [Intel-gfx] " Matthew Brost 2021-07-20 20:57 ` [RFC PATCH 26/42] drm/i915/guc: Update debugfs for GuC multi-lrc Matthew Brost 2021-07-20 20:57 ` [Intel-gfx] " Matthew Brost 2021-07-20 20:57 ` [RFC PATCH 27/42] drm/i915: Connect UAPI to GuC multi-lrc interface Matthew Brost 2021-07-20 20:57 ` [Intel-gfx] " Matthew Brost 2021-07-20 20:57 ` [RFC PATCH 28/42] drm/i915/guc: Add basic GuC multi-lrc selftest Matthew Brost 2021-07-20 20:57 ` [Intel-gfx] " Matthew Brost 2021-07-20 20:57 ` [RFC PATCH 29/42] drm/i915/guc: Implement BB boundary preemption for multi-lrc Matthew Brost 2021-07-20 20:57 ` [Intel-gfx] " Matthew Brost 2021-07-20 20:57 ` [RFC PATCH 30/42] i915/drm: Move secure execbuf check to execbuf2 Matthew Brost 2021-07-20 20:57 ` [Intel-gfx] " Matthew Brost 2021-07-20 20:57 ` [RFC PATCH 31/42] drm/i915: Move input/exec fence handling to i915_gem_execbuffer2 Matthew Brost 2021-07-20 20:57 ` [Intel-gfx] " Matthew Brost 2021-07-20 20:57 ` [RFC PATCH 32/42] drm/i915: Move output " Matthew Brost 2021-07-20 20:57 ` [Intel-gfx] " Matthew Brost 2021-07-20 20:57 ` [RFC PATCH 33/42] drm/i915: Return output fence from i915_gem_do_execbuffer Matthew Brost 2021-07-20 20:57 ` [Intel-gfx] " Matthew Brost 2021-07-20 20:57 ` [RFC PATCH 34/42] drm/i915: Store batch index in struct i915_execbuffer Matthew Brost 2021-07-20 20:57 ` [Intel-gfx] " Matthew Brost 2021-07-20 20:57 ` [RFC PATCH 35/42] drm/i915: Allow callers of i915_gem_do_execbuffer to override the batch index Matthew Brost 2021-07-20 20:57 ` [Intel-gfx] " Matthew Brost 2021-07-20 20:57 ` [RFC PATCH 36/42] drm/i915: Teach execbuf there can be more than one batch in the objects list Matthew Brost 2021-07-20 20:57 ` [Intel-gfx] " Matthew Brost 2021-07-20 20:57 ` [RFC PATCH 37/42] drm/i915: Only track object dependencies on first request Matthew Brost 2021-07-20 20:57 ` [Intel-gfx] " Matthew Brost 2021-07-20 20:57 ` [RFC PATCH 38/42] drm/i915: Force parallel contexts to use copy engine for reloc Matthew Brost 2021-07-20 20:57 ` [Intel-gfx] " Matthew Brost 2021-07-20 20:57 ` [RFC PATCH 39/42] drm/i915: Multi-batch execbuffer2 Matthew Brost 2021-07-20 20:57 ` [Intel-gfx] " Matthew Brost 2021-07-20 20:58 ` [RFC PATCH 40/42] drm/i915: Eliminate unnecessary VMA calls for multi-BB submission Matthew Brost 2021-07-20 20:58 ` [Intel-gfx] " Matthew Brost 2021-07-20 20:58 ` [RFC PATCH 41/42] drm/i915: Enable multi-bb execbuf Matthew Brost 2021-07-20 20:58 ` [Intel-gfx] " Matthew Brost 2021-07-20 20:58 ` [RFC PATCH 42/42] drm/i915/execlists: Parallel submission support for execlists Matthew Brost 2021-07-20 20:58 ` [Intel-gfx] " Matthew Brost
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=20210720205802.39610-22-matthew.brost@intel.com \ --to=matthew.brost@intel.com \ --cc=dri-devel@lists.freedesktop.org \ --cc=intel-gfx@lists.freedesktop.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: linkBe sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.