All of lore.kernel.org
 help / color / mirror / Atom feed
From: John.C.Harrison@Intel.com
To: Intel-GFX@Lists.FreeDesktop.Org
Cc: DRI-Devel@Lists.FreeDesktop.Org,
	Rahul Kumar Singh <rahul.kumar.singh@intel.com>
Subject: [PATCH 08/12] drm/i915/guc: Add selftest for a hung GuC
Date: Tue, 12 Jul 2022 16:31:32 -0700	[thread overview]
Message-ID: <20220712233136.1044951-9-John.C.Harrison@Intel.com> (raw)
In-Reply-To: <20220712233136.1044951-1-John.C.Harrison@Intel.com>

From: Rahul Kumar Singh <rahul.kumar.singh@intel.com>

Add a test to check that the hangcheck will recover from a submission
hang in the GuC.

Signed-off-by: Rahul Kumar Singh <rahul.kumar.singh@intel.com>
---
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c |   1 +
 .../drm/i915/gt/uc/selftest_guc_hangcheck.c   | 159 ++++++++++++++++++
 .../drm/i915/selftests/i915_live_selftests.h  |   1 +
 3 files changed, 161 insertions(+)
 create mode 100644 drivers/gpu/drm/i915/gt/uc/selftest_guc_hangcheck.c

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 7537459080278..72832a4f4bac7 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -4937,4 +4937,5 @@ bool intel_guc_virtual_engine_has_heartbeat(const struct intel_engine_cs *ve)
 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
 #include "selftest_guc.c"
 #include "selftest_guc_multi_lrc.c"
+#include "selftest_guc_hangcheck.c"
 #endif
diff --git a/drivers/gpu/drm/i915/gt/uc/selftest_guc_hangcheck.c b/drivers/gpu/drm/i915/gt/uc/selftest_guc_hangcheck.c
new file mode 100644
index 0000000000000..af913c4b09d37
--- /dev/null
+++ b/drivers/gpu/drm/i915/gt/uc/selftest_guc_hangcheck.c
@@ -0,0 +1,159 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright �� 2019 Intel Corporation
+ */
+
+#include "selftests/igt_spinner.h"
+#include "selftests/igt_reset.h"
+#include "selftests/intel_scheduler_helpers.h"
+#include "gt/intel_engine_heartbeat.h"
+#include "gem/selftests/mock_context.h"
+
+#define BEAT_INTERVAL	100
+
+static struct i915_request *nop_request(struct intel_engine_cs *engine)
+{
+	struct i915_request *rq;
+
+	rq = intel_engine_create_kernel_request(engine);
+	if (IS_ERR(rq))
+		return rq;
+
+	i915_request_get(rq);
+	i915_request_add(rq);
+
+	return rq;
+}
+
+static int intel_hang_guc(void *arg)
+{
+	struct intel_gt *gt = arg;
+	int ret = 0;
+	struct i915_gem_context *ctx;
+	struct intel_context *ce;
+	struct igt_spinner spin;
+	struct i915_request *rq;
+	intel_wakeref_t wakeref;
+	struct i915_gpu_error *global = &gt->i915->gpu_error;
+	struct intel_engine_cs *engine;
+	unsigned int reset_count;
+	u32 guc_status;
+	u32 old_beat;
+
+	ctx = kernel_context(gt->i915, NULL);
+	if (IS_ERR(ctx)) {
+		pr_err("Failed get kernel context: %ld\n", PTR_ERR(ctx));
+		return PTR_ERR(ctx);
+	}
+
+	wakeref = intel_runtime_pm_get(gt->uncore->rpm);
+
+	ce = intel_context_create(gt->engine[BCS0]);
+	if (IS_ERR(ce)) {
+		ret = PTR_ERR(ce);
+		pr_err("Failed to create spinner request: %d\n", ret);
+		goto err;
+	}
+
+	engine = ce->engine;
+	reset_count = i915_reset_count(global);
+
+	old_beat = engine->props.heartbeat_interval_ms;
+	ret = intel_engine_set_heartbeat(engine, BEAT_INTERVAL);
+	if (ret) {
+		pr_err("Failed to boost heatbeat interval: %d\n", ret);
+		goto err;
+	}
+
+	ret = igt_spinner_init(&spin, engine->gt);
+	if (ret) {
+		pr_err("Failed to create spinner: %d\n", ret);
+		goto err;
+	}
+
+	rq = igt_spinner_create_request(&spin, ce, MI_NOOP);
+	intel_context_put(ce);
+	if (IS_ERR(rq)) {
+		ret = PTR_ERR(rq);
+		pr_err("Failed to create spinner request: %d\n", ret);
+		goto err_spin;
+	}
+
+	ret = request_add_spin(rq, &spin);
+	if (ret) {
+		i915_request_put(rq);
+		pr_err("Failed to add Spinner request: %d\n", ret);
+		goto err_spin;
+	}
+
+	ret = intel_reset_guc(gt);
+	if (ret) {
+		i915_request_put(rq);
+		pr_err("Failed to reset GuC, ret = %d\n", ret);
+		goto err_spin;
+	}
+
+	guc_status = intel_uncore_read(gt->uncore, GUC_STATUS);
+	if (!(guc_status & GS_MIA_IN_RESET)) {
+		i915_request_put(rq);
+		pr_err("GuC failed to reset: status = 0x%08X\n", guc_status);
+		ret = -EIO;
+		goto err_spin;
+	}
+
+	/* Wait for the heartbeat to cause a reset */
+	ret = intel_selftest_wait_for_rq(rq);
+	i915_request_put(rq);
+	if (ret) {
+		pr_err("Request failed to complete: %d\n", ret);
+		goto err_spin;
+	}
+
+	if (i915_reset_count(global) == reset_count) {
+		pr_err("Failed to record a GPU reset\n");
+		ret = -EINVAL;
+		goto err_spin;
+	}
+
+err_spin:
+	igt_spinner_end(&spin);
+	igt_spinner_fini(&spin);
+	intel_engine_set_heartbeat(engine, old_beat);
+
+	if (ret == 0) {
+		rq = nop_request(engine);
+		if (IS_ERR(rq)) {
+			ret = PTR_ERR(rq);
+			goto err;
+		}
+
+		ret = intel_selftest_wait_for_rq(rq);
+		i915_request_put(rq);
+		if (ret) {
+			pr_err("No-op failed to complete: %d\n", ret);
+			goto err;
+		}
+	}
+
+err:
+	intel_runtime_pm_put(gt->uncore->rpm, wakeref);
+	kernel_context_close(ctx);
+
+	return ret;
+}
+
+int intel_guc_hang_check(struct drm_i915_private *i915)
+{
+	static const struct i915_subtest tests[] = {
+		SUBTEST(intel_hang_guc),
+	};
+	struct intel_gt *gt = to_gt(i915);
+
+	if (intel_gt_is_wedged(gt))
+		return 0;
+
+	if (!intel_uc_uses_guc_submission(&gt->uc))
+		return 0;
+
+	return intel_gt_live_subtests(tests, gt);
+}
diff --git a/drivers/gpu/drm/i915/selftests/i915_live_selftests.h b/drivers/gpu/drm/i915/selftests/i915_live_selftests.h
index bdd290f2bf3cd..aaf8a380e5c78 100644
--- a/drivers/gpu/drm/i915/selftests/i915_live_selftests.h
+++ b/drivers/gpu/drm/i915/selftests/i915_live_selftests.h
@@ -49,5 +49,6 @@ selftest(perf, i915_perf_live_selftests)
 selftest(slpc, intel_slpc_live_selftests)
 selftest(guc, intel_guc_live_selftests)
 selftest(guc_multi_lrc, intel_guc_multi_lrc_live_selftests)
+selftest(guc_hang, intel_guc_hang_check)
 /* Here be dragons: keep last to run last! */
 selftest(late_gt_pm, intel_gt_pm_late_selftests)
-- 
2.36.0


WARNING: multiple messages have this Message-ID (diff)
From: John.C.Harrison@Intel.com
To: Intel-GFX@Lists.FreeDesktop.Org
Cc: DRI-Devel@Lists.FreeDesktop.Org,
	Rahul Kumar Singh <rahul.kumar.singh@intel.com>
Subject: [Intel-gfx] [PATCH 08/12] drm/i915/guc: Add selftest for a hung GuC
Date: Tue, 12 Jul 2022 16:31:32 -0700	[thread overview]
Message-ID: <20220712233136.1044951-9-John.C.Harrison@Intel.com> (raw)
In-Reply-To: <20220712233136.1044951-1-John.C.Harrison@Intel.com>

From: Rahul Kumar Singh <rahul.kumar.singh@intel.com>

Add a test to check that the hangcheck will recover from a submission
hang in the GuC.

Signed-off-by: Rahul Kumar Singh <rahul.kumar.singh@intel.com>
---
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c |   1 +
 .../drm/i915/gt/uc/selftest_guc_hangcheck.c   | 159 ++++++++++++++++++
 .../drm/i915/selftests/i915_live_selftests.h  |   1 +
 3 files changed, 161 insertions(+)
 create mode 100644 drivers/gpu/drm/i915/gt/uc/selftest_guc_hangcheck.c

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 7537459080278..72832a4f4bac7 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -4937,4 +4937,5 @@ bool intel_guc_virtual_engine_has_heartbeat(const struct intel_engine_cs *ve)
 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
 #include "selftest_guc.c"
 #include "selftest_guc_multi_lrc.c"
+#include "selftest_guc_hangcheck.c"
 #endif
diff --git a/drivers/gpu/drm/i915/gt/uc/selftest_guc_hangcheck.c b/drivers/gpu/drm/i915/gt/uc/selftest_guc_hangcheck.c
new file mode 100644
index 0000000000000..af913c4b09d37
--- /dev/null
+++ b/drivers/gpu/drm/i915/gt/uc/selftest_guc_hangcheck.c
@@ -0,0 +1,159 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright �� 2019 Intel Corporation
+ */
+
+#include "selftests/igt_spinner.h"
+#include "selftests/igt_reset.h"
+#include "selftests/intel_scheduler_helpers.h"
+#include "gt/intel_engine_heartbeat.h"
+#include "gem/selftests/mock_context.h"
+
+#define BEAT_INTERVAL	100
+
+static struct i915_request *nop_request(struct intel_engine_cs *engine)
+{
+	struct i915_request *rq;
+
+	rq = intel_engine_create_kernel_request(engine);
+	if (IS_ERR(rq))
+		return rq;
+
+	i915_request_get(rq);
+	i915_request_add(rq);
+
+	return rq;
+}
+
+static int intel_hang_guc(void *arg)
+{
+	struct intel_gt *gt = arg;
+	int ret = 0;
+	struct i915_gem_context *ctx;
+	struct intel_context *ce;
+	struct igt_spinner spin;
+	struct i915_request *rq;
+	intel_wakeref_t wakeref;
+	struct i915_gpu_error *global = &gt->i915->gpu_error;
+	struct intel_engine_cs *engine;
+	unsigned int reset_count;
+	u32 guc_status;
+	u32 old_beat;
+
+	ctx = kernel_context(gt->i915, NULL);
+	if (IS_ERR(ctx)) {
+		pr_err("Failed get kernel context: %ld\n", PTR_ERR(ctx));
+		return PTR_ERR(ctx);
+	}
+
+	wakeref = intel_runtime_pm_get(gt->uncore->rpm);
+
+	ce = intel_context_create(gt->engine[BCS0]);
+	if (IS_ERR(ce)) {
+		ret = PTR_ERR(ce);
+		pr_err("Failed to create spinner request: %d\n", ret);
+		goto err;
+	}
+
+	engine = ce->engine;
+	reset_count = i915_reset_count(global);
+
+	old_beat = engine->props.heartbeat_interval_ms;
+	ret = intel_engine_set_heartbeat(engine, BEAT_INTERVAL);
+	if (ret) {
+		pr_err("Failed to boost heatbeat interval: %d\n", ret);
+		goto err;
+	}
+
+	ret = igt_spinner_init(&spin, engine->gt);
+	if (ret) {
+		pr_err("Failed to create spinner: %d\n", ret);
+		goto err;
+	}
+
+	rq = igt_spinner_create_request(&spin, ce, MI_NOOP);
+	intel_context_put(ce);
+	if (IS_ERR(rq)) {
+		ret = PTR_ERR(rq);
+		pr_err("Failed to create spinner request: %d\n", ret);
+		goto err_spin;
+	}
+
+	ret = request_add_spin(rq, &spin);
+	if (ret) {
+		i915_request_put(rq);
+		pr_err("Failed to add Spinner request: %d\n", ret);
+		goto err_spin;
+	}
+
+	ret = intel_reset_guc(gt);
+	if (ret) {
+		i915_request_put(rq);
+		pr_err("Failed to reset GuC, ret = %d\n", ret);
+		goto err_spin;
+	}
+
+	guc_status = intel_uncore_read(gt->uncore, GUC_STATUS);
+	if (!(guc_status & GS_MIA_IN_RESET)) {
+		i915_request_put(rq);
+		pr_err("GuC failed to reset: status = 0x%08X\n", guc_status);
+		ret = -EIO;
+		goto err_spin;
+	}
+
+	/* Wait for the heartbeat to cause a reset */
+	ret = intel_selftest_wait_for_rq(rq);
+	i915_request_put(rq);
+	if (ret) {
+		pr_err("Request failed to complete: %d\n", ret);
+		goto err_spin;
+	}
+
+	if (i915_reset_count(global) == reset_count) {
+		pr_err("Failed to record a GPU reset\n");
+		ret = -EINVAL;
+		goto err_spin;
+	}
+
+err_spin:
+	igt_spinner_end(&spin);
+	igt_spinner_fini(&spin);
+	intel_engine_set_heartbeat(engine, old_beat);
+
+	if (ret == 0) {
+		rq = nop_request(engine);
+		if (IS_ERR(rq)) {
+			ret = PTR_ERR(rq);
+			goto err;
+		}
+
+		ret = intel_selftest_wait_for_rq(rq);
+		i915_request_put(rq);
+		if (ret) {
+			pr_err("No-op failed to complete: %d\n", ret);
+			goto err;
+		}
+	}
+
+err:
+	intel_runtime_pm_put(gt->uncore->rpm, wakeref);
+	kernel_context_close(ctx);
+
+	return ret;
+}
+
+int intel_guc_hang_check(struct drm_i915_private *i915)
+{
+	static const struct i915_subtest tests[] = {
+		SUBTEST(intel_hang_guc),
+	};
+	struct intel_gt *gt = to_gt(i915);
+
+	if (intel_gt_is_wedged(gt))
+		return 0;
+
+	if (!intel_uc_uses_guc_submission(&gt->uc))
+		return 0;
+
+	return intel_gt_live_subtests(tests, gt);
+}
diff --git a/drivers/gpu/drm/i915/selftests/i915_live_selftests.h b/drivers/gpu/drm/i915/selftests/i915_live_selftests.h
index bdd290f2bf3cd..aaf8a380e5c78 100644
--- a/drivers/gpu/drm/i915/selftests/i915_live_selftests.h
+++ b/drivers/gpu/drm/i915/selftests/i915_live_selftests.h
@@ -49,5 +49,6 @@ selftest(perf, i915_perf_live_selftests)
 selftest(slpc, intel_slpc_live_selftests)
 selftest(guc, intel_guc_live_selftests)
 selftest(guc_multi_lrc, intel_guc_multi_lrc_live_selftests)
+selftest(guc_hang, intel_guc_hang_check)
 /* Here be dragons: keep last to run last! */
 selftest(late_gt_pm, intel_gt_pm_late_selftests)
-- 
2.36.0


  parent reply	other threads:[~2022-07-12 23:31 UTC|newest]

Thread overview: 56+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-07-12 23:31 [PATCH 00/12] Random assortment of (mostly) GuC related patches John.C.Harrison
2022-07-12 23:31 ` [Intel-gfx] " John.C.Harrison
2022-07-12 23:31 ` [PATCH 01/12] drm/i915: Remove bogus GEM_BUG_ON in unpark John.C.Harrison
2022-07-12 23:31   ` [Intel-gfx] " John.C.Harrison
2022-07-18 12:15   ` Tvrtko Ursulin
2022-07-19  0:05     ` John Harrison
2022-07-19  9:42       ` Tvrtko Ursulin
2022-07-21  0:54         ` John Harrison
2022-07-21  9:24           ` Tvrtko Ursulin
2022-07-22 19:09             ` John Harrison
2022-07-12 23:31 ` [PATCH 02/12] drm/i915/guc: Don't call ring_is_idle in GuC submission John.C.Harrison
2022-07-12 23:31   ` [Intel-gfx] " John.C.Harrison
2022-07-18 12:26   ` Tvrtko Ursulin
2022-07-19  0:09     ` John Harrison
2022-07-19  9:49       ` Tvrtko Ursulin
2022-07-19 10:14         ` Tvrtko Ursulin
2022-07-12 23:31 ` [PATCH 03/12] drm/i915/guc: Fix issues with live_preempt_cancel John.C.Harrison
2022-07-12 23:31   ` [Intel-gfx] " John.C.Harrison
2022-07-12 23:31 ` [PATCH 04/12] drm/i915/guc: Add GuC <-> kernel time stamp translation information John.C.Harrison
2022-07-12 23:31   ` [Intel-gfx] " John.C.Harrison
2022-07-12 23:31 ` [PATCH 05/12] drm/i915/guc: Record CTB info in error logs John.C.Harrison
2022-07-12 23:31   ` [Intel-gfx] " John.C.Harrison
2022-07-12 23:31 ` [PATCH 06/12] drm/i915/guc: Use streaming loads to speed up dumping the guc log John.C.Harrison
2022-07-12 23:31   ` [Intel-gfx] " John.C.Harrison
2022-07-22 20:05   ` John Harrison
2022-07-12 23:31 ` [PATCH 07/12] drm/i915/guc: Route semaphores to GuC for Gen12+ John.C.Harrison
2022-07-12 23:31   ` [Intel-gfx] " John.C.Harrison
2022-07-13  0:51   ` Matthew Brost
2022-07-13  0:51     ` [Intel-gfx] " Matthew Brost
2022-07-15 17:21   ` Ceraolo Spurio, Daniele
2022-07-15 17:21     ` [Intel-gfx] " Ceraolo Spurio, Daniele
2022-07-12 23:31 ` John.C.Harrison [this message]
2022-07-12 23:31   ` [Intel-gfx] [PATCH 08/12] drm/i915/guc: Add selftest for a hung GuC John.C.Harrison
2022-07-12 23:31 ` [PATCH 09/12] drm/i915/selftest: Cope with not having an RCS engine John.C.Harrison
2022-07-12 23:31   ` [Intel-gfx] " John.C.Harrison
2022-07-13  0:48   ` Matthew Brost
2022-07-13  0:48     ` [Intel-gfx] " Matthew Brost
2022-07-12 23:31 ` [PATCH 10/12] drm/i915/guc: Support larger contexts on newer hardware John.C.Harrison
2022-07-12 23:31   ` [Intel-gfx] " John.C.Harrison
2022-07-18 12:35   ` Tvrtko Ursulin
2022-07-19  0:13     ` John Harrison
2022-07-19  9:56       ` Tvrtko Ursulin
2022-07-22 19:32         ` John Harrison
2022-07-25 11:24           ` Tvrtko Ursulin
2022-07-12 23:31 ` [PATCH 11/12] drm/i915/guc: Don't abort on CTB_UNUSED status John.C.Harrison
2022-07-12 23:31   ` [Intel-gfx] " John.C.Harrison
2022-07-18 12:36   ` Tvrtko Ursulin
2022-07-19  0:16     ` John Harrison
2022-07-12 23:31 ` [PATCH 12/12] drm/i915/guc: Add a helper for log buffer size John.C.Harrison
2022-07-12 23:31   ` [Intel-gfx] " John.C.Harrison
2022-07-13  0:46   ` Matthew Brost
2022-07-13  0:46     ` [Intel-gfx] " Matthew Brost
2022-07-13  0:31 ` [Intel-gfx] ✗ Fi.CI.BUILD: warning for Random assortment of (mostly) GuC related patches Patchwork
2022-07-13 20:09 ` [Intel-gfx] ✗ Fi.CI.CHECKPATCH: warning for Random assortment of (mostly) GuC related patches (rev2) Patchwork
2022-07-13 20:27 ` [Intel-gfx] ✓ Fi.CI.BAT: success " Patchwork
2022-07-14  1:41 ` [Intel-gfx] ✗ Fi.CI.IGT: failure " Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220712233136.1044951-9-John.C.Harrison@Intel.com \
    --to=john.c.harrison@intel.com \
    --cc=DRI-Devel@Lists.FreeDesktop.Org \
    --cc=Intel-GFX@Lists.FreeDesktop.Org \
    --cc=rahul.kumar.singh@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.