dri-devel.lists.freedesktop.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/4] More fixes/tweaks to GuC support
@ 2021-12-11  6:58 John.C.Harrison
  2021-12-11  6:58 ` [PATCH 1/4] drm/i915/guc: Speed up GuC log dumps John.C.Harrison
                   ` (3 more replies)
  0 siblings, 4 replies; 8+ messages in thread
From: John.C.Harrison @ 2021-12-11  6:58 UTC (permalink / raw)
  To: Intel-GFX; +Cc: John Harrison, DRI-Devel

From: John Harrison <John.C.Harrison@Intel.com>

Allow engine resets on RCS, report problems with engine resets,
improve GuC log usability.

Signed-off-by: John Harrison <John.C.Harrison@Intel.com>


John Harrison (4):
  drm/i915/guc: Speed up GuC log dumps
  drm/i915/guc: Increase GuC log size for CONFIG_DEBUG_GEM
  drm/i915/guc: Flag an error if an engine reset fails
  drm/i915: Improve long running OCL w/a for GuC submission

 drivers/gpu/drm/i915/gt/intel_engine_cs.c     | 42 +++++++++++++-
 drivers/gpu/drm/i915/gt/intel_gt_debugfs.h    | 21 +++++--
 drivers/gpu/drm/i915/gt/uc/intel_guc_log.h    |  5 +-
 .../drm/i915/gt/uc/intel_guc_log_debugfs.c    | 58 ++++++++++++++++++-
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 14 ++++-
 5 files changed, 125 insertions(+), 15 deletions(-)

-- 
2.25.1


^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH 1/4] drm/i915/guc: Speed up GuC log dumps
  2021-12-11  6:58 [PATCH 0/4] More fixes/tweaks to GuC support John.C.Harrison
@ 2021-12-11  6:58 ` John.C.Harrison
  2021-12-20 14:43   ` Daniele Ceraolo Spurio
  2021-12-11  6:58 ` [PATCH 2/4] drm/i915/guc: Increase GuC log size for CONFIG_DEBUG_GEM John.C.Harrison
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 8+ messages in thread
From: John.C.Harrison @ 2021-12-11  6:58 UTC (permalink / raw)
  To: Intel-GFX; +Cc: John Harrison, DRI-Devel

From: John Harrison <John.C.Harrison@Intel.com>

Add support for telling the debugfs interface the size of the GuC log
dump in advance. Without that, the underlying framework keeps calling
the 'show' function with larger and larger buffer allocations until it
fits. That means reading the log from graphics memory many times - 16
times with the full 18MB log size.

v2: Don't return error codes from size query. Report overflow in the
error dump as well (review feedback from Daniele).

Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
---
 drivers/gpu/drm/i915/gt/intel_gt_debugfs.h    | 21 +++++--
 .../drm/i915/gt/uc/intel_guc_log_debugfs.c    | 58 ++++++++++++++++++-
 2 files changed, 71 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_gt_debugfs.h b/drivers/gpu/drm/i915/gt/intel_gt_debugfs.h
index e307ceb99031..17e79b735cfe 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_debugfs.h
+++ b/drivers/gpu/drm/i915/gt/intel_gt_debugfs.h
@@ -10,11 +10,7 @@
 
 struct intel_gt;
 
-#define DEFINE_INTEL_GT_DEBUGFS_ATTRIBUTE(__name)				\
-	static int __name ## _open(struct inode *inode, struct file *file) \
-{									\
-	return single_open(file, __name ## _show, inode->i_private);	\
-}									\
+#define __GT_DEBUGFS_ATTRIBUTE_FOPS(__name)				\
 static const struct file_operations __name ## _fops = {			\
 	.owner = THIS_MODULE,						\
 	.open = __name ## _open,					\
@@ -23,6 +19,21 @@ static const struct file_operations __name ## _fops = {			\
 	.release = single_release,					\
 }
 
+#define DEFINE_INTEL_GT_DEBUGFS_ATTRIBUTE(__name)			\
+static int __name ## _open(struct inode *inode, struct file *file)	\
+{									\
+	return single_open(file, __name ## _show, inode->i_private);	\
+}									\
+__GT_DEBUGFS_ATTRIBUTE_FOPS(__name)
+
+#define DEFINE_INTEL_GT_DEBUGFS_ATTRIBUTE_WITH_SIZE(__name, __size_vf)		\
+static int __name ## _open(struct inode *inode, struct file *file)		\
+{										\
+	return single_open_size(file, __name ## _show, inode->i_private,	\
+			    __size_vf(inode->i_private));			\
+}										\
+__GT_DEBUGFS_ATTRIBUTE_FOPS(__name)
+
 void intel_gt_debugfs_register(struct intel_gt *gt);
 
 struct intel_gt_debugfs_file {
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_log_debugfs.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_log_debugfs.c
index 8fd068049376..ddfbe334689f 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_log_debugfs.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_log_debugfs.c
@@ -10,22 +10,74 @@
 #include "intel_guc.h"
 #include "intel_guc_log.h"
 #include "intel_guc_log_debugfs.h"
+#include "intel_uc.h"
+
+static u32 obj_to_guc_log_dump_size(struct drm_i915_gem_object *obj)
+{
+	u32 size;
+
+	if (!obj)
+		return PAGE_SIZE;
+
+	/* "0x%08x 0x%08x 0x%08x 0x%08x\n" => 16 bytes -> 44 chars => x2.75 */
+	size = ((obj->base.size * 11) + 3) / 4;
+
+	/* Add padding for final blank line, any extra header info, etc. */
+	size = PAGE_ALIGN(size + PAGE_SIZE);
+
+	return size;
+}
+
+static u32 guc_log_dump_size(struct intel_guc_log *log)
+{
+	struct intel_guc *guc = log_to_guc(log);
+
+	if (!intel_guc_is_supported(guc))
+		return PAGE_SIZE;
+
+	if (!log->vma)
+		return PAGE_SIZE;
+
+	return obj_to_guc_log_dump_size(log->vma->obj);
+}
 
 static int guc_log_dump_show(struct seq_file *m, void *data)
 {
 	struct drm_printer p = drm_seq_file_printer(m);
+	int ret;
 
-	return intel_guc_log_dump(m->private, &p, false);
+	ret = intel_guc_log_dump(m->private, &p, false);
+
+	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM) && seq_has_overflowed(m))
+		pr_warn_once("preallocated size:%zx for %s exceeded\n",
+			     m->size, __func__);
+
+	return ret;
+}
+DEFINE_INTEL_GT_DEBUGFS_ATTRIBUTE_WITH_SIZE(guc_log_dump, guc_log_dump_size);
+
+static u32 guc_load_err_dump_size(struct intel_guc_log *log)
+{
+	struct intel_guc *guc = log_to_guc(log);
+	struct intel_uc *uc = container_of(guc, struct intel_uc, guc);
+
+	if (!intel_guc_is_supported(guc))
+		return PAGE_SIZE;
+
+	return obj_to_guc_log_dump_size(uc->load_err_log);
 }
-DEFINE_INTEL_GT_DEBUGFS_ATTRIBUTE(guc_log_dump);
 
 static int guc_load_err_log_dump_show(struct seq_file *m, void *data)
 {
 	struct drm_printer p = drm_seq_file_printer(m);
 
+	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM) && seq_has_overflowed(m))
+		pr_warn_once("preallocated size:%zx for %s exceeded\n",
+			     m->size, __func__);
+
 	return intel_guc_log_dump(m->private, &p, true);
 }
-DEFINE_INTEL_GT_DEBUGFS_ATTRIBUTE(guc_load_err_log_dump);
+DEFINE_INTEL_GT_DEBUGFS_ATTRIBUTE_WITH_SIZE(guc_load_err_log_dump, guc_load_err_dump_size);
 
 static int guc_log_level_get(void *data, u64 *val)
 {
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 2/4] drm/i915/guc: Increase GuC log size for CONFIG_DEBUG_GEM
  2021-12-11  6:58 [PATCH 0/4] More fixes/tweaks to GuC support John.C.Harrison
  2021-12-11  6:58 ` [PATCH 1/4] drm/i915/guc: Speed up GuC log dumps John.C.Harrison
@ 2021-12-11  6:58 ` John.C.Harrison
  2021-12-11  6:58 ` [PATCH 3/4] drm/i915/guc: Flag an error if an engine reset fails John.C.Harrison
  2021-12-11  6:58 ` [PATCH 4/4] drm/i915: Improve long running OCL w/a for GuC submission John.C.Harrison
  3 siblings, 0 replies; 8+ messages in thread
From: John.C.Harrison @ 2021-12-11  6:58 UTC (permalink / raw)
  To: Intel-GFX; +Cc: Matthew Brost, John Harrison, DRI-Devel

From: John Harrison <John.C.Harrison@Intel.com>

Lots of testing is done with the DEBUG_GEM config option enabled but
not the DEBUG_GUC option. That means we only get teeny-tiny GuC logs
which are not hugely useful. Enabling full DEBUG_GUC also spews lots
of other detailed output that is not generally desired. However,
bigger GuC logs are extremely useful for almost any regression debug.
So enable bigger logs for DEBUG_GEM builds as well.

Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Reviewed-by: Matthew Brost <matthew.brost@intel.com>
---
 drivers/gpu/drm/i915/gt/uc/intel_guc_log.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_log.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.h
index ac1ee1d5ce10..fe6ab7550a14 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_log.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.h
@@ -15,9 +15,12 @@
 
 struct intel_guc;
 
-#ifdef CONFIG_DRM_I915_DEBUG_GUC
+#if defined(CONFIG_DRM_I915_DEBUG_GUC)
 #define CRASH_BUFFER_SIZE	SZ_2M
 #define DEBUG_BUFFER_SIZE	SZ_16M
+#elif defined(CONFIG_DRM_I915_DEBUG_GEM)
+#define CRASH_BUFFER_SIZE	SZ_1M
+#define DEBUG_BUFFER_SIZE	SZ_2M
 #else
 #define CRASH_BUFFER_SIZE	SZ_8K
 #define DEBUG_BUFFER_SIZE	SZ_64K
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 3/4] drm/i915/guc: Flag an error if an engine reset fails
  2021-12-11  6:58 [PATCH 0/4] More fixes/tweaks to GuC support John.C.Harrison
  2021-12-11  6:58 ` [PATCH 1/4] drm/i915/guc: Speed up GuC log dumps John.C.Harrison
  2021-12-11  6:58 ` [PATCH 2/4] drm/i915/guc: Increase GuC log size for CONFIG_DEBUG_GEM John.C.Harrison
@ 2021-12-11  6:58 ` John.C.Harrison
  2021-12-16  1:16   ` [Intel-gfx] " Daniele Ceraolo Spurio
  2021-12-11  6:58 ` [PATCH 4/4] drm/i915: Improve long running OCL w/a for GuC submission John.C.Harrison
  3 siblings, 1 reply; 8+ messages in thread
From: John.C.Harrison @ 2021-12-11  6:58 UTC (permalink / raw)
  To: Intel-GFX; +Cc: John Harrison, DRI-Devel

From: John Harrison <John.C.Harrison@Intel.com>

If GuC encounters an error during engine reset, the i915 driver
promotes to full GT reset. This includes an info message about why the
reset is happening. However, that is not treated as a failure by any
of the CI systems because resets are an expected occurrance during
testing. This kind of failure is a major problem and should never
happen. So, complain more loudly and make sure CI notices.

Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
---
 drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 97311119da6f..6015815f1da0 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -4018,11 +4018,12 @@ int intel_guc_engine_failure_process_msg(struct intel_guc *guc,
 					 const u32 *msg, u32 len)
 {
 	struct intel_engine_cs *engine;
+	struct intel_gt *gt = guc_to_gt(guc);
 	u8 guc_class, instance;
 	u32 reason;
 
 	if (unlikely(len != 3)) {
-		drm_err(&guc_to_gt(guc)->i915->drm, "Invalid length %u", len);
+		drm_err(&gt->i915->drm, "Invalid length %u", len);
 		return -EPROTO;
 	}
 
@@ -4032,12 +4033,19 @@ int intel_guc_engine_failure_process_msg(struct intel_guc *guc,
 
 	engine = guc_lookup_engine(guc, guc_class, instance);
 	if (unlikely(!engine)) {
-		drm_err(&guc_to_gt(guc)->i915->drm,
+		drm_err(&gt->i915->drm,
 			"Invalid engine %d:%d", guc_class, instance);
 		return -EPROTO;
 	}
 
-	intel_gt_handle_error(guc_to_gt(guc), engine->mask,
+	/*
+	 * This is an unexpected failure of a hardware feature. So, log a real
+	 * error message not just the informational that comes with the reset.
+	 */
+	drm_err(&gt->i915->drm, "GuC engine reset request failed on %d:%d (%s) because %d",
+		guc_class, instance, engine->name, reason);
+
+	intel_gt_handle_error(gt, engine->mask,
 			      I915_ERROR_CAPTURE,
 			      "GuC failed to reset %s (reason=0x%08x)\n",
 			      engine->name, reason);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 4/4] drm/i915: Improve long running OCL w/a for GuC submission
  2021-12-11  6:58 [PATCH 0/4] More fixes/tweaks to GuC support John.C.Harrison
                   ` (2 preceding siblings ...)
  2021-12-11  6:58 ` [PATCH 3/4] drm/i915/guc: Flag an error if an engine reset fails John.C.Harrison
@ 2021-12-11  6:58 ` John.C.Harrison
  2021-12-16  1:11   ` [Intel-gfx] " Daniele Ceraolo Spurio
  3 siblings, 1 reply; 8+ messages in thread
From: John.C.Harrison @ 2021-12-11  6:58 UTC (permalink / raw)
  To: Intel-GFX; +Cc: John Harrison, DRI-Devel

From: John Harrison <John.C.Harrison@Intel.com>

A workaround was added to the driver to allow OpenCL workloads to run
'forever' by disabling pre-emption on the RCS engine for Gen12.
It is not totally unbound as the heartbeat will kick in eventually
and cause a reset of the hung engine.

However, this does not work well in GuC submission mode. In GuC mode,
the pre-emption timeout is how GuC detects hung contexts and triggers
a per engine reset. Thus, disabling the timeout means also losing all
per engine reset ability. A full GT reset will still occur when the
heartbeat finally expires, but that is a much more destructive and
undesirable mechanism.

The purpose of the workaround is actually to give OpenCL tasks longer
to reach a pre-emption point after a pre-emption request has been
issued. This is necessary because Gen12 does not support mid-thread
pre-emption and OpenCL can have long running threads.

So, rather than disabling the timeout completely, just set it to a
'long' value. Likewise, bump the heartbeat interval. That gives the
OpenCL thread sufficient time to reach a pre-emption point without
being killed off either by the GuC or by the heartbeat.

Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
---
 drivers/gpu/drm/i915/gt/intel_engine_cs.c | 42 +++++++++++++++++++++--
 1 file changed, 39 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index 352254e001b4..26af8d60fe2b 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -382,9 +382,45 @@ static int intel_engine_setup(struct intel_gt *gt, enum intel_engine_id id,
 	engine->props.timeslice_duration_ms =
 		CONFIG_DRM_I915_TIMESLICE_DURATION;
 
-	/* Override to uninterruptible for OpenCL workloads. */
-	if (GRAPHICS_VER(i915) == 12 && engine->class == RENDER_CLASS)
-		engine->props.preempt_timeout_ms = 0;
+	/*
+	 * Mid-thread pre-emption is not available in Gen12. Unfortunately,
+	 * some OpenCL workloads run quite long threads. That means they get
+	 * reset due to not pre-empting in a timely manner.
+	 * The execlist solution was to disable pre-emption completely.
+	 * However, pre-emption timeouts are the way GuC detects hung contexts
+	 * and triggers engine resets. Thus, without pre-emption, there is no
+	 * per engine reset. And full GT reset is much more intrusive. So keep
+	 * the timeout for GuC submission platforms and just bump it to be
+	 * much larger. Also bump the heartbeat timeout to match, otherwise
+	 * the heartbeat can expire before the pre-emption can timeout and
+	 * thus trigger a full GT reset anyway.
+	 */
+	if (GRAPHICS_VER(i915) == 12 && engine->class == RENDER_CLASS) {
+		if (intel_uc_wants_guc_submission(&gt->uc)) {
+			const unsigned long min_preempt = 7500;
+			const unsigned long min_beat = 5000;
+
+			if (engine->props.preempt_timeout_ms &&
+			    engine->props.preempt_timeout_ms < min_preempt) {
+				drm_info(&gt->i915->drm, "Bumping pre-emption timeout from %ld to %ld on %s to allow slow compute pre-emption\n",
+					 engine->props.preempt_timeout_ms,
+					 min_preempt, engine->name);
+
+				engine->props.preempt_timeout_ms = min_preempt;
+			}
+
+			if (engine->props.heartbeat_interval_ms &&
+			    engine->props.heartbeat_interval_ms < min_beat) {
+				drm_info(&gt->i915->drm, "Bumping heartbeat interval from %ld to %ld on %s to allow slow compute pre-emption\n",
+					 engine->props.heartbeat_interval_ms,
+					 min_beat, engine->name);
+
+				engine->props.heartbeat_interval_ms = min_beat;
+			}
+		} else {
+			engine->props.preempt_timeout_ms = 0;
+		}
+	}
 
 	engine->defaults = engine->props; /* never to change again */
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [Intel-gfx] [PATCH 4/4] drm/i915: Improve long running OCL w/a for GuC submission
  2021-12-11  6:58 ` [PATCH 4/4] drm/i915: Improve long running OCL w/a for GuC submission John.C.Harrison
@ 2021-12-16  1:11   ` Daniele Ceraolo Spurio
  0 siblings, 0 replies; 8+ messages in thread
From: Daniele Ceraolo Spurio @ 2021-12-16  1:11 UTC (permalink / raw)
  To: John.C.Harrison, Intel-GFX; +Cc: DRI-Devel



On 12/10/2021 10:58 PM, John.C.Harrison@Intel.com wrote:
> From: John Harrison <John.C.Harrison@Intel.com>
>
> A workaround was added to the driver to allow OpenCL workloads to run
> 'forever' by disabling pre-emption on the RCS engine for Gen12.
> It is not totally unbound as the heartbeat will kick in eventually
> and cause a reset of the hung engine.
>
> However, this does not work well in GuC submission mode. In GuC mode,
> the pre-emption timeout is how GuC detects hung contexts and triggers
> a per engine reset. Thus, disabling the timeout means also losing all
> per engine reset ability. A full GT reset will still occur when the
> heartbeat finally expires, but that is a much more destructive and
> undesirable mechanism.
>
> The purpose of the workaround is actually to give OpenCL tasks longer
> to reach a pre-emption point after a pre-emption request has been
> issued. This is necessary because Gen12 does not support mid-thread
> pre-emption and OpenCL can have long running threads.
>
> So, rather than disabling the timeout completely, just set it to a
> 'long' value. Likewise, bump the heartbeat interval. That gives the
> OpenCL thread sufficient time to reach a pre-emption point without
> being killed off either by the GuC or by the heartbeat.
>
> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>

On the approach and the code:

Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>

Please get an ack from the interested parties on the actual numbers used 
for the timeouts (they look big enough to me, but I'm not familiar with 
the compute use-case).

Daniele

> ---
>   drivers/gpu/drm/i915/gt/intel_engine_cs.c | 42 +++++++++++++++++++++--
>   1 file changed, 39 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> index 352254e001b4..26af8d60fe2b 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> @@ -382,9 +382,45 @@ static int intel_engine_setup(struct intel_gt *gt, enum intel_engine_id id,
>   	engine->props.timeslice_duration_ms =
>   		CONFIG_DRM_I915_TIMESLICE_DURATION;
>   
> -	/* Override to uninterruptible for OpenCL workloads. */
> -	if (GRAPHICS_VER(i915) == 12 && engine->class == RENDER_CLASS)
> -		engine->props.preempt_timeout_ms = 0;
> +	/*
> +	 * Mid-thread pre-emption is not available in Gen12. Unfortunately,
> +	 * some OpenCL workloads run quite long threads. That means they get
> +	 * reset due to not pre-empting in a timely manner.
> +	 * The execlist solution was to disable pre-emption completely.
> +	 * However, pre-emption timeouts are the way GuC detects hung contexts
> +	 * and triggers engine resets. Thus, without pre-emption, there is no
> +	 * per engine reset. And full GT reset is much more intrusive. So keep
> +	 * the timeout for GuC submission platforms and just bump it to be
> +	 * much larger. Also bump the heartbeat timeout to match, otherwise
> +	 * the heartbeat can expire before the pre-emption can timeout and
> +	 * thus trigger a full GT reset anyway.
> +	 */
> +	if (GRAPHICS_VER(i915) == 12 && engine->class == RENDER_CLASS) {
> +		if (intel_uc_wants_guc_submission(&gt->uc)) {
> +			const unsigned long min_preempt = 7500;
> +			const unsigned long min_beat = 5000;
> +
> +			if (engine->props.preempt_timeout_ms &&
> +			    engine->props.preempt_timeout_ms < min_preempt) {
> +				drm_info(&gt->i915->drm, "Bumping pre-emption timeout from %ld to %ld on %s to allow slow compute pre-emption\n",
> +					 engine->props.preempt_timeout_ms,
> +					 min_preempt, engine->name);
> +
> +				engine->props.preempt_timeout_ms = min_preempt;
> +			}
> +
> +			if (engine->props.heartbeat_interval_ms &&
> +			    engine->props.heartbeat_interval_ms < min_beat) {
> +				drm_info(&gt->i915->drm, "Bumping heartbeat interval from %ld to %ld on %s to allow slow compute pre-emption\n",
> +					 engine->props.heartbeat_interval_ms,
> +					 min_beat, engine->name);
> +
> +				engine->props.heartbeat_interval_ms = min_beat;
> +			}
> +		} else {
> +			engine->props.preempt_timeout_ms = 0;
> +		}
> +	}
>   
>   	engine->defaults = engine->props; /* never to change again */
>   


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [Intel-gfx] [PATCH 3/4] drm/i915/guc: Flag an error if an engine reset fails
  2021-12-11  6:58 ` [PATCH 3/4] drm/i915/guc: Flag an error if an engine reset fails John.C.Harrison
@ 2021-12-16  1:16   ` Daniele Ceraolo Spurio
  0 siblings, 0 replies; 8+ messages in thread
From: Daniele Ceraolo Spurio @ 2021-12-16  1:16 UTC (permalink / raw)
  To: John.C.Harrison, Intel-GFX; +Cc: DRI-Devel



On 12/10/2021 10:58 PM, John.C.Harrison@Intel.com wrote:
> From: John Harrison <John.C.Harrison@Intel.com>
>
> If GuC encounters an error during engine reset, the i915 driver
> promotes to full GT reset. This includes an info message about why the
> reset is happening. However, that is not treated as a failure by any
> of the CI systems because resets are an expected occurrance during
> testing. This kind of failure is a major problem and should never
> happen. So, complain more loudly and make sure CI notices.
>
> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
> ---
>   drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c | 14 +++++++++++---
>   1 file changed, 11 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> index 97311119da6f..6015815f1da0 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> @@ -4018,11 +4018,12 @@ int intel_guc_engine_failure_process_msg(struct intel_guc *guc,
>   					 const u32 *msg, u32 len)
>   {
>   	struct intel_engine_cs *engine;
> +	struct intel_gt *gt = guc_to_gt(guc);
>   	u8 guc_class, instance;
>   	u32 reason;
>   
>   	if (unlikely(len != 3)) {
> -		drm_err(&guc_to_gt(guc)->i915->drm, "Invalid length %u", len);
> +		drm_err(&gt->i915->drm, "Invalid length %u", len);
>   		return -EPROTO;
>   	}
>   
> @@ -4032,12 +4033,19 @@ int intel_guc_engine_failure_process_msg(struct intel_guc *guc,
>   
>   	engine = guc_lookup_engine(guc, guc_class, instance);
>   	if (unlikely(!engine)) {
> -		drm_err(&guc_to_gt(guc)->i915->drm,
> +		drm_err(&gt->i915->drm,
>   			"Invalid engine %d:%d", guc_class, instance);
>   		return -EPROTO;
>   	}
>   
> -	intel_gt_handle_error(guc_to_gt(guc), engine->mask,
> +	/*
> +	 * This is an unexpected failure of a hardware feature. So, log a real
> +	 * error message not just the informational that comes with the reset.
> +	 */
> +	drm_err(&gt->i915->drm, "GuC engine reset request failed on %d:%d (%s) because %d",

In the error handling called below, the reason is logged as 0x%08x, so 
IMO we should do the same here for consistency.
With that:

Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>

Daniele

> +		guc_class, instance, engine->name, reason);
> +
> +	intel_gt_handle_error(gt, engine->mask,
>   			      I915_ERROR_CAPTURE,
>   			      "GuC failed to reset %s (reason=0x%08x)\n",
>   			      engine->name, reason);


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/4] drm/i915/guc: Speed up GuC log dumps
  2021-12-11  6:58 ` [PATCH 1/4] drm/i915/guc: Speed up GuC log dumps John.C.Harrison
@ 2021-12-20 14:43   ` Daniele Ceraolo Spurio
  0 siblings, 0 replies; 8+ messages in thread
From: Daniele Ceraolo Spurio @ 2021-12-20 14:43 UTC (permalink / raw)
  To: John.C.Harrison, Intel-GFX; +Cc: DRI-Devel



On 12/10/2021 10:58 PM, John.C.Harrison@Intel.com wrote:
> From: John Harrison <John.C.Harrison@Intel.com>
>
> Add support for telling the debugfs interface the size of the GuC log
> dump in advance. Without that, the underlying framework keeps calling
> the 'show' function with larger and larger buffer allocations until it
> fits. That means reading the log from graphics memory many times - 16
> times with the full 18MB log size.
>
> v2: Don't return error codes from size query. Report overflow in the
> error dump as well (review feedback from Daniele).
>
> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>

Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>

Daniele

> ---
>   drivers/gpu/drm/i915/gt/intel_gt_debugfs.h    | 21 +++++--
>   .../drm/i915/gt/uc/intel_guc_log_debugfs.c    | 58 ++++++++++++++++++-
>   2 files changed, 71 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_debugfs.h b/drivers/gpu/drm/i915/gt/intel_gt_debugfs.h
> index e307ceb99031..17e79b735cfe 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_debugfs.h
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_debugfs.h
> @@ -10,11 +10,7 @@
>   
>   struct intel_gt;
>   
> -#define DEFINE_INTEL_GT_DEBUGFS_ATTRIBUTE(__name)				\
> -	static int __name ## _open(struct inode *inode, struct file *file) \
> -{									\
> -	return single_open(file, __name ## _show, inode->i_private);	\
> -}									\
> +#define __GT_DEBUGFS_ATTRIBUTE_FOPS(__name)				\
>   static const struct file_operations __name ## _fops = {			\
>   	.owner = THIS_MODULE,						\
>   	.open = __name ## _open,					\
> @@ -23,6 +19,21 @@ static const struct file_operations __name ## _fops = {			\
>   	.release = single_release,					\
>   }
>   
> +#define DEFINE_INTEL_GT_DEBUGFS_ATTRIBUTE(__name)			\
> +static int __name ## _open(struct inode *inode, struct file *file)	\
> +{									\
> +	return single_open(file, __name ## _show, inode->i_private);	\
> +}									\
> +__GT_DEBUGFS_ATTRIBUTE_FOPS(__name)
> +
> +#define DEFINE_INTEL_GT_DEBUGFS_ATTRIBUTE_WITH_SIZE(__name, __size_vf)		\
> +static int __name ## _open(struct inode *inode, struct file *file)		\
> +{										\
> +	return single_open_size(file, __name ## _show, inode->i_private,	\
> +			    __size_vf(inode->i_private));			\
> +}										\
> +__GT_DEBUGFS_ATTRIBUTE_FOPS(__name)
> +
>   void intel_gt_debugfs_register(struct intel_gt *gt);
>   
>   struct intel_gt_debugfs_file {
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_log_debugfs.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_log_debugfs.c
> index 8fd068049376..ddfbe334689f 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_log_debugfs.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_log_debugfs.c
> @@ -10,22 +10,74 @@
>   #include "intel_guc.h"
>   #include "intel_guc_log.h"
>   #include "intel_guc_log_debugfs.h"
> +#include "intel_uc.h"
> +
> +static u32 obj_to_guc_log_dump_size(struct drm_i915_gem_object *obj)
> +{
> +	u32 size;
> +
> +	if (!obj)
> +		return PAGE_SIZE;
> +
> +	/* "0x%08x 0x%08x 0x%08x 0x%08x\n" => 16 bytes -> 44 chars => x2.75 */
> +	size = ((obj->base.size * 11) + 3) / 4;
> +
> +	/* Add padding for final blank line, any extra header info, etc. */
> +	size = PAGE_ALIGN(size + PAGE_SIZE);
> +
> +	return size;
> +}
> +
> +static u32 guc_log_dump_size(struct intel_guc_log *log)
> +{
> +	struct intel_guc *guc = log_to_guc(log);
> +
> +	if (!intel_guc_is_supported(guc))
> +		return PAGE_SIZE;
> +
> +	if (!log->vma)
> +		return PAGE_SIZE;
> +
> +	return obj_to_guc_log_dump_size(log->vma->obj);
> +}
>   
>   static int guc_log_dump_show(struct seq_file *m, void *data)
>   {
>   	struct drm_printer p = drm_seq_file_printer(m);
> +	int ret;
>   
> -	return intel_guc_log_dump(m->private, &p, false);
> +	ret = intel_guc_log_dump(m->private, &p, false);
> +
> +	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM) && seq_has_overflowed(m))
> +		pr_warn_once("preallocated size:%zx for %s exceeded\n",
> +			     m->size, __func__);
> +
> +	return ret;
> +}
> +DEFINE_INTEL_GT_DEBUGFS_ATTRIBUTE_WITH_SIZE(guc_log_dump, guc_log_dump_size);
> +
> +static u32 guc_load_err_dump_size(struct intel_guc_log *log)
> +{
> +	struct intel_guc *guc = log_to_guc(log);
> +	struct intel_uc *uc = container_of(guc, struct intel_uc, guc);
> +
> +	if (!intel_guc_is_supported(guc))
> +		return PAGE_SIZE;
> +
> +	return obj_to_guc_log_dump_size(uc->load_err_log);
>   }
> -DEFINE_INTEL_GT_DEBUGFS_ATTRIBUTE(guc_log_dump);
>   
>   static int guc_load_err_log_dump_show(struct seq_file *m, void *data)
>   {
>   	struct drm_printer p = drm_seq_file_printer(m);
>   
> +	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM) && seq_has_overflowed(m))
> +		pr_warn_once("preallocated size:%zx for %s exceeded\n",
> +			     m->size, __func__);
> +
>   	return intel_guc_log_dump(m->private, &p, true);
>   }
> -DEFINE_INTEL_GT_DEBUGFS_ATTRIBUTE(guc_load_err_log_dump);
> +DEFINE_INTEL_GT_DEBUGFS_ATTRIBUTE_WITH_SIZE(guc_load_err_log_dump, guc_load_err_dump_size);
>   
>   static int guc_log_level_get(void *data, u64 *val)
>   {


^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2021-12-20 14:43 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-12-11  6:58 [PATCH 0/4] More fixes/tweaks to GuC support John.C.Harrison
2021-12-11  6:58 ` [PATCH 1/4] drm/i915/guc: Speed up GuC log dumps John.C.Harrison
2021-12-20 14:43   ` Daniele Ceraolo Spurio
2021-12-11  6:58 ` [PATCH 2/4] drm/i915/guc: Increase GuC log size for CONFIG_DEBUG_GEM John.C.Harrison
2021-12-11  6:58 ` [PATCH 3/4] drm/i915/guc: Flag an error if an engine reset fails John.C.Harrison
2021-12-16  1:16   ` [Intel-gfx] " Daniele Ceraolo Spurio
2021-12-11  6:58 ` [PATCH 4/4] drm/i915: Improve long running OCL w/a for GuC submission John.C.Harrison
2021-12-16  1:11   ` [Intel-gfx] " Daniele Ceraolo Spurio

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).