Re: [Intel-gfx] [PATCH v2 1/3] drm/i915/guc: Limit scheduling properties to avoid overflow

From: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
To: John.C.Harrison@Intel.com, Intel-GFX@Lists.FreeDesktop.Org
Cc: DRI-Devel@Lists.FreeDesktop.Org
Subject: Re: [Intel-gfx] [PATCH v2 1/3] drm/i915/guc: Limit scheduling properties to avoid overflow
Date: Wed, 2 Mar 2022 09:49:03 +0000	[thread overview]
Message-ID: <abf830db-d613-8374-038f-c0d2a93df73f@linux.intel.com> (raw)
In-Reply-To: <20220225204151.2248027-2-John.C.Harrison@Intel.com>


On 25/02/2022 20:41, John.C.Harrison@Intel.com wrote:
> From: John Harrison <John.C.Harrison@Intel.com>
> 
> GuC converts the pre-emption timeout and timeslice quantum values into
> clock ticks internally. That significantly reduces the point of 32bit
> overflow. On current platforms, worst case scenario is approximately
> 110 seconds. Rather than allowing the user to set higher values and
> then get confused by early timeouts, add limits when setting these
> values.
> 
> v2: Add helper functins for clamping (review feedback from Tvrtko).
> 
> Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
> Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com> (v1)
> ---
>   drivers/gpu/drm/i915/gt/intel_engine.h      |  6 ++
>   drivers/gpu/drm/i915/gt/intel_engine_cs.c   | 69 +++++++++++++++++++++
>   drivers/gpu/drm/i915/gt/sysfs_engines.c     | 25 +++++---
>   drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h |  9 +++
>   4 files changed, 99 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h b/drivers/gpu/drm/i915/gt/intel_engine.h
> index be4b1e65442f..5a9186f784c4 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine.h
> +++ b/drivers/gpu/drm/i915/gt/intel_engine.h
> @@ -349,4 +349,10 @@ intel_engine_get_hung_context(struct intel_engine_cs *engine)
>   	return engine->hung_ce;
>   }
>   
> +u64 intel_clamp_heartbeat_interval_ms(struct intel_engine_cs *engine, u64 value);
> +u64 intel_clamp_max_busywait_duration_ns(struct intel_engine_cs *engine, u64 value);
> +u64 intel_clamp_preempt_timeout_ms(struct intel_engine_cs *engine, u64 value);
> +u64 intel_clamp_stop_timeout_ms(struct intel_engine_cs *engine, u64 value);
> +u64 intel_clamp_timeslice_duration_ms(struct intel_engine_cs *engine, u64 value);
> +
>   #endif /* _INTEL_RINGBUFFER_H_ */
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> index e855c801ba28..7ad9e6006656 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
> @@ -399,6 +399,26 @@ static int intel_engine_setup(struct intel_gt *gt, enum intel_engine_id id,
>   	if (GRAPHICS_VER(i915) == 12 && engine->class == RENDER_CLASS)
>   		engine->props.preempt_timeout_ms = 0;
>   
> +	/* Cap properties according to any system limits */
> +#define CLAMP_PROP(field) \
> +	do { \
> +		u64 clamp = intel_clamp_##field(engine, engine->props.field); \
> +		if (clamp != engine->props.field) { \
> +			drm_notice(&engine->i915->drm, \
> +				   "Warning, clamping %s to %lld to prevent overflow\n", \
> +				   #field, clamp); \
> +			engine->props.field = clamp; \
> +		} \
> +	} while (0)
> +
> +	CLAMP_PROP(heartbeat_interval_ms);
> +	CLAMP_PROP(max_busywait_duration_ns);
> +	CLAMP_PROP(preempt_timeout_ms);
> +	CLAMP_PROP(stop_timeout_ms);
> +	CLAMP_PROP(timeslice_duration_ms);
> +
> +#undef CLAMP_PROP
> +
>   	engine->defaults = engine->props; /* never to change again */
>   
>   	engine->context_size = intel_engine_context_size(gt, engine->class);
> @@ -421,6 +441,55 @@ static int intel_engine_setup(struct intel_gt *gt, enum intel_engine_id id,
>   	return 0;
>   }
>   
> +u64 intel_clamp_heartbeat_interval_ms(struct intel_engine_cs *engine, u64 value)
> +{
> +	value = min_t(u64, value, jiffies_to_msecs(MAX_SCHEDULE_TIMEOUT));
> +
> +	return value;
> +}
> +
> +u64 intel_clamp_max_busywait_duration_ns(struct intel_engine_cs *engine, u64 value)
> +{
> +	value = min(value, jiffies_to_nsecs(2));
> +
> +	return value;
> +}
> +
> +u64 intel_clamp_preempt_timeout_ms(struct intel_engine_cs *engine, u64 value)
> +{
> +	/*
> +	 * NB: The GuC API only supports 32bit values. However, the limit is further
> +	 * reduced due to internal calculations which would otherwise overflow.
> +	 */
> +	if (intel_guc_submission_is_wanted(&engine->gt->uc.guc))
> +		value = min_t(u64, value, GUC_POLICY_MAX_PREEMPT_TIMEOUT_MS);
> +
> +	value = min_t(u64, value, jiffies_to_msecs(MAX_SCHEDULE_TIMEOUT));
> +
> +	return value;
> +}
> +
> +u64 intel_clamp_stop_timeout_ms(struct intel_engine_cs *engine, u64 value)
> +{
> +	value = min_t(u64, value, jiffies_to_msecs(MAX_SCHEDULE_TIMEOUT));
> +
> +	return value;
> +}
> +
> +u64 intel_clamp_timeslice_duration_ms(struct intel_engine_cs *engine, u64 value)
> +{
> +	/*
> +	 * NB: The GuC API only supports 32bit values. However, the limit is further
> +	 * reduced due to internal calculations which would otherwise overflow.
> +	 */
> +	if (intel_guc_submission_is_wanted(&engine->gt->uc.guc))
> +		value = min_t(u64, value, GUC_POLICY_MAX_EXEC_QUANTUM_MS);
> +
> +	value = min_t(u64, value, jiffies_to_msecs(MAX_SCHEDULE_TIMEOUT));
> +
> +	return value;
> +}
> +
>   static void __setup_engine_capabilities(struct intel_engine_cs *engine)
>   {
>   	struct drm_i915_private *i915 = engine->i915;
> diff --git a/drivers/gpu/drm/i915/gt/sysfs_engines.c b/drivers/gpu/drm/i915/gt/sysfs_engines.c
> index 967031056202..f2d9858d827c 100644
> --- a/drivers/gpu/drm/i915/gt/sysfs_engines.c
> +++ b/drivers/gpu/drm/i915/gt/sysfs_engines.c
> @@ -144,7 +144,7 @@ max_spin_store(struct kobject *kobj, struct kobj_attribute *attr,
>   	       const char *buf, size_t count)
>   {
>   	struct intel_engine_cs *engine = kobj_to_engine(kobj);
> -	unsigned long long duration;
> +	unsigned long long duration, clamped;
>   	int err;
>   
>   	/*
> @@ -168,7 +168,8 @@ max_spin_store(struct kobject *kobj, struct kobj_attribute *attr,
>   	if (err)
>   		return err;
>   
> -	if (duration > jiffies_to_nsecs(2))
> +	clamped = intel_clamp_max_busywait_duration_ns(engine, duration);
> +	if (duration != clamped)
>   		return -EINVAL;
>   
>   	WRITE_ONCE(engine->props.max_busywait_duration_ns, duration);
> @@ -203,7 +204,7 @@ timeslice_store(struct kobject *kobj, struct kobj_attribute *attr,
>   		const char *buf, size_t count)
>   {
>   	struct intel_engine_cs *engine = kobj_to_engine(kobj);
> -	unsigned long long duration;
> +	unsigned long long duration, clamped;
>   	int err;
>   
>   	/*
> @@ -218,7 +219,8 @@ timeslice_store(struct kobject *kobj, struct kobj_attribute *attr,
>   	if (err)
>   		return err;
>   
> -	if (duration > jiffies_to_msecs(MAX_SCHEDULE_TIMEOUT))
> +	clamped = intel_clamp_timeslice_duration_ms(engine, duration);
> +	if (duration != clamped)
>   		return -EINVAL;
>   
>   	WRITE_ONCE(engine->props.timeslice_duration_ms, duration);
> @@ -256,7 +258,7 @@ stop_store(struct kobject *kobj, struct kobj_attribute *attr,
>   	   const char *buf, size_t count)
>   {
>   	struct intel_engine_cs *engine = kobj_to_engine(kobj);
> -	unsigned long long duration;
> +	unsigned long long duration, clamped;
>   	int err;
>   
>   	/*
> @@ -272,7 +274,8 @@ stop_store(struct kobject *kobj, struct kobj_attribute *attr,
>   	if (err)
>   		return err;
>   
> -	if (duration > jiffies_to_msecs(MAX_SCHEDULE_TIMEOUT))
> +	clamped = intel_clamp_stop_timeout_ms(engine, duration);
> +	if (duration != clamped)
>   		return -EINVAL;
>   
>   	WRITE_ONCE(engine->props.stop_timeout_ms, duration);
> @@ -306,7 +309,7 @@ preempt_timeout_store(struct kobject *kobj, struct kobj_attribute *attr,
>   		      const char *buf, size_t count)
>   {
>   	struct intel_engine_cs *engine = kobj_to_engine(kobj);
> -	unsigned long long timeout;
> +	unsigned long long timeout, clamped;
>   	int err;
>   
>   	/*
> @@ -322,7 +325,8 @@ preempt_timeout_store(struct kobject *kobj, struct kobj_attribute *attr,
>   	if (err)
>   		return err;
>   
> -	if (timeout > jiffies_to_msecs(MAX_SCHEDULE_TIMEOUT))
> +	clamped = intel_clamp_preempt_timeout_ms(engine, timeout);
> +	if (timeout != clamped)
>   		return -EINVAL;
>   
>   	WRITE_ONCE(engine->props.preempt_timeout_ms, timeout);
> @@ -362,7 +366,7 @@ heartbeat_store(struct kobject *kobj, struct kobj_attribute *attr,
>   		const char *buf, size_t count)
>   {
>   	struct intel_engine_cs *engine = kobj_to_engine(kobj);
> -	unsigned long long delay;
> +	unsigned long long delay, clamped;
>   	int err;
>   
>   	/*
> @@ -379,7 +383,8 @@ heartbeat_store(struct kobject *kobj, struct kobj_attribute *attr,
>   	if (err)
>   		return err;
>   
> -	if (delay >= jiffies_to_msecs(MAX_SCHEDULE_TIMEOUT))
> +	clamped = intel_clamp_heartbeat_interval_ms(engine, delay);
> +	if (delay != clamped)
>   		return -EINVAL;
>   
>   	err = intel_engine_set_heartbeat(engine, delay);
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
> index 6a4612a852e2..ad131092f8df 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h
> @@ -248,6 +248,15 @@ struct guc_lrc_desc {
>   
>   #define GLOBAL_POLICY_DEFAULT_DPC_PROMOTE_TIME_US 500000
>   
> +/*
> + * GuC converts the timeout to clock ticks internally. Different platforms have
> + * different GuC clocks. Thus, the maximum value before overflow is platform
> + * dependent. Current worst case scenario is about 110s. So, limit to 100s to be
> + * safe.
> + */
> +#define GUC_POLICY_MAX_EXEC_QUANTUM_MS		(100 * 1000)
> +#define GUC_POLICY_MAX_PREEMPT_TIMEOUT_MS	(100 * 1000)
> +
>   struct guc_policies {
>   	u32 submission_queue_depth[GUC_MAX_ENGINE_CLASSES];
>   	/* In micro seconds. How much time to allow before DPC processing is

LGTM. Pretty please:

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index b3a429a92c0d..8208164c25e7 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -2218,13 +2218,24 @@ static inline u32 get_children_join_value(struct intel_context *ce,
  static void guc_context_policy_init(struct intel_engine_cs *engine,
                                     struct guc_lrc_desc *desc)
  {
+       struct drm_device *drm = &engine->i915->drm;
+
         desc->policy_flags = 0;
  
         if (engine->flags & I915_ENGINE_WANT_FORCED_PREEMPTION)
                 desc->policy_flags |= CONTEXT_POLICY_FLAG_PREEMPT_TO_IDLE;
  
         /* NB: For both of these, zero means disabled. */
+       if (overflows_type(engine->props.timeslice_duration_ms * 1000,
+                          desc->execution_quantum))
+               drm_warn_once(drm, "GuC interface cannot support %lums timeslice!\n",
+                             engine->props.timeslice_duration_ms);
         desc->execution_quantum = engine->props.timeslice_duration_ms * 1000;
+
+       if (overflows_type(engine->props.preempt_timeout_ms * 1000,
+                          desc->preemption_timeout))
+               drm_warn_once(drm, "GuC interface cannot support %lums preemption timeout!\n",
+                             engine->props.preempt_timeout_ms);
         desc->preemption_timeout = engine->props.preempt_timeout_ms * 1000;
  }
  

With that:

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Any idea what happened with the CI run? It's full of odd failures.

Regards,

Tvrtko