[PATCH 0/3] drm/i915/gt: RPS tuning for light media playback

dri-devel.lists.freedesktop.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 0/3] drm/i915/gt: RPS tuning for light media playback
@ 2021-11-17 22:49 Vinay Belgaumkar
  2021-11-17 22:49 ` [PATCH 1/3] drm/i915/gt: Spread virtual engines over idle engines Vinay Belgaumkar
                   ` (2 more replies)
  0 siblings, 3 replies; 14+ messages in thread
From: Vinay Belgaumkar @ 2021-11-17 22:49 UTC (permalink / raw)
  To: intel-gfx, dri-devel; +Cc: Vinay Belgaumkar, Chris Wilson, Tvrtko Ursulin

      Switch from tgl to adl, sees one particular media decode pipeline fit
into a single vcs engine on adl, whereas it took two on tgl. However, it
was observed that the power consumtpion for adl remained higher than for
tgl. One contibution is that each engine is treated individually for rps
evaluation, another is that it appears that we prefer to avoid low
frequencies (with no rc6) and use slightly higher frequencies (with lots
of rc6). So let's try tweaking the balancer to smear busy virtual
contexts across multiple engines (trying to make adl look more like
tgl), and tweak the rps evaluation to "race to idle" harder.

Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Vinay Belgaumkar <vinay.belgaumkar@intel.com>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

Chris Wilson (3):
  drm/i915/gt: Spread virtual engines over idle engines
  drm/i915/gt: Compare average group occupancy for RPS evaluation
  drm/i915/gt: Improve "race-to-idle" at low frequencies

 .../drm/i915/gt/intel_execlists_submission.c  | 80 ++++++++++++-------
 drivers/gpu/drm/i915/gt/intel_rps.c           | 79 +++++++++++++-----
 2 files changed, 112 insertions(+), 47 deletions(-)

-- 
2.34.0


^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 1/3] drm/i915/gt: Spread virtual engines over idle engines
  2021-11-17 22:49 [PATCH 0/3] drm/i915/gt: RPS tuning for light media playback Vinay Belgaumkar
@ 2021-11-17 22:49 ` Vinay Belgaumkar
  2021-11-23  9:39   ` Tvrtko Ursulin
  2021-11-17 22:49 ` [PATCH 2/3] drm/i915/gt: Compare average group occupancy for RPS evaluation Vinay Belgaumkar
  2021-11-17 22:49 ` [PATCH 3/3] drm/i915/gt: Improve "race-to-idle" at low frequencies Vinay Belgaumkar
  2 siblings, 1 reply; 14+ messages in thread
From: Vinay Belgaumkar @ 2021-11-17 22:49 UTC (permalink / raw)
  To: intel-gfx, dri-devel; +Cc: Vinay Belgaumkar, Tvrtko Ursulin, Chris Wilson

From: Chris Wilson <chris@chris-wilson.co.uk>

Everytime we come to the end of a virtual engine's context, re-randomise
it's siblings[]. As we schedule the siblings' tasklets in the order they
are in the array, earlier entries are executed first (when idle) and so
will be preferred when scheduling the next virtual request. Currently,
we only update the array when switching onto a new idle engine, so we
prefer to stick on the last execute engine, keeping the work compact.
However, it can be beneficial to spread the work out across idle
engines, so choose another sibling as our preferred target at the end of
the context's execution.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Vinay Belgaumkar <vinay.belgaumkar@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
---
 .../drm/i915/gt/intel_execlists_submission.c  | 80 ++++++++++++-------
 1 file changed, 52 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
index ca03880fa7e4..b95bbc8fb91a 100644
--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
@@ -539,6 +539,41 @@ static void execlists_schedule_in(struct i915_request *rq, int idx)
 	GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
 }
 
+static void virtual_xfer_context(struct virtual_engine *ve,
+				 struct intel_engine_cs *engine)
+{
+	unsigned int n;
+
+	if (likely(engine == ve->siblings[0]))
+		return;
+
+	if (!intel_engine_has_relative_mmio(engine))
+		lrc_update_offsets(&ve->context, engine);
+
+	/*
+	 * Move the bound engine to the top of the list for
+	 * future execution. We then kick this tasklet first
+	 * before checking others, so that we preferentially
+	 * reuse this set of bound registers.
+	 */
+	for (n = 1; n < ve->num_siblings; n++) {
+		if (ve->siblings[n] == engine) {
+			swap(ve->siblings[n], ve->siblings[0]);
+			break;
+		}
+	}
+}
+
+static int ve_random_sibling(struct virtual_engine *ve)
+{
+	return prandom_u32_max(ve->num_siblings);
+}
+
+static int ve_random_other_sibling(struct virtual_engine *ve)
+{
+	return 1 + prandom_u32_max(ve->num_siblings - 1);
+}
+
 static void
 resubmit_virtual_request(struct i915_request *rq, struct virtual_engine *ve)
 {
@@ -578,8 +613,23 @@ static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
 	    rq->execution_mask != engine->mask)
 		resubmit_virtual_request(rq, ve);
 
-	if (READ_ONCE(ve->request))
+	/*
+	 * Reschedule with a new "preferred" sibling.
+	 *
+	 * The tasklets are executed in the order of ve->siblings[], so
+	 * siblings[0] receives preferrential treatment of greedily checking
+	 * for execution of the virtual engine. At this point, the virtual
+	 * engine is no longer in the current GPU cache due to idleness or
+	 * contention, so it can be executed on any without penalty. We
+	 * re-randomise at this point in order to spread light loads across
+	 * the system, heavy overlapping loads will continue to be greedily
+	 * executed by the first available engine.
+	 */
+	if (READ_ONCE(ve->request)) {
+		virtual_xfer_context(ve,
+				     ve->siblings[ve_random_other_sibling(ve)]);
 		tasklet_hi_schedule(&ve->base.sched_engine->tasklet);
+	}
 }
 
 static void __execlists_schedule_out(struct i915_request * const rq,
@@ -1030,32 +1080,6 @@ first_virtual_engine(struct intel_engine_cs *engine)
 	return NULL;
 }
 
-static void virtual_xfer_context(struct virtual_engine *ve,
-				 struct intel_engine_cs *engine)
-{
-	unsigned int n;
-
-	if (likely(engine == ve->siblings[0]))
-		return;
-
-	GEM_BUG_ON(READ_ONCE(ve->context.inflight));
-	if (!intel_engine_has_relative_mmio(engine))
-		lrc_update_offsets(&ve->context, engine);
-
-	/*
-	 * Move the bound engine to the top of the list for
-	 * future execution. We then kick this tasklet first
-	 * before checking others, so that we preferentially
-	 * reuse this set of bound registers.
-	 */
-	for (n = 1; n < ve->num_siblings; n++) {
-		if (ve->siblings[n] == engine) {
-			swap(ve->siblings[n], ve->siblings[0]);
-			break;
-		}
-	}
-}
-
 static void defer_request(struct i915_request *rq, struct list_head * const pl)
 {
 	LIST_HEAD(list);
@@ -3590,7 +3614,7 @@ static void virtual_engine_initial_hint(struct virtual_engine *ve)
 	 * NB This does not force us to execute on this engine, it will just
 	 * typically be the first we inspect for submission.
 	 */
-	swp = prandom_u32_max(ve->num_siblings);
+	swp = ve_random_sibling(ve);
 	if (swp)
 		swap(ve->siblings[swp], ve->siblings[0]);
 }
-- 
2.34.0


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 2/3] drm/i915/gt: Compare average group occupancy for RPS evaluation
  2021-11-17 22:49 [PATCH 0/3] drm/i915/gt: RPS tuning for light media playback Vinay Belgaumkar
  2021-11-17 22:49 ` [PATCH 1/3] drm/i915/gt: Spread virtual engines over idle engines Vinay Belgaumkar
@ 2021-11-17 22:49 ` Vinay Belgaumkar
  2021-11-23 17:35   ` Belgaumkar, Vinay
  2021-11-17 22:49 ` [PATCH 3/3] drm/i915/gt: Improve "race-to-idle" at low frequencies Vinay Belgaumkar
  2 siblings, 1 reply; 14+ messages in thread
From: Vinay Belgaumkar @ 2021-11-17 22:49 UTC (permalink / raw)
  To: intel-gfx, dri-devel; +Cc: Vinay Belgaumkar, Chris Wilson, Tvrtko Ursulin

From: Chris Wilson <chris.p.wilson@intel.com>

Currently, we inspect each engine individually and measure the occupancy
of that engine over the last evaluation interval. If that exceeds our
busyness thresholds, we decide to increase the GPU frequency. However,
under a load balancer, we should consider the occupancy of entire engine
groups, as work may be spread out across the group. In doing so, we
prefer wide over fast, power consumption is approximately proportional to
the square of the frequency. However, since the load balancer is greedy,
the first idle engine gets all the work, and preferrentially reuses the
last active engine, under light loads all work is assigned to one
engine, and so that engine appears very busy. But if the work happened
to overlap slightly, the workload would spread across multiple engines,
reducing each individual engine's runtime, and so reducing the rps
contribution, keeping the frequency low. Instead, when considering the
contribution, consider the contribution over the entire engine group
(capacity).

Signed-off-by: Chris Wilson <chris.p.wilson@intel.com>
Cc: Vinay Belgaumkar <vinay.belgaumkar@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
---
 drivers/gpu/drm/i915/gt/intel_rps.c | 48 ++++++++++++++++++++---------
 1 file changed, 34 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_rps.c b/drivers/gpu/drm/i915/gt/intel_rps.c
index 07ff7ba7b2b7..3675ac93ded0 100644
--- a/drivers/gpu/drm/i915/gt/intel_rps.c
+++ b/drivers/gpu/drm/i915/gt/intel_rps.c
@@ -7,6 +7,7 @@
 
 #include "i915_drv.h"
 #include "intel_breadcrumbs.h"
+#include "intel_engine_pm.h"
 #include "intel_gt.h"
 #include "intel_gt_clock_utils.h"
 #include "intel_gt_irq.h"
@@ -65,26 +66,45 @@ static void set(struct intel_uncore *uncore, i915_reg_t reg, u32 val)
 static void rps_timer(struct timer_list *t)
 {
 	struct intel_rps *rps = from_timer(rps, t, timer);
-	struct intel_engine_cs *engine;
-	ktime_t dt, last, timestamp;
-	enum intel_engine_id id;
+	struct intel_gt *gt = rps_to_gt(rps);
+	ktime_t dt, last, timestamp = 0;
 	s64 max_busy[3] = {};
+	int i, j;
 
-	timestamp = 0;
-	for_each_engine(engine, rps_to_gt(rps), id) {
-		s64 busy;
-		int i;
+	/* Compare average occupancy over each engine group */
+	for (i = 0; i < ARRAY_SIZE(gt->engine_class); i++) {
+		s64 busy = 0;
+		int count = 0;
+
+		for (j = 0; j < ARRAY_SIZE(gt->engine_class[i]); j++) {
+			struct intel_engine_cs *engine;
 
-		dt = intel_engine_get_busy_time(engine, &timestamp);
-		last = engine->stats.rps;
-		engine->stats.rps = dt;
+			engine = gt->engine_class[i][j];
+			if (!engine)
+				continue;
 
-		busy = ktime_to_ns(ktime_sub(dt, last));
-		for (i = 0; i < ARRAY_SIZE(max_busy); i++) {
-			if (busy > max_busy[i])
-				swap(busy, max_busy[i]);
+			dt = intel_engine_get_busy_time(engine, &timestamp);
+			last = engine->stats.rps;
+			engine->stats.rps = dt;
+
+			if (!intel_engine_pm_is_awake(engine))
+				continue;
+
+			busy += ktime_to_ns(ktime_sub(dt, last));
+			count++;
+		}
+
+		if (count > 1)
+			busy = div_u64(busy, count);
+		if (busy <= max_busy[ARRAY_SIZE(max_busy) - 1])
+			continue;
+
+		for (j = 0; j < ARRAY_SIZE(max_busy); j++) {
+			if (busy > max_busy[j])
+				swap(busy, max_busy[j]);
 		}
 	}
+
 	last = rps->pm_timestamp;
 	rps->pm_timestamp = timestamp;
 
-- 
2.34.0


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 3/3] drm/i915/gt: Improve "race-to-idle" at low frequencies
  2021-11-17 22:49 [PATCH 0/3] drm/i915/gt: RPS tuning for light media playback Vinay Belgaumkar
  2021-11-17 22:49 ` [PATCH 1/3] drm/i915/gt: Spread virtual engines over idle engines Vinay Belgaumkar
  2021-11-17 22:49 ` [PATCH 2/3] drm/i915/gt: Compare average group occupancy for RPS evaluation Vinay Belgaumkar
@ 2021-11-17 22:49 ` Vinay Belgaumkar
  2021-11-22 18:44   ` Rodrigo Vivi
  2021-11-23 17:37   ` Belgaumkar, Vinay
  2 siblings, 2 replies; 14+ messages in thread
From: Vinay Belgaumkar @ 2021-11-17 22:49 UTC (permalink / raw)
  To: intel-gfx, dri-devel; +Cc: Vinay Belgaumkar, Tvrtko Ursulin, Chris Wilson

From: Chris Wilson <chris@chris-wilson.co.uk>

While the power consumption is proportional to the frequency, there is
also a static draw for active gates. The longer we are able to powergate
(rc6), the lower the static draw. Thus there is a sweetspot in the
frequency/power curve where we run at higher frequency in order to sleep
longer, aka race-to-idle. This is more evident at lower frequencies, so
let's look to bump the frequency if we think we will benefit by sleeping
longer at the higher frequency and so conserving power.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Vinay Belgaumkar <vinay.belgaumkar@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
---
 drivers/gpu/drm/i915/gt/intel_rps.c | 31 ++++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_rps.c b/drivers/gpu/drm/i915/gt/intel_rps.c
index 3675ac93ded0..6af3231982af 100644
--- a/drivers/gpu/drm/i915/gt/intel_rps.c
+++ b/drivers/gpu/drm/i915/gt/intel_rps.c
@@ -63,6 +63,22 @@ static void set(struct intel_uncore *uncore, i915_reg_t reg, u32 val)
 	intel_uncore_write_fw(uncore, reg, val);
 }
 
+static bool race_to_idle(struct intel_rps *rps, u64 busy, u64 dt)
+{
+	unsigned int this = rps->cur_freq;
+	unsigned int next = rps->cur_freq + 1;
+	u64 next_dt = next * max(busy, dt);
+
+	/*
+	 * Compare estimated time spent in rc6 at the next power bin. If
+	 * we expect to sleep longer than the estimated increased power
+	 * cost of running at a higher frequency, it will be reduced power
+	 * consumption overall.
+	 */
+	return (((next_dt - this * busy) >> 10) * this * this >
+		((next_dt - next * busy) >> 10) * next * next);
+}
+
 static void rps_timer(struct timer_list *t)
 {
 	struct intel_rps *rps = from_timer(rps, t, timer);
@@ -133,7 +149,7 @@ static void rps_timer(struct timer_list *t)
 			if (!max_busy[i])
 				break;
 
-			busy += div_u64(max_busy[i], 1 << i);
+			busy += max_busy[i] >> i;
 		}
 		GT_TRACE(rps_to_gt(rps),
 			 "busy:%lld [%d%%], max:[%lld, %lld, %lld], interval:%d\n",
@@ -141,13 +157,18 @@ static void rps_timer(struct timer_list *t)
 			 max_busy[0], max_busy[1], max_busy[2],
 			 rps->pm_interval);
 
-		if (100 * busy > rps->power.up_threshold * dt &&
-		    rps->cur_freq < rps->max_freq_softlimit) {
+		if (rps->cur_freq < rps->max_freq_softlimit &&
+		    race_to_idle(rps, max_busy[0], dt)) {
+			rps->pm_iir |= GEN6_PM_RP_UP_THRESHOLD;
+			rps->pm_interval = 1;
+			schedule_work(&rps->work);
+		} else if (rps->cur_freq < rps->max_freq_softlimit &&
+			   100 * busy > rps->power.up_threshold * dt) {
 			rps->pm_iir |= GEN6_PM_RP_UP_THRESHOLD;
 			rps->pm_interval = 1;
 			schedule_work(&rps->work);
-		} else if (100 * busy < rps->power.down_threshold * dt &&
-			   rps->cur_freq > rps->min_freq_softlimit) {
+		} else if (rps->cur_freq > rps->min_freq_softlimit &&
+			   100 * busy < rps->power.down_threshold * dt) {
 			rps->pm_iir |= GEN6_PM_RP_DOWN_THRESHOLD;
 			rps->pm_interval = 1;
 			schedule_work(&rps->work);
-- 
2.34.0


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH 3/3] drm/i915/gt: Improve "race-to-idle" at low frequencies
  2021-11-17 22:49 ` [PATCH 3/3] drm/i915/gt: Improve "race-to-idle" at low frequencies Vinay Belgaumkar
@ 2021-11-22 18:44   ` Rodrigo Vivi
  2021-11-23  9:17     ` Tvrtko Ursulin
  2021-11-23 17:37   ` Belgaumkar, Vinay
  1 sibling, 1 reply; 14+ messages in thread
From: Rodrigo Vivi @ 2021-11-22 18:44 UTC (permalink / raw)
  To: Vinay Belgaumkar; +Cc: Tvrtko Ursulin, intel-gfx, dri-devel, Chris Wilson

On Wed, Nov 17, 2021 at 02:49:55PM -0800, Vinay Belgaumkar wrote:
> From: Chris Wilson <chris@chris-wilson.co.uk>
> 
> While the power consumption is proportional to the frequency, there is
> also a static draw for active gates. The longer we are able to powergate
> (rc6), the lower the static draw. Thus there is a sweetspot in the
> frequency/power curve where we run at higher frequency in order to sleep
> longer, aka race-to-idle. This is more evident at lower frequencies, so
> let's look to bump the frequency if we think we will benefit by sleeping
> longer at the higher frequency and so conserving power.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Vinay Belgaumkar <vinay.belgaumkar@intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>

Please let's not increase the complexity here, unless we have a very good
and documented reason.

Before trying to implement anything smart like this in the driver I'd like
to see data, power and performance results in different platforms and with
different workloads.

Thanks,
Rodrigo.

> ---
>  drivers/gpu/drm/i915/gt/intel_rps.c | 31 ++++++++++++++++++++++++-----
>  1 file changed, 26 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_rps.c b/drivers/gpu/drm/i915/gt/intel_rps.c
> index 3675ac93ded0..6af3231982af 100644
> --- a/drivers/gpu/drm/i915/gt/intel_rps.c
> +++ b/drivers/gpu/drm/i915/gt/intel_rps.c
> @@ -63,6 +63,22 @@ static void set(struct intel_uncore *uncore, i915_reg_t reg, u32 val)
>  	intel_uncore_write_fw(uncore, reg, val);
>  }
>  
> +static bool race_to_idle(struct intel_rps *rps, u64 busy, u64 dt)
> +{
> +	unsigned int this = rps->cur_freq;
> +	unsigned int next = rps->cur_freq + 1;
> +	u64 next_dt = next * max(busy, dt);
> +
> +	/*
> +	 * Compare estimated time spent in rc6 at the next power bin. If
> +	 * we expect to sleep longer than the estimated increased power
> +	 * cost of running at a higher frequency, it will be reduced power
> +	 * consumption overall.
> +	 */
> +	return (((next_dt - this * busy) >> 10) * this * this >
> +		((next_dt - next * busy) >> 10) * next * next);
> +}
> +
>  static void rps_timer(struct timer_list *t)
>  {
>  	struct intel_rps *rps = from_timer(rps, t, timer);
> @@ -133,7 +149,7 @@ static void rps_timer(struct timer_list *t)
>  			if (!max_busy[i])
>  				break;
>  
> -			busy += div_u64(max_busy[i], 1 << i);
> +			busy += max_busy[i] >> i;
>  		}
>  		GT_TRACE(rps_to_gt(rps),
>  			 "busy:%lld [%d%%], max:[%lld, %lld, %lld], interval:%d\n",
> @@ -141,13 +157,18 @@ static void rps_timer(struct timer_list *t)
>  			 max_busy[0], max_busy[1], max_busy[2],
>  			 rps->pm_interval);
>  
> -		if (100 * busy > rps->power.up_threshold * dt &&
> -		    rps->cur_freq < rps->max_freq_softlimit) {
> +		if (rps->cur_freq < rps->max_freq_softlimit &&
> +		    race_to_idle(rps, max_busy[0], dt)) {
> +			rps->pm_iir |= GEN6_PM_RP_UP_THRESHOLD;
> +			rps->pm_interval = 1;
> +			schedule_work(&rps->work);
> +		} else if (rps->cur_freq < rps->max_freq_softlimit &&
> +			   100 * busy > rps->power.up_threshold * dt) {
>  			rps->pm_iir |= GEN6_PM_RP_UP_THRESHOLD;
>  			rps->pm_interval = 1;
>  			schedule_work(&rps->work);
> -		} else if (100 * busy < rps->power.down_threshold * dt &&
> -			   rps->cur_freq > rps->min_freq_softlimit) {
> +		} else if (rps->cur_freq > rps->min_freq_softlimit &&
> +			   100 * busy < rps->power.down_threshold * dt) {
>  			rps->pm_iir |= GEN6_PM_RP_DOWN_THRESHOLD;
>  			rps->pm_interval = 1;
>  			schedule_work(&rps->work);
> -- 
> 2.34.0
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 3/3] drm/i915/gt: Improve "race-to-idle" at low frequencies
  2021-11-22 18:44   ` Rodrigo Vivi
@ 2021-11-23  9:17     ` Tvrtko Ursulin
  2021-11-23 16:53       ` Vivi, Rodrigo
  0 siblings, 1 reply; 14+ messages in thread
From: Tvrtko Ursulin @ 2021-11-23  9:17 UTC (permalink / raw)
  To: Rodrigo Vivi, Vinay Belgaumkar; +Cc: intel-gfx, dri-devel, Chris Wilson


On 22/11/2021 18:44, Rodrigo Vivi wrote:
> On Wed, Nov 17, 2021 at 02:49:55PM -0800, Vinay Belgaumkar wrote:
>> From: Chris Wilson <chris@chris-wilson.co.uk>
>>
>> While the power consumption is proportional to the frequency, there is
>> also a static draw for active gates. The longer we are able to powergate
>> (rc6), the lower the static draw. Thus there is a sweetspot in the
>> frequency/power curve where we run at higher frequency in order to sleep
>> longer, aka race-to-idle. This is more evident at lower frequencies, so
>> let's look to bump the frequency if we think we will benefit by sleeping
>> longer at the higher frequency and so conserving power.
>>
>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>> Cc: Vinay Belgaumkar <vinay.belgaumkar@intel.com>
>> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> 
> Please let's not increase the complexity here, unless we have a very good
> and documented reason.
> 
> Before trying to implement anything smart like this in the driver I'd like
> to see data, power and performance results in different platforms and with
> different workloads.

Who has such test suite and test farm which isn't focused to workloads 
from a single customer? ;(

Regards,

Tvrtko

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/3] drm/i915/gt: Spread virtual engines over idle engines
  2021-11-17 22:49 ` [PATCH 1/3] drm/i915/gt: Spread virtual engines over idle engines Vinay Belgaumkar
@ 2021-11-23  9:39   ` Tvrtko Ursulin
  2021-11-23 19:52     ` [Intel-gfx] " Rodrigo Vivi
  0 siblings, 1 reply; 14+ messages in thread
From: Tvrtko Ursulin @ 2021-11-23  9:39 UTC (permalink / raw)
  To: Vinay Belgaumkar, intel-gfx, dri-devel; +Cc: Chris Wilson


On 17/11/2021 22:49, Vinay Belgaumkar wrote:
> From: Chris Wilson <chris@chris-wilson.co.uk>
> 
> Everytime we come to the end of a virtual engine's context, re-randomise
> it's siblings[]. As we schedule the siblings' tasklets in the order they
> are in the array, earlier entries are executed first (when idle) and so
> will be preferred when scheduling the next virtual request. Currently,
> we only update the array when switching onto a new idle engine, so we
> prefer to stick on the last execute engine, keeping the work compact.
> However, it can be beneficial to spread the work out across idle
> engines, so choose another sibling as our preferred target at the end of
> the context's execution.

This partially brings back, from a different angle, the more dynamic 
scheduling behavior which has been lost since bugfix 90a987205c6c 
("drm/i915/gt: Only swap to a random sibling once upon creation").

One day we could experiment with using engine busyness as criteria 
(instead of random). Back in the day busyness was kind of the best 
strategy, although sampled at submit, not at the trailing edge like 
here, but it still may be able to settle down to engine configuration 
better in some scenarios. Only testing could say.

Still, from memory random also wasn't that bad so this should be okay 
for now.

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko

> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Vinay Belgaumkar <vinay.belgaumkar@intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> ---
>   .../drm/i915/gt/intel_execlists_submission.c  | 80 ++++++++++++-------
>   1 file changed, 52 insertions(+), 28 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> index ca03880fa7e4..b95bbc8fb91a 100644
> --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> @@ -539,6 +539,41 @@ static void execlists_schedule_in(struct i915_request *rq, int idx)
>   	GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
>   }
>   
> +static void virtual_xfer_context(struct virtual_engine *ve,
> +				 struct intel_engine_cs *engine)
> +{
> +	unsigned int n;
> +
> +	if (likely(engine == ve->siblings[0]))
> +		return;
> +
> +	if (!intel_engine_has_relative_mmio(engine))
> +		lrc_update_offsets(&ve->context, engine);
> +
> +	/*
> +	 * Move the bound engine to the top of the list for
> +	 * future execution. We then kick this tasklet first
> +	 * before checking others, so that we preferentially
> +	 * reuse this set of bound registers.
> +	 */
> +	for (n = 1; n < ve->num_siblings; n++) {
> +		if (ve->siblings[n] == engine) {
> +			swap(ve->siblings[n], ve->siblings[0]);
> +			break;
> +		}
> +	}
> +}
> +
> +static int ve_random_sibling(struct virtual_engine *ve)
> +{
> +	return prandom_u32_max(ve->num_siblings);
> +}
> +
> +static int ve_random_other_sibling(struct virtual_engine *ve)
> +{
> +	return 1 + prandom_u32_max(ve->num_siblings - 1);
> +}
> +
>   static void
>   resubmit_virtual_request(struct i915_request *rq, struct virtual_engine *ve)
>   {
> @@ -578,8 +613,23 @@ static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
>   	    rq->execution_mask != engine->mask)
>   		resubmit_virtual_request(rq, ve);
>   
> -	if (READ_ONCE(ve->request))
> +	/*
> +	 * Reschedule with a new "preferred" sibling.
> +	 *
> +	 * The tasklets are executed in the order of ve->siblings[], so
> +	 * siblings[0] receives preferrential treatment of greedily checking
> +	 * for execution of the virtual engine. At this point, the virtual
> +	 * engine is no longer in the current GPU cache due to idleness or
> +	 * contention, so it can be executed on any without penalty. We
> +	 * re-randomise at this point in order to spread light loads across
> +	 * the system, heavy overlapping loads will continue to be greedily
> +	 * executed by the first available engine.
> +	 */
> +	if (READ_ONCE(ve->request)) {
> +		virtual_xfer_context(ve,
> +				     ve->siblings[ve_random_other_sibling(ve)]);
>   		tasklet_hi_schedule(&ve->base.sched_engine->tasklet);
> +	}
>   }
>   
>   static void __execlists_schedule_out(struct i915_request * const rq,
> @@ -1030,32 +1080,6 @@ first_virtual_engine(struct intel_engine_cs *engine)
>   	return NULL;
>   }
>   
> -static void virtual_xfer_context(struct virtual_engine *ve,
> -				 struct intel_engine_cs *engine)
> -{
> -	unsigned int n;
> -
> -	if (likely(engine == ve->siblings[0]))
> -		return;
> -
> -	GEM_BUG_ON(READ_ONCE(ve->context.inflight));
> -	if (!intel_engine_has_relative_mmio(engine))
> -		lrc_update_offsets(&ve->context, engine);
> -
> -	/*
> -	 * Move the bound engine to the top of the list for
> -	 * future execution. We then kick this tasklet first
> -	 * before checking others, so that we preferentially
> -	 * reuse this set of bound registers.
> -	 */
> -	for (n = 1; n < ve->num_siblings; n++) {
> -		if (ve->siblings[n] == engine) {
> -			swap(ve->siblings[n], ve->siblings[0]);
> -			break;
> -		}
> -	}
> -}
> -
>   static void defer_request(struct i915_request *rq, struct list_head * const pl)
>   {
>   	LIST_HEAD(list);
> @@ -3590,7 +3614,7 @@ static void virtual_engine_initial_hint(struct virtual_engine *ve)
>   	 * NB This does not force us to execute on this engine, it will just
>   	 * typically be the first we inspect for submission.
>   	 */
> -	swp = prandom_u32_max(ve->num_siblings);
> +	swp = ve_random_sibling(ve);
>   	if (swp)
>   		swap(ve->siblings[swp], ve->siblings[0]);
>   }
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 3/3] drm/i915/gt: Improve "race-to-idle" at low frequencies
  2021-11-23  9:17     ` Tvrtko Ursulin
@ 2021-11-23 16:53       ` Vivi, Rodrigo
  0 siblings, 0 replies; 14+ messages in thread
From: Vivi, Rodrigo @ 2021-11-23 16:53 UTC (permalink / raw)
  To: tvrtko.ursulin, Belgaumkar, Vinay; +Cc: intel-gfx, dri-devel, chris

On Tue, 2021-11-23 at 09:17 +0000, Tvrtko Ursulin wrote:
> 
> On 22/11/2021 18:44, Rodrigo Vivi wrote:
> > On Wed, Nov 17, 2021 at 02:49:55PM -0800, Vinay Belgaumkar wrote:
> > > From: Chris Wilson <chris@chris-wilson.co.uk>
> > > 
> > > While the power consumption is proportional to the frequency,
> > > there is
> > > also a static draw for active gates. The longer we are able to
> > > powergate
> > > (rc6), the lower the static draw. Thus there is a sweetspot in
> > > the
> > > frequency/power curve where we run at higher frequency in order
> > > to sleep
> > > longer, aka race-to-idle. This is more evident at lower
> > > frequencies, so
> > > let's look to bump the frequency if we think we will benefit by
> > > sleeping
> > > longer at the higher frequency and so conserving power.
> > > 
> > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > > Cc: Vinay Belgaumkar <vinay.belgaumkar@intel.com>
> > > Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> > 
> > Please let's not increase the complexity here, unless we have a
> > very good
> > and documented reason.
> > 
> > Before trying to implement anything smart like this in the driver
> > I'd like
> > to see data, power and performance results in different platforms
> > and with
> > different workloads.
> 
> Who has such test suite and test farm which isn't focused to
> workloads 
> from a single customer? ;(

Okay, maybe we don't need to cover the world here. But without seen any
data at all it is hard to make this call.

> 
> Regards,
> 
> Tvrtko


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/3] drm/i915/gt: Compare average group occupancy for RPS evaluation
  2021-11-17 22:49 ` [PATCH 2/3] drm/i915/gt: Compare average group occupancy for RPS evaluation Vinay Belgaumkar
@ 2021-11-23 17:35   ` Belgaumkar, Vinay
  0 siblings, 0 replies; 14+ messages in thread
From: Belgaumkar, Vinay @ 2021-11-23 17:35 UTC (permalink / raw)
  To: intel-gfx, dri-devel; +Cc: Tvrtko Ursulin, Chris Wilson



On 11/17/2021 2:49 PM, Vinay Belgaumkar wrote:
> From: Chris Wilson <chris.p.wilson@intel.com>
> 
> Currently, we inspect each engine individually and measure the occupancy
> of that engine over the last evaluation interval. If that exceeds our
> busyness thresholds, we decide to increase the GPU frequency. However,
> under a load balancer, we should consider the occupancy of entire engine
> groups, as work may be spread out across the group. In doing so, we
> prefer wide over fast, power consumption is approximately proportional to
> the square of the frequency. However, since the load balancer is greedy,
> the first idle engine gets all the work, and preferrentially reuses the
> last active engine, under light loads all work is assigned to one
> engine, and so that engine appears very busy. But if the work happened
> to overlap slightly, the workload would spread across multiple engines,
> reducing each individual engine's runtime, and so reducing the rps
> contribution, keeping the frequency low. Instead, when considering the
> contribution, consider the contribution over the entire engine group
> (capacity).
> 
> Signed-off-by: Chris Wilson <chris.p.wilson@intel.com>
> Cc: Vinay Belgaumkar <vinay.belgaumkar@intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>

Reviewed-by: Vinay Belgaumkar <vinay.belgaumkar@intel.com>

> ---
>   drivers/gpu/drm/i915/gt/intel_rps.c | 48 ++++++++++++++++++++---------
>   1 file changed, 34 insertions(+), 14 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_rps.c b/drivers/gpu/drm/i915/gt/intel_rps.c
> index 07ff7ba7b2b7..3675ac93ded0 100644
> --- a/drivers/gpu/drm/i915/gt/intel_rps.c
> +++ b/drivers/gpu/drm/i915/gt/intel_rps.c
> @@ -7,6 +7,7 @@
>   
>   #include "i915_drv.h"
>   #include "intel_breadcrumbs.h"
> +#include "intel_engine_pm.h"
>   #include "intel_gt.h"
>   #include "intel_gt_clock_utils.h"
>   #include "intel_gt_irq.h"
> @@ -65,26 +66,45 @@ static void set(struct intel_uncore *uncore, i915_reg_t reg, u32 val)
>   static void rps_timer(struct timer_list *t)
>   {
>   	struct intel_rps *rps = from_timer(rps, t, timer);
> -	struct intel_engine_cs *engine;
> -	ktime_t dt, last, timestamp;
> -	enum intel_engine_id id;
> +	struct intel_gt *gt = rps_to_gt(rps);
> +	ktime_t dt, last, timestamp = 0;
>   	s64 max_busy[3] = {};
> +	int i, j;
>   
> -	timestamp = 0;
> -	for_each_engine(engine, rps_to_gt(rps), id) {
> -		s64 busy;
> -		int i;
> +	/* Compare average occupancy over each engine group */
> +	for (i = 0; i < ARRAY_SIZE(gt->engine_class); i++) {
> +		s64 busy = 0;
> +		int count = 0;
> +
> +		for (j = 0; j < ARRAY_SIZE(gt->engine_class[i]); j++) {
> +			struct intel_engine_cs *engine;
>   
> -		dt = intel_engine_get_busy_time(engine, &timestamp);
> -		last = engine->stats.rps;
> -		engine->stats.rps = dt;
> +			engine = gt->engine_class[i][j];
> +			if (!engine)
> +				continue;
>   
> -		busy = ktime_to_ns(ktime_sub(dt, last));
> -		for (i = 0; i < ARRAY_SIZE(max_busy); i++) {
> -			if (busy > max_busy[i])
> -				swap(busy, max_busy[i]);
> +			dt = intel_engine_get_busy_time(engine, &timestamp);
> +			last = engine->stats.rps;
> +			engine->stats.rps = dt;
> +
> +			if (!intel_engine_pm_is_awake(engine))
> +				continue;
> +
> +			busy += ktime_to_ns(ktime_sub(dt, last));
> +			count++;
> +		}
> +
> +		if (count > 1)
> +			busy = div_u64(busy, count);
> +		if (busy <= max_busy[ARRAY_SIZE(max_busy) - 1])
> +			continue;
> +
> +		for (j = 0; j < ARRAY_SIZE(max_busy); j++) {
> +			if (busy > max_busy[j])
> +				swap(busy, max_busy[j]);
>   		}
>   	}
> +
>   	last = rps->pm_timestamp;
>   	rps->pm_timestamp = timestamp;
>   
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 3/3] drm/i915/gt: Improve "race-to-idle" at low frequencies
  2021-11-17 22:49 ` [PATCH 3/3] drm/i915/gt: Improve "race-to-idle" at low frequencies Vinay Belgaumkar
  2021-11-22 18:44   ` Rodrigo Vivi
@ 2021-11-23 17:37   ` Belgaumkar, Vinay
  1 sibling, 0 replies; 14+ messages in thread
From: Belgaumkar, Vinay @ 2021-11-23 17:37 UTC (permalink / raw)
  To: intel-gfx, dri-devel; +Cc: Tvrtko Ursulin, Chris Wilson



On 11/17/2021 2:49 PM, Vinay Belgaumkar wrote:
> From: Chris Wilson <chris@chris-wilson.co.uk>
> 
> While the power consumption is proportional to the frequency, there is
> also a static draw for active gates. The longer we are able to powergate
> (rc6), the lower the static draw. Thus there is a sweetspot in the
> frequency/power curve where we run at higher frequency in order to sleep
> longer, aka race-to-idle. This is more evident at lower frequencies, so
> let's look to bump the frequency if we think we will benefit by sleeping
> longer at the higher frequency and so conserving power.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Vinay Belgaumkar <vinay.belgaumkar@intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>

Data collected does show some power savings.

Reviewed-by: Vinay Belgaumkar <vinay.belgaumkar@intel.com>
> ---
>   drivers/gpu/drm/i915/gt/intel_rps.c | 31 ++++++++++++++++++++++++-----
>   1 file changed, 26 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_rps.c b/drivers/gpu/drm/i915/gt/intel_rps.c
> index 3675ac93ded0..6af3231982af 100644
> --- a/drivers/gpu/drm/i915/gt/intel_rps.c
> +++ b/drivers/gpu/drm/i915/gt/intel_rps.c
> @@ -63,6 +63,22 @@ static void set(struct intel_uncore *uncore, i915_reg_t reg, u32 val)
>   	intel_uncore_write_fw(uncore, reg, val);
>   }
>   
> +static bool race_to_idle(struct intel_rps *rps, u64 busy, u64 dt)
> +{
> +	unsigned int this = rps->cur_freq;
> +	unsigned int next = rps->cur_freq + 1;
> +	u64 next_dt = next * max(busy, dt);
> +
> +	/*
> +	 * Compare estimated time spent in rc6 at the next power bin. If
> +	 * we expect to sleep longer than the estimated increased power
> +	 * cost of running at a higher frequency, it will be reduced power
> +	 * consumption overall.
> +	 */
> +	return (((next_dt - this * busy) >> 10) * this * this >
> +		((next_dt - next * busy) >> 10) * next * next);
> +}
> +
>   static void rps_timer(struct timer_list *t)
>   {
>   	struct intel_rps *rps = from_timer(rps, t, timer);
> @@ -133,7 +149,7 @@ static void rps_timer(struct timer_list *t)
>   			if (!max_busy[i])
>   				break;
>   
> -			busy += div_u64(max_busy[i], 1 << i);
> +			busy += max_busy[i] >> i;
>   		}
>   		GT_TRACE(rps_to_gt(rps),
>   			 "busy:%lld [%d%%], max:[%lld, %lld, %lld], interval:%d\n",
> @@ -141,13 +157,18 @@ static void rps_timer(struct timer_list *t)
>   			 max_busy[0], max_busy[1], max_busy[2],
>   			 rps->pm_interval);
>   
> -		if (100 * busy > rps->power.up_threshold * dt &&
> -		    rps->cur_freq < rps->max_freq_softlimit) {
> +		if (rps->cur_freq < rps->max_freq_softlimit &&
> +		    race_to_idle(rps, max_busy[0], dt)) {
> +			rps->pm_iir |= GEN6_PM_RP_UP_THRESHOLD;
> +			rps->pm_interval = 1;
> +			schedule_work(&rps->work);
> +		} else if (rps->cur_freq < rps->max_freq_softlimit &&
> +			   100 * busy > rps->power.up_threshold * dt) {
>   			rps->pm_iir |= GEN6_PM_RP_UP_THRESHOLD;
>   			rps->pm_interval = 1;
>   			schedule_work(&rps->work);
> -		} else if (100 * busy < rps->power.down_threshold * dt &&
> -			   rps->cur_freq > rps->min_freq_softlimit) {
> +		} else if (rps->cur_freq > rps->min_freq_softlimit &&
> +			   100 * busy < rps->power.down_threshold * dt) {
>   			rps->pm_iir |= GEN6_PM_RP_DOWN_THRESHOLD;
>   			rps->pm_interval = 1;
>   			schedule_work(&rps->work);
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [Intel-gfx] [PATCH 1/3] drm/i915/gt: Spread virtual engines over idle engines
  2021-11-23  9:39   ` Tvrtko Ursulin
@ 2021-11-23 19:52     ` Rodrigo Vivi
  2021-11-24  8:56       ` Tvrtko Ursulin
  0 siblings, 1 reply; 14+ messages in thread
From: Rodrigo Vivi @ 2021-11-23 19:52 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: Vinay Belgaumkar, intel-gfx, dri-devel, Chris Wilson

On Tue, Nov 23, 2021 at 09:39:25AM +0000, Tvrtko Ursulin wrote:
> 
> On 17/11/2021 22:49, Vinay Belgaumkar wrote:
> > From: Chris Wilson <chris@chris-wilson.co.uk>
> > 
> > Everytime we come to the end of a virtual engine's context, re-randomise
> > it's siblings[]. As we schedule the siblings' tasklets in the order they
> > are in the array, earlier entries are executed first (when idle) and so
> > will be preferred when scheduling the next virtual request. Currently,
> > we only update the array when switching onto a new idle engine, so we
> > prefer to stick on the last execute engine, keeping the work compact.
> > However, it can be beneficial to spread the work out across idle
> > engines, so choose another sibling as our preferred target at the end of
> > the context's execution.
> 
> This partially brings back, from a different angle, the more dynamic
> scheduling behavior which has been lost since bugfix 90a987205c6c
> ("drm/i915/gt: Only swap to a random sibling once upon creation").

Shouldn't we use the Fixes tag here since this is targeting to fix one
of the performance regressions of this patch?

> 
> One day we could experiment with using engine busyness as criteria (instead
> of random). Back in the day busyness was kind of the best strategy, although
> sampled at submit, not at the trailing edge like here, but it still may be
> able to settle down to engine configuration better in some scenarios. Only
> testing could say.
> 
> Still, from memory random also wasn't that bad so this should be okay for
> now.
> 
> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Since you reviewed and it looks to be a middle ground point in terms
of when to balancing (always like in the initial implementation vs
only once like the in 90a987205c6c).

If this one is really fixing the regression by itself:
Acked-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
on this patch here.

But I still don't want to take the risk with touching the freq with
race to idle, until not convinced that it is absolutely needed and
that we are not breaking the world out there.

> 
> Regards,
> 
> Tvrtko
> 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Vinay Belgaumkar <vinay.belgaumkar@intel.com>
> > Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> > ---
> >   .../drm/i915/gt/intel_execlists_submission.c  | 80 ++++++++++++-------
> >   1 file changed, 52 insertions(+), 28 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> > index ca03880fa7e4..b95bbc8fb91a 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> > @@ -539,6 +539,41 @@ static void execlists_schedule_in(struct i915_request *rq, int idx)
> >   	GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
> >   }
> > +static void virtual_xfer_context(struct virtual_engine *ve,
> > +				 struct intel_engine_cs *engine)
> > +{
> > +	unsigned int n;
> > +
> > +	if (likely(engine == ve->siblings[0]))
> > +		return;
> > +
> > +	if (!intel_engine_has_relative_mmio(engine))
> > +		lrc_update_offsets(&ve->context, engine);
> > +
> > +	/*
> > +	 * Move the bound engine to the top of the list for
> > +	 * future execution. We then kick this tasklet first
> > +	 * before checking others, so that we preferentially
> > +	 * reuse this set of bound registers.
> > +	 */
> > +	for (n = 1; n < ve->num_siblings; n++) {
> > +		if (ve->siblings[n] == engine) {
> > +			swap(ve->siblings[n], ve->siblings[0]);
> > +			break;
> > +		}
> > +	}
> > +}
> > +
> > +static int ve_random_sibling(struct virtual_engine *ve)
> > +{
> > +	return prandom_u32_max(ve->num_siblings);
> > +}
> > +
> > +static int ve_random_other_sibling(struct virtual_engine *ve)
> > +{
> > +	return 1 + prandom_u32_max(ve->num_siblings - 1);
> > +}
> > +
> >   static void
> >   resubmit_virtual_request(struct i915_request *rq, struct virtual_engine *ve)
> >   {
> > @@ -578,8 +613,23 @@ static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
> >   	    rq->execution_mask != engine->mask)
> >   		resubmit_virtual_request(rq, ve);
> > -	if (READ_ONCE(ve->request))
> > +	/*
> > +	 * Reschedule with a new "preferred" sibling.
> > +	 *
> > +	 * The tasklets are executed in the order of ve->siblings[], so
> > +	 * siblings[0] receives preferrential treatment of greedily checking
> > +	 * for execution of the virtual engine. At this point, the virtual
> > +	 * engine is no longer in the current GPU cache due to idleness or
> > +	 * contention, so it can be executed on any without penalty. We
> > +	 * re-randomise at this point in order to spread light loads across
> > +	 * the system, heavy overlapping loads will continue to be greedily
> > +	 * executed by the first available engine.
> > +	 */
> > +	if (READ_ONCE(ve->request)) {
> > +		virtual_xfer_context(ve,
> > +				     ve->siblings[ve_random_other_sibling(ve)]);
> >   		tasklet_hi_schedule(&ve->base.sched_engine->tasklet);
> > +	}
> >   }
> >   static void __execlists_schedule_out(struct i915_request * const rq,
> > @@ -1030,32 +1080,6 @@ first_virtual_engine(struct intel_engine_cs *engine)
> >   	return NULL;
> >   }
> > -static void virtual_xfer_context(struct virtual_engine *ve,
> > -				 struct intel_engine_cs *engine)
> > -{
> > -	unsigned int n;
> > -
> > -	if (likely(engine == ve->siblings[0]))
> > -		return;
> > -
> > -	GEM_BUG_ON(READ_ONCE(ve->context.inflight));
> > -	if (!intel_engine_has_relative_mmio(engine))
> > -		lrc_update_offsets(&ve->context, engine);
> > -
> > -	/*
> > -	 * Move the bound engine to the top of the list for
> > -	 * future execution. We then kick this tasklet first
> > -	 * before checking others, so that we preferentially
> > -	 * reuse this set of bound registers.
> > -	 */
> > -	for (n = 1; n < ve->num_siblings; n++) {
> > -		if (ve->siblings[n] == engine) {
> > -			swap(ve->siblings[n], ve->siblings[0]);
> > -			break;
> > -		}
> > -	}
> > -}
> > -
> >   static void defer_request(struct i915_request *rq, struct list_head * const pl)
> >   {
> >   	LIST_HEAD(list);
> > @@ -3590,7 +3614,7 @@ static void virtual_engine_initial_hint(struct virtual_engine *ve)
> >   	 * NB This does not force us to execute on this engine, it will just
> >   	 * typically be the first we inspect for submission.
> >   	 */
> > -	swp = prandom_u32_max(ve->num_siblings);
> > +	swp = ve_random_sibling(ve);
> >   	if (swp)
> >   		swap(ve->siblings[swp], ve->siblings[0]);
> >   }
> > 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [Intel-gfx] [PATCH 1/3] drm/i915/gt: Spread virtual engines over idle engines
  2021-11-23 19:52     ` [Intel-gfx] " Rodrigo Vivi
@ 2021-11-24  8:56       ` Tvrtko Ursulin
  2021-11-24 13:55         ` Rodrigo Vivi
  0 siblings, 1 reply; 14+ messages in thread
From: Tvrtko Ursulin @ 2021-11-24  8:56 UTC (permalink / raw)
  To: Rodrigo Vivi; +Cc: Vinay Belgaumkar, intel-gfx, dri-devel, Chris Wilson


On 23/11/2021 19:52, Rodrigo Vivi wrote:
> On Tue, Nov 23, 2021 at 09:39:25AM +0000, Tvrtko Ursulin wrote:
>>
>> On 17/11/2021 22:49, Vinay Belgaumkar wrote:
>>> From: Chris Wilson <chris@chris-wilson.co.uk>
>>>
>>> Everytime we come to the end of a virtual engine's context, re-randomise
>>> it's siblings[]. As we schedule the siblings' tasklets in the order they
>>> are in the array, earlier entries are executed first (when idle) and so
>>> will be preferred when scheduling the next virtual request. Currently,
>>> we only update the array when switching onto a new idle engine, so we
>>> prefer to stick on the last execute engine, keeping the work compact.
>>> However, it can be beneficial to spread the work out across idle
>>> engines, so choose another sibling as our preferred target at the end of
>>> the context's execution.
>>
>> This partially brings back, from a different angle, the more dynamic
>> scheduling behavior which has been lost since bugfix 90a987205c6c
>> ("drm/i915/gt: Only swap to a random sibling once upon creation").
> 
> Shouldn't we use the Fixes tag here since this is targeting to fix one
> of the performance regressions of this patch?

Probably not but hard to say. Note that it wasn't a performance 
regression that was reported but power.

And to go back to what we said elsewhere in the thread, I am actually 
with you in thinking that in the ideal world we need PnP testing across 
a variety of workloads and platforms. And "in the ideal world" should 
really be in the normal world. It is not professional to be reactive to 
isolated bug reports from users, without being able to see the overall 
picture.

>> One day we could experiment with using engine busyness as criteria (instead
>> of random). Back in the day busyness was kind of the best strategy, although
>> sampled at submit, not at the trailing edge like here, but it still may be
>> able to settle down to engine configuration better in some scenarios. Only
>> testing could say.
>>
>> Still, from memory random also wasn't that bad so this should be okay for
>> now.
>>
>> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> 
> Since you reviewed and it looks to be a middle ground point in terms
> of when to balancing (always like in the initial implementation vs
> only once like the in 90a987205c6c).
> 
> If this one is really fixing the regression by itself:
> Acked-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
> on this patch here.
> 
> But I still don't want to take the risk with touching the freq with
> race to idle, until not convinced that it is absolutely needed and
> that we are not breaking the world out there.

Yes agreed in principle, we have users with different priorities.

However the RPS patches in the series, definitely the 1st one which 
looks at classes versus individual engines, sound plausible to me. Given 
the absence of automated PnP testing mentioned above, in the past it was 
usually Chris who was making the above and beyond effort to evaluate 
changes like these on as many platforms as he could, and with different 
workloads. Not sure who has the mandate and drive to fill that space but 
something will need to happen.

Regards,

Tvrtko

>>
>> Regards,
>>
>> Tvrtko
>>
>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>> Cc: Vinay Belgaumkar <vinay.belgaumkar@intel.com>
>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
>>> ---
>>>    .../drm/i915/gt/intel_execlists_submission.c  | 80 ++++++++++++-------
>>>    1 file changed, 52 insertions(+), 28 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
>>> index ca03880fa7e4..b95bbc8fb91a 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
>>> +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
>>> @@ -539,6 +539,41 @@ static void execlists_schedule_in(struct i915_request *rq, int idx)
>>>    	GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
>>>    }
>>> +static void virtual_xfer_context(struct virtual_engine *ve,
>>> +				 struct intel_engine_cs *engine)
>>> +{
>>> +	unsigned int n;
>>> +
>>> +	if (likely(engine == ve->siblings[0]))
>>> +		return;
>>> +
>>> +	if (!intel_engine_has_relative_mmio(engine))
>>> +		lrc_update_offsets(&ve->context, engine);
>>> +
>>> +	/*
>>> +	 * Move the bound engine to the top of the list for
>>> +	 * future execution. We then kick this tasklet first
>>> +	 * before checking others, so that we preferentially
>>> +	 * reuse this set of bound registers.
>>> +	 */
>>> +	for (n = 1; n < ve->num_siblings; n++) {
>>> +		if (ve->siblings[n] == engine) {
>>> +			swap(ve->siblings[n], ve->siblings[0]);
>>> +			break;
>>> +		}
>>> +	}
>>> +}
>>> +
>>> +static int ve_random_sibling(struct virtual_engine *ve)
>>> +{
>>> +	return prandom_u32_max(ve->num_siblings);
>>> +}
>>> +
>>> +static int ve_random_other_sibling(struct virtual_engine *ve)
>>> +{
>>> +	return 1 + prandom_u32_max(ve->num_siblings - 1);
>>> +}
>>> +
>>>    static void
>>>    resubmit_virtual_request(struct i915_request *rq, struct virtual_engine *ve)
>>>    {
>>> @@ -578,8 +613,23 @@ static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
>>>    	    rq->execution_mask != engine->mask)
>>>    		resubmit_virtual_request(rq, ve);
>>> -	if (READ_ONCE(ve->request))
>>> +	/*
>>> +	 * Reschedule with a new "preferred" sibling.
>>> +	 *
>>> +	 * The tasklets are executed in the order of ve->siblings[], so
>>> +	 * siblings[0] receives preferrential treatment of greedily checking
>>> +	 * for execution of the virtual engine. At this point, the virtual
>>> +	 * engine is no longer in the current GPU cache due to idleness or
>>> +	 * contention, so it can be executed on any without penalty. We
>>> +	 * re-randomise at this point in order to spread light loads across
>>> +	 * the system, heavy overlapping loads will continue to be greedily
>>> +	 * executed by the first available engine.
>>> +	 */
>>> +	if (READ_ONCE(ve->request)) {
>>> +		virtual_xfer_context(ve,
>>> +				     ve->siblings[ve_random_other_sibling(ve)]);
>>>    		tasklet_hi_schedule(&ve->base.sched_engine->tasklet);
>>> +	}
>>>    }
>>>    static void __execlists_schedule_out(struct i915_request * const rq,
>>> @@ -1030,32 +1080,6 @@ first_virtual_engine(struct intel_engine_cs *engine)
>>>    	return NULL;
>>>    }
>>> -static void virtual_xfer_context(struct virtual_engine *ve,
>>> -				 struct intel_engine_cs *engine)
>>> -{
>>> -	unsigned int n;
>>> -
>>> -	if (likely(engine == ve->siblings[0]))
>>> -		return;
>>> -
>>> -	GEM_BUG_ON(READ_ONCE(ve->context.inflight));
>>> -	if (!intel_engine_has_relative_mmio(engine))
>>> -		lrc_update_offsets(&ve->context, engine);
>>> -
>>> -	/*
>>> -	 * Move the bound engine to the top of the list for
>>> -	 * future execution. We then kick this tasklet first
>>> -	 * before checking others, so that we preferentially
>>> -	 * reuse this set of bound registers.
>>> -	 */
>>> -	for (n = 1; n < ve->num_siblings; n++) {
>>> -		if (ve->siblings[n] == engine) {
>>> -			swap(ve->siblings[n], ve->siblings[0]);
>>> -			break;
>>> -		}
>>> -	}
>>> -}
>>> -
>>>    static void defer_request(struct i915_request *rq, struct list_head * const pl)
>>>    {
>>>    	LIST_HEAD(list);
>>> @@ -3590,7 +3614,7 @@ static void virtual_engine_initial_hint(struct virtual_engine *ve)
>>>    	 * NB This does not force us to execute on this engine, it will just
>>>    	 * typically be the first we inspect for submission.
>>>    	 */
>>> -	swp = prandom_u32_max(ve->num_siblings);
>>> +	swp = ve_random_sibling(ve);
>>>    	if (swp)
>>>    		swap(ve->siblings[swp], ve->siblings[0]);
>>>    }
>>>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [Intel-gfx] [PATCH 1/3] drm/i915/gt: Spread virtual engines over idle engines
  2021-11-24  8:56       ` Tvrtko Ursulin
@ 2021-11-24 13:55         ` Rodrigo Vivi
  2021-11-24 15:09           ` Rodrigo Vivi
  0 siblings, 1 reply; 14+ messages in thread
From: Rodrigo Vivi @ 2021-11-24 13:55 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: intel-gfx, dri-devel, Chris Wilson

On Wed, Nov 24, 2021 at 08:56:52AM +0000, Tvrtko Ursulin wrote:
> 
> On 23/11/2021 19:52, Rodrigo Vivi wrote:
> > On Tue, Nov 23, 2021 at 09:39:25AM +0000, Tvrtko Ursulin wrote:
> > > 
> > > On 17/11/2021 22:49, Vinay Belgaumkar wrote:
> > > > From: Chris Wilson <chris@chris-wilson.co.uk>
> > > > 
> > > > Everytime we come to the end of a virtual engine's context, re-randomise
> > > > it's siblings[]. As we schedule the siblings' tasklets in the order they
> > > > are in the array, earlier entries are executed first (when idle) and so
> > > > will be preferred when scheduling the next virtual request. Currently,
> > > > we only update the array when switching onto a new idle engine, so we
> > > > prefer to stick on the last execute engine, keeping the work compact.
> > > > However, it can be beneficial to spread the work out across idle
> > > > engines, so choose another sibling as our preferred target at the end of
> > > > the context's execution.
> > > 
> > > This partially brings back, from a different angle, the more dynamic
> > > scheduling behavior which has been lost since bugfix 90a987205c6c
> > > ("drm/i915/gt: Only swap to a random sibling once upon creation").
> > 
> > Shouldn't we use the Fixes tag here since this is targeting to fix one
> > of the performance regressions of this patch?
> 
> Probably not but hard to say. Note that it wasn't a performance regression
> that was reported but power.
> 
> And to go back to what we said elsewhere in the thread, I am actually with
> you in thinking that in the ideal world we need PnP testing across a variety
> of workloads and platforms. And "in the ideal world" should really be in the
> normal world. It is not professional to be reactive to isolated bug reports
> from users, without being able to see the overall picture.

We surely need to address the bug report from users. I'm just asking to address
that with the smallest fix that we can backport and fit to the products milestones.

Instead, we are creating another optimization feature on a rush. Without a proper
validation.

I believe it is too risk to add an algorithm like that without a broader test.
I see a big risk of introducing corner cases that will results in more bug report
from other users in a near future.

So, let's all be professionals and provide a smaller fix for a regression on
the load balancing scenario and provide a better validation with more data
to justify this new feature.

Thanks,
Rodrigo.

> 
> > > One day we could experiment with using engine busyness as criteria (instead
> > > of random). Back in the day busyness was kind of the best strategy, although
> > > sampled at submit, not at the trailing edge like here, but it still may be
> > > able to settle down to engine configuration better in some scenarios. Only
> > > testing could say.
> > > 
> > > Still, from memory random also wasn't that bad so this should be okay for
> > > now.
> > > 
> > > Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > 
> > Since you reviewed and it looks to be a middle ground point in terms
> > of when to balancing (always like in the initial implementation vs
> > only once like the in 90a987205c6c).
> > 
> > If this one is really fixing the regression by itself:
> > Acked-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
> > on this patch here.
> > 
> > But I still don't want to take the risk with touching the freq with
> > race to idle, until not convinced that it is absolutely needed and
> > that we are not breaking the world out there.
> 
> Yes agreed in principle, we have users with different priorities.
> 
> However the RPS patches in the series, definitely the 1st one which looks at
> classes versus individual engines, sound plausible to me. Given the absence
> of automated PnP testing mentioned above, in the past it was usually Chris
> who was making the above and beyond effort to evaluate changes like these on
> as many platforms as he could, and with different workloads. Not sure who
> has the mandate and drive to fill that space but something will need to
> happen.
> 
> Regards,
> 
> Tvrtko
> 
> > > 
> > > Regards,
> > > 
> > > Tvrtko
> > > 
> > > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > > > Cc: Vinay Belgaumkar <vinay.belgaumkar@intel.com>
> > > > Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> > > > ---
> > > >    .../drm/i915/gt/intel_execlists_submission.c  | 80 ++++++++++++-------
> > > >    1 file changed, 52 insertions(+), 28 deletions(-)
> > > > 
> > > > diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> > > > index ca03880fa7e4..b95bbc8fb91a 100644
> > > > --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> > > > +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> > > > @@ -539,6 +539,41 @@ static void execlists_schedule_in(struct i915_request *rq, int idx)
> > > >    	GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
> > > >    }
> > > > +static void virtual_xfer_context(struct virtual_engine *ve,
> > > > +				 struct intel_engine_cs *engine)
> > > > +{
> > > > +	unsigned int n;
> > > > +
> > > > +	if (likely(engine == ve->siblings[0]))
> > > > +		return;
> > > > +
> > > > +	if (!intel_engine_has_relative_mmio(engine))
> > > > +		lrc_update_offsets(&ve->context, engine);
> > > > +
> > > > +	/*
> > > > +	 * Move the bound engine to the top of the list for
> > > > +	 * future execution. We then kick this tasklet first
> > > > +	 * before checking others, so that we preferentially
> > > > +	 * reuse this set of bound registers.
> > > > +	 */
> > > > +	for (n = 1; n < ve->num_siblings; n++) {
> > > > +		if (ve->siblings[n] == engine) {
> > > > +			swap(ve->siblings[n], ve->siblings[0]);
> > > > +			break;
> > > > +		}
> > > > +	}
> > > > +}
> > > > +
> > > > +static int ve_random_sibling(struct virtual_engine *ve)
> > > > +{
> > > > +	return prandom_u32_max(ve->num_siblings);
> > > > +}
> > > > +
> > > > +static int ve_random_other_sibling(struct virtual_engine *ve)
> > > > +{
> > > > +	return 1 + prandom_u32_max(ve->num_siblings - 1);
> > > > +}
> > > > +
> > > >    static void
> > > >    resubmit_virtual_request(struct i915_request *rq, struct virtual_engine *ve)
> > > >    {
> > > > @@ -578,8 +613,23 @@ static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
> > > >    	    rq->execution_mask != engine->mask)
> > > >    		resubmit_virtual_request(rq, ve);
> > > > -	if (READ_ONCE(ve->request))
> > > > +	/*
> > > > +	 * Reschedule with a new "preferred" sibling.
> > > > +	 *
> > > > +	 * The tasklets are executed in the order of ve->siblings[], so
> > > > +	 * siblings[0] receives preferrential treatment of greedily checking
> > > > +	 * for execution of the virtual engine. At this point, the virtual
> > > > +	 * engine is no longer in the current GPU cache due to idleness or
> > > > +	 * contention, so it can be executed on any without penalty. We
> > > > +	 * re-randomise at this point in order to spread light loads across
> > > > +	 * the system, heavy overlapping loads will continue to be greedily
> > > > +	 * executed by the first available engine.
> > > > +	 */
> > > > +	if (READ_ONCE(ve->request)) {
> > > > +		virtual_xfer_context(ve,
> > > > +				     ve->siblings[ve_random_other_sibling(ve)]);
> > > >    		tasklet_hi_schedule(&ve->base.sched_engine->tasklet);
> > > > +	}
> > > >    }
> > > >    static void __execlists_schedule_out(struct i915_request * const rq,
> > > > @@ -1030,32 +1080,6 @@ first_virtual_engine(struct intel_engine_cs *engine)
> > > >    	return NULL;
> > > >    }
> > > > -static void virtual_xfer_context(struct virtual_engine *ve,
> > > > -				 struct intel_engine_cs *engine)
> > > > -{
> > > > -	unsigned int n;
> > > > -
> > > > -	if (likely(engine == ve->siblings[0]))
> > > > -		return;
> > > > -
> > > > -	GEM_BUG_ON(READ_ONCE(ve->context.inflight));
> > > > -	if (!intel_engine_has_relative_mmio(engine))
> > > > -		lrc_update_offsets(&ve->context, engine);
> > > > -
> > > > -	/*
> > > > -	 * Move the bound engine to the top of the list for
> > > > -	 * future execution. We then kick this tasklet first
> > > > -	 * before checking others, so that we preferentially
> > > > -	 * reuse this set of bound registers.
> > > > -	 */
> > > > -	for (n = 1; n < ve->num_siblings; n++) {
> > > > -		if (ve->siblings[n] == engine) {
> > > > -			swap(ve->siblings[n], ve->siblings[0]);
> > > > -			break;
> > > > -		}
> > > > -	}
> > > > -}
> > > > -
> > > >    static void defer_request(struct i915_request *rq, struct list_head * const pl)
> > > >    {
> > > >    	LIST_HEAD(list);
> > > > @@ -3590,7 +3614,7 @@ static void virtual_engine_initial_hint(struct virtual_engine *ve)
> > > >    	 * NB This does not force us to execute on this engine, it will just
> > > >    	 * typically be the first we inspect for submission.
> > > >    	 */
> > > > -	swp = prandom_u32_max(ve->num_siblings);
> > > > +	swp = ve_random_sibling(ve);
> > > >    	if (swp)
> > > >    		swap(ve->siblings[swp], ve->siblings[0]);
> > > >    }
> > > > 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [Intel-gfx] [PATCH 1/3] drm/i915/gt: Spread virtual engines over idle engines
  2021-11-24 13:55         ` Rodrigo Vivi
@ 2021-11-24 15:09           ` Rodrigo Vivi
  0 siblings, 0 replies; 14+ messages in thread
From: Rodrigo Vivi @ 2021-11-24 15:09 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: intel-gfx, dri-devel, Chris Wilson

On Wed, Nov 24, 2021 at 08:55:39AM -0500, Rodrigo Vivi wrote:
> On Wed, Nov 24, 2021 at 08:56:52AM +0000, Tvrtko Ursulin wrote:
> > 
> > On 23/11/2021 19:52, Rodrigo Vivi wrote:
> > > On Tue, Nov 23, 2021 at 09:39:25AM +0000, Tvrtko Ursulin wrote:
> > > > 
> > > > On 17/11/2021 22:49, Vinay Belgaumkar wrote:
> > > > > From: Chris Wilson <chris@chris-wilson.co.uk>
> > > > > 
> > > > > Everytime we come to the end of a virtual engine's context, re-randomise
> > > > > it's siblings[]. As we schedule the siblings' tasklets in the order they
> > > > > are in the array, earlier entries are executed first (when idle) and so
> > > > > will be preferred when scheduling the next virtual request. Currently,
> > > > > we only update the array when switching onto a new idle engine, so we
> > > > > prefer to stick on the last execute engine, keeping the work compact.
> > > > > However, it can be beneficial to spread the work out across idle
> > > > > engines, so choose another sibling as our preferred target at the end of
> > > > > the context's execution.
> > > > 
> > > > This partially brings back, from a different angle, the more dynamic
> > > > scheduling behavior which has been lost since bugfix 90a987205c6c
> > > > ("drm/i915/gt: Only swap to a random sibling once upon creation").
> > > 
> > > Shouldn't we use the Fixes tag here since this is targeting to fix one
> > > of the performance regressions of this patch?
> > 
> > Probably not but hard to say. Note that it wasn't a performance regression
> > that was reported but power.
> > 
> > And to go back to what we said elsewhere in the thread, I am actually with
> > you in thinking that in the ideal world we need PnP testing across a variety
> > of workloads and platforms. And "in the ideal world" should really be in the
> > normal world. It is not professional to be reactive to isolated bug reports
> > from users, without being able to see the overall picture.
> 
> We surely need to address the bug report from users. I'm just asking to address
> that with the smallest fix that we can backport and fit to the products milestones.
> 
> Instead, we are creating another optimization feature on a rush. Without a proper
> validation.
> 
> I believe it is too risk to add an algorithm like that without a broader test.
> I see a big risk of introducing corner cases that will results in more bug report
> from other users in a near future.
> 
> So, let's all be professionals and provide a smaller fix for a regression on
> the load balancing scenario and provide a better validation with more data
> to justify this new feature.

Okay, after more IRC discussions I see that patch 2 is also part of the solution
and probably safe.

Let me be clear that my biggest complain and the risk is with race-to-idle in
patch 3 on trying to predict the rc6 behavior and increasing the freq to try to
complete job faster and then get to rc6 faster... That one would need a lot
more validation.

> 
> Thanks,
> Rodrigo.
> 
> > 
> > > > One day we could experiment with using engine busyness as criteria (instead
> > > > of random). Back in the day busyness was kind of the best strategy, although
> > > > sampled at submit, not at the trailing edge like here, but it still may be
> > > > able to settle down to engine configuration better in some scenarios. Only
> > > > testing could say.
> > > > 
> > > > Still, from memory random also wasn't that bad so this should be okay for
> > > > now.
> > > > 
> > > > Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > > 
> > > Since you reviewed and it looks to be a middle ground point in terms
> > > of when to balancing (always like in the initial implementation vs
> > > only once like the in 90a987205c6c).
> > > 
> > > If this one is really fixing the regression by itself:
> > > Acked-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
> > > on this patch here.
> > > 
> > > But I still don't want to take the risk with touching the freq with
> > > race to idle, until not convinced that it is absolutely needed and
> > > that we are not breaking the world out there.
> > 
> > Yes agreed in principle, we have users with different priorities.
> > 
> > However the RPS patches in the series, definitely the 1st one which looks at
> > classes versus individual engines, sound plausible to me. Given the absence
> > of automated PnP testing mentioned above, in the past it was usually Chris
> > who was making the above and beyond effort to evaluate changes like these on
> > as many platforms as he could, and with different workloads. Not sure who
> > has the mandate and drive to fill that space but something will need to
> > happen.
> > 
> > Regards,
> > 
> > Tvrtko
> > 
> > > > 
> > > > Regards,
> > > > 
> > > > Tvrtko
> > > > 
> > > > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > > > > Cc: Vinay Belgaumkar <vinay.belgaumkar@intel.com>
> > > > > Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> > > > > ---
> > > > >    .../drm/i915/gt/intel_execlists_submission.c  | 80 ++++++++++++-------
> > > > >    1 file changed, 52 insertions(+), 28 deletions(-)
> > > > > 
> > > > > diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> > > > > index ca03880fa7e4..b95bbc8fb91a 100644
> > > > > --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> > > > > +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> > > > > @@ -539,6 +539,41 @@ static void execlists_schedule_in(struct i915_request *rq, int idx)
> > > > >    	GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
> > > > >    }
> > > > > +static void virtual_xfer_context(struct virtual_engine *ve,
> > > > > +				 struct intel_engine_cs *engine)
> > > > > +{
> > > > > +	unsigned int n;
> > > > > +
> > > > > +	if (likely(engine == ve->siblings[0]))
> > > > > +		return;
> > > > > +
> > > > > +	if (!intel_engine_has_relative_mmio(engine))
> > > > > +		lrc_update_offsets(&ve->context, engine);
> > > > > +
> > > > > +	/*
> > > > > +	 * Move the bound engine to the top of the list for
> > > > > +	 * future execution. We then kick this tasklet first
> > > > > +	 * before checking others, so that we preferentially
> > > > > +	 * reuse this set of bound registers.
> > > > > +	 */
> > > > > +	for (n = 1; n < ve->num_siblings; n++) {
> > > > > +		if (ve->siblings[n] == engine) {
> > > > > +			swap(ve->siblings[n], ve->siblings[0]);
> > > > > +			break;
> > > > > +		}
> > > > > +	}
> > > > > +}
> > > > > +
> > > > > +static int ve_random_sibling(struct virtual_engine *ve)
> > > > > +{
> > > > > +	return prandom_u32_max(ve->num_siblings);
> > > > > +}
> > > > > +
> > > > > +static int ve_random_other_sibling(struct virtual_engine *ve)
> > > > > +{
> > > > > +	return 1 + prandom_u32_max(ve->num_siblings - 1);
> > > > > +}
> > > > > +
> > > > >    static void
> > > > >    resubmit_virtual_request(struct i915_request *rq, struct virtual_engine *ve)
> > > > >    {
> > > > > @@ -578,8 +613,23 @@ static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
> > > > >    	    rq->execution_mask != engine->mask)
> > > > >    		resubmit_virtual_request(rq, ve);
> > > > > -	if (READ_ONCE(ve->request))
> > > > > +	/*
> > > > > +	 * Reschedule with a new "preferred" sibling.
> > > > > +	 *
> > > > > +	 * The tasklets are executed in the order of ve->siblings[], so
> > > > > +	 * siblings[0] receives preferrential treatment of greedily checking
> > > > > +	 * for execution of the virtual engine. At this point, the virtual
> > > > > +	 * engine is no longer in the current GPU cache due to idleness or
> > > > > +	 * contention, so it can be executed on any without penalty. We
> > > > > +	 * re-randomise at this point in order to spread light loads across
> > > > > +	 * the system, heavy overlapping loads will continue to be greedily
> > > > > +	 * executed by the first available engine.
> > > > > +	 */
> > > > > +	if (READ_ONCE(ve->request)) {
> > > > > +		virtual_xfer_context(ve,
> > > > > +				     ve->siblings[ve_random_other_sibling(ve)]);
> > > > >    		tasklet_hi_schedule(&ve->base.sched_engine->tasklet);
> > > > > +	}
> > > > >    }
> > > > >    static void __execlists_schedule_out(struct i915_request * const rq,
> > > > > @@ -1030,32 +1080,6 @@ first_virtual_engine(struct intel_engine_cs *engine)
> > > > >    	return NULL;
> > > > >    }
> > > > > -static void virtual_xfer_context(struct virtual_engine *ve,
> > > > > -				 struct intel_engine_cs *engine)
> > > > > -{
> > > > > -	unsigned int n;
> > > > > -
> > > > > -	if (likely(engine == ve->siblings[0]))
> > > > > -		return;
> > > > > -
> > > > > -	GEM_BUG_ON(READ_ONCE(ve->context.inflight));
> > > > > -	if (!intel_engine_has_relative_mmio(engine))
> > > > > -		lrc_update_offsets(&ve->context, engine);
> > > > > -
> > > > > -	/*
> > > > > -	 * Move the bound engine to the top of the list for
> > > > > -	 * future execution. We then kick this tasklet first
> > > > > -	 * before checking others, so that we preferentially
> > > > > -	 * reuse this set of bound registers.
> > > > > -	 */
> > > > > -	for (n = 1; n < ve->num_siblings; n++) {
> > > > > -		if (ve->siblings[n] == engine) {
> > > > > -			swap(ve->siblings[n], ve->siblings[0]);
> > > > > -			break;
> > > > > -		}
> > > > > -	}
> > > > > -}
> > > > > -
> > > > >    static void defer_request(struct i915_request *rq, struct list_head * const pl)
> > > > >    {
> > > > >    	LIST_HEAD(list);
> > > > > @@ -3590,7 +3614,7 @@ static void virtual_engine_initial_hint(struct virtual_engine *ve)
> > > > >    	 * NB This does not force us to execute on this engine, it will just
> > > > >    	 * typically be the first we inspect for submission.
> > > > >    	 */
> > > > > -	swp = prandom_u32_max(ve->num_siblings);
> > > > > +	swp = ve_random_sibling(ve);
> > > > >    	if (swp)
> > > > >    		swap(ve->siblings[swp], ve->siblings[0]);
> > > > >    }
> > > > > 

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2021-11-24 15:09 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-11-17 22:49 [PATCH 0/3] drm/i915/gt: RPS tuning for light media playback Vinay Belgaumkar
2021-11-17 22:49 ` [PATCH 1/3] drm/i915/gt: Spread virtual engines over idle engines Vinay Belgaumkar
2021-11-23  9:39   ` Tvrtko Ursulin
2021-11-23 19:52     ` [Intel-gfx] " Rodrigo Vivi
2021-11-24  8:56       ` Tvrtko Ursulin
2021-11-24 13:55         ` Rodrigo Vivi
2021-11-24 15:09           ` Rodrigo Vivi
2021-11-17 22:49 ` [PATCH 2/3] drm/i915/gt: Compare average group occupancy for RPS evaluation Vinay Belgaumkar
2021-11-23 17:35   ` Belgaumkar, Vinay
2021-11-17 22:49 ` [PATCH 3/3] drm/i915/gt: Improve "race-to-idle" at low frequencies Vinay Belgaumkar
2021-11-22 18:44   ` Rodrigo Vivi
2021-11-23  9:17     ` Tvrtko Ursulin
2021-11-23 16:53       ` Vivi, Rodrigo
2021-11-23 17:37   ` Belgaumkar, Vinay

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).