All of lore.kernel.org
 help / color / mirror / Atom feed
* [Intel-gfx] [PATCH 1/2] drm/i915/gt: Try to smooth RPS spikes
@ 2020-04-14 16:14 Chris Wilson
  2020-04-14 16:14   ` [Intel-gfx] " Chris Wilson
                   ` (5 more replies)
  0 siblings, 6 replies; 34+ messages in thread
From: Chris Wilson @ 2020-04-14 16:14 UTC (permalink / raw)
  To: intel-gfx; +Cc: Chris Wilson

By the time we respond to the RPS interrupt [inside a worker], the GPU
may be running a different workload. As we look to make the evalution
intervals shorter, these spikes are more likely to okay. Let's try to
smooth over the spikes in the workload by comparing the EI interrupt
[up/down events] with the most recently completed EI; if both say up,
then increase the clocks, if they disagree stay the same. In principle,
this means we now take 2 up EI to go increase into the next bin, and
similary 2 down EI to decrease. However, if the worker runs fast enough,
the previous EI in the registers will be the same as triggered the
interrupt, so responsiveness remains unaffect. [Under the current scheme
where EI are on the order of 10ms, it is likely that this is true and we
compare the interrupt with the EI that caused it.]

As usual, Valleyview just likes to be different; and there since we are
manually evaluating the threshold, we cannot sample the previous EI
registers.

References: https://gitlab.freedesktop.org/drm/intel/-/issues/1698
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Andi Shyti <andi.shyti@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_rps.c | 59 ++++++++++++++++++++++++-----
 1 file changed, 50 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_rps.c b/drivers/gpu/drm/i915/gt/intel_rps.c
index 86110458e2a7..367132092bed 100644
--- a/drivers/gpu/drm/i915/gt/intel_rps.c
+++ b/drivers/gpu/drm/i915/gt/intel_rps.c
@@ -1416,6 +1416,11 @@ static void vlv_c0_read(struct intel_uncore *uncore, struct intel_rps_ei *ei)
 	ei->media_c0 = intel_uncore_read(uncore, VLV_MEDIA_C0_COUNT);
 }
 
+static bool vlv_manual_ei(u32 pm_iir)
+{
+	return pm_iir & GEN6_PM_RP_UP_EI_EXPIRED;
+}
+
 static u32 vlv_wa_c0_ei(struct intel_rps *rps, u32 pm_iir)
 {
 	struct intel_uncore *uncore = rps_to_uncore(rps);
@@ -1423,7 +1428,7 @@ static u32 vlv_wa_c0_ei(struct intel_rps *rps, u32 pm_iir)
 	struct intel_rps_ei now;
 	u32 events = 0;
 
-	if ((pm_iir & GEN6_PM_RP_UP_EI_EXPIRED) == 0)
+	if (!vlv_manual_ei(pm_iir))
 		return 0;
 
 	vlv_c0_read(uncore, &now);
@@ -1456,6 +1461,37 @@ static u32 vlv_wa_c0_ei(struct intel_rps *rps, u32 pm_iir)
 	return events;
 }
 
+static bool __confirm_ei(struct intel_rps *rps,
+			 i915_reg_t ei_sample,
+			 i915_reg_t ei_threshold)
+{
+	struct intel_uncore *uncore = rps_to_uncore(rps);
+	u32 threshold, sample;
+
+	sample = intel_uncore_read(uncore, ei_sample);
+	threshold = intel_uncore_read(uncore, ei_threshold);
+
+	sample &= GEN6_CURBSYTAVG_MASK;
+
+	return sample > threshold;
+}
+
+static bool confirm_up(struct intel_rps *rps, u32 pm_iir)
+{
+	if (vlv_manual_ei(pm_iir))
+		return true;
+
+	return __confirm_ei(rps, GEN6_RP_PREV_UP, GEN6_RP_UP_THRESHOLD);
+}
+
+static bool confirm_down(struct intel_rps *rps, u32 pm_iir)
+{
+	if (vlv_manual_ei(pm_iir))
+		return true;
+
+	return !__confirm_ei(rps, GEN6_RP_PREV_UP, GEN6_RP_UP_THRESHOLD);
+}
+
 static void rps_work(struct work_struct *work)
 {
 	struct intel_rps *rps = container_of(work, typeof(*rps), work);
@@ -1484,10 +1520,11 @@ static void rps_work(struct work_struct *work)
 	max = rps->max_freq_softlimit;
 	if (client_boost)
 		max = rps->max_freq;
-	if (client_boost && new_freq < rps->boost_freq) {
+	if (client_boost && new_freq <= rps->boost_freq) {
 		new_freq = rps->boost_freq;
 		adj = 0;
-	} else if (pm_iir & GEN6_PM_RP_UP_THRESHOLD) {
+	} else if (pm_iir & GEN6_PM_RP_UP_THRESHOLD &&
+		   confirm_up(rps, pm_iir)) {
 		if (adj > 0)
 			adj *= 2;
 		else /* CHV needs even encode values */
@@ -1497,13 +1534,15 @@ static void rps_work(struct work_struct *work)
 			adj = 0;
 	} else if (client_boost) {
 		adj = 0;
-	} else if (pm_iir & GEN6_PM_RP_DOWN_TIMEOUT) {
+	} else if (pm_iir & GEN6_PM_RP_DOWN_TIMEOUT &&
+		   confirm_down(rps, pm_iir)) {
 		if (rps->cur_freq > rps->efficient_freq)
 			new_freq = rps->efficient_freq;
 		else if (rps->cur_freq > rps->min_freq_softlimit)
 			new_freq = rps->min_freq_softlimit;
 		adj = 0;
-	} else if (pm_iir & GEN6_PM_RP_DOWN_THRESHOLD) {
+	} else if (pm_iir & GEN6_PM_RP_DOWN_THRESHOLD &&
+		   confirm_down(rps, pm_iir)) {
 		if (adj < 0)
 			adj *= 2;
 		else /* CHV needs even encode values */
@@ -1511,8 +1550,8 @@ static void rps_work(struct work_struct *work)
 
 		if (new_freq <= rps->min_freq_softlimit)
 			adj = 0;
-	} else { /* unknown event */
-		adj = 0;
+	} else { /* unknown event, or unwanted */
+		goto unlock;
 	}
 
 	rps->last_adj = adj;
@@ -1529,8 +1568,9 @@ static void rps_work(struct work_struct *work)
 	    (adj > 0 && rps->power.mode == LOW_POWER))
 		rps->last_adj = 0;
 
-	/* sysfs frequency interfaces may have snuck in while servicing the
-	 * interrupt
+	/*
+	 * sysfs frequency limits may have snuck in while
+	 * servicing the interrupt
 	 */
 	new_freq += adj;
 	new_freq = clamp_t(int, new_freq, min, max);
@@ -1540,6 +1580,7 @@ static void rps_work(struct work_struct *work)
 		rps->last_adj = 0;
 	}
 
+unlock:
 	mutex_unlock(&rps->lock);
 
 out:
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [PATCH 2/2] drm/i915/gt: Shrink the RPS evalution intervals
  2020-04-14 16:14 [Intel-gfx] [PATCH 1/2] drm/i915/gt: Try to smooth RPS spikes Chris Wilson
@ 2020-04-14 16:14   ` Chris Wilson
  2020-04-15  0:56 ` [Intel-gfx] ✗ Fi.CI.CHECKPATCH: warning for series starting with [1/2] drm/i915/gt: Try to smooth RPS spikes Patchwork
                     ` (4 subsequent siblings)
  5 siblings, 0 replies; 34+ messages in thread
From: Chris Wilson @ 2020-04-14 16:14 UTC (permalink / raw)
  To: intel-gfx
  Cc: Chris Wilson, Mika Kuoppala, Andi Shyti, Lyude Paul,
	Francisco Jerez, stable

Try to make RPS dramatically more responsive by shrinking the evaluation
intervales by a factor of 100! The issue is as we now park the GPU
rapidly upon idling, a short or bursty workload such as the composited
desktop never sustains enough work to fill and complete an evaluation
window. As such, the frequency we program remains stuck. This was first
reported as once boosted, we never relinquished the boost [see commit
21abf0bf168d ("drm/i915/gt: Treat idling as a RPS downclock event")] but
it equally applies in the order direction for bursty workloads that
*need* low latency, like desktop animations.

What we could try is preserve the incomplete EI history across idling,
it is not clear whether that would be effective, nor whether the
presumption of continuous workloads is accurate. A clearer path seems to
treat it as symptomatic that we fail to handle bursty workload with the
current EI, and seek to address that by shrinking the EI so the
evaluations are run much more often.

This will likely entail more frequent interrupts, and by the time we
process the interrupt in the bottom half [from inside a worker], the
workload on the GPU has changed. To address the changeable nature, in
the previous patch we compared the previous complete EI with the
interrupt request and only up/down clock if both agree. The impact of
asking for, and presumably, receiving more interrupts is still to be
determined and mitigations sought. The first idea is to differentiate
between up/down responsivity and make upclocking more responsive than
downlocking. This should both help thwart jitter on bursty workloads by
making it easier to increase than it is to decrease frequencies, and
reduce the number of interrupts we would need to process.

Fixes: 21abf0bf168d ("drm/i915/gt: Treat idling as a RPS downclock event")
Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/1698
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Andi Shyti <andi.shyti@intel.com>
Cc: Lyude Paul <lyude@redhat.com>
Cc: Francisco Jerez <currojerez@riseup.net>
Cc: <stable@vger.kernel.org> # v5.5+
---
 drivers/gpu/drm/i915/gt/intel_rps.c | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_rps.c b/drivers/gpu/drm/i915/gt/intel_rps.c
index 367132092bed..47ddb25edc97 100644
--- a/drivers/gpu/drm/i915/gt/intel_rps.c
+++ b/drivers/gpu/drm/i915/gt/intel_rps.c
@@ -542,37 +542,38 @@ static void rps_set_power(struct intel_rps *rps, int new_power)
 	/* Note the units here are not exactly 1us, but 1280ns. */
 	switch (new_power) {
 	case LOW_POWER:
-		/* Upclock if more than 95% busy over 16ms */
-		ei_up = 16000;
+		/* Upclock if more than 95% busy over 160us */
+		ei_up = 160;
 		threshold_up = 95;
 
-		/* Downclock if less than 85% busy over 32ms */
-		ei_down = 32000;
+		/* Downclock if less than 85% busy over 1600us */
+		ei_down = 1600;
 		threshold_down = 85;
 		break;
 
 	case BETWEEN:
-		/* Upclock if more than 90% busy over 13ms */
-		ei_up = 13000;
+		/* Upclock if more than 90% busy over 160us */
+		ei_up = 160;
 		threshold_up = 90;
 
-		/* Downclock if less than 75% busy over 32ms */
-		ei_down = 32000;
+		/* Downclock if less than 75% busy over 1600us */
+		ei_down = 1600;
 		threshold_down = 75;
 		break;
 
 	case HIGH_POWER:
-		/* Upclock if more than 85% busy over 10ms */
-		ei_up = 10000;
+		/* Upclock if more than 85% busy over 160us */
+		ei_up = 160;
 		threshold_up = 85;
 
-		/* Downclock if less than 60% busy over 32ms */
-		ei_down = 32000;
+		/* Downclock if less than 60% busy over 1600us */
+		ei_down = 1600;
 		threshold_down = 60;
 		break;
 	}
 
-	/* When byt can survive without system hang with dynamic
+	/*
+	 * When byt can survive without system hang with dynamic
 	 * sw freq adjustments, this restriction can be lifted.
 	 */
 	if (IS_VALLEYVIEW(i915))
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [Intel-gfx] [PATCH 2/2] drm/i915/gt: Shrink the RPS evalution intervals
@ 2020-04-14 16:14   ` Chris Wilson
  0 siblings, 0 replies; 34+ messages in thread
From: Chris Wilson @ 2020-04-14 16:14 UTC (permalink / raw)
  To: intel-gfx; +Cc: stable, Chris Wilson

Try to make RPS dramatically more responsive by shrinking the evaluation
intervales by a factor of 100! The issue is as we now park the GPU
rapidly upon idling, a short or bursty workload such as the composited
desktop never sustains enough work to fill and complete an evaluation
window. As such, the frequency we program remains stuck. This was first
reported as once boosted, we never relinquished the boost [see commit
21abf0bf168d ("drm/i915/gt: Treat idling as a RPS downclock event")] but
it equally applies in the order direction for bursty workloads that
*need* low latency, like desktop animations.

What we could try is preserve the incomplete EI history across idling,
it is not clear whether that would be effective, nor whether the
presumption of continuous workloads is accurate. A clearer path seems to
treat it as symptomatic that we fail to handle bursty workload with the
current EI, and seek to address that by shrinking the EI so the
evaluations are run much more often.

This will likely entail more frequent interrupts, and by the time we
process the interrupt in the bottom half [from inside a worker], the
workload on the GPU has changed. To address the changeable nature, in
the previous patch we compared the previous complete EI with the
interrupt request and only up/down clock if both agree. The impact of
asking for, and presumably, receiving more interrupts is still to be
determined and mitigations sought. The first idea is to differentiate
between up/down responsivity and make upclocking more responsive than
downlocking. This should both help thwart jitter on bursty workloads by
making it easier to increase than it is to decrease frequencies, and
reduce the number of interrupts we would need to process.

Fixes: 21abf0bf168d ("drm/i915/gt: Treat idling as a RPS downclock event")
Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/1698
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Andi Shyti <andi.shyti@intel.com>
Cc: Lyude Paul <lyude@redhat.com>
Cc: Francisco Jerez <currojerez@riseup.net>
Cc: <stable@vger.kernel.org> # v5.5+
---
 drivers/gpu/drm/i915/gt/intel_rps.c | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_rps.c b/drivers/gpu/drm/i915/gt/intel_rps.c
index 367132092bed..47ddb25edc97 100644
--- a/drivers/gpu/drm/i915/gt/intel_rps.c
+++ b/drivers/gpu/drm/i915/gt/intel_rps.c
@@ -542,37 +542,38 @@ static void rps_set_power(struct intel_rps *rps, int new_power)
 	/* Note the units here are not exactly 1us, but 1280ns. */
 	switch (new_power) {
 	case LOW_POWER:
-		/* Upclock if more than 95% busy over 16ms */
-		ei_up = 16000;
+		/* Upclock if more than 95% busy over 160us */
+		ei_up = 160;
 		threshold_up = 95;
 
-		/* Downclock if less than 85% busy over 32ms */
-		ei_down = 32000;
+		/* Downclock if less than 85% busy over 1600us */
+		ei_down = 1600;
 		threshold_down = 85;
 		break;
 
 	case BETWEEN:
-		/* Upclock if more than 90% busy over 13ms */
-		ei_up = 13000;
+		/* Upclock if more than 90% busy over 160us */
+		ei_up = 160;
 		threshold_up = 90;
 
-		/* Downclock if less than 75% busy over 32ms */
-		ei_down = 32000;
+		/* Downclock if less than 75% busy over 1600us */
+		ei_down = 1600;
 		threshold_down = 75;
 		break;
 
 	case HIGH_POWER:
-		/* Upclock if more than 85% busy over 10ms */
-		ei_up = 10000;
+		/* Upclock if more than 85% busy over 160us */
+		ei_up = 160;
 		threshold_up = 85;
 
-		/* Downclock if less than 60% busy over 32ms */
-		ei_down = 32000;
+		/* Downclock if less than 60% busy over 1600us */
+		ei_down = 1600;
 		threshold_down = 60;
 		break;
 	}
 
-	/* When byt can survive without system hang with dynamic
+	/*
+	 * When byt can survive without system hang with dynamic
 	 * sw freq adjustments, this restriction can be lifted.
 	 */
 	if (IS_VALLEYVIEW(i915))
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* Re: [PATCH 2/2] drm/i915/gt: Shrink the RPS evalution intervals
  2020-04-14 16:14   ` [Intel-gfx] " Chris Wilson
@ 2020-04-14 16:35     ` Chris Wilson
  -1 siblings, 0 replies; 34+ messages in thread
From: Chris Wilson @ 2020-04-14 16:35 UTC (permalink / raw)
  To: intel-gfx; +Cc: Mika Kuoppala, Andi Shyti, Lyude Paul, Francisco Jerez, stable

Quoting Chris Wilson (2020-04-14 17:14:23)
> Try to make RPS dramatically more responsive by shrinking the evaluation
> intervales by a factor of 100! The issue is as we now park the GPU
> rapidly upon idling, a short or bursty workload such as the composited
> desktop never sustains enough work to fill and complete an evaluation
> window. As such, the frequency we program remains stuck. This was first
> reported as once boosted, we never relinquished the boost [see commit
> 21abf0bf168d ("drm/i915/gt: Treat idling as a RPS downclock event")] but
> it equally applies in the order direction for bursty workloads that
> *need* low latency, like desktop animations.
> 
> What we could try is preserve the incomplete EI history across idling,
> it is not clear whether that would be effective, nor whether the
> presumption of continuous workloads is accurate. A clearer path seems to
> treat it as symptomatic that we fail to handle bursty workload with the
> current EI, and seek to address that by shrinking the EI so the
> evaluations are run much more often.
> 
> This will likely entail more frequent interrupts, and by the time we
> process the interrupt in the bottom half [from inside a worker], the
> workload on the GPU has changed. To address the changeable nature, in
> the previous patch we compared the previous complete EI with the
> interrupt request and only up/down clock if both agree. The impact of
> asking for, and presumably, receiving more interrupts is still to be
> determined and mitigations sought. The first idea is to differentiate
> between up/down responsivity and make upclocking more responsive than
> downlocking. This should both help thwart jitter on bursty workloads by
> making it easier to increase than it is to decrease frequencies, and
> reduce the number of interrupts we would need to process.

Another worry I'd like to raise, is that by reducing the EI we risk
unstable evaluations. I'm not sure how accurate the HW is, and I worry
about borderline workloads (if that is possible) but mainly the worry is
how the HW is sampling.

The other unmentioned unknown is the latency in reprogramming the
frequency. At what point does it start to become a significant factor?
I'm presuming the RPS evaluation itself is free, until it has to talk
across the chip to send an interrupt.
-Chris

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [Intel-gfx] [PATCH 2/2] drm/i915/gt: Shrink the RPS evalution intervals
@ 2020-04-14 16:35     ` Chris Wilson
  0 siblings, 0 replies; 34+ messages in thread
From: Chris Wilson @ 2020-04-14 16:35 UTC (permalink / raw)
  To: intel-gfx; +Cc: stable

Quoting Chris Wilson (2020-04-14 17:14:23)
> Try to make RPS dramatically more responsive by shrinking the evaluation
> intervales by a factor of 100! The issue is as we now park the GPU
> rapidly upon idling, a short or bursty workload such as the composited
> desktop never sustains enough work to fill and complete an evaluation
> window. As such, the frequency we program remains stuck. This was first
> reported as once boosted, we never relinquished the boost [see commit
> 21abf0bf168d ("drm/i915/gt: Treat idling as a RPS downclock event")] but
> it equally applies in the order direction for bursty workloads that
> *need* low latency, like desktop animations.
> 
> What we could try is preserve the incomplete EI history across idling,
> it is not clear whether that would be effective, nor whether the
> presumption of continuous workloads is accurate. A clearer path seems to
> treat it as symptomatic that we fail to handle bursty workload with the
> current EI, and seek to address that by shrinking the EI so the
> evaluations are run much more often.
> 
> This will likely entail more frequent interrupts, and by the time we
> process the interrupt in the bottom half [from inside a worker], the
> workload on the GPU has changed. To address the changeable nature, in
> the previous patch we compared the previous complete EI with the
> interrupt request and only up/down clock if both agree. The impact of
> asking for, and presumably, receiving more interrupts is still to be
> determined and mitigations sought. The first idea is to differentiate
> between up/down responsivity and make upclocking more responsive than
> downlocking. This should both help thwart jitter on bursty workloads by
> making it easier to increase than it is to decrease frequencies, and
> reduce the number of interrupts we would need to process.

Another worry I'd like to raise, is that by reducing the EI we risk
unstable evaluations. I'm not sure how accurate the HW is, and I worry
about borderline workloads (if that is possible) but mainly the worry is
how the HW is sampling.

The other unmentioned unknown is the latency in reprogramming the
frequency. At what point does it start to become a significant factor?
I'm presuming the RPS evaluation itself is free, until it has to talk
across the chip to send an interrupt.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 2/2] drm/i915/gt: Shrink the RPS evalution intervals
  2020-04-14 16:35     ` [Intel-gfx] " Chris Wilson
@ 2020-04-14 19:39       ` Francisco Jerez
  -1 siblings, 0 replies; 34+ messages in thread
From: Francisco Jerez @ 2020-04-14 19:39 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx; +Cc: Mika Kuoppala, Andi Shyti, Lyude Paul, stable


[-- Attachment #1.1: Type: text/plain, Size: 3746 bytes --]

Chris Wilson <chris@chris-wilson.co.uk> writes:

> Quoting Chris Wilson (2020-04-14 17:14:23)
>> Try to make RPS dramatically more responsive by shrinking the evaluation
>> intervales by a factor of 100! The issue is as we now park the GPU
>> rapidly upon idling, a short or bursty workload such as the composited
>> desktop never sustains enough work to fill and complete an evaluation
>> window. As such, the frequency we program remains stuck. This was first
>> reported as once boosted, we never relinquished the boost [see commit
>> 21abf0bf168d ("drm/i915/gt: Treat idling as a RPS downclock event")] but
>> it equally applies in the order direction for bursty workloads that
>> *need* low latency, like desktop animations.
>> 
>> What we could try is preserve the incomplete EI history across idling,
>> it is not clear whether that would be effective, nor whether the
>> presumption of continuous workloads is accurate. A clearer path seems to
>> treat it as symptomatic that we fail to handle bursty workload with the
>> current EI, and seek to address that by shrinking the EI so the
>> evaluations are run much more often.
>> 
>> This will likely entail more frequent interrupts, and by the time we
>> process the interrupt in the bottom half [from inside a worker], the
>> workload on the GPU has changed. To address the changeable nature, in
>> the previous patch we compared the previous complete EI with the
>> interrupt request and only up/down clock if both agree. The impact of
>> asking for, and presumably, receiving more interrupts is still to be
>> determined and mitigations sought. The first idea is to differentiate
>> between up/down responsivity and make upclocking more responsive than
>> downlocking. This should both help thwart jitter on bursty workloads by
>> making it easier to increase than it is to decrease frequencies, and
>> reduce the number of interrupts we would need to process.
>
> Another worry I'd like to raise, is that by reducing the EI we risk
> unstable evaluations. I'm not sure how accurate the HW is, and I worry
> about borderline workloads (if that is possible) but mainly the worry is
> how the HW is sampling.
>
> The other unmentioned unknown is the latency in reprogramming the
> frequency. At what point does it start to become a significant factor?
> I'm presuming the RPS evaluation itself is free, until it has to talk
> across the chip to send an interrupt.
> -Chris

At least on ICL the problem which this patch and 21abf0bf168d were
working around seems to have to do with RPS interrupt delivery being
inadvertently blocked for extended periods of time.  Looking at the GPU
utilization and RPS events on a graph I could see the GPU being stuck at
low frequency without any RPS interrupts firing, for a time interval
orders of magnitude greater than the EI we're theoretically programming
today.  IOW it seems like the real problem isn't that our EIs are too
long, but that we're missing a bunch of them.

The solution I was suggesting for this on IRC during the last couple of
days wouldn't have any of the drawbacks you mention above, I'll send it
to this list in a moment if the general approach seems okay to you:

https://github.com/curro/linux/commit/f7bc31402aa727a52d957e62d985c6dae6be4b86

That said it *might* be helpful to reduce the EIs we use right now in
addition, but a factor of 100 seems over the top since that will cause
the evaluation interval to be roughly two orders of magnitude shorter
than the rendering time of a typical frame, which can lead to massive
oscillations even in workloads that use a small fraction of the GPU time
to render a single frame.  Maybe we want something in between?

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 227 bytes --]

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [Intel-gfx] [PATCH 2/2] drm/i915/gt: Shrink the RPS evalution intervals
@ 2020-04-14 19:39       ` Francisco Jerez
  0 siblings, 0 replies; 34+ messages in thread
From: Francisco Jerez @ 2020-04-14 19:39 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx; +Cc: stable


[-- Attachment #1.1.1: Type: text/plain, Size: 3746 bytes --]

Chris Wilson <chris@chris-wilson.co.uk> writes:

> Quoting Chris Wilson (2020-04-14 17:14:23)
>> Try to make RPS dramatically more responsive by shrinking the evaluation
>> intervales by a factor of 100! The issue is as we now park the GPU
>> rapidly upon idling, a short or bursty workload such as the composited
>> desktop never sustains enough work to fill and complete an evaluation
>> window. As such, the frequency we program remains stuck. This was first
>> reported as once boosted, we never relinquished the boost [see commit
>> 21abf0bf168d ("drm/i915/gt: Treat idling as a RPS downclock event")] but
>> it equally applies in the order direction for bursty workloads that
>> *need* low latency, like desktop animations.
>> 
>> What we could try is preserve the incomplete EI history across idling,
>> it is not clear whether that would be effective, nor whether the
>> presumption of continuous workloads is accurate. A clearer path seems to
>> treat it as symptomatic that we fail to handle bursty workload with the
>> current EI, and seek to address that by shrinking the EI so the
>> evaluations are run much more often.
>> 
>> This will likely entail more frequent interrupts, and by the time we
>> process the interrupt in the bottom half [from inside a worker], the
>> workload on the GPU has changed. To address the changeable nature, in
>> the previous patch we compared the previous complete EI with the
>> interrupt request and only up/down clock if both agree. The impact of
>> asking for, and presumably, receiving more interrupts is still to be
>> determined and mitigations sought. The first idea is to differentiate
>> between up/down responsivity and make upclocking more responsive than
>> downlocking. This should both help thwart jitter on bursty workloads by
>> making it easier to increase than it is to decrease frequencies, and
>> reduce the number of interrupts we would need to process.
>
> Another worry I'd like to raise, is that by reducing the EI we risk
> unstable evaluations. I'm not sure how accurate the HW is, and I worry
> about borderline workloads (if that is possible) but mainly the worry is
> how the HW is sampling.
>
> The other unmentioned unknown is the latency in reprogramming the
> frequency. At what point does it start to become a significant factor?
> I'm presuming the RPS evaluation itself is free, until it has to talk
> across the chip to send an interrupt.
> -Chris

At least on ICL the problem which this patch and 21abf0bf168d were
working around seems to have to do with RPS interrupt delivery being
inadvertently blocked for extended periods of time.  Looking at the GPU
utilization and RPS events on a graph I could see the GPU being stuck at
low frequency without any RPS interrupts firing, for a time interval
orders of magnitude greater than the EI we're theoretically programming
today.  IOW it seems like the real problem isn't that our EIs are too
long, but that we're missing a bunch of them.

The solution I was suggesting for this on IRC during the last couple of
days wouldn't have any of the drawbacks you mention above, I'll send it
to this list in a moment if the general approach seems okay to you:

https://github.com/curro/linux/commit/f7bc31402aa727a52d957e62d985c6dae6be4b86

That said it *might* be helpful to reduce the EIs we use right now in
addition, but a factor of 100 seems over the top since that will cause
the evaluation interval to be roughly two orders of magnitude shorter
than the rendering time of a typical frame, which can lead to massive
oscillations even in workloads that use a small fraction of the GPU time
to render a single frame.  Maybe we want something in between?

[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 227 bytes --]

[-- Attachment #2: Type: text/plain, Size: 160 bytes --]

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [Intel-gfx] [PATCH 2/2] drm/i915/gt: Shrink the RPS evalution intervals
  2020-04-14 19:39       ` [Intel-gfx] " Francisco Jerez
@ 2020-04-14 20:13         ` Chris Wilson
  -1 siblings, 0 replies; 34+ messages in thread
From: Chris Wilson @ 2020-04-14 20:13 UTC (permalink / raw)
  To: Francisco Jerez, intel-gfx; +Cc: stable

Quoting Francisco Jerez (2020-04-14 20:39:48)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > Quoting Chris Wilson (2020-04-14 17:14:23)
> >> Try to make RPS dramatically more responsive by shrinking the evaluation
> >> intervales by a factor of 100! The issue is as we now park the GPU
> >> rapidly upon idling, a short or bursty workload such as the composited
> >> desktop never sustains enough work to fill and complete an evaluation
> >> window. As such, the frequency we program remains stuck. This was first
> >> reported as once boosted, we never relinquished the boost [see commit
> >> 21abf0bf168d ("drm/i915/gt: Treat idling as a RPS downclock event")] but
> >> it equally applies in the order direction for bursty workloads that
> >> *need* low latency, like desktop animations.
> >> 
> >> What we could try is preserve the incomplete EI history across idling,
> >> it is not clear whether that would be effective, nor whether the
> >> presumption of continuous workloads is accurate. A clearer path seems to
> >> treat it as symptomatic that we fail to handle bursty workload with the
> >> current EI, and seek to address that by shrinking the EI so the
> >> evaluations are run much more often.
> >> 
> >> This will likely entail more frequent interrupts, and by the time we
> >> process the interrupt in the bottom half [from inside a worker], the
> >> workload on the GPU has changed. To address the changeable nature, in
> >> the previous patch we compared the previous complete EI with the
> >> interrupt request and only up/down clock if both agree. The impact of
> >> asking for, and presumably, receiving more interrupts is still to be
> >> determined and mitigations sought. The first idea is to differentiate
> >> between up/down responsivity and make upclocking more responsive than
> >> downlocking. This should both help thwart jitter on bursty workloads by
> >> making it easier to increase than it is to decrease frequencies, and
> >> reduce the number of interrupts we would need to process.
> >
> > Another worry I'd like to raise, is that by reducing the EI we risk
> > unstable evaluations. I'm not sure how accurate the HW is, and I worry
> > about borderline workloads (if that is possible) but mainly the worry is
> > how the HW is sampling.
> >
> > The other unmentioned unknown is the latency in reprogramming the
> > frequency. At what point does it start to become a significant factor?
> > I'm presuming the RPS evaluation itself is free, until it has to talk
> > across the chip to send an interrupt.
> > -Chris
> 
> At least on ICL the problem which this patch and 21abf0bf168d were
> working around seems to have to do with RPS interrupt delivery being
> inadvertently blocked for extended periods of time.  Looking at the GPU
> utilization and RPS events on a graph I could see the GPU being stuck at
> low frequency without any RPS interrupts firing, for a time interval
> orders of magnitude greater than the EI we're theoretically programming
> today.  IOW it seems like the real problem isn't that our EIs are too
> long, but that we're missing a bunch of them.
> 
> The solution I was suggesting for this on IRC during the last couple of
> days wouldn't have any of the drawbacks you mention above, I'll send it
> to this list in a moment if the general approach seems okay to you:
> 
> https://github.com/curro/linux/commit/f7bc31402aa727a52d957e62d985c6dae6be4b86

We were explicitly told to mask the interrupt generation at source
to conserve power. So I would hope for a statement as to whether that is
still a requirement from the HW architects; but I can't see why we would
not apply the mask and that this is just paper. If the observation about
forcewake tallies, would this not suggest that it is still conserving
power on icl?

I haven't looked at whether I see the same phenomenon as you [missing
interrupts on icl] locally, but I was expecting something like the bug
report since the observation that render times are less than EI was
causing the clocks to stay high. And I noticed your problem statement
and was hopeful for a link.

They sound like two different problems. (Underclocking for animations is
not icl specific.)

> That said it *might* be helpful to reduce the EIs we use right now in
> addition, but a factor of 100 seems over the top since that will cause
> the evaluation interval to be roughly two orders of magnitude shorter
> than the rendering time of a typical frame, which can lead to massive
> oscillations even in workloads that use a small fraction of the GPU time
> to render a single frame.  Maybe we want something in between?

Probably; as you can guess these were pulled out of nowhere based on the
observation that the frame lengths are much shorter than the current EI
and that in order for us to ramp up to maxclocks in a single frame of
animation would take about 4 samples per frame. Based on the reporter's
observations, we do have to ramp up very quickly for single frame of
rendering in order to hit the vblank, as we are ramping down afterwards.

With a target of 4 samples within say 1ms, 160us isn't too far of the
mark. (We have to allow some extra time to catch up rendering.)

As for steady state workloads, I'm optimistic the smoothing helps. (It's
harder to find steady state, unthrottled workloads!)
-Chris

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [Intel-gfx] [PATCH 2/2] drm/i915/gt: Shrink the RPS evalution intervals
@ 2020-04-14 20:13         ` Chris Wilson
  0 siblings, 0 replies; 34+ messages in thread
From: Chris Wilson @ 2020-04-14 20:13 UTC (permalink / raw)
  To: Francisco Jerez, intel-gfx; +Cc: stable

Quoting Francisco Jerez (2020-04-14 20:39:48)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > Quoting Chris Wilson (2020-04-14 17:14:23)
> >> Try to make RPS dramatically more responsive by shrinking the evaluation
> >> intervales by a factor of 100! The issue is as we now park the GPU
> >> rapidly upon idling, a short or bursty workload such as the composited
> >> desktop never sustains enough work to fill and complete an evaluation
> >> window. As such, the frequency we program remains stuck. This was first
> >> reported as once boosted, we never relinquished the boost [see commit
> >> 21abf0bf168d ("drm/i915/gt: Treat idling as a RPS downclock event")] but
> >> it equally applies in the order direction for bursty workloads that
> >> *need* low latency, like desktop animations.
> >> 
> >> What we could try is preserve the incomplete EI history across idling,
> >> it is not clear whether that would be effective, nor whether the
> >> presumption of continuous workloads is accurate. A clearer path seems to
> >> treat it as symptomatic that we fail to handle bursty workload with the
> >> current EI, and seek to address that by shrinking the EI so the
> >> evaluations are run much more often.
> >> 
> >> This will likely entail more frequent interrupts, and by the time we
> >> process the interrupt in the bottom half [from inside a worker], the
> >> workload on the GPU has changed. To address the changeable nature, in
> >> the previous patch we compared the previous complete EI with the
> >> interrupt request and only up/down clock if both agree. The impact of
> >> asking for, and presumably, receiving more interrupts is still to be
> >> determined and mitigations sought. The first idea is to differentiate
> >> between up/down responsivity and make upclocking more responsive than
> >> downlocking. This should both help thwart jitter on bursty workloads by
> >> making it easier to increase than it is to decrease frequencies, and
> >> reduce the number of interrupts we would need to process.
> >
> > Another worry I'd like to raise, is that by reducing the EI we risk
> > unstable evaluations. I'm not sure how accurate the HW is, and I worry
> > about borderline workloads (if that is possible) but mainly the worry is
> > how the HW is sampling.
> >
> > The other unmentioned unknown is the latency in reprogramming the
> > frequency. At what point does it start to become a significant factor?
> > I'm presuming the RPS evaluation itself is free, until it has to talk
> > across the chip to send an interrupt.
> > -Chris
> 
> At least on ICL the problem which this patch and 21abf0bf168d were
> working around seems to have to do with RPS interrupt delivery being
> inadvertently blocked for extended periods of time.  Looking at the GPU
> utilization and RPS events on a graph I could see the GPU being stuck at
> low frequency without any RPS interrupts firing, for a time interval
> orders of magnitude greater than the EI we're theoretically programming
> today.  IOW it seems like the real problem isn't that our EIs are too
> long, but that we're missing a bunch of them.
> 
> The solution I was suggesting for this on IRC during the last couple of
> days wouldn't have any of the drawbacks you mention above, I'll send it
> to this list in a moment if the general approach seems okay to you:
> 
> https://github.com/curro/linux/commit/f7bc31402aa727a52d957e62d985c6dae6be4b86

We were explicitly told to mask the interrupt generation at source
to conserve power. So I would hope for a statement as to whether that is
still a requirement from the HW architects; but I can't see why we would
not apply the mask and that this is just paper. If the observation about
forcewake tallies, would this not suggest that it is still conserving
power on icl?

I haven't looked at whether I see the same phenomenon as you [missing
interrupts on icl] locally, but I was expecting something like the bug
report since the observation that render times are less than EI was
causing the clocks to stay high. And I noticed your problem statement
and was hopeful for a link.

They sound like two different problems. (Underclocking for animations is
not icl specific.)

> That said it *might* be helpful to reduce the EIs we use right now in
> addition, but a factor of 100 seems over the top since that will cause
> the evaluation interval to be roughly two orders of magnitude shorter
> than the rendering time of a typical frame, which can lead to massive
> oscillations even in workloads that use a small fraction of the GPU time
> to render a single frame.  Maybe we want something in between?

Probably; as you can guess these were pulled out of nowhere based on the
observation that the frame lengths are much shorter than the current EI
and that in order for us to ramp up to maxclocks in a single frame of
animation would take about 4 samples per frame. Based on the reporter's
observations, we do have to ramp up very quickly for single frame of
rendering in order to hit the vblank, as we are ramping down afterwards.

With a target of 4 samples within say 1ms, 160us isn't too far of the
mark. (We have to allow some extra time to catch up rendering.)

As for steady state workloads, I'm optimistic the smoothing helps. (It's
harder to find steady state, unthrottled workloads!)
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [Intel-gfx] [PATCH 2/2] drm/i915/gt: Shrink the RPS evalution intervals
  2020-04-14 20:13         ` Chris Wilson
@ 2020-04-14 21:00           ` Francisco Jerez
  -1 siblings, 0 replies; 34+ messages in thread
From: Francisco Jerez @ 2020-04-14 21:00 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx; +Cc: stable, Wilson, Chris P


[-- Attachment #1.1: Type: text/plain, Size: 7413 bytes --]

Chris Wilson <chris@chris-wilson.co.uk> writes:

> Quoting Francisco Jerez (2020-04-14 20:39:48)
>> Chris Wilson <chris@chris-wilson.co.uk> writes:
>> 
>> > Quoting Chris Wilson (2020-04-14 17:14:23)
>> >> Try to make RPS dramatically more responsive by shrinking the evaluation
>> >> intervales by a factor of 100! The issue is as we now park the GPU
>> >> rapidly upon idling, a short or bursty workload such as the composited
>> >> desktop never sustains enough work to fill and complete an evaluation
>> >> window. As such, the frequency we program remains stuck. This was first
>> >> reported as once boosted, we never relinquished the boost [see commit
>> >> 21abf0bf168d ("drm/i915/gt: Treat idling as a RPS downclock event")] but
>> >> it equally applies in the order direction for bursty workloads that
>> >> *need* low latency, like desktop animations.
>> >> 
>> >> What we could try is preserve the incomplete EI history across idling,
>> >> it is not clear whether that would be effective, nor whether the
>> >> presumption of continuous workloads is accurate. A clearer path seems to
>> >> treat it as symptomatic that we fail to handle bursty workload with the
>> >> current EI, and seek to address that by shrinking the EI so the
>> >> evaluations are run much more often.
>> >> 
>> >> This will likely entail more frequent interrupts, and by the time we
>> >> process the interrupt in the bottom half [from inside a worker], the
>> >> workload on the GPU has changed. To address the changeable nature, in
>> >> the previous patch we compared the previous complete EI with the
>> >> interrupt request and only up/down clock if both agree. The impact of
>> >> asking for, and presumably, receiving more interrupts is still to be
>> >> determined and mitigations sought. The first idea is to differentiate
>> >> between up/down responsivity and make upclocking more responsive than
>> >> downlocking. This should both help thwart jitter on bursty workloads by
>> >> making it easier to increase than it is to decrease frequencies, and
>> >> reduce the number of interrupts we would need to process.
>> >
>> > Another worry I'd like to raise, is that by reducing the EI we risk
>> > unstable evaluations. I'm not sure how accurate the HW is, and I worry
>> > about borderline workloads (if that is possible) but mainly the worry is
>> > how the HW is sampling.
>> >
>> > The other unmentioned unknown is the latency in reprogramming the
>> > frequency. At what point does it start to become a significant factor?
>> > I'm presuming the RPS evaluation itself is free, until it has to talk
>> > across the chip to send an interrupt.
>> > -Chris
>> 
>> At least on ICL the problem which this patch and 21abf0bf168d were
>> working around seems to have to do with RPS interrupt delivery being
>> inadvertently blocked for extended periods of time.  Looking at the GPU
>> utilization and RPS events on a graph I could see the GPU being stuck at
>> low frequency without any RPS interrupts firing, for a time interval
>> orders of magnitude greater than the EI we're theoretically programming
>> today.  IOW it seems like the real problem isn't that our EIs are too
>> long, but that we're missing a bunch of them.
>> 
>> The solution I was suggesting for this on IRC during the last couple of
>> days wouldn't have any of the drawbacks you mention above, I'll send it
>> to this list in a moment if the general approach seems okay to you:
>> 
>> https://github.com/curro/linux/commit/f7bc31402aa727a52d957e62d985c6dae6be4b86
>
> We were explicitly told to mask the interrupt generation at source
> to conserve power. So I would hope for a statement as to whether that is
> still a requirement from the HW architects; but I can't see why we would
> not apply the mask and that this is just paper. If the observation about
> forcewake tallies, would this not suggest that it is still conserving
> power on icl?
>

Yeah, it's hard to see how disabling interrupt generation could save any
additional power in a unit which is powered off -- At least on ICL where
even the interrupt masking register is powered off...

> I haven't looked at whether I see the same phenomenon as you [missing
> interrupts on icl] locally, but I was expecting something like the bug
> report since the observation that render times are less than EI was
> causing the clocks to stay high. And I noticed your problem statement
> and was hopeful for a link.
>

Right.  In the workloads I was looking at last week the GPU would often
be active for periods of time several times greater than the EI, and we
would still fail to clock up.

> They sound like two different problems. (Underclocking for animations is
> not icl specific.)
>

Sure.  But it seems like the underclocking problem has been greatly
exacerbated by 21abf0bf168d, which may have been mitigating the same ICL
problem I was looking at leading to RPS interrupt loss.  Maybe
21abf0bf168d wouldn't be necessary with working RPS interrupt delivery?
And without 21abf0bf168d platforms earlier than ICL wouldn't have as
much of an underclocking problem either.

>> That said it *might* be helpful to reduce the EIs we use right now in
>> addition, but a factor of 100 seems over the top since that will cause
>> the evaluation interval to be roughly two orders of magnitude shorter
>> than the rendering time of a typical frame, which can lead to massive
>> oscillations even in workloads that use a small fraction of the GPU time
>> to render a single frame.  Maybe we want something in between?
>
> Probably; as you can guess these were pulled out of nowhere based on the
> observation that the frame lengths are much shorter than the current EI
> and that in order for us to ramp up to maxclocks in a single frame of
> animation would take about 4 samples per frame. Based on the reporter's
> observations, we do have to ramp up very quickly for single frame of
> rendering in order to hit the vblank, as we are ramping down afterwards.
>
> With a target of 4 samples within say 1ms, 160us isn't too far of the
> mark. (We have to allow some extra time to catch up rendering.)


How about we stop ramping down after the rendering of a single frame?
It's not like we save any power by doing that, since the GPU seems to be
forced to the minimum frequency for as long as it remains parked anyway.
If the GPU remains idle long enough for the RPS utilization counters to
drop below the threshold and qualify for a ramp-down the RPS should send
us an interrupt, at which point we will ramp down the frequency.

Unconditionally ramping down on parking seems to disturb the accuracy of
that RPS feedback loop, which then needs to be worked around by reducing
the averaging window of the RPS to a tiny fraction of the oscillation
period of any typical GPU workload, which is going to prevent the RPS
from seeing a whole oscillation period before it reacts, which is almost
guaranteed to have a serious energy-efficiency cost.

>
> As for steady state workloads, I'm optimistic the smoothing helps. (It's
> harder to find steady state, unthrottled workloads!)

I'm curious, how is that smoothing you do in PATCH 1 better than simply
setting 2x the EIs? (Which would also mean half the interrupt processing
overhead as in this series)

> -Chris

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 227 bytes --]

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [Intel-gfx] [PATCH 2/2] drm/i915/gt: Shrink the RPS evalution intervals
@ 2020-04-14 21:00           ` Francisco Jerez
  0 siblings, 0 replies; 34+ messages in thread
From: Francisco Jerez @ 2020-04-14 21:00 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx; +Cc: Wilson, Chris P, stable


[-- Attachment #1.1.1: Type: text/plain, Size: 7413 bytes --]

Chris Wilson <chris@chris-wilson.co.uk> writes:

> Quoting Francisco Jerez (2020-04-14 20:39:48)
>> Chris Wilson <chris@chris-wilson.co.uk> writes:
>> 
>> > Quoting Chris Wilson (2020-04-14 17:14:23)
>> >> Try to make RPS dramatically more responsive by shrinking the evaluation
>> >> intervales by a factor of 100! The issue is as we now park the GPU
>> >> rapidly upon idling, a short or bursty workload such as the composited
>> >> desktop never sustains enough work to fill and complete an evaluation
>> >> window. As such, the frequency we program remains stuck. This was first
>> >> reported as once boosted, we never relinquished the boost [see commit
>> >> 21abf0bf168d ("drm/i915/gt: Treat idling as a RPS downclock event")] but
>> >> it equally applies in the order direction for bursty workloads that
>> >> *need* low latency, like desktop animations.
>> >> 
>> >> What we could try is preserve the incomplete EI history across idling,
>> >> it is not clear whether that would be effective, nor whether the
>> >> presumption of continuous workloads is accurate. A clearer path seems to
>> >> treat it as symptomatic that we fail to handle bursty workload with the
>> >> current EI, and seek to address that by shrinking the EI so the
>> >> evaluations are run much more often.
>> >> 
>> >> This will likely entail more frequent interrupts, and by the time we
>> >> process the interrupt in the bottom half [from inside a worker], the
>> >> workload on the GPU has changed. To address the changeable nature, in
>> >> the previous patch we compared the previous complete EI with the
>> >> interrupt request and only up/down clock if both agree. The impact of
>> >> asking for, and presumably, receiving more interrupts is still to be
>> >> determined and mitigations sought. The first idea is to differentiate
>> >> between up/down responsivity and make upclocking more responsive than
>> >> downlocking. This should both help thwart jitter on bursty workloads by
>> >> making it easier to increase than it is to decrease frequencies, and
>> >> reduce the number of interrupts we would need to process.
>> >
>> > Another worry I'd like to raise, is that by reducing the EI we risk
>> > unstable evaluations. I'm not sure how accurate the HW is, and I worry
>> > about borderline workloads (if that is possible) but mainly the worry is
>> > how the HW is sampling.
>> >
>> > The other unmentioned unknown is the latency in reprogramming the
>> > frequency. At what point does it start to become a significant factor?
>> > I'm presuming the RPS evaluation itself is free, until it has to talk
>> > across the chip to send an interrupt.
>> > -Chris
>> 
>> At least on ICL the problem which this patch and 21abf0bf168d were
>> working around seems to have to do with RPS interrupt delivery being
>> inadvertently blocked for extended periods of time.  Looking at the GPU
>> utilization and RPS events on a graph I could see the GPU being stuck at
>> low frequency without any RPS interrupts firing, for a time interval
>> orders of magnitude greater than the EI we're theoretically programming
>> today.  IOW it seems like the real problem isn't that our EIs are too
>> long, but that we're missing a bunch of them.
>> 
>> The solution I was suggesting for this on IRC during the last couple of
>> days wouldn't have any of the drawbacks you mention above, I'll send it
>> to this list in a moment if the general approach seems okay to you:
>> 
>> https://github.com/curro/linux/commit/f7bc31402aa727a52d957e62d985c6dae6be4b86
>
> We were explicitly told to mask the interrupt generation at source
> to conserve power. So I would hope for a statement as to whether that is
> still a requirement from the HW architects; but I can't see why we would
> not apply the mask and that this is just paper. If the observation about
> forcewake tallies, would this not suggest that it is still conserving
> power on icl?
>

Yeah, it's hard to see how disabling interrupt generation could save any
additional power in a unit which is powered off -- At least on ICL where
even the interrupt masking register is powered off...

> I haven't looked at whether I see the same phenomenon as you [missing
> interrupts on icl] locally, but I was expecting something like the bug
> report since the observation that render times are less than EI was
> causing the clocks to stay high. And I noticed your problem statement
> and was hopeful for a link.
>

Right.  In the workloads I was looking at last week the GPU would often
be active for periods of time several times greater than the EI, and we
would still fail to clock up.

> They sound like two different problems. (Underclocking for animations is
> not icl specific.)
>

Sure.  But it seems like the underclocking problem has been greatly
exacerbated by 21abf0bf168d, which may have been mitigating the same ICL
problem I was looking at leading to RPS interrupt loss.  Maybe
21abf0bf168d wouldn't be necessary with working RPS interrupt delivery?
And without 21abf0bf168d platforms earlier than ICL wouldn't have as
much of an underclocking problem either.

>> That said it *might* be helpful to reduce the EIs we use right now in
>> addition, but a factor of 100 seems over the top since that will cause
>> the evaluation interval to be roughly two orders of magnitude shorter
>> than the rendering time of a typical frame, which can lead to massive
>> oscillations even in workloads that use a small fraction of the GPU time
>> to render a single frame.  Maybe we want something in between?
>
> Probably; as you can guess these were pulled out of nowhere based on the
> observation that the frame lengths are much shorter than the current EI
> and that in order for us to ramp up to maxclocks in a single frame of
> animation would take about 4 samples per frame. Based on the reporter's
> observations, we do have to ramp up very quickly for single frame of
> rendering in order to hit the vblank, as we are ramping down afterwards.
>
> With a target of 4 samples within say 1ms, 160us isn't too far of the
> mark. (We have to allow some extra time to catch up rendering.)


How about we stop ramping down after the rendering of a single frame?
It's not like we save any power by doing that, since the GPU seems to be
forced to the minimum frequency for as long as it remains parked anyway.
If the GPU remains idle long enough for the RPS utilization counters to
drop below the threshold and qualify for a ramp-down the RPS should send
us an interrupt, at which point we will ramp down the frequency.

Unconditionally ramping down on parking seems to disturb the accuracy of
that RPS feedback loop, which then needs to be worked around by reducing
the averaging window of the RPS to a tiny fraction of the oscillation
period of any typical GPU workload, which is going to prevent the RPS
from seeing a whole oscillation period before it reacts, which is almost
guaranteed to have a serious energy-efficiency cost.

>
> As for steady state workloads, I'm optimistic the smoothing helps. (It's
> harder to find steady state, unthrottled workloads!)

I'm curious, how is that smoothing you do in PATCH 1 better than simply
setting 2x the EIs? (Which would also mean half the interrupt processing
overhead as in this series)

> -Chris

[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 227 bytes --]

[-- Attachment #2: Type: text/plain, Size: 160 bytes --]

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [Intel-gfx] [PATCH 2/2] drm/i915/gt: Shrink the RPS evalution intervals
  2020-04-14 19:39       ` [Intel-gfx] " Francisco Jerez
@ 2020-04-14 21:35         ` Chris Wilson
  -1 siblings, 0 replies; 34+ messages in thread
From: Chris Wilson @ 2020-04-14 21:35 UTC (permalink / raw)
  To: Francisco Jerez, intel-gfx; +Cc: stable

Quoting Francisco Jerez (2020-04-14 20:39:48)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > Quoting Chris Wilson (2020-04-14 17:14:23)
> >> Try to make RPS dramatically more responsive by shrinking the evaluation
> >> intervales by a factor of 100! The issue is as we now park the GPU
> >> rapidly upon idling, a short or bursty workload such as the composited
> >> desktop never sustains enough work to fill and complete an evaluation
> >> window. As such, the frequency we program remains stuck. This was first
> >> reported as once boosted, we never relinquished the boost [see commit
> >> 21abf0bf168d ("drm/i915/gt: Treat idling as a RPS downclock event")] but
> >> it equally applies in the order direction for bursty workloads that
> >> *need* low latency, like desktop animations.
> >> 
> >> What we could try is preserve the incomplete EI history across idling,
> >> it is not clear whether that would be effective, nor whether the
> >> presumption of continuous workloads is accurate. A clearer path seems to
> >> treat it as symptomatic that we fail to handle bursty workload with the
> >> current EI, and seek to address that by shrinking the EI so the
> >> evaluations are run much more often.
> >> 
> >> This will likely entail more frequent interrupts, and by the time we
> >> process the interrupt in the bottom half [from inside a worker], the
> >> workload on the GPU has changed. To address the changeable nature, in
> >> the previous patch we compared the previous complete EI with the
> >> interrupt request and only up/down clock if both agree. The impact of
> >> asking for, and presumably, receiving more interrupts is still to be
> >> determined and mitigations sought. The first idea is to differentiate
> >> between up/down responsivity and make upclocking more responsive than
> >> downlocking. This should both help thwart jitter on bursty workloads by
> >> making it easier to increase than it is to decrease frequencies, and
> >> reduce the number of interrupts we would need to process.
> >
> > Another worry I'd like to raise, is that by reducing the EI we risk
> > unstable evaluations. I'm not sure how accurate the HW is, and I worry
> > about borderline workloads (if that is possible) but mainly the worry is
> > how the HW is sampling.
> >
> > The other unmentioned unknown is the latency in reprogramming the
> > frequency. At what point does it start to become a significant factor?
> > I'm presuming the RPS evaluation itself is free, until it has to talk
> > across the chip to send an interrupt.
> > -Chris
> 
> At least on ICL the problem which this patch and 21abf0bf168d were
> working around seems to have to do with RPS interrupt delivery being
> inadvertently blocked for extended periods of time.  Looking at the GPU
> utilization and RPS events on a graph I could see the GPU being stuck at
> low frequency without any RPS interrupts firing, for a time interval
> orders of magnitude greater than the EI we're theoretically programming
> today.  IOW it seems like the real problem isn't that our EIs are too
> long, but that we're missing a bunch of them.

Just stuck a pr_err() into gen11_handle_rps_events(), and momentarily
before we were throttled (and so capped at 100% load), interrupts were
being delivered:

[  887.521727] gen11_rps_irq_handler: { iir:20, events:20 }
[  887.538039] gen11_rps_irq_handler: { iir:10, events:10 }
[  887.538253] gen11_rps_irq_handler: { iir:20, events:20 }
[  887.538555] gen11_rps_irq_handler: { iir:10, events:10 }
[  887.554731] gen11_rps_irq_handler: { iir:10, events:10 }
[  887.554857] gen11_rps_irq_handler: { iir:20, events:20 }
[  887.555604] gen11_rps_irq_handler: { iir:10, events:10 }
[  887.571373] gen11_rps_irq_handler: { iir:10, events:10 }
[  887.571496] gen11_rps_irq_handler: { iir:20, events:20 }
[  887.571646] gen11_rps_irq_handler: { iir:10, events:10 }
[  887.588199] gen11_rps_irq_handler: { iir:10, events:10 }
[  887.588380] gen11_rps_irq_handler: { iir:20, events:20 }
[  887.588692] gen11_rps_irq_handler: { iir:10, events:10 }
[  887.604718] gen11_rps_irq_handler: { iir:10, events:10 }
[  887.604937] gen11_rps_irq_handler: { iir:20, events:20 }
[  887.621591] gen11_rps_irq_handler: { iir:10, events:10 }
[  887.621755] gen11_rps_irq_handler: { iir:10, events:10 }
[  887.637988] gen11_rps_irq_handler: { iir:10, events:10 }
[  887.638166] gen11_rps_irq_handler: { iir:20, events:20 }
[  887.638803] gen11_rps_irq_handler: { iir:10, events:10 }
[  887.654812] gen11_rps_irq_handler: { iir:10, events:10 }
[  887.655029] gen11_rps_irq_handler: { iir:20, events:20 }
[  887.671423] gen11_rps_irq_handler: { iir:10, events:10 }
[  887.671649] gen11_rps_irq_handler: { iir:20, events:20 }

That looks within expectations for the short EI settings. So many
interrupts is a drag, and I would be tempted to remove the process bottom
half.

Oh well, I should check how many of those are translated into frequency
updates. I just wanted to first check if in the first try I stumbled
into the same loss of interrupts issue.
-Chris

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [Intel-gfx] [PATCH 2/2] drm/i915/gt: Shrink the RPS evalution intervals
@ 2020-04-14 21:35         ` Chris Wilson
  0 siblings, 0 replies; 34+ messages in thread
From: Chris Wilson @ 2020-04-14 21:35 UTC (permalink / raw)
  To: Francisco Jerez, intel-gfx; +Cc: stable

Quoting Francisco Jerez (2020-04-14 20:39:48)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > Quoting Chris Wilson (2020-04-14 17:14:23)
> >> Try to make RPS dramatically more responsive by shrinking the evaluation
> >> intervales by a factor of 100! The issue is as we now park the GPU
> >> rapidly upon idling, a short or bursty workload such as the composited
> >> desktop never sustains enough work to fill and complete an evaluation
> >> window. As such, the frequency we program remains stuck. This was first
> >> reported as once boosted, we never relinquished the boost [see commit
> >> 21abf0bf168d ("drm/i915/gt: Treat idling as a RPS downclock event")] but
> >> it equally applies in the order direction for bursty workloads that
> >> *need* low latency, like desktop animations.
> >> 
> >> What we could try is preserve the incomplete EI history across idling,
> >> it is not clear whether that would be effective, nor whether the
> >> presumption of continuous workloads is accurate. A clearer path seems to
> >> treat it as symptomatic that we fail to handle bursty workload with the
> >> current EI, and seek to address that by shrinking the EI so the
> >> evaluations are run much more often.
> >> 
> >> This will likely entail more frequent interrupts, and by the time we
> >> process the interrupt in the bottom half [from inside a worker], the
> >> workload on the GPU has changed. To address the changeable nature, in
> >> the previous patch we compared the previous complete EI with the
> >> interrupt request and only up/down clock if both agree. The impact of
> >> asking for, and presumably, receiving more interrupts is still to be
> >> determined and mitigations sought. The first idea is to differentiate
> >> between up/down responsivity and make upclocking more responsive than
> >> downlocking. This should both help thwart jitter on bursty workloads by
> >> making it easier to increase than it is to decrease frequencies, and
> >> reduce the number of interrupts we would need to process.
> >
> > Another worry I'd like to raise, is that by reducing the EI we risk
> > unstable evaluations. I'm not sure how accurate the HW is, and I worry
> > about borderline workloads (if that is possible) but mainly the worry is
> > how the HW is sampling.
> >
> > The other unmentioned unknown is the latency in reprogramming the
> > frequency. At what point does it start to become a significant factor?
> > I'm presuming the RPS evaluation itself is free, until it has to talk
> > across the chip to send an interrupt.
> > -Chris
> 
> At least on ICL the problem which this patch and 21abf0bf168d were
> working around seems to have to do with RPS interrupt delivery being
> inadvertently blocked for extended periods of time.  Looking at the GPU
> utilization and RPS events on a graph I could see the GPU being stuck at
> low frequency without any RPS interrupts firing, for a time interval
> orders of magnitude greater than the EI we're theoretically programming
> today.  IOW it seems like the real problem isn't that our EIs are too
> long, but that we're missing a bunch of them.

Just stuck a pr_err() into gen11_handle_rps_events(), and momentarily
before we were throttled (and so capped at 100% load), interrupts were
being delivered:

[  887.521727] gen11_rps_irq_handler: { iir:20, events:20 }
[  887.538039] gen11_rps_irq_handler: { iir:10, events:10 }
[  887.538253] gen11_rps_irq_handler: { iir:20, events:20 }
[  887.538555] gen11_rps_irq_handler: { iir:10, events:10 }
[  887.554731] gen11_rps_irq_handler: { iir:10, events:10 }
[  887.554857] gen11_rps_irq_handler: { iir:20, events:20 }
[  887.555604] gen11_rps_irq_handler: { iir:10, events:10 }
[  887.571373] gen11_rps_irq_handler: { iir:10, events:10 }
[  887.571496] gen11_rps_irq_handler: { iir:20, events:20 }
[  887.571646] gen11_rps_irq_handler: { iir:10, events:10 }
[  887.588199] gen11_rps_irq_handler: { iir:10, events:10 }
[  887.588380] gen11_rps_irq_handler: { iir:20, events:20 }
[  887.588692] gen11_rps_irq_handler: { iir:10, events:10 }
[  887.604718] gen11_rps_irq_handler: { iir:10, events:10 }
[  887.604937] gen11_rps_irq_handler: { iir:20, events:20 }
[  887.621591] gen11_rps_irq_handler: { iir:10, events:10 }
[  887.621755] gen11_rps_irq_handler: { iir:10, events:10 }
[  887.637988] gen11_rps_irq_handler: { iir:10, events:10 }
[  887.638166] gen11_rps_irq_handler: { iir:20, events:20 }
[  887.638803] gen11_rps_irq_handler: { iir:10, events:10 }
[  887.654812] gen11_rps_irq_handler: { iir:10, events:10 }
[  887.655029] gen11_rps_irq_handler: { iir:20, events:20 }
[  887.671423] gen11_rps_irq_handler: { iir:10, events:10 }
[  887.671649] gen11_rps_irq_handler: { iir:20, events:20 }

That looks within expectations for the short EI settings. So many
interrupts is a drag, and I would be tempted to remove the process bottom
half.

Oh well, I should check how many of those are translated into frequency
updates. I just wanted to first check if in the first try I stumbled
into the same loss of interrupts issue.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [Intel-gfx] [PATCH 2/2] drm/i915/gt: Shrink the RPS evalution intervals
  2020-04-14 21:00           ` Francisco Jerez
@ 2020-04-14 21:52             ` Chris Wilson
  -1 siblings, 0 replies; 34+ messages in thread
From: Chris Wilson @ 2020-04-14 21:52 UTC (permalink / raw)
  To: Francisco Jerez, intel-gfx; +Cc: stable, Wilson, Chris P

Quoting Francisco Jerez (2020-04-14 22:00:25)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > Quoting Francisco Jerez (2020-04-14 20:39:48)
> >> Chris Wilson <chris@chris-wilson.co.uk> writes:
> >> 
> >> > Quoting Chris Wilson (2020-04-14 17:14:23)
> >> >> Try to make RPS dramatically more responsive by shrinking the evaluation
> >> >> intervales by a factor of 100! The issue is as we now park the GPU
> >> >> rapidly upon idling, a short or bursty workload such as the composited
> >> >> desktop never sustains enough work to fill and complete an evaluation
> >> >> window. As such, the frequency we program remains stuck. This was first
> >> >> reported as once boosted, we never relinquished the boost [see commit
> >> >> 21abf0bf168d ("drm/i915/gt: Treat idling as a RPS downclock event")] but
> >> >> it equally applies in the order direction for bursty workloads that
> >> >> *need* low latency, like desktop animations.
> >> >> 
> >> >> What we could try is preserve the incomplete EI history across idling,
> >> >> it is not clear whether that would be effective, nor whether the
> >> >> presumption of continuous workloads is accurate. A clearer path seems to
> >> >> treat it as symptomatic that we fail to handle bursty workload with the
> >> >> current EI, and seek to address that by shrinking the EI so the
> >> >> evaluations are run much more often.
> >> >> 
> >> >> This will likely entail more frequent interrupts, and by the time we
> >> >> process the interrupt in the bottom half [from inside a worker], the
> >> >> workload on the GPU has changed. To address the changeable nature, in
> >> >> the previous patch we compared the previous complete EI with the
> >> >> interrupt request and only up/down clock if both agree. The impact of
> >> >> asking for, and presumably, receiving more interrupts is still to be
> >> >> determined and mitigations sought. The first idea is to differentiate
> >> >> between up/down responsivity and make upclocking more responsive than
> >> >> downlocking. This should both help thwart jitter on bursty workloads by
> >> >> making it easier to increase than it is to decrease frequencies, and
> >> >> reduce the number of interrupts we would need to process.
> >> >
> >> > Another worry I'd like to raise, is that by reducing the EI we risk
> >> > unstable evaluations. I'm not sure how accurate the HW is, and I worry
> >> > about borderline workloads (if that is possible) but mainly the worry is
> >> > how the HW is sampling.
> >> >
> >> > The other unmentioned unknown is the latency in reprogramming the
> >> > frequency. At what point does it start to become a significant factor?
> >> > I'm presuming the RPS evaluation itself is free, until it has to talk
> >> > across the chip to send an interrupt.
> >> > -Chris
> >> 
> >> At least on ICL the problem which this patch and 21abf0bf168d were
> >> working around seems to have to do with RPS interrupt delivery being
> >> inadvertently blocked for extended periods of time.  Looking at the GPU
> >> utilization and RPS events on a graph I could see the GPU being stuck at
> >> low frequency without any RPS interrupts firing, for a time interval
> >> orders of magnitude greater than the EI we're theoretically programming
> >> today.  IOW it seems like the real problem isn't that our EIs are too
> >> long, but that we're missing a bunch of them.
> >> 
> >> The solution I was suggesting for this on IRC during the last couple of
> >> days wouldn't have any of the drawbacks you mention above, I'll send it
> >> to this list in a moment if the general approach seems okay to you:
> >> 
> >> https://github.com/curro/linux/commit/f7bc31402aa727a52d957e62d985c6dae6be4b86
> >
> > We were explicitly told to mask the interrupt generation at source
> > to conserve power. So I would hope for a statement as to whether that is
> > still a requirement from the HW architects; but I can't see why we would
> > not apply the mask and that this is just paper. If the observation about
> > forcewake tallies, would this not suggest that it is still conserving
> > power on icl?
> >
> 
> Yeah, it's hard to see how disabling interrupt generation could save any
> additional power in a unit which is powered off -- At least on ICL where
> even the interrupt masking register is powered off...
> 
> > I haven't looked at whether I see the same phenomenon as you [missing
> > interrupts on icl] locally, but I was expecting something like the bug
> > report since the observation that render times are less than EI was
> > causing the clocks to stay high. And I noticed your problem statement
> > and was hopeful for a link.
> >
> 
> Right.  In the workloads I was looking at last week the GPU would often
> be active for periods of time several times greater than the EI, and we
> would still fail to clock up.
> 
> > They sound like two different problems. (Underclocking for animations is
> > not icl specific.)
> >
> 
> Sure.  But it seems like the underclocking problem has been greatly
> exacerbated by 21abf0bf168d, which may have been mitigating the same ICL
> problem I was looking at leading to RPS interrupt loss.  Maybe
> 21abf0bf168d wouldn't be necessary with working RPS interrupt delivery?
> And without 21abf0bf168d platforms earlier than ICL wouldn't have as
> much of an underclocking problem either.

21abf0bf168d ("drm/i915/gt: Treat idling as a RPS downclock event")

is necessary due to that we can set a boost frequency and then never run
the RPS worker due to short activity cycles. See
igt/i915_pm_rc6_residency for rc6-idle, the bg_load is essentially just

for (;;) {
	execbuf(&nop);
	sleep(.1);
}

Without 21abf0bf168d if you trigger a waitboost just before, it never
recovers and power utilisation is measurably higher.

> >> That said it *might* be helpful to reduce the EIs we use right now in
> >> addition, but a factor of 100 seems over the top since that will cause
> >> the evaluation interval to be roughly two orders of magnitude shorter
> >> than the rendering time of a typical frame, which can lead to massive
> >> oscillations even in workloads that use a small fraction of the GPU time
> >> to render a single frame.  Maybe we want something in between?
> >
> > Probably; as you can guess these were pulled out of nowhere based on the
> > observation that the frame lengths are much shorter than the current EI
> > and that in order for us to ramp up to maxclocks in a single frame of
> > animation would take about 4 samples per frame. Based on the reporter's
> > observations, we do have to ramp up very quickly for single frame of
> > rendering in order to hit the vblank, as we are ramping down afterwards.
> >
> > With a target of 4 samples within say 1ms, 160us isn't too far of the
> > mark. (We have to allow some extra time to catch up rendering.)
> 
> 
> How about we stop ramping down after the rendering of a single frame?
> It's not like we save any power by doing that, since the GPU seems to be
> forced to the minimum frequency for as long as it remains parked anyway.
> If the GPU remains idle long enough for the RPS utilization counters to
> drop below the threshold and qualify for a ramp-down the RPS should send
> us an interrupt, at which point we will ramp down the frequency.

Because it demonstrably and quite dramatically reduces power consumption
for very light desktop workloads.

> Unconditionally ramping down on parking seems to disturb the accuracy of
> that RPS feedback loop, which then needs to be worked around by reducing
> the averaging window of the RPS to a tiny fraction of the oscillation
> period of any typical GPU workload, which is going to prevent the RPS
> from seeing a whole oscillation period before it reacts, which is almost
> guaranteed to have a serious energy-efficiency cost.

There is no feedback loop in these workloads. There are no completed RPS
workers, they are all cancelled if they were even scheduled. (Now we
might be tempted to look at the results if they scheduled and take that
into consideration instead of unconditionally downclocking.)

> > As for steady state workloads, I'm optimistic the smoothing helps. (It's
> > harder to find steady state, unthrottled workloads!)
> 
> I'm curious, how is that smoothing you do in PATCH 1 better than simply
> setting 2x the EIs? (Which would also mean half the interrupt processing
> overhead as in this series)

I'm anticipating where the RPS worker may not run until the next jiffie
or two, by which point the iir is stale. But yes, there's only a subtle
difference between comparing the last 320us every 160us and comparing
the last 320us every 320us.
-Chris

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [Intel-gfx] [PATCH 2/2] drm/i915/gt: Shrink the RPS evalution intervals
@ 2020-04-14 21:52             ` Chris Wilson
  0 siblings, 0 replies; 34+ messages in thread
From: Chris Wilson @ 2020-04-14 21:52 UTC (permalink / raw)
  To: Francisco Jerez, intel-gfx; +Cc: Wilson, Chris P, stable

Quoting Francisco Jerez (2020-04-14 22:00:25)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > Quoting Francisco Jerez (2020-04-14 20:39:48)
> >> Chris Wilson <chris@chris-wilson.co.uk> writes:
> >> 
> >> > Quoting Chris Wilson (2020-04-14 17:14:23)
> >> >> Try to make RPS dramatically more responsive by shrinking the evaluation
> >> >> intervales by a factor of 100! The issue is as we now park the GPU
> >> >> rapidly upon idling, a short or bursty workload such as the composited
> >> >> desktop never sustains enough work to fill and complete an evaluation
> >> >> window. As such, the frequency we program remains stuck. This was first
> >> >> reported as once boosted, we never relinquished the boost [see commit
> >> >> 21abf0bf168d ("drm/i915/gt: Treat idling as a RPS downclock event")] but
> >> >> it equally applies in the order direction for bursty workloads that
> >> >> *need* low latency, like desktop animations.
> >> >> 
> >> >> What we could try is preserve the incomplete EI history across idling,
> >> >> it is not clear whether that would be effective, nor whether the
> >> >> presumption of continuous workloads is accurate. A clearer path seems to
> >> >> treat it as symptomatic that we fail to handle bursty workload with the
> >> >> current EI, and seek to address that by shrinking the EI so the
> >> >> evaluations are run much more often.
> >> >> 
> >> >> This will likely entail more frequent interrupts, and by the time we
> >> >> process the interrupt in the bottom half [from inside a worker], the
> >> >> workload on the GPU has changed. To address the changeable nature, in
> >> >> the previous patch we compared the previous complete EI with the
> >> >> interrupt request and only up/down clock if both agree. The impact of
> >> >> asking for, and presumably, receiving more interrupts is still to be
> >> >> determined and mitigations sought. The first idea is to differentiate
> >> >> between up/down responsivity and make upclocking more responsive than
> >> >> downlocking. This should both help thwart jitter on bursty workloads by
> >> >> making it easier to increase than it is to decrease frequencies, and
> >> >> reduce the number of interrupts we would need to process.
> >> >
> >> > Another worry I'd like to raise, is that by reducing the EI we risk
> >> > unstable evaluations. I'm not sure how accurate the HW is, and I worry
> >> > about borderline workloads (if that is possible) but mainly the worry is
> >> > how the HW is sampling.
> >> >
> >> > The other unmentioned unknown is the latency in reprogramming the
> >> > frequency. At what point does it start to become a significant factor?
> >> > I'm presuming the RPS evaluation itself is free, until it has to talk
> >> > across the chip to send an interrupt.
> >> > -Chris
> >> 
> >> At least on ICL the problem which this patch and 21abf0bf168d were
> >> working around seems to have to do with RPS interrupt delivery being
> >> inadvertently blocked for extended periods of time.  Looking at the GPU
> >> utilization and RPS events on a graph I could see the GPU being stuck at
> >> low frequency without any RPS interrupts firing, for a time interval
> >> orders of magnitude greater than the EI we're theoretically programming
> >> today.  IOW it seems like the real problem isn't that our EIs are too
> >> long, but that we're missing a bunch of them.
> >> 
> >> The solution I was suggesting for this on IRC during the last couple of
> >> days wouldn't have any of the drawbacks you mention above, I'll send it
> >> to this list in a moment if the general approach seems okay to you:
> >> 
> >> https://github.com/curro/linux/commit/f7bc31402aa727a52d957e62d985c6dae6be4b86
> >
> > We were explicitly told to mask the interrupt generation at source
> > to conserve power. So I would hope for a statement as to whether that is
> > still a requirement from the HW architects; but I can't see why we would
> > not apply the mask and that this is just paper. If the observation about
> > forcewake tallies, would this not suggest that it is still conserving
> > power on icl?
> >
> 
> Yeah, it's hard to see how disabling interrupt generation could save any
> additional power in a unit which is powered off -- At least on ICL where
> even the interrupt masking register is powered off...
> 
> > I haven't looked at whether I see the same phenomenon as you [missing
> > interrupts on icl] locally, but I was expecting something like the bug
> > report since the observation that render times are less than EI was
> > causing the clocks to stay high. And I noticed your problem statement
> > and was hopeful for a link.
> >
> 
> Right.  In the workloads I was looking at last week the GPU would often
> be active for periods of time several times greater than the EI, and we
> would still fail to clock up.
> 
> > They sound like two different problems. (Underclocking for animations is
> > not icl specific.)
> >
> 
> Sure.  But it seems like the underclocking problem has been greatly
> exacerbated by 21abf0bf168d, which may have been mitigating the same ICL
> problem I was looking at leading to RPS interrupt loss.  Maybe
> 21abf0bf168d wouldn't be necessary with working RPS interrupt delivery?
> And without 21abf0bf168d platforms earlier than ICL wouldn't have as
> much of an underclocking problem either.

21abf0bf168d ("drm/i915/gt: Treat idling as a RPS downclock event")

is necessary due to that we can set a boost frequency and then never run
the RPS worker due to short activity cycles. See
igt/i915_pm_rc6_residency for rc6-idle, the bg_load is essentially just

for (;;) {
	execbuf(&nop);
	sleep(.1);
}

Without 21abf0bf168d if you trigger a waitboost just before, it never
recovers and power utilisation is measurably higher.

> >> That said it *might* be helpful to reduce the EIs we use right now in
> >> addition, but a factor of 100 seems over the top since that will cause
> >> the evaluation interval to be roughly two orders of magnitude shorter
> >> than the rendering time of a typical frame, which can lead to massive
> >> oscillations even in workloads that use a small fraction of the GPU time
> >> to render a single frame.  Maybe we want something in between?
> >
> > Probably; as you can guess these were pulled out of nowhere based on the
> > observation that the frame lengths are much shorter than the current EI
> > and that in order for us to ramp up to maxclocks in a single frame of
> > animation would take about 4 samples per frame. Based on the reporter's
> > observations, we do have to ramp up very quickly for single frame of
> > rendering in order to hit the vblank, as we are ramping down afterwards.
> >
> > With a target of 4 samples within say 1ms, 160us isn't too far of the
> > mark. (We have to allow some extra time to catch up rendering.)
> 
> 
> How about we stop ramping down after the rendering of a single frame?
> It's not like we save any power by doing that, since the GPU seems to be
> forced to the minimum frequency for as long as it remains parked anyway.
> If the GPU remains idle long enough for the RPS utilization counters to
> drop below the threshold and qualify for a ramp-down the RPS should send
> us an interrupt, at which point we will ramp down the frequency.

Because it demonstrably and quite dramatically reduces power consumption
for very light desktop workloads.

> Unconditionally ramping down on parking seems to disturb the accuracy of
> that RPS feedback loop, which then needs to be worked around by reducing
> the averaging window of the RPS to a tiny fraction of the oscillation
> period of any typical GPU workload, which is going to prevent the RPS
> from seeing a whole oscillation period before it reacts, which is almost
> guaranteed to have a serious energy-efficiency cost.

There is no feedback loop in these workloads. There are no completed RPS
workers, they are all cancelled if they were even scheduled. (Now we
might be tempted to look at the results if they scheduled and take that
into consideration instead of unconditionally downclocking.)

> > As for steady state workloads, I'm optimistic the smoothing helps. (It's
> > harder to find steady state, unthrottled workloads!)
> 
> I'm curious, how is that smoothing you do in PATCH 1 better than simply
> setting 2x the EIs? (Which would also mean half the interrupt processing
> overhead as in this series)

I'm anticipating where the RPS worker may not run until the next jiffie
or two, by which point the iir is stale. But yes, there's only a subtle
difference between comparing the last 320us every 160us and comparing
the last 320us every 320us.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [Intel-gfx] [PATCH 2/2] drm/i915/gt: Shrink the RPS evalution intervals
  2020-04-14 21:35         ` Chris Wilson
@ 2020-04-14 22:27           ` Francisco Jerez
  -1 siblings, 0 replies; 34+ messages in thread
From: Francisco Jerez @ 2020-04-14 22:27 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx; +Cc: stable


[-- Attachment #1.1: Type: text/plain, Size: 5605 bytes --]

Chris Wilson <chris@chris-wilson.co.uk> writes:

> Quoting Francisco Jerez (2020-04-14 20:39:48)
>> Chris Wilson <chris@chris-wilson.co.uk> writes:
>> 
>> > Quoting Chris Wilson (2020-04-14 17:14:23)
>> >> Try to make RPS dramatically more responsive by shrinking the evaluation
>> >> intervales by a factor of 100! The issue is as we now park the GPU
>> >> rapidly upon idling, a short or bursty workload such as the composited
>> >> desktop never sustains enough work to fill and complete an evaluation
>> >> window. As such, the frequency we program remains stuck. This was first
>> >> reported as once boosted, we never relinquished the boost [see commit
>> >> 21abf0bf168d ("drm/i915/gt: Treat idling as a RPS downclock event")] but
>> >> it equally applies in the order direction for bursty workloads that
>> >> *need* low latency, like desktop animations.
>> >> 
>> >> What we could try is preserve the incomplete EI history across idling,
>> >> it is not clear whether that would be effective, nor whether the
>> >> presumption of continuous workloads is accurate. A clearer path seems to
>> >> treat it as symptomatic that we fail to handle bursty workload with the
>> >> current EI, and seek to address that by shrinking the EI so the
>> >> evaluations are run much more often.
>> >> 
>> >> This will likely entail more frequent interrupts, and by the time we
>> >> process the interrupt in the bottom half [from inside a worker], the
>> >> workload on the GPU has changed. To address the changeable nature, in
>> >> the previous patch we compared the previous complete EI with the
>> >> interrupt request and only up/down clock if both agree. The impact of
>> >> asking for, and presumably, receiving more interrupts is still to be
>> >> determined and mitigations sought. The first idea is to differentiate
>> >> between up/down responsivity and make upclocking more responsive than
>> >> downlocking. This should both help thwart jitter on bursty workloads by
>> >> making it easier to increase than it is to decrease frequencies, and
>> >> reduce the number of interrupts we would need to process.
>> >
>> > Another worry I'd like to raise, is that by reducing the EI we risk
>> > unstable evaluations. I'm not sure how accurate the HW is, and I worry
>> > about borderline workloads (if that is possible) but mainly the worry is
>> > how the HW is sampling.
>> >
>> > The other unmentioned unknown is the latency in reprogramming the
>> > frequency. At what point does it start to become a significant factor?
>> > I'm presuming the RPS evaluation itself is free, until it has to talk
>> > across the chip to send an interrupt.
>> > -Chris
>> 
>> At least on ICL the problem which this patch and 21abf0bf168d were
>> working around seems to have to do with RPS interrupt delivery being
>> inadvertently blocked for extended periods of time.  Looking at the GPU
>> utilization and RPS events on a graph I could see the GPU being stuck at
>> low frequency without any RPS interrupts firing, for a time interval
>> orders of magnitude greater than the EI we're theoretically programming
>> today.  IOW it seems like the real problem isn't that our EIs are too
>> long, but that we're missing a bunch of them.
>
> Just stuck a pr_err() into gen11_handle_rps_events(), and momentarily
> before we were throttled (and so capped at 100% load), interrupts were
> being delivered:
>
> [  887.521727] gen11_rps_irq_handler: { iir:20, events:20 }
> [  887.538039] gen11_rps_irq_handler: { iir:10, events:10 }
> [  887.538253] gen11_rps_irq_handler: { iir:20, events:20 }
> [  887.538555] gen11_rps_irq_handler: { iir:10, events:10 }
> [  887.554731] gen11_rps_irq_handler: { iir:10, events:10 }
> [  887.554857] gen11_rps_irq_handler: { iir:20, events:20 }
> [  887.555604] gen11_rps_irq_handler: { iir:10, events:10 }
> [  887.571373] gen11_rps_irq_handler: { iir:10, events:10 }
> [  887.571496] gen11_rps_irq_handler: { iir:20, events:20 }
> [  887.571646] gen11_rps_irq_handler: { iir:10, events:10 }
> [  887.588199] gen11_rps_irq_handler: { iir:10, events:10 }
> [  887.588380] gen11_rps_irq_handler: { iir:20, events:20 }
> [  887.588692] gen11_rps_irq_handler: { iir:10, events:10 }
> [  887.604718] gen11_rps_irq_handler: { iir:10, events:10 }
> [  887.604937] gen11_rps_irq_handler: { iir:20, events:20 }
> [  887.621591] gen11_rps_irq_handler: { iir:10, events:10 }
> [  887.621755] gen11_rps_irq_handler: { iir:10, events:10 }
> [  887.637988] gen11_rps_irq_handler: { iir:10, events:10 }
> [  887.638166] gen11_rps_irq_handler: { iir:20, events:20 }
> [  887.638803] gen11_rps_irq_handler: { iir:10, events:10 }
> [  887.654812] gen11_rps_irq_handler: { iir:10, events:10 }
> [  887.655029] gen11_rps_irq_handler: { iir:20, events:20 }
> [  887.671423] gen11_rps_irq_handler: { iir:10, events:10 }
> [  887.671649] gen11_rps_irq_handler: { iir:20, events:20 }
>
> That looks within expectations for the short EI settings. So many
> interrupts is a drag, and I would be tempted to remove the process bottom
> half.
>
> Oh well, I should check how many of those are translated into frequency
> updates. I just wanted to first check if in the first try I stumbled
> into the same loss of interrupts issue.

The interrupt loss seems to be non-deterministic, it's not like we lose
100% of them, since there is always a chance that the GPU is awake
during the PMINTRMSK write.  It's easily noticeable anyway with most
GPU-bound workloads on ICL, particularly with the current long EIs.

> -Chris

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 227 bytes --]

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [Intel-gfx] [PATCH 2/2] drm/i915/gt: Shrink the RPS evalution intervals
@ 2020-04-14 22:27           ` Francisco Jerez
  0 siblings, 0 replies; 34+ messages in thread
From: Francisco Jerez @ 2020-04-14 22:27 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx; +Cc: stable


[-- Attachment #1.1.1: Type: text/plain, Size: 5605 bytes --]

Chris Wilson <chris@chris-wilson.co.uk> writes:

> Quoting Francisco Jerez (2020-04-14 20:39:48)
>> Chris Wilson <chris@chris-wilson.co.uk> writes:
>> 
>> > Quoting Chris Wilson (2020-04-14 17:14:23)
>> >> Try to make RPS dramatically more responsive by shrinking the evaluation
>> >> intervales by a factor of 100! The issue is as we now park the GPU
>> >> rapidly upon idling, a short or bursty workload such as the composited
>> >> desktop never sustains enough work to fill and complete an evaluation
>> >> window. As such, the frequency we program remains stuck. This was first
>> >> reported as once boosted, we never relinquished the boost [see commit
>> >> 21abf0bf168d ("drm/i915/gt: Treat idling as a RPS downclock event")] but
>> >> it equally applies in the order direction for bursty workloads that
>> >> *need* low latency, like desktop animations.
>> >> 
>> >> What we could try is preserve the incomplete EI history across idling,
>> >> it is not clear whether that would be effective, nor whether the
>> >> presumption of continuous workloads is accurate. A clearer path seems to
>> >> treat it as symptomatic that we fail to handle bursty workload with the
>> >> current EI, and seek to address that by shrinking the EI so the
>> >> evaluations are run much more often.
>> >> 
>> >> This will likely entail more frequent interrupts, and by the time we
>> >> process the interrupt in the bottom half [from inside a worker], the
>> >> workload on the GPU has changed. To address the changeable nature, in
>> >> the previous patch we compared the previous complete EI with the
>> >> interrupt request and only up/down clock if both agree. The impact of
>> >> asking for, and presumably, receiving more interrupts is still to be
>> >> determined and mitigations sought. The first idea is to differentiate
>> >> between up/down responsivity and make upclocking more responsive than
>> >> downlocking. This should both help thwart jitter on bursty workloads by
>> >> making it easier to increase than it is to decrease frequencies, and
>> >> reduce the number of interrupts we would need to process.
>> >
>> > Another worry I'd like to raise, is that by reducing the EI we risk
>> > unstable evaluations. I'm not sure how accurate the HW is, and I worry
>> > about borderline workloads (if that is possible) but mainly the worry is
>> > how the HW is sampling.
>> >
>> > The other unmentioned unknown is the latency in reprogramming the
>> > frequency. At what point does it start to become a significant factor?
>> > I'm presuming the RPS evaluation itself is free, until it has to talk
>> > across the chip to send an interrupt.
>> > -Chris
>> 
>> At least on ICL the problem which this patch and 21abf0bf168d were
>> working around seems to have to do with RPS interrupt delivery being
>> inadvertently blocked for extended periods of time.  Looking at the GPU
>> utilization and RPS events on a graph I could see the GPU being stuck at
>> low frequency without any RPS interrupts firing, for a time interval
>> orders of magnitude greater than the EI we're theoretically programming
>> today.  IOW it seems like the real problem isn't that our EIs are too
>> long, but that we're missing a bunch of them.
>
> Just stuck a pr_err() into gen11_handle_rps_events(), and momentarily
> before we were throttled (and so capped at 100% load), interrupts were
> being delivered:
>
> [  887.521727] gen11_rps_irq_handler: { iir:20, events:20 }
> [  887.538039] gen11_rps_irq_handler: { iir:10, events:10 }
> [  887.538253] gen11_rps_irq_handler: { iir:20, events:20 }
> [  887.538555] gen11_rps_irq_handler: { iir:10, events:10 }
> [  887.554731] gen11_rps_irq_handler: { iir:10, events:10 }
> [  887.554857] gen11_rps_irq_handler: { iir:20, events:20 }
> [  887.555604] gen11_rps_irq_handler: { iir:10, events:10 }
> [  887.571373] gen11_rps_irq_handler: { iir:10, events:10 }
> [  887.571496] gen11_rps_irq_handler: { iir:20, events:20 }
> [  887.571646] gen11_rps_irq_handler: { iir:10, events:10 }
> [  887.588199] gen11_rps_irq_handler: { iir:10, events:10 }
> [  887.588380] gen11_rps_irq_handler: { iir:20, events:20 }
> [  887.588692] gen11_rps_irq_handler: { iir:10, events:10 }
> [  887.604718] gen11_rps_irq_handler: { iir:10, events:10 }
> [  887.604937] gen11_rps_irq_handler: { iir:20, events:20 }
> [  887.621591] gen11_rps_irq_handler: { iir:10, events:10 }
> [  887.621755] gen11_rps_irq_handler: { iir:10, events:10 }
> [  887.637988] gen11_rps_irq_handler: { iir:10, events:10 }
> [  887.638166] gen11_rps_irq_handler: { iir:20, events:20 }
> [  887.638803] gen11_rps_irq_handler: { iir:10, events:10 }
> [  887.654812] gen11_rps_irq_handler: { iir:10, events:10 }
> [  887.655029] gen11_rps_irq_handler: { iir:20, events:20 }
> [  887.671423] gen11_rps_irq_handler: { iir:10, events:10 }
> [  887.671649] gen11_rps_irq_handler: { iir:20, events:20 }
>
> That looks within expectations for the short EI settings. So many
> interrupts is a drag, and I would be tempted to remove the process bottom
> half.
>
> Oh well, I should check how many of those are translated into frequency
> updates. I just wanted to first check if in the first try I stumbled
> into the same loss of interrupts issue.

The interrupt loss seems to be non-deterministic, it's not like we lose
100% of them, since there is always a chance that the GPU is awake
during the PMINTRMSK write.  It's easily noticeable anyway with most
GPU-bound workloads on ICL, particularly with the current long EIs.

> -Chris

[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 227 bytes --]

[-- Attachment #2: Type: text/plain, Size: 160 bytes --]

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [Intel-gfx] [PATCH 2/2] drm/i915/gt: Shrink the RPS evalution intervals
  2020-04-14 21:52             ` Chris Wilson
@ 2020-04-14 22:28               ` Francisco Jerez
  -1 siblings, 0 replies; 34+ messages in thread
From: Francisco Jerez @ 2020-04-14 22:28 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx; +Cc: stable, Wilson, Chris P


[-- Attachment #1.1: Type: text/plain, Size: 9921 bytes --]

Chris Wilson <chris@chris-wilson.co.uk> writes:

> Quoting Francisco Jerez (2020-04-14 22:00:25)
>> Chris Wilson <chris@chris-wilson.co.uk> writes:
>> 
>> > Quoting Francisco Jerez (2020-04-14 20:39:48)
>> >> Chris Wilson <chris@chris-wilson.co.uk> writes:
>> >> 
>> >> > Quoting Chris Wilson (2020-04-14 17:14:23)
>> >> >> Try to make RPS dramatically more responsive by shrinking the evaluation
>> >> >> intervales by a factor of 100! The issue is as we now park the GPU
>> >> >> rapidly upon idling, a short or bursty workload such as the composited
>> >> >> desktop never sustains enough work to fill and complete an evaluation
>> >> >> window. As such, the frequency we program remains stuck. This was first
>> >> >> reported as once boosted, we never relinquished the boost [see commit
>> >> >> 21abf0bf168d ("drm/i915/gt: Treat idling as a RPS downclock event")] but
>> >> >> it equally applies in the order direction for bursty workloads that
>> >> >> *need* low latency, like desktop animations.
>> >> >> 
>> >> >> What we could try is preserve the incomplete EI history across idling,
>> >> >> it is not clear whether that would be effective, nor whether the
>> >> >> presumption of continuous workloads is accurate. A clearer path seems to
>> >> >> treat it as symptomatic that we fail to handle bursty workload with the
>> >> >> current EI, and seek to address that by shrinking the EI so the
>> >> >> evaluations are run much more often.
>> >> >> 
>> >> >> This will likely entail more frequent interrupts, and by the time we
>> >> >> process the interrupt in the bottom half [from inside a worker], the
>> >> >> workload on the GPU has changed. To address the changeable nature, in
>> >> >> the previous patch we compared the previous complete EI with the
>> >> >> interrupt request and only up/down clock if both agree. The impact of
>> >> >> asking for, and presumably, receiving more interrupts is still to be
>> >> >> determined and mitigations sought. The first idea is to differentiate
>> >> >> between up/down responsivity and make upclocking more responsive than
>> >> >> downlocking. This should both help thwart jitter on bursty workloads by
>> >> >> making it easier to increase than it is to decrease frequencies, and
>> >> >> reduce the number of interrupts we would need to process.
>> >> >
>> >> > Another worry I'd like to raise, is that by reducing the EI we risk
>> >> > unstable evaluations. I'm not sure how accurate the HW is, and I worry
>> >> > about borderline workloads (if that is possible) but mainly the worry is
>> >> > how the HW is sampling.
>> >> >
>> >> > The other unmentioned unknown is the latency in reprogramming the
>> >> > frequency. At what point does it start to become a significant factor?
>> >> > I'm presuming the RPS evaluation itself is free, until it has to talk
>> >> > across the chip to send an interrupt.
>> >> > -Chris
>> >> 
>> >> At least on ICL the problem which this patch and 21abf0bf168d were
>> >> working around seems to have to do with RPS interrupt delivery being
>> >> inadvertently blocked for extended periods of time.  Looking at the GPU
>> >> utilization and RPS events on a graph I could see the GPU being stuck at
>> >> low frequency without any RPS interrupts firing, for a time interval
>> >> orders of magnitude greater than the EI we're theoretically programming
>> >> today.  IOW it seems like the real problem isn't that our EIs are too
>> >> long, but that we're missing a bunch of them.
>> >> 
>> >> The solution I was suggesting for this on IRC during the last couple of
>> >> days wouldn't have any of the drawbacks you mention above, I'll send it
>> >> to this list in a moment if the general approach seems okay to you:
>> >> 
>> >> https://github.com/curro/linux/commit/f7bc31402aa727a52d957e62d985c6dae6be4b86
>> >
>> > We were explicitly told to mask the interrupt generation at source
>> > to conserve power. So I would hope for a statement as to whether that is
>> > still a requirement from the HW architects; but I can't see why we would
>> > not apply the mask and that this is just paper. If the observation about
>> > forcewake tallies, would this not suggest that it is still conserving
>> > power on icl?
>> >
>> 
>> Yeah, it's hard to see how disabling interrupt generation could save any
>> additional power in a unit which is powered off -- At least on ICL where
>> even the interrupt masking register is powered off...
>> 
>> > I haven't looked at whether I see the same phenomenon as you [missing
>> > interrupts on icl] locally, but I was expecting something like the bug
>> > report since the observation that render times are less than EI was
>> > causing the clocks to stay high. And I noticed your problem statement
>> > and was hopeful for a link.
>> >
>> 
>> Right.  In the workloads I was looking at last week the GPU would often
>> be active for periods of time several times greater than the EI, and we
>> would still fail to clock up.
>> 
>> > They sound like two different problems. (Underclocking for animations is
>> > not icl specific.)
>> >
>> 
>> Sure.  But it seems like the underclocking problem has been greatly
>> exacerbated by 21abf0bf168d, which may have been mitigating the same ICL
>> problem I was looking at leading to RPS interrupt loss.  Maybe
>> 21abf0bf168d wouldn't be necessary with working RPS interrupt delivery?
>> And without 21abf0bf168d platforms earlier than ICL wouldn't have as
>> much of an underclocking problem either.
>
> 21abf0bf168d ("drm/i915/gt: Treat idling as a RPS downclock event")
>
> is necessary due to that we can set a boost frequency and then never run
> the RPS worker due to short activity cycles. See
> igt/i915_pm_rc6_residency for rc6-idle, the bg_load is essentially just
>
> for (;;) {
> 	execbuf(&nop);
> 	sleep(.1);
> }
>
> Without 21abf0bf168d if you trigger a waitboost just before, it never
> recovers and power utilisation is measurably higher.
>

I can believe that, but can you confirm that the problem isn't a symptom
of the PMINTRMSK issue I was talking about earlier?  Assuming it isn't,
couldn't you do something like 21abf0bf168d which simply stores a
timestamp at GPU parking time and delays the actual frequency ramp-down
to GPU unpark, at which point it's performed *only* if the time spent
idle goes over a down-clock threshold based on the current "power" mode
(e.g. something like the value of GEN6_RP_DOWN_EI minus
GEN6_RP_DOWN_THRESHOLD) -- So the behavior of frequency rampdown due to
parking more closely matches what the RPS would have done if it had been
active.

>> >> That said it *might* be helpful to reduce the EIs we use right now in
>> >> addition, but a factor of 100 seems over the top since that will cause
>> >> the evaluation interval to be roughly two orders of magnitude shorter
>> >> than the rendering time of a typical frame, which can lead to massive
>> >> oscillations even in workloads that use a small fraction of the GPU time
>> >> to render a single frame.  Maybe we want something in between?
>> >
>> > Probably; as you can guess these were pulled out of nowhere based on the
>> > observation that the frame lengths are much shorter than the current EI
>> > and that in order for us to ramp up to maxclocks in a single frame of
>> > animation would take about 4 samples per frame. Based on the reporter's
>> > observations, we do have to ramp up very quickly for single frame of
>> > rendering in order to hit the vblank, as we are ramping down afterwards.
>> >
>> > With a target of 4 samples within say 1ms, 160us isn't too far of the
>> > mark. (We have to allow some extra time to catch up rendering.)
>> 
>> 
>> How about we stop ramping down after the rendering of a single frame?
>> It's not like we save any power by doing that, since the GPU seems to be
>> forced to the minimum frequency for as long as it remains parked anyway.
>> If the GPU remains idle long enough for the RPS utilization counters to
>> drop below the threshold and qualify for a ramp-down the RPS should send
>> us an interrupt, at which point we will ramp down the frequency.
>
> Because it demonstrably and quite dramatically reduces power consumption
> for very light desktop workloads.
>
>> Unconditionally ramping down on parking seems to disturb the accuracy of
>> that RPS feedback loop, which then needs to be worked around by reducing
>> the averaging window of the RPS to a tiny fraction of the oscillation
>> period of any typical GPU workload, which is going to prevent the RPS
>> from seeing a whole oscillation period before it reacts, which is almost
>> guaranteed to have a serious energy-efficiency cost.
>
> There is no feedback loop in these workloads. There are no completed RPS
> workers, they are all cancelled if they were even scheduled. (Now we
> might be tempted to look at the results if they scheduled and take that
> into consideration instead of unconditionally downclocking.)
>

Processing any pending RPS work seems better IMHO than unconditionally
assuming they indicate a clock-down.

>> > As for steady state workloads, I'm optimistic the smoothing helps. (It's
>> > harder to find steady state, unthrottled workloads!)
>> 
>> I'm curious, how is that smoothing you do in PATCH 1 better than simply
>> setting 2x the EIs? (Which would also mean half the interrupt processing
>> overhead as in this series)
>
> I'm anticipating where the RPS worker may not run until the next jiffie
> or two, by which point the iir is stale. But yes, there's only a subtle
> difference between comparing the last 320us every 160us and comparing
> the last 320us every 320us.

Sounds like the benefit from smoothing is very slight, considering its
accuracy and interrupt processing overhead costs.

> -Chris

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 227 bytes --]

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [Intel-gfx] [PATCH 2/2] drm/i915/gt: Shrink the RPS evalution intervals
@ 2020-04-14 22:28               ` Francisco Jerez
  0 siblings, 0 replies; 34+ messages in thread
From: Francisco Jerez @ 2020-04-14 22:28 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx; +Cc: Wilson, Chris P, stable


[-- Attachment #1.1.1: Type: text/plain, Size: 9921 bytes --]

Chris Wilson <chris@chris-wilson.co.uk> writes:

> Quoting Francisco Jerez (2020-04-14 22:00:25)
>> Chris Wilson <chris@chris-wilson.co.uk> writes:
>> 
>> > Quoting Francisco Jerez (2020-04-14 20:39:48)
>> >> Chris Wilson <chris@chris-wilson.co.uk> writes:
>> >> 
>> >> > Quoting Chris Wilson (2020-04-14 17:14:23)
>> >> >> Try to make RPS dramatically more responsive by shrinking the evaluation
>> >> >> intervales by a factor of 100! The issue is as we now park the GPU
>> >> >> rapidly upon idling, a short or bursty workload such as the composited
>> >> >> desktop never sustains enough work to fill and complete an evaluation
>> >> >> window. As such, the frequency we program remains stuck. This was first
>> >> >> reported as once boosted, we never relinquished the boost [see commit
>> >> >> 21abf0bf168d ("drm/i915/gt: Treat idling as a RPS downclock event")] but
>> >> >> it equally applies in the order direction for bursty workloads that
>> >> >> *need* low latency, like desktop animations.
>> >> >> 
>> >> >> What we could try is preserve the incomplete EI history across idling,
>> >> >> it is not clear whether that would be effective, nor whether the
>> >> >> presumption of continuous workloads is accurate. A clearer path seems to
>> >> >> treat it as symptomatic that we fail to handle bursty workload with the
>> >> >> current EI, and seek to address that by shrinking the EI so the
>> >> >> evaluations are run much more often.
>> >> >> 
>> >> >> This will likely entail more frequent interrupts, and by the time we
>> >> >> process the interrupt in the bottom half [from inside a worker], the
>> >> >> workload on the GPU has changed. To address the changeable nature, in
>> >> >> the previous patch we compared the previous complete EI with the
>> >> >> interrupt request and only up/down clock if both agree. The impact of
>> >> >> asking for, and presumably, receiving more interrupts is still to be
>> >> >> determined and mitigations sought. The first idea is to differentiate
>> >> >> between up/down responsivity and make upclocking more responsive than
>> >> >> downlocking. This should both help thwart jitter on bursty workloads by
>> >> >> making it easier to increase than it is to decrease frequencies, and
>> >> >> reduce the number of interrupts we would need to process.
>> >> >
>> >> > Another worry I'd like to raise, is that by reducing the EI we risk
>> >> > unstable evaluations. I'm not sure how accurate the HW is, and I worry
>> >> > about borderline workloads (if that is possible) but mainly the worry is
>> >> > how the HW is sampling.
>> >> >
>> >> > The other unmentioned unknown is the latency in reprogramming the
>> >> > frequency. At what point does it start to become a significant factor?
>> >> > I'm presuming the RPS evaluation itself is free, until it has to talk
>> >> > across the chip to send an interrupt.
>> >> > -Chris
>> >> 
>> >> At least on ICL the problem which this patch and 21abf0bf168d were
>> >> working around seems to have to do with RPS interrupt delivery being
>> >> inadvertently blocked for extended periods of time.  Looking at the GPU
>> >> utilization and RPS events on a graph I could see the GPU being stuck at
>> >> low frequency without any RPS interrupts firing, for a time interval
>> >> orders of magnitude greater than the EI we're theoretically programming
>> >> today.  IOW it seems like the real problem isn't that our EIs are too
>> >> long, but that we're missing a bunch of them.
>> >> 
>> >> The solution I was suggesting for this on IRC during the last couple of
>> >> days wouldn't have any of the drawbacks you mention above, I'll send it
>> >> to this list in a moment if the general approach seems okay to you:
>> >> 
>> >> https://github.com/curro/linux/commit/f7bc31402aa727a52d957e62d985c6dae6be4b86
>> >
>> > We were explicitly told to mask the interrupt generation at source
>> > to conserve power. So I would hope for a statement as to whether that is
>> > still a requirement from the HW architects; but I can't see why we would
>> > not apply the mask and that this is just paper. If the observation about
>> > forcewake tallies, would this not suggest that it is still conserving
>> > power on icl?
>> >
>> 
>> Yeah, it's hard to see how disabling interrupt generation could save any
>> additional power in a unit which is powered off -- At least on ICL where
>> even the interrupt masking register is powered off...
>> 
>> > I haven't looked at whether I see the same phenomenon as you [missing
>> > interrupts on icl] locally, but I was expecting something like the bug
>> > report since the observation that render times are less than EI was
>> > causing the clocks to stay high. And I noticed your problem statement
>> > and was hopeful for a link.
>> >
>> 
>> Right.  In the workloads I was looking at last week the GPU would often
>> be active for periods of time several times greater than the EI, and we
>> would still fail to clock up.
>> 
>> > They sound like two different problems. (Underclocking for animations is
>> > not icl specific.)
>> >
>> 
>> Sure.  But it seems like the underclocking problem has been greatly
>> exacerbated by 21abf0bf168d, which may have been mitigating the same ICL
>> problem I was looking at leading to RPS interrupt loss.  Maybe
>> 21abf0bf168d wouldn't be necessary with working RPS interrupt delivery?
>> And without 21abf0bf168d platforms earlier than ICL wouldn't have as
>> much of an underclocking problem either.
>
> 21abf0bf168d ("drm/i915/gt: Treat idling as a RPS downclock event")
>
> is necessary due to that we can set a boost frequency and then never run
> the RPS worker due to short activity cycles. See
> igt/i915_pm_rc6_residency for rc6-idle, the bg_load is essentially just
>
> for (;;) {
> 	execbuf(&nop);
> 	sleep(.1);
> }
>
> Without 21abf0bf168d if you trigger a waitboost just before, it never
> recovers and power utilisation is measurably higher.
>

I can believe that, but can you confirm that the problem isn't a symptom
of the PMINTRMSK issue I was talking about earlier?  Assuming it isn't,
couldn't you do something like 21abf0bf168d which simply stores a
timestamp at GPU parking time and delays the actual frequency ramp-down
to GPU unpark, at which point it's performed *only* if the time spent
idle goes over a down-clock threshold based on the current "power" mode
(e.g. something like the value of GEN6_RP_DOWN_EI minus
GEN6_RP_DOWN_THRESHOLD) -- So the behavior of frequency rampdown due to
parking more closely matches what the RPS would have done if it had been
active.

>> >> That said it *might* be helpful to reduce the EIs we use right now in
>> >> addition, but a factor of 100 seems over the top since that will cause
>> >> the evaluation interval to be roughly two orders of magnitude shorter
>> >> than the rendering time of a typical frame, which can lead to massive
>> >> oscillations even in workloads that use a small fraction of the GPU time
>> >> to render a single frame.  Maybe we want something in between?
>> >
>> > Probably; as you can guess these were pulled out of nowhere based on the
>> > observation that the frame lengths are much shorter than the current EI
>> > and that in order for us to ramp up to maxclocks in a single frame of
>> > animation would take about 4 samples per frame. Based on the reporter's
>> > observations, we do have to ramp up very quickly for single frame of
>> > rendering in order to hit the vblank, as we are ramping down afterwards.
>> >
>> > With a target of 4 samples within say 1ms, 160us isn't too far of the
>> > mark. (We have to allow some extra time to catch up rendering.)
>> 
>> 
>> How about we stop ramping down after the rendering of a single frame?
>> It's not like we save any power by doing that, since the GPU seems to be
>> forced to the minimum frequency for as long as it remains parked anyway.
>> If the GPU remains idle long enough for the RPS utilization counters to
>> drop below the threshold and qualify for a ramp-down the RPS should send
>> us an interrupt, at which point we will ramp down the frequency.
>
> Because it demonstrably and quite dramatically reduces power consumption
> for very light desktop workloads.
>
>> Unconditionally ramping down on parking seems to disturb the accuracy of
>> that RPS feedback loop, which then needs to be worked around by reducing
>> the averaging window of the RPS to a tiny fraction of the oscillation
>> period of any typical GPU workload, which is going to prevent the RPS
>> from seeing a whole oscillation period before it reacts, which is almost
>> guaranteed to have a serious energy-efficiency cost.
>
> There is no feedback loop in these workloads. There are no completed RPS
> workers, they are all cancelled if they were even scheduled. (Now we
> might be tempted to look at the results if they scheduled and take that
> into consideration instead of unconditionally downclocking.)
>

Processing any pending RPS work seems better IMHO than unconditionally
assuming they indicate a clock-down.

>> > As for steady state workloads, I'm optimistic the smoothing helps. (It's
>> > harder to find steady state, unthrottled workloads!)
>> 
>> I'm curious, how is that smoothing you do in PATCH 1 better than simply
>> setting 2x the EIs? (Which would also mean half the interrupt processing
>> overhead as in this series)
>
> I'm anticipating where the RPS worker may not run until the next jiffie
> or two, by which point the iir is stale. But yes, there's only a subtle
> difference between comparing the last 320us every 160us and comparing
> the last 320us every 320us.

Sounds like the benefit from smoothing is very slight, considering its
accuracy and interrupt processing overhead costs.

> -Chris

[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 227 bytes --]

[-- Attachment #2: Type: text/plain, Size: 160 bytes --]

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [Intel-gfx] [PATCH 2/2] drm/i915/gt: Shrink the RPS evalution intervals
  2020-04-14 22:28               ` Francisco Jerez
@ 2020-04-14 22:38                 ` Francisco Jerez
  -1 siblings, 0 replies; 34+ messages in thread
From: Francisco Jerez @ 2020-04-14 22:38 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx; +Cc: Wilson, Chris P, stable


[-- Attachment #1.1: Type: text/plain, Size: 10767 bytes --]

Francisco Jerez <currojerez@riseup.net> writes:

> Chris Wilson <chris@chris-wilson.co.uk> writes:
>
>> Quoting Francisco Jerez (2020-04-14 22:00:25)
>>> Chris Wilson <chris@chris-wilson.co.uk> writes:
>>> 
>>> > Quoting Francisco Jerez (2020-04-14 20:39:48)
>>> >> Chris Wilson <chris@chris-wilson.co.uk> writes:
>>> >> 
>>> >> > Quoting Chris Wilson (2020-04-14 17:14:23)
>>> >> >> Try to make RPS dramatically more responsive by shrinking the evaluation
>>> >> >> intervales by a factor of 100! The issue is as we now park the GPU
>>> >> >> rapidly upon idling, a short or bursty workload such as the composited
>>> >> >> desktop never sustains enough work to fill and complete an evaluation
>>> >> >> window. As such, the frequency we program remains stuck. This was first
>>> >> >> reported as once boosted, we never relinquished the boost [see commit
>>> >> >> 21abf0bf168d ("drm/i915/gt: Treat idling as a RPS downclock event")] but
>>> >> >> it equally applies in the order direction for bursty workloads that
>>> >> >> *need* low latency, like desktop animations.
>>> >> >> 
>>> >> >> What we could try is preserve the incomplete EI history across idling,
>>> >> >> it is not clear whether that would be effective, nor whether the
>>> >> >> presumption of continuous workloads is accurate. A clearer path seems to
>>> >> >> treat it as symptomatic that we fail to handle bursty workload with the
>>> >> >> current EI, and seek to address that by shrinking the EI so the
>>> >> >> evaluations are run much more often.
>>> >> >> 
>>> >> >> This will likely entail more frequent interrupts, and by the time we
>>> >> >> process the interrupt in the bottom half [from inside a worker], the
>>> >> >> workload on the GPU has changed. To address the changeable nature, in
>>> >> >> the previous patch we compared the previous complete EI with the
>>> >> >> interrupt request and only up/down clock if both agree. The impact of
>>> >> >> asking for, and presumably, receiving more interrupts is still to be
>>> >> >> determined and mitigations sought. The first idea is to differentiate
>>> >> >> between up/down responsivity and make upclocking more responsive than
>>> >> >> downlocking. This should both help thwart jitter on bursty workloads by
>>> >> >> making it easier to increase than it is to decrease frequencies, and
>>> >> >> reduce the number of interrupts we would need to process.
>>> >> >
>>> >> > Another worry I'd like to raise, is that by reducing the EI we risk
>>> >> > unstable evaluations. I'm not sure how accurate the HW is, and I worry
>>> >> > about borderline workloads (if that is possible) but mainly the worry is
>>> >> > how the HW is sampling.
>>> >> >
>>> >> > The other unmentioned unknown is the latency in reprogramming the
>>> >> > frequency. At what point does it start to become a significant factor?
>>> >> > I'm presuming the RPS evaluation itself is free, until it has to talk
>>> >> > across the chip to send an interrupt.
>>> >> > -Chris
>>> >> 
>>> >> At least on ICL the problem which this patch and 21abf0bf168d were
>>> >> working around seems to have to do with RPS interrupt delivery being
>>> >> inadvertently blocked for extended periods of time.  Looking at the GPU
>>> >> utilization and RPS events on a graph I could see the GPU being stuck at
>>> >> low frequency without any RPS interrupts firing, for a time interval
>>> >> orders of magnitude greater than the EI we're theoretically programming
>>> >> today.  IOW it seems like the real problem isn't that our EIs are too
>>> >> long, but that we're missing a bunch of them.
>>> >> 
>>> >> The solution I was suggesting for this on IRC during the last couple of
>>> >> days wouldn't have any of the drawbacks you mention above, I'll send it
>>> >> to this list in a moment if the general approach seems okay to you:
>>> >> 
>>> >> https://github.com/curro/linux/commit/f7bc31402aa727a52d957e62d985c6dae6be4b86
>>> >
>>> > We were explicitly told to mask the interrupt generation at source
>>> > to conserve power. So I would hope for a statement as to whether that is
>>> > still a requirement from the HW architects; but I can't see why we would
>>> > not apply the mask and that this is just paper. If the observation about
>>> > forcewake tallies, would this not suggest that it is still conserving
>>> > power on icl?
>>> >
>>> 
>>> Yeah, it's hard to see how disabling interrupt generation could save any
>>> additional power in a unit which is powered off -- At least on ICL where
>>> even the interrupt masking register is powered off...
>>> 
>>> > I haven't looked at whether I see the same phenomenon as you [missing
>>> > interrupts on icl] locally, but I was expecting something like the bug
>>> > report since the observation that render times are less than EI was
>>> > causing the clocks to stay high. And I noticed your problem statement
>>> > and was hopeful for a link.
>>> >
>>> 
>>> Right.  In the workloads I was looking at last week the GPU would often
>>> be active for periods of time several times greater than the EI, and we
>>> would still fail to clock up.
>>> 
>>> > They sound like two different problems. (Underclocking for animations is
>>> > not icl specific.)
>>> >
>>> 
>>> Sure.  But it seems like the underclocking problem has been greatly
>>> exacerbated by 21abf0bf168d, which may have been mitigating the same ICL
>>> problem I was looking at leading to RPS interrupt loss.  Maybe
>>> 21abf0bf168d wouldn't be necessary with working RPS interrupt delivery?
>>> And without 21abf0bf168d platforms earlier than ICL wouldn't have as
>>> much of an underclocking problem either.
>>
>> 21abf0bf168d ("drm/i915/gt: Treat idling as a RPS downclock event")
>>
>> is necessary due to that we can set a boost frequency and then never run
>> the RPS worker due to short activity cycles. See
>> igt/i915_pm_rc6_residency for rc6-idle, the bg_load is essentially just
>>
>> for (;;) {
>> 	execbuf(&nop);
>> 	sleep(.1);
>> }
>>
>> Without 21abf0bf168d if you trigger a waitboost just before, it never
>> recovers and power utilisation is measurably higher.
>>
>
> I can believe that, but can you confirm that the problem isn't a symptom
> of the PMINTRMSK issue I was talking about earlier?  Assuming it isn't,
> couldn't you do something like 21abf0bf168d which simply stores a
> timestamp at GPU parking time and delays the actual frequency ramp-down
> to GPU unpark, at which point it's performed *only* if the time spent
> idle goes over a down-clock threshold based on the current "power" mode
> (e.g. something like the value of GEN6_RP_DOWN_EI minus
> GEN6_RP_DOWN_THRESHOLD) -- So the behavior of frequency rampdown due to
> parking more closely matches what the RPS would have done if it had been
> active.
>

If you want to match the behavior of the RPS even more closely you could
use a variable delay kind of like the one I'm using for overload
tracking in:

https://github.com/curro/linux/commit/0669ce23148eafa00e15fc44b9d07f7919802af0

However using a fixed delay based on GEN6_RP_DOWN_THRESHOLD as I was
describing above might be sufficient to avoid the waitboost energy
efficiency problem you were talking about.

>>> >> That said it *might* be helpful to reduce the EIs we use right now in
>>> >> addition, but a factor of 100 seems over the top since that will cause
>>> >> the evaluation interval to be roughly two orders of magnitude shorter
>>> >> than the rendering time of a typical frame, which can lead to massive
>>> >> oscillations even in workloads that use a small fraction of the GPU time
>>> >> to render a single frame.  Maybe we want something in between?
>>> >
>>> > Probably; as you can guess these were pulled out of nowhere based on the
>>> > observation that the frame lengths are much shorter than the current EI
>>> > and that in order for us to ramp up to maxclocks in a single frame of
>>> > animation would take about 4 samples per frame. Based on the reporter's
>>> > observations, we do have to ramp up very quickly for single frame of
>>> > rendering in order to hit the vblank, as we are ramping down afterwards.
>>> >
>>> > With a target of 4 samples within say 1ms, 160us isn't too far of the
>>> > mark. (We have to allow some extra time to catch up rendering.)
>>> 
>>> 
>>> How about we stop ramping down after the rendering of a single frame?
>>> It's not like we save any power by doing that, since the GPU seems to be
>>> forced to the minimum frequency for as long as it remains parked anyway.
>>> If the GPU remains idle long enough for the RPS utilization counters to
>>> drop below the threshold and qualify for a ramp-down the RPS should send
>>> us an interrupt, at which point we will ramp down the frequency.
>>
>> Because it demonstrably and quite dramatically reduces power consumption
>> for very light desktop workloads.
>>
>>> Unconditionally ramping down on parking seems to disturb the accuracy of
>>> that RPS feedback loop, which then needs to be worked around by reducing
>>> the averaging window of the RPS to a tiny fraction of the oscillation
>>> period of any typical GPU workload, which is going to prevent the RPS
>>> from seeing a whole oscillation period before it reacts, which is almost
>>> guaranteed to have a serious energy-efficiency cost.
>>
>> There is no feedback loop in these workloads. There are no completed RPS
>> workers, they are all cancelled if they were even scheduled. (Now we
>> might be tempted to look at the results if they scheduled and take that
>> into consideration instead of unconditionally downclocking.)
>>
>
> Processing any pending RPS work seems better IMHO than unconditionally
> assuming they indicate a clock-down.
>
>>> > As for steady state workloads, I'm optimistic the smoothing helps. (It's
>>> > harder to find steady state, unthrottled workloads!)
>>> 
>>> I'm curious, how is that smoothing you do in PATCH 1 better than simply
>>> setting 2x the EIs? (Which would also mean half the interrupt processing
>>> overhead as in this series)
>>
>> I'm anticipating where the RPS worker may not run until the next jiffie
>> or two, by which point the iir is stale. But yes, there's only a subtle
>> difference between comparing the last 320us every 160us and comparing
>> the last 320us every 320us.
>
> Sounds like the benefit from smoothing is very slight, considering its
> accuracy and interrupt processing overhead costs.
>
>> -Chris
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 227 bytes --]

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [Intel-gfx] [PATCH 2/2] drm/i915/gt: Shrink the RPS evalution intervals
@ 2020-04-14 22:38                 ` Francisco Jerez
  0 siblings, 0 replies; 34+ messages in thread
From: Francisco Jerez @ 2020-04-14 22:38 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx; +Cc: stable, Wilson, Chris P


[-- Attachment #1.1.1: Type: text/plain, Size: 10767 bytes --]

Francisco Jerez <currojerez@riseup.net> writes:

> Chris Wilson <chris@chris-wilson.co.uk> writes:
>
>> Quoting Francisco Jerez (2020-04-14 22:00:25)
>>> Chris Wilson <chris@chris-wilson.co.uk> writes:
>>> 
>>> > Quoting Francisco Jerez (2020-04-14 20:39:48)
>>> >> Chris Wilson <chris@chris-wilson.co.uk> writes:
>>> >> 
>>> >> > Quoting Chris Wilson (2020-04-14 17:14:23)
>>> >> >> Try to make RPS dramatically more responsive by shrinking the evaluation
>>> >> >> intervales by a factor of 100! The issue is as we now park the GPU
>>> >> >> rapidly upon idling, a short or bursty workload such as the composited
>>> >> >> desktop never sustains enough work to fill and complete an evaluation
>>> >> >> window. As such, the frequency we program remains stuck. This was first
>>> >> >> reported as once boosted, we never relinquished the boost [see commit
>>> >> >> 21abf0bf168d ("drm/i915/gt: Treat idling as a RPS downclock event")] but
>>> >> >> it equally applies in the order direction for bursty workloads that
>>> >> >> *need* low latency, like desktop animations.
>>> >> >> 
>>> >> >> What we could try is preserve the incomplete EI history across idling,
>>> >> >> it is not clear whether that would be effective, nor whether the
>>> >> >> presumption of continuous workloads is accurate. A clearer path seems to
>>> >> >> treat it as symptomatic that we fail to handle bursty workload with the
>>> >> >> current EI, and seek to address that by shrinking the EI so the
>>> >> >> evaluations are run much more often.
>>> >> >> 
>>> >> >> This will likely entail more frequent interrupts, and by the time we
>>> >> >> process the interrupt in the bottom half [from inside a worker], the
>>> >> >> workload on the GPU has changed. To address the changeable nature, in
>>> >> >> the previous patch we compared the previous complete EI with the
>>> >> >> interrupt request and only up/down clock if both agree. The impact of
>>> >> >> asking for, and presumably, receiving more interrupts is still to be
>>> >> >> determined and mitigations sought. The first idea is to differentiate
>>> >> >> between up/down responsivity and make upclocking more responsive than
>>> >> >> downlocking. This should both help thwart jitter on bursty workloads by
>>> >> >> making it easier to increase than it is to decrease frequencies, and
>>> >> >> reduce the number of interrupts we would need to process.
>>> >> >
>>> >> > Another worry I'd like to raise, is that by reducing the EI we risk
>>> >> > unstable evaluations. I'm not sure how accurate the HW is, and I worry
>>> >> > about borderline workloads (if that is possible) but mainly the worry is
>>> >> > how the HW is sampling.
>>> >> >
>>> >> > The other unmentioned unknown is the latency in reprogramming the
>>> >> > frequency. At what point does it start to become a significant factor?
>>> >> > I'm presuming the RPS evaluation itself is free, until it has to talk
>>> >> > across the chip to send an interrupt.
>>> >> > -Chris
>>> >> 
>>> >> At least on ICL the problem which this patch and 21abf0bf168d were
>>> >> working around seems to have to do with RPS interrupt delivery being
>>> >> inadvertently blocked for extended periods of time.  Looking at the GPU
>>> >> utilization and RPS events on a graph I could see the GPU being stuck at
>>> >> low frequency without any RPS interrupts firing, for a time interval
>>> >> orders of magnitude greater than the EI we're theoretically programming
>>> >> today.  IOW it seems like the real problem isn't that our EIs are too
>>> >> long, but that we're missing a bunch of them.
>>> >> 
>>> >> The solution I was suggesting for this on IRC during the last couple of
>>> >> days wouldn't have any of the drawbacks you mention above, I'll send it
>>> >> to this list in a moment if the general approach seems okay to you:
>>> >> 
>>> >> https://github.com/curro/linux/commit/f7bc31402aa727a52d957e62d985c6dae6be4b86
>>> >
>>> > We were explicitly told to mask the interrupt generation at source
>>> > to conserve power. So I would hope for a statement as to whether that is
>>> > still a requirement from the HW architects; but I can't see why we would
>>> > not apply the mask and that this is just paper. If the observation about
>>> > forcewake tallies, would this not suggest that it is still conserving
>>> > power on icl?
>>> >
>>> 
>>> Yeah, it's hard to see how disabling interrupt generation could save any
>>> additional power in a unit which is powered off -- At least on ICL where
>>> even the interrupt masking register is powered off...
>>> 
>>> > I haven't looked at whether I see the same phenomenon as you [missing
>>> > interrupts on icl] locally, but I was expecting something like the bug
>>> > report since the observation that render times are less than EI was
>>> > causing the clocks to stay high. And I noticed your problem statement
>>> > and was hopeful for a link.
>>> >
>>> 
>>> Right.  In the workloads I was looking at last week the GPU would often
>>> be active for periods of time several times greater than the EI, and we
>>> would still fail to clock up.
>>> 
>>> > They sound like two different problems. (Underclocking for animations is
>>> > not icl specific.)
>>> >
>>> 
>>> Sure.  But it seems like the underclocking problem has been greatly
>>> exacerbated by 21abf0bf168d, which may have been mitigating the same ICL
>>> problem I was looking at leading to RPS interrupt loss.  Maybe
>>> 21abf0bf168d wouldn't be necessary with working RPS interrupt delivery?
>>> And without 21abf0bf168d platforms earlier than ICL wouldn't have as
>>> much of an underclocking problem either.
>>
>> 21abf0bf168d ("drm/i915/gt: Treat idling as a RPS downclock event")
>>
>> is necessary due to that we can set a boost frequency and then never run
>> the RPS worker due to short activity cycles. See
>> igt/i915_pm_rc6_residency for rc6-idle, the bg_load is essentially just
>>
>> for (;;) {
>> 	execbuf(&nop);
>> 	sleep(.1);
>> }
>>
>> Without 21abf0bf168d if you trigger a waitboost just before, it never
>> recovers and power utilisation is measurably higher.
>>
>
> I can believe that, but can you confirm that the problem isn't a symptom
> of the PMINTRMSK issue I was talking about earlier?  Assuming it isn't,
> couldn't you do something like 21abf0bf168d which simply stores a
> timestamp at GPU parking time and delays the actual frequency ramp-down
> to GPU unpark, at which point it's performed *only* if the time spent
> idle goes over a down-clock threshold based on the current "power" mode
> (e.g. something like the value of GEN6_RP_DOWN_EI minus
> GEN6_RP_DOWN_THRESHOLD) -- So the behavior of frequency rampdown due to
> parking more closely matches what the RPS would have done if it had been
> active.
>

If you want to match the behavior of the RPS even more closely you could
use a variable delay kind of like the one I'm using for overload
tracking in:

https://github.com/curro/linux/commit/0669ce23148eafa00e15fc44b9d07f7919802af0

However using a fixed delay based on GEN6_RP_DOWN_THRESHOLD as I was
describing above might be sufficient to avoid the waitboost energy
efficiency problem you were talking about.

>>> >> That said it *might* be helpful to reduce the EIs we use right now in
>>> >> addition, but a factor of 100 seems over the top since that will cause
>>> >> the evaluation interval to be roughly two orders of magnitude shorter
>>> >> than the rendering time of a typical frame, which can lead to massive
>>> >> oscillations even in workloads that use a small fraction of the GPU time
>>> >> to render a single frame.  Maybe we want something in between?
>>> >
>>> > Probably; as you can guess these were pulled out of nowhere based on the
>>> > observation that the frame lengths are much shorter than the current EI
>>> > and that in order for us to ramp up to maxclocks in a single frame of
>>> > animation would take about 4 samples per frame. Based on the reporter's
>>> > observations, we do have to ramp up very quickly for single frame of
>>> > rendering in order to hit the vblank, as we are ramping down afterwards.
>>> >
>>> > With a target of 4 samples within say 1ms, 160us isn't too far of the
>>> > mark. (We have to allow some extra time to catch up rendering.)
>>> 
>>> 
>>> How about we stop ramping down after the rendering of a single frame?
>>> It's not like we save any power by doing that, since the GPU seems to be
>>> forced to the minimum frequency for as long as it remains parked anyway.
>>> If the GPU remains idle long enough for the RPS utilization counters to
>>> drop below the threshold and qualify for a ramp-down the RPS should send
>>> us an interrupt, at which point we will ramp down the frequency.
>>
>> Because it demonstrably and quite dramatically reduces power consumption
>> for very light desktop workloads.
>>
>>> Unconditionally ramping down on parking seems to disturb the accuracy of
>>> that RPS feedback loop, which then needs to be worked around by reducing
>>> the averaging window of the RPS to a tiny fraction of the oscillation
>>> period of any typical GPU workload, which is going to prevent the RPS
>>> from seeing a whole oscillation period before it reacts, which is almost
>>> guaranteed to have a serious energy-efficiency cost.
>>
>> There is no feedback loop in these workloads. There are no completed RPS
>> workers, they are all cancelled if they were even scheduled. (Now we
>> might be tempted to look at the results if they scheduled and take that
>> into consideration instead of unconditionally downclocking.)
>>
>
> Processing any pending RPS work seems better IMHO than unconditionally
> assuming they indicate a clock-down.
>
>>> > As for steady state workloads, I'm optimistic the smoothing helps. (It's
>>> > harder to find steady state, unthrottled workloads!)
>>> 
>>> I'm curious, how is that smoothing you do in PATCH 1 better than simply
>>> setting 2x the EIs? (Which would also mean half the interrupt processing
>>> overhead as in this series)
>>
>> I'm anticipating where the RPS worker may not run until the next jiffie
>> or two, by which point the iir is stale. But yes, there's only a subtle
>> difference between comparing the last 320us every 160us and comparing
>> the last 320us every 320us.
>
> Sounds like the benefit from smoothing is very slight, considering its
> accuracy and interrupt processing overhead costs.
>
>> -Chris
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx

[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 227 bytes --]

[-- Attachment #2: Type: text/plain, Size: 160 bytes --]

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [Intel-gfx] ✗ Fi.CI.CHECKPATCH: warning for series starting with [1/2] drm/i915/gt: Try to smooth RPS spikes
  2020-04-14 16:14 [Intel-gfx] [PATCH 1/2] drm/i915/gt: Try to smooth RPS spikes Chris Wilson
  2020-04-14 16:14   ` [Intel-gfx] " Chris Wilson
@ 2020-04-15  0:56 ` Patchwork
  2020-04-15  1:13 ` [Intel-gfx] ✗ Fi.CI.DOCS: " Patchwork
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 34+ messages in thread
From: Patchwork @ 2020-04-15  0:56 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/2] drm/i915/gt: Try to smooth RPS spikes
URL   : https://patchwork.freedesktop.org/series/75927/
State : warning

== Summary ==

$ dim checkpatch origin/drm-tip
fa2b7bf5aa68 drm/i915/gt: Try to smooth RPS spikes
-:13: WARNING:TYPO_SPELLING: 'similary' may be misspelled - perhaps 'similarly'?
#13: 
similary 2 down EI to decrease. However, if the worker runs fast enough,

total: 0 errors, 1 warnings, 0 checks, 114 lines checked
c320d0f386a4 drm/i915/gt: Shrink the RPS evalution intervals
-:12: ERROR:GIT_COMMIT_ID: Please use git commit description style 'commit <12+ chars of sha1> ("<title line>")' - ie: 'commit 21abf0bf168d ("drm/i915/gt: Treat idling as a RPS downclock event")'
#12: 
21abf0bf168d ("drm/i915/gt: Treat idling as a RPS downclock event")] but

total: 1 errors, 0 warnings, 0 checks, 51 lines checked

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [Intel-gfx] ✗ Fi.CI.DOCS: warning for series starting with [1/2] drm/i915/gt: Try to smooth RPS spikes
  2020-04-14 16:14 [Intel-gfx] [PATCH 1/2] drm/i915/gt: Try to smooth RPS spikes Chris Wilson
  2020-04-14 16:14   ` [Intel-gfx] " Chris Wilson
  2020-04-15  0:56 ` [Intel-gfx] ✗ Fi.CI.CHECKPATCH: warning for series starting with [1/2] drm/i915/gt: Try to smooth RPS spikes Patchwork
@ 2020-04-15  1:13 ` Patchwork
  2020-04-15  1:18 ` [Intel-gfx] ✓ Fi.CI.BAT: success " Patchwork
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 34+ messages in thread
From: Patchwork @ 2020-04-15  1:13 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/2] drm/i915/gt: Try to smooth RPS spikes
URL   : https://patchwork.freedesktop.org/series/75927/
State : warning

== Summary ==

$ make htmldocs 2>&1 > /dev/null | grep i915
/home/cidrm/kernel/Documentation/gpu/i915.rst:610: WARNING: duplicate label gpu/i915:layout, other instance in /home/cidrm/kernel/Documentation/gpu/i915.rst

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [Intel-gfx] ✓ Fi.CI.BAT: success for series starting with [1/2] drm/i915/gt: Try to smooth RPS spikes
  2020-04-14 16:14 [Intel-gfx] [PATCH 1/2] drm/i915/gt: Try to smooth RPS spikes Chris Wilson
                   ` (2 preceding siblings ...)
  2020-04-15  1:13 ` [Intel-gfx] ✗ Fi.CI.DOCS: " Patchwork
@ 2020-04-15  1:18 ` Patchwork
  2020-04-15 10:56 ` [Intel-gfx] [PATCH 1/2] " Andi Shyti
  2020-04-15 15:22 ` [Intel-gfx] ✗ Fi.CI.IGT: failure for series starting with [1/2] " Patchwork
  5 siblings, 0 replies; 34+ messages in thread
From: Patchwork @ 2020-04-15  1:18 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/2] drm/i915/gt: Try to smooth RPS spikes
URL   : https://patchwork.freedesktop.org/series/75927/
State : success

== Summary ==

CI Bug Log - changes from CI_DRM_8298 -> Patchwork_17298
====================================================

Summary
-------

  **SUCCESS**

  No regressions found.

  External URL: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17298/index.html


Changes
-------

  No changes found


Participating hosts (48 -> 38)
------------------------------

  Missing    (10): fi-cml-s fi-hsw-4200u fi-glk-dsi fi-byt-squawks fi-bsw-cyan fi-apl-guc fi-cfl-8109u fi-icl-y fi-byt-clapper fi-skl-6700k2 


Build changes
-------------

  * CI: CI-20190529 -> None
  * Linux: CI_DRM_8298 -> Patchwork_17298

  CI-20190529: 20190529
  CI_DRM_8298: 17f82f0c2857d0b442adbdb62eb44b61d0f5b775 @ git://anongit.freedesktop.org/gfx-ci/linux
  IGT_5589: 31962324ac86f029e2841e56e97c42cf9d572956 @ git://anongit.freedesktop.org/xorg/app/intel-gpu-tools
  Patchwork_17298: c320d0f386a455ccc184a5959634fdadeab6b5e6 @ git://anongit.freedesktop.org/gfx-ci/linux


== Linux commits ==

c320d0f386a4 drm/i915/gt: Shrink the RPS evalution intervals
fa2b7bf5aa68 drm/i915/gt: Try to smooth RPS spikes

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17298/index.html
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [Intel-gfx] [PATCH 2/2] drm/i915/gt: Shrink the RPS evalution intervals
  2020-04-14 19:39       ` [Intel-gfx] " Francisco Jerez
@ 2020-04-15  7:37         ` Chris Wilson
  -1 siblings, 0 replies; 34+ messages in thread
From: Chris Wilson @ 2020-04-15  7:37 UTC (permalink / raw)
  To: Francisco Jerez, intel-gfx; +Cc: stable

Quoting Francisco Jerez (2020-04-14 20:39:48)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > Quoting Chris Wilson (2020-04-14 17:14:23)
> >> Try to make RPS dramatically more responsive by shrinking the evaluation
> >> intervales by a factor of 100! The issue is as we now park the GPU
> >> rapidly upon idling, a short or bursty workload such as the composited
> >> desktop never sustains enough work to fill and complete an evaluation
> >> window. As such, the frequency we program remains stuck. This was first
> >> reported as once boosted, we never relinquished the boost [see commit
> >> 21abf0bf168d ("drm/i915/gt: Treat idling as a RPS downclock event")] but
> >> it equally applies in the order direction for bursty workloads that
> >> *need* low latency, like desktop animations.
> >> 
> >> What we could try is preserve the incomplete EI history across idling,
> >> it is not clear whether that would be effective, nor whether the
> >> presumption of continuous workloads is accurate. A clearer path seems to
> >> treat it as symptomatic that we fail to handle bursty workload with the
> >> current EI, and seek to address that by shrinking the EI so the
> >> evaluations are run much more often.
> >> 
> >> This will likely entail more frequent interrupts, and by the time we
> >> process the interrupt in the bottom half [from inside a worker], the
> >> workload on the GPU has changed. To address the changeable nature, in
> >> the previous patch we compared the previous complete EI with the
> >> interrupt request and only up/down clock if both agree. The impact of
> >> asking for, and presumably, receiving more interrupts is still to be
> >> determined and mitigations sought. The first idea is to differentiate
> >> between up/down responsivity and make upclocking more responsive than
> >> downlocking. This should both help thwart jitter on bursty workloads by
> >> making it easier to increase than it is to decrease frequencies, and
> >> reduce the number of interrupts we would need to process.
> >
> > Another worry I'd like to raise, is that by reducing the EI we risk
> > unstable evaluations. I'm not sure how accurate the HW is, and I worry
> > about borderline workloads (if that is possible) but mainly the worry is
> > how the HW is sampling.
> >
> > The other unmentioned unknown is the latency in reprogramming the
> > frequency. At what point does it start to become a significant factor?
> > I'm presuming the RPS evaluation itself is free, until it has to talk
> > across the chip to send an interrupt.
> > -Chris
> 
> At least on ICL the problem which this patch and 21abf0bf168d were
> working around seems to have to do with RPS interrupt delivery being
> inadvertently blocked for extended periods of time.  Looking at the GPU
> utilization and RPS events on a graph I could see the GPU being stuck at
> low frequency without any RPS interrupts firing, for a time interval
> orders of magnitude greater than the EI we're theoretically programming
> today.  IOW it seems like the real problem isn't that our EIs are too
> long, but that we're missing a bunch of them.
> 
> The solution I was suggesting for this on IRC during the last couple of
> days wouldn't have any of the drawbacks you mention above, I'll send it
> to this list in a moment if the general approach seems okay to you:
> 
> https://github.com/curro/linux/commit/f7bc31402aa727a52d957e62d985c6dae6be4b86

Confirmed that the PMINTRMSK is sufficiently delayed to cause problems.
 
> That said it *might* be helpful to reduce the EIs we use right now in
> addition, but a factor of 100 seems over the top since that will cause
> the evaluation interval to be roughly two orders of magnitude shorter
> than the rendering time of a typical frame, which can lead to massive
> oscillations even in workloads that use a small fraction of the GPU time
> to render a single frame.  Maybe we want something in between?

And confirmed that both are problems :) The EI are just too large to
handle the bursty workload. That is with the 10+ms EI, we do not see any
interrupts in the simple animations as we park within an EI.
-Chris

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [Intel-gfx] [PATCH 2/2] drm/i915/gt: Shrink the RPS evalution intervals
@ 2020-04-15  7:37         ` Chris Wilson
  0 siblings, 0 replies; 34+ messages in thread
From: Chris Wilson @ 2020-04-15  7:37 UTC (permalink / raw)
  To: Francisco Jerez, intel-gfx; +Cc: stable

Quoting Francisco Jerez (2020-04-14 20:39:48)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > Quoting Chris Wilson (2020-04-14 17:14:23)
> >> Try to make RPS dramatically more responsive by shrinking the evaluation
> >> intervales by a factor of 100! The issue is as we now park the GPU
> >> rapidly upon idling, a short or bursty workload such as the composited
> >> desktop never sustains enough work to fill and complete an evaluation
> >> window. As such, the frequency we program remains stuck. This was first
> >> reported as once boosted, we never relinquished the boost [see commit
> >> 21abf0bf168d ("drm/i915/gt: Treat idling as a RPS downclock event")] but
> >> it equally applies in the order direction for bursty workloads that
> >> *need* low latency, like desktop animations.
> >> 
> >> What we could try is preserve the incomplete EI history across idling,
> >> it is not clear whether that would be effective, nor whether the
> >> presumption of continuous workloads is accurate. A clearer path seems to
> >> treat it as symptomatic that we fail to handle bursty workload with the
> >> current EI, and seek to address that by shrinking the EI so the
> >> evaluations are run much more often.
> >> 
> >> This will likely entail more frequent interrupts, and by the time we
> >> process the interrupt in the bottom half [from inside a worker], the
> >> workload on the GPU has changed. To address the changeable nature, in
> >> the previous patch we compared the previous complete EI with the
> >> interrupt request and only up/down clock if both agree. The impact of
> >> asking for, and presumably, receiving more interrupts is still to be
> >> determined and mitigations sought. The first idea is to differentiate
> >> between up/down responsivity and make upclocking more responsive than
> >> downlocking. This should both help thwart jitter on bursty workloads by
> >> making it easier to increase than it is to decrease frequencies, and
> >> reduce the number of interrupts we would need to process.
> >
> > Another worry I'd like to raise, is that by reducing the EI we risk
> > unstable evaluations. I'm not sure how accurate the HW is, and I worry
> > about borderline workloads (if that is possible) but mainly the worry is
> > how the HW is sampling.
> >
> > The other unmentioned unknown is the latency in reprogramming the
> > frequency. At what point does it start to become a significant factor?
> > I'm presuming the RPS evaluation itself is free, until it has to talk
> > across the chip to send an interrupt.
> > -Chris
> 
> At least on ICL the problem which this patch and 21abf0bf168d were
> working around seems to have to do with RPS interrupt delivery being
> inadvertently blocked for extended periods of time.  Looking at the GPU
> utilization and RPS events on a graph I could see the GPU being stuck at
> low frequency without any RPS interrupts firing, for a time interval
> orders of magnitude greater than the EI we're theoretically programming
> today.  IOW it seems like the real problem isn't that our EIs are too
> long, but that we're missing a bunch of them.
> 
> The solution I was suggesting for this on IRC during the last couple of
> days wouldn't have any of the drawbacks you mention above, I'll send it
> to this list in a moment if the general approach seems okay to you:
> 
> https://github.com/curro/linux/commit/f7bc31402aa727a52d957e62d985c6dae6be4b86

Confirmed that the PMINTRMSK is sufficiently delayed to cause problems.
 
> That said it *might* be helpful to reduce the EIs we use right now in
> addition, but a factor of 100 seems over the top since that will cause
> the evaluation interval to be roughly two orders of magnitude shorter
> than the rendering time of a typical frame, which can lead to massive
> oscillations even in workloads that use a small fraction of the GPU time
> to render a single frame.  Maybe we want something in between?

And confirmed that both are problems :) The EI are just too large to
handle the bursty workload. That is with the 10+ms EI, we do not see any
interrupts in the simple animations as we park within an EI.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [Intel-gfx] [PATCH 1/2] drm/i915/gt: Try to smooth RPS spikes
  2020-04-14 16:14 [Intel-gfx] [PATCH 1/2] drm/i915/gt: Try to smooth RPS spikes Chris Wilson
                   ` (3 preceding siblings ...)
  2020-04-15  1:18 ` [Intel-gfx] ✓ Fi.CI.BAT: success " Patchwork
@ 2020-04-15 10:56 ` Andi Shyti
  2020-04-15 11:24   ` Chris Wilson
  2020-04-15 15:22 ` [Intel-gfx] ✗ Fi.CI.IGT: failure for series starting with [1/2] " Patchwork
  5 siblings, 1 reply; 34+ messages in thread
From: Andi Shyti @ 2020-04-15 10:56 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

Hi Chris,

On Tue, Apr 14, 2020 at 05:14:22PM +0100, Chris Wilson wrote:
> By the time we respond to the RPS interrupt [inside a worker], the GPU
> may be running a different workload. As we look to make the evalution
> intervals shorter, these spikes are more likely to okay. Let's try to
> smooth over the spikes in the workload by comparing the EI interrupt
> [up/down events] with the most recently completed EI; if both say up,
> then increase the clocks, if they disagree stay the same. In principle,
> this means we now take 2 up EI to go increase into the next bin, and
> similary 2 down EI to decrease. However, if the worker runs fast enough,
> the previous EI in the registers will be the same as triggered the
> interrupt, so responsiveness remains unaffect. [Under the current scheme
> where EI are on the order of 10ms, it is likely that this is true and we
> compare the interrupt with the EI that caused it.]

looks reasonable to me. Wouldn't it make also sense to evaluate
the difference between the current and the previous pm_iir?

Reviewed-by: Andi Shyti <andi.shyti@inte.com>

Andi
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [PATCH 2/2] drm/i915/gt: Shrink the RPS evalution intervals
  2020-04-14 16:14   ` [Intel-gfx] " Chris Wilson
@ 2020-04-15 11:11     ` Andi Shyti
  -1 siblings, 0 replies; 34+ messages in thread
From: Andi Shyti @ 2020-04-15 11:11 UTC (permalink / raw)
  To: Chris Wilson
  Cc: intel-gfx, Mika Kuoppala, Lyude Paul, Francisco Jerez, stable

Hi Chris,

On Tue, Apr 14, 2020 at 05:14:23PM +0100, Chris Wilson wrote:
> Try to make RPS dramatically more responsive by shrinking the evaluation
> intervales by a factor of 100! The issue is as we now park the GPU
> rapidly upon idling, a short or bursty workload such as the composited
> desktop never sustains enough work to fill and complete an evaluation
> window. As such, the frequency we program remains stuck. This was first
> reported as once boosted, we never relinquished the boost [see commit
> 21abf0bf168d ("drm/i915/gt: Treat idling as a RPS downclock event")] but
> it equally applies in the order direction for bursty workloads that
> *need* low latency, like desktop animations.
> 
> What we could try is preserve the incomplete EI history across idling,
> it is not clear whether that would be effective, nor whether the
> presumption of continuous workloads is accurate. A clearer path seems to
> treat it as symptomatic that we fail to handle bursty workload with the
> current EI, and seek to address that by shrinking the EI so the
> evaluations are run much more often.
> 
> This will likely entail more frequent interrupts, and by the time we
> process the interrupt in the bottom half [from inside a worker], the
> workload on the GPU has changed. To address the changeable nature, in
> the previous patch we compared the previous complete EI with the
> interrupt request and only up/down clock if both agree. The impact of
> asking for, and presumably, receiving more interrupts is still to be
> determined and mitigations sought. The first idea is to differentiate
> between up/down responsivity and make upclocking more responsive than
> downlocking. This should both help thwart jitter on bursty workloads by
> making it easier to increase than it is to decrease frequencies, and
> reduce the number of interrupts we would need to process.
> 
> Fixes: 21abf0bf168d ("drm/i915/gt: Treat idling as a RPS downclock event")
> Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/1698
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Andi Shyti <andi.shyti@intel.com>
> Cc: Lyude Paul <lyude@redhat.com>
> Cc: Francisco Jerez <currojerez@riseup.net>
> Cc: <stable@vger.kernel.org> # v5.5+
> ---
>  drivers/gpu/drm/i915/gt/intel_rps.c | 27 ++++++++++++++-------------
>  1 file changed, 14 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_rps.c b/drivers/gpu/drm/i915/gt/intel_rps.c
> index 367132092bed..47ddb25edc97 100644
> --- a/drivers/gpu/drm/i915/gt/intel_rps.c
> +++ b/drivers/gpu/drm/i915/gt/intel_rps.c
> @@ -542,37 +542,38 @@ static void rps_set_power(struct intel_rps *rps, int new_power)
>  	/* Note the units here are not exactly 1us, but 1280ns. */
>  	switch (new_power) {
>  	case LOW_POWER:
> -		/* Upclock if more than 95% busy over 16ms */
> -		ei_up = 16000;
> +		/* Upclock if more than 95% busy over 160us */
> +		ei_up = 160;
>  		threshold_up = 95;
>  
> -		/* Downclock if less than 85% busy over 32ms */
> -		ei_down = 32000;
> +		/* Downclock if less than 85% busy over 1600us */
> +		ei_down = 1600;
>  		threshold_down = 85;
>  		break;
>  
>  	case BETWEEN:
> -		/* Upclock if more than 90% busy over 13ms */
> -		ei_up = 13000;
> +		/* Upclock if more than 90% busy over 160us */
> +		ei_up = 160;
>  		threshold_up = 90;
>  
> -		/* Downclock if less than 75% busy over 32ms */
> -		ei_down = 32000;
> +		/* Downclock if less than 75% busy over 1600us */
> +		ei_down = 1600;
>  		threshold_down = 75;
>  		break;
>  
>  	case HIGH_POWER:
> -		/* Upclock if more than 85% busy over 10ms */
> -		ei_up = 10000;
> +		/* Upclock if more than 85% busy over 160us */
> +		ei_up = 160;
>  		threshold_up = 85;
>  
> -		/* Downclock if less than 60% busy over 32ms */
> -		ei_down = 32000;
> +		/* Downclock if less than 60% busy over 1600us */
> +		ei_down = 1600;

This is quite a drammatic change.

Can we have a more dynamic selection of the interval depending on
the frequency we are running? We reduce the interval in low
frequencies and increase the interval in high frequencies.

Andi

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [Intel-gfx] [PATCH 2/2] drm/i915/gt: Shrink the RPS evalution intervals
@ 2020-04-15 11:11     ` Andi Shyti
  0 siblings, 0 replies; 34+ messages in thread
From: Andi Shyti @ 2020-04-15 11:11 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx, stable

Hi Chris,

On Tue, Apr 14, 2020 at 05:14:23PM +0100, Chris Wilson wrote:
> Try to make RPS dramatically more responsive by shrinking the evaluation
> intervales by a factor of 100! The issue is as we now park the GPU
> rapidly upon idling, a short or bursty workload such as the composited
> desktop never sustains enough work to fill and complete an evaluation
> window. As such, the frequency we program remains stuck. This was first
> reported as once boosted, we never relinquished the boost [see commit
> 21abf0bf168d ("drm/i915/gt: Treat idling as a RPS downclock event")] but
> it equally applies in the order direction for bursty workloads that
> *need* low latency, like desktop animations.
> 
> What we could try is preserve the incomplete EI history across idling,
> it is not clear whether that would be effective, nor whether the
> presumption of continuous workloads is accurate. A clearer path seems to
> treat it as symptomatic that we fail to handle bursty workload with the
> current EI, and seek to address that by shrinking the EI so the
> evaluations are run much more often.
> 
> This will likely entail more frequent interrupts, and by the time we
> process the interrupt in the bottom half [from inside a worker], the
> workload on the GPU has changed. To address the changeable nature, in
> the previous patch we compared the previous complete EI with the
> interrupt request and only up/down clock if both agree. The impact of
> asking for, and presumably, receiving more interrupts is still to be
> determined and mitigations sought. The first idea is to differentiate
> between up/down responsivity and make upclocking more responsive than
> downlocking. This should both help thwart jitter on bursty workloads by
> making it easier to increase than it is to decrease frequencies, and
> reduce the number of interrupts we would need to process.
> 
> Fixes: 21abf0bf168d ("drm/i915/gt: Treat idling as a RPS downclock event")
> Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/1698
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Andi Shyti <andi.shyti@intel.com>
> Cc: Lyude Paul <lyude@redhat.com>
> Cc: Francisco Jerez <currojerez@riseup.net>
> Cc: <stable@vger.kernel.org> # v5.5+
> ---
>  drivers/gpu/drm/i915/gt/intel_rps.c | 27 ++++++++++++++-------------
>  1 file changed, 14 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_rps.c b/drivers/gpu/drm/i915/gt/intel_rps.c
> index 367132092bed..47ddb25edc97 100644
> --- a/drivers/gpu/drm/i915/gt/intel_rps.c
> +++ b/drivers/gpu/drm/i915/gt/intel_rps.c
> @@ -542,37 +542,38 @@ static void rps_set_power(struct intel_rps *rps, int new_power)
>  	/* Note the units here are not exactly 1us, but 1280ns. */
>  	switch (new_power) {
>  	case LOW_POWER:
> -		/* Upclock if more than 95% busy over 16ms */
> -		ei_up = 16000;
> +		/* Upclock if more than 95% busy over 160us */
> +		ei_up = 160;
>  		threshold_up = 95;
>  
> -		/* Downclock if less than 85% busy over 32ms */
> -		ei_down = 32000;
> +		/* Downclock if less than 85% busy over 1600us */
> +		ei_down = 1600;
>  		threshold_down = 85;
>  		break;
>  
>  	case BETWEEN:
> -		/* Upclock if more than 90% busy over 13ms */
> -		ei_up = 13000;
> +		/* Upclock if more than 90% busy over 160us */
> +		ei_up = 160;
>  		threshold_up = 90;
>  
> -		/* Downclock if less than 75% busy over 32ms */
> -		ei_down = 32000;
> +		/* Downclock if less than 75% busy over 1600us */
> +		ei_down = 1600;
>  		threshold_down = 75;
>  		break;
>  
>  	case HIGH_POWER:
> -		/* Upclock if more than 85% busy over 10ms */
> -		ei_up = 10000;
> +		/* Upclock if more than 85% busy over 160us */
> +		ei_up = 160;
>  		threshold_up = 85;
>  
> -		/* Downclock if less than 60% busy over 32ms */
> -		ei_down = 32000;
> +		/* Downclock if less than 60% busy over 1600us */
> +		ei_down = 1600;

This is quite a drammatic change.

Can we have a more dynamic selection of the interval depending on
the frequency we are running? We reduce the interval in low
frequencies and increase the interval in high frequencies.

Andi
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [Intel-gfx] [PATCH 1/2] drm/i915/gt: Try to smooth RPS spikes
  2020-04-15 10:56 ` [Intel-gfx] [PATCH 1/2] " Andi Shyti
@ 2020-04-15 11:24   ` Chris Wilson
  2020-04-15 11:45     ` Chris Wilson
  0 siblings, 1 reply; 34+ messages in thread
From: Chris Wilson @ 2020-04-15 11:24 UTC (permalink / raw)
  To: Andi Shyti; +Cc: intel-gfx

Quoting Andi Shyti (2020-04-15 11:56:59)
> Hi Chris,
> 
> On Tue, Apr 14, 2020 at 05:14:22PM +0100, Chris Wilson wrote:
> > By the time we respond to the RPS interrupt [inside a worker], the GPU
> > may be running a different workload. As we look to make the evalution
> > intervals shorter, these spikes are more likely to okay. Let's try to
> > smooth over the spikes in the workload by comparing the EI interrupt
> > [up/down events] with the most recently completed EI; if both say up,
> > then increase the clocks, if they disagree stay the same. In principle,
> > this means we now take 2 up EI to go increase into the next bin, and
> > similary 2 down EI to decrease. However, if the worker runs fast enough,
> > the previous EI in the registers will be the same as triggered the
> > interrupt, so responsiveness remains unaffect. [Under the current scheme
> > where EI are on the order of 10ms, it is likely that this is true and we
> > compare the interrupt with the EI that caused it.]
> 
> looks reasonable to me. Wouldn't it make also sense to evaluate
> the difference between the current and the previous pm_iir?

I was considering it... We can't look at IIR since we apply the mask,
but ISR should be valid. Worth a looksee.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [Intel-gfx] [PATCH 2/2] drm/i915/gt: Shrink the RPS evalution intervals
  2020-04-15  7:37         ` Chris Wilson
@ 2020-04-15 11:36           ` Chris Wilson
  -1 siblings, 0 replies; 34+ messages in thread
From: Chris Wilson @ 2020-04-15 11:36 UTC (permalink / raw)
  To: Francisco Jerez, intel-gfx; +Cc: stable

Quoting Chris Wilson (2020-04-15 08:37:07)
> Quoting Francisco Jerez (2020-04-14 20:39:48)
> > Chris Wilson <chris@chris-wilson.co.uk> writes:
> > 
> > > Quoting Chris Wilson (2020-04-14 17:14:23)
> > >> Try to make RPS dramatically more responsive by shrinking the evaluation
> > >> intervales by a factor of 100! The issue is as we now park the GPU
> > >> rapidly upon idling, a short or bursty workload such as the composited
> > >> desktop never sustains enough work to fill and complete an evaluation
> > >> window. As such, the frequency we program remains stuck. This was first
> > >> reported as once boosted, we never relinquished the boost [see commit
> > >> 21abf0bf168d ("drm/i915/gt: Treat idling as a RPS downclock event")] but
> > >> it equally applies in the order direction for bursty workloads that
> > >> *need* low latency, like desktop animations.
> > >> 
> > >> What we could try is preserve the incomplete EI history across idling,
> > >> it is not clear whether that would be effective, nor whether the
> > >> presumption of continuous workloads is accurate. A clearer path seems to
> > >> treat it as symptomatic that we fail to handle bursty workload with the
> > >> current EI, and seek to address that by shrinking the EI so the
> > >> evaluations are run much more often.
> > >> 
> > >> This will likely entail more frequent interrupts, and by the time we
> > >> process the interrupt in the bottom half [from inside a worker], the
> > >> workload on the GPU has changed. To address the changeable nature, in
> > >> the previous patch we compared the previous complete EI with the
> > >> interrupt request and only up/down clock if both agree. The impact of
> > >> asking for, and presumably, receiving more interrupts is still to be
> > >> determined and mitigations sought. The first idea is to differentiate
> > >> between up/down responsivity and make upclocking more responsive than
> > >> downlocking. This should both help thwart jitter on bursty workloads by
> > >> making it easier to increase than it is to decrease frequencies, and
> > >> reduce the number of interrupts we would need to process.
> > >
> > > Another worry I'd like to raise, is that by reducing the EI we risk
> > > unstable evaluations. I'm not sure how accurate the HW is, and I worry
> > > about borderline workloads (if that is possible) but mainly the worry is
> > > how the HW is sampling.
> > >
> > > The other unmentioned unknown is the latency in reprogramming the
> > > frequency. At what point does it start to become a significant factor?
> > > I'm presuming the RPS evaluation itself is free, until it has to talk
> > > across the chip to send an interrupt.
> > > -Chris
> > 
> > At least on ICL the problem which this patch and 21abf0bf168d were
> > working around seems to have to do with RPS interrupt delivery being
> > inadvertently blocked for extended periods of time.  Looking at the GPU
> > utilization and RPS events on a graph I could see the GPU being stuck at
> > low frequency without any RPS interrupts firing, for a time interval
> > orders of magnitude greater than the EI we're theoretically programming
> > today.  IOW it seems like the real problem isn't that our EIs are too
> > long, but that we're missing a bunch of them.
> > 
> > The solution I was suggesting for this on IRC during the last couple of
> > days wouldn't have any of the drawbacks you mention above, I'll send it
> > to this list in a moment if the general approach seems okay to you:
> > 
> > https://github.com/curro/linux/commit/f7bc31402aa727a52d957e62d985c6dae6be4b86
> 
> Confirmed that the PMINTRMSK is sufficiently delayed to cause problems.

[   68.462428] rcs0: UP interrupt not recorded for spinner, pm_iir:0, prev_up:2ee0, up_threshold:2c88, up_ei:2ee0
Have selftest \o/
FW fixes selftest.
-Chris

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [Intel-gfx] [PATCH 2/2] drm/i915/gt: Shrink the RPS evalution intervals
@ 2020-04-15 11:36           ` Chris Wilson
  0 siblings, 0 replies; 34+ messages in thread
From: Chris Wilson @ 2020-04-15 11:36 UTC (permalink / raw)
  To: Francisco Jerez, intel-gfx; +Cc: stable

Quoting Chris Wilson (2020-04-15 08:37:07)
> Quoting Francisco Jerez (2020-04-14 20:39:48)
> > Chris Wilson <chris@chris-wilson.co.uk> writes:
> > 
> > > Quoting Chris Wilson (2020-04-14 17:14:23)
> > >> Try to make RPS dramatically more responsive by shrinking the evaluation
> > >> intervales by a factor of 100! The issue is as we now park the GPU
> > >> rapidly upon idling, a short or bursty workload such as the composited
> > >> desktop never sustains enough work to fill and complete an evaluation
> > >> window. As such, the frequency we program remains stuck. This was first
> > >> reported as once boosted, we never relinquished the boost [see commit
> > >> 21abf0bf168d ("drm/i915/gt: Treat idling as a RPS downclock event")] but
> > >> it equally applies in the order direction for bursty workloads that
> > >> *need* low latency, like desktop animations.
> > >> 
> > >> What we could try is preserve the incomplete EI history across idling,
> > >> it is not clear whether that would be effective, nor whether the
> > >> presumption of continuous workloads is accurate. A clearer path seems to
> > >> treat it as symptomatic that we fail to handle bursty workload with the
> > >> current EI, and seek to address that by shrinking the EI so the
> > >> evaluations are run much more often.
> > >> 
> > >> This will likely entail more frequent interrupts, and by the time we
> > >> process the interrupt in the bottom half [from inside a worker], the
> > >> workload on the GPU has changed. To address the changeable nature, in
> > >> the previous patch we compared the previous complete EI with the
> > >> interrupt request and only up/down clock if both agree. The impact of
> > >> asking for, and presumably, receiving more interrupts is still to be
> > >> determined and mitigations sought. The first idea is to differentiate
> > >> between up/down responsivity and make upclocking more responsive than
> > >> downlocking. This should both help thwart jitter on bursty workloads by
> > >> making it easier to increase than it is to decrease frequencies, and
> > >> reduce the number of interrupts we would need to process.
> > >
> > > Another worry I'd like to raise, is that by reducing the EI we risk
> > > unstable evaluations. I'm not sure how accurate the HW is, and I worry
> > > about borderline workloads (if that is possible) but mainly the worry is
> > > how the HW is sampling.
> > >
> > > The other unmentioned unknown is the latency in reprogramming the
> > > frequency. At what point does it start to become a significant factor?
> > > I'm presuming the RPS evaluation itself is free, until it has to talk
> > > across the chip to send an interrupt.
> > > -Chris
> > 
> > At least on ICL the problem which this patch and 21abf0bf168d were
> > working around seems to have to do with RPS interrupt delivery being
> > inadvertently blocked for extended periods of time.  Looking at the GPU
> > utilization and RPS events on a graph I could see the GPU being stuck at
> > low frequency without any RPS interrupts firing, for a time interval
> > orders of magnitude greater than the EI we're theoretically programming
> > today.  IOW it seems like the real problem isn't that our EIs are too
> > long, but that we're missing a bunch of them.
> > 
> > The solution I was suggesting for this on IRC during the last couple of
> > days wouldn't have any of the drawbacks you mention above, I'll send it
> > to this list in a moment if the general approach seems okay to you:
> > 
> > https://github.com/curro/linux/commit/f7bc31402aa727a52d957e62d985c6dae6be4b86
> 
> Confirmed that the PMINTRMSK is sufficiently delayed to cause problems.

[   68.462428] rcs0: UP interrupt not recorded for spinner, pm_iir:0, prev_up:2ee0, up_threshold:2c88, up_ei:2ee0
Have selftest \o/
FW fixes selftest.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [Intel-gfx] [PATCH 1/2] drm/i915/gt: Try to smooth RPS spikes
  2020-04-15 11:24   ` Chris Wilson
@ 2020-04-15 11:45     ` Chris Wilson
  0 siblings, 0 replies; 34+ messages in thread
From: Chris Wilson @ 2020-04-15 11:45 UTC (permalink / raw)
  To: Andi Shyti; +Cc: intel-gfx

Quoting Chris Wilson (2020-04-15 12:24:59)
> Quoting Andi Shyti (2020-04-15 11:56:59)
> > Hi Chris,
> > 
> > On Tue, Apr 14, 2020 at 05:14:22PM +0100, Chris Wilson wrote:
> > > By the time we respond to the RPS interrupt [inside a worker], the GPU
> > > may be running a different workload. As we look to make the evalution
> > > intervals shorter, these spikes are more likely to okay. Let's try to
> > > smooth over the spikes in the workload by comparing the EI interrupt
> > > [up/down events] with the most recently completed EI; if both say up,
> > > then increase the clocks, if they disagree stay the same. In principle,
> > > this means we now take 2 up EI to go increase into the next bin, and
> > > similary 2 down EI to decrease. However, if the worker runs fast enough,
> > > the previous EI in the registers will be the same as triggered the
> > > interrupt, so responsiveness remains unaffect. [Under the current scheme
> > > where EI are on the order of 10ms, it is likely that this is true and we
> > > compare the interrupt with the EI that caused it.]
> > 
> > looks reasonable to me. Wouldn't it make also sense to evaluate
> > the difference between the current and the previous pm_iir?
> 
> I was considering it... We can't look at IIR since we apply the mask,
> but ISR should be valid. Worth a looksee.

[   98.882025] bcs0: DOWN interrupt not recorded for idle, pm_iir:10, pm_isr:0, prev_down:0, down_threshold:3840, down_ei:5dc0

ISR doesn't seem helpful on first try.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 34+ messages in thread

* [Intel-gfx] ✗ Fi.CI.IGT: failure for series starting with [1/2] drm/i915/gt: Try to smooth RPS spikes
  2020-04-14 16:14 [Intel-gfx] [PATCH 1/2] drm/i915/gt: Try to smooth RPS spikes Chris Wilson
                   ` (4 preceding siblings ...)
  2020-04-15 10:56 ` [Intel-gfx] [PATCH 1/2] " Andi Shyti
@ 2020-04-15 15:22 ` Patchwork
  5 siblings, 0 replies; 34+ messages in thread
From: Patchwork @ 2020-04-15 15:22 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/2] drm/i915/gt: Try to smooth RPS spikes
URL   : https://patchwork.freedesktop.org/series/75927/
State : failure

== Summary ==

CI Bug Log - changes from CI_DRM_8298_full -> Patchwork_17298_full
====================================================

Summary
-------

  **FAILURE**

  Serious unknown changes coming with Patchwork_17298_full absolutely need to be
  verified manually.
  
  If you think the reported changes have nothing to do with the changes
  introduced in Patchwork_17298_full, please notify your bug team to allow them
  to document this new failure mode, which will reduce false positives in CI.

  

Possible new issues
-------------------

  Here are the unknown changes that may have been introduced in Patchwork_17298_full:

### IGT changes ###

#### Possible regressions ####

  * igt@i915_pm_rps@min-max-config-loaded:
    - shard-snb:          [PASS][1] -> [FAIL][2]
   [1]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8298/shard-snb4/igt@i915_pm_rps@min-max-config-loaded.html
   [2]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17298/shard-snb2/igt@i915_pm_rps@min-max-config-loaded.html
    - shard-hsw:          [PASS][3] -> [FAIL][4]
   [3]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8298/shard-hsw6/igt@i915_pm_rps@min-max-config-loaded.html
   [4]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17298/shard-hsw4/igt@i915_pm_rps@min-max-config-loaded.html

  
New tests
---------

  New tests have been introduced between CI_DRM_8298_full and Patchwork_17298_full:

### New IGT tests (27) ###

  * igt@kms_plane_cursor@pipe-a-overlay-size-128:
    - Statuses : 7 pass(s)
    - Exec time: [1.67, 3.71] s

  * igt@kms_plane_cursor@pipe-a-overlay-size-256:
    - Statuses : 8 pass(s)
    - Exec time: [1.64, 3.65] s

  * igt@kms_plane_cursor@pipe-a-overlay-size-64:
    - Statuses : 7 pass(s)
    - Exec time: [1.65, 3.68] s

  * igt@kms_plane_cursor@pipe-a-primary-size-128:
    - Statuses : 8 pass(s)
    - Exec time: [1.66, 3.46] s

  * igt@kms_plane_cursor@pipe-a-primary-size-256:
    - Statuses : 8 pass(s)
    - Exec time: [1.64, 3.45] s

  * igt@kms_plane_cursor@pipe-a-primary-size-64:
    - Statuses : 8 pass(s)
    - Exec time: [1.63, 3.41] s

  * igt@kms_plane_cursor@pipe-a-viewport-size-128:
    - Statuses : 8 pass(s)
    - Exec time: [1.65, 3.64] s

  * igt@kms_plane_cursor@pipe-a-viewport-size-256:
    - Statuses : 8 pass(s)
    - Exec time: [1.64, 3.67] s

  * igt@kms_plane_cursor@pipe-a-viewport-size-64:
    - Statuses : 8 pass(s)
    - Exec time: [1.64, 3.69] s

  * igt@kms_plane_cursor@pipe-b-overlay-size-128:
    - Statuses : 8 pass(s)
    - Exec time: [2.22, 4.77] s

  * igt@kms_plane_cursor@pipe-b-overlay-size-256:
    - Statuses : 8 pass(s)
    - Exec time: [2.22, 4.81] s

  * igt@kms_plane_cursor@pipe-b-overlay-size-64:
    - Statuses : 8 pass(s)
    - Exec time: [2.21, 4.84] s

  * igt@kms_plane_cursor@pipe-b-primary-size-128:
    - Statuses : 8 pass(s)
    - Exec time: [2.20, 4.68] s

  * igt@kms_plane_cursor@pipe-b-primary-size-256:
    - Statuses : 8 pass(s)
    - Exec time: [2.21, 4.54] s

  * igt@kms_plane_cursor@pipe-b-primary-size-64:
    - Statuses : 8 pass(s)
    - Exec time: [2.20, 4.67] s

  * igt@kms_plane_cursor@pipe-b-viewport-size-128:
    - Statuses :
    - Exec time: [None] s

  * igt@kms_plane_cursor@pipe-b-viewport-size-256:
    - Statuses : 8 pass(s)
    - Exec time: [2.21, 4.86] s

  * igt@kms_plane_cursor@pipe-b-viewport-size-64:
    - Statuses : 8 pass(s)
    - Exec time: [2.27, 4.85] s

  * igt@kms_plane_cursor@pipe-c-overlay-size-128:
    - Statuses : 7 pass(s) 1 skip(s)
    - Exec time: [0.0, 4.86] s

  * igt@kms_plane_cursor@pipe-c-overlay-size-256:
    - Statuses : 7 pass(s) 1 skip(s)
    - Exec time: [0.0, 4.88] s

  * igt@kms_plane_cursor@pipe-c-overlay-size-64:
    - Statuses : 7 pass(s) 1 skip(s)
    - Exec time: [0.0, 4.84] s

  * igt@kms_plane_cursor@pipe-c-primary-size-128:
    - Statuses : 7 pass(s) 1 skip(s)
    - Exec time: [0.0, 4.67] s

  * igt@kms_plane_cursor@pipe-c-primary-size-256:
    - Statuses : 7 pass(s) 1 skip(s)
    - Exec time: [0.0, 4.58] s

  * igt@kms_plane_cursor@pipe-c-primary-size-64:
    - Statuses : 7 pass(s) 1 skip(s)
    - Exec time: [0.0, 4.71] s

  * igt@kms_plane_cursor@pipe-c-viewport-size-128:
    - Statuses : 6 pass(s) 1 skip(s)
    - Exec time: [0.0, 5.06] s

  * igt@kms_plane_cursor@pipe-c-viewport-size-256:
    - Statuses : 6 pass(s) 1 skip(s)
    - Exec time: [0.0, 4.89] s

  * igt@kms_plane_cursor@pipe-c-viewport-size-64:
    - Statuses : 7 pass(s) 1 skip(s)
    - Exec time: [0.0, 4.91] s

  

Known issues
------------

  Here are the changes found in Patchwork_17298_full that come from known issues:

### IGT changes ###

#### Issues hit ####

  * igt@gem_softpin@noreloc-s3:
    - shard-apl:          [PASS][5] -> [DMESG-WARN][6] ([i915#180]) +1 similar issue
   [5]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8298/shard-apl4/igt@gem_softpin@noreloc-s3.html
   [6]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17298/shard-apl2/igt@gem_softpin@noreloc-s3.html

  * igt@gem_workarounds@suspend-resume-fd:
    - shard-kbl:          [PASS][7] -> [DMESG-WARN][8] ([i915#180]) +5 similar issues
   [7]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8298/shard-kbl1/igt@gem_workarounds@suspend-resume-fd.html
   [8]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17298/shard-kbl6/igt@gem_workarounds@suspend-resume-fd.html

  * igt@kms_plane_alpha_blend@pipe-b-coverage-7efc:
    - shard-skl:          [PASS][9] -> [FAIL][10] ([fdo#108145] / [i915#265])
   [9]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8298/shard-skl3/igt@kms_plane_alpha_blend@pipe-b-coverage-7efc.html
   [10]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17298/shard-skl2/igt@kms_plane_alpha_blend@pipe-b-coverage-7efc.html

  * igt@kms_psr@psr2_primary_page_flip:
    - shard-iclb:         [PASS][11] -> [SKIP][12] ([fdo#109441])
   [11]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8298/shard-iclb2/igt@kms_psr@psr2_primary_page_flip.html
   [12]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17298/shard-iclb8/igt@kms_psr@psr2_primary_page_flip.html

  * igt@kms_setmode@basic:
    - shard-skl:          [PASS][13] -> [FAIL][14] ([i915#31])
   [13]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8298/shard-skl7/igt@kms_setmode@basic.html
   [14]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17298/shard-skl10/igt@kms_setmode@basic.html

  * igt@kms_vblank@pipe-a-ts-continuation-dpms-suspend:
    - shard-iclb:         [PASS][15] -> [INCOMPLETE][16] ([i915#1078] / [i915#1185])
   [15]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8298/shard-iclb1/igt@kms_vblank@pipe-a-ts-continuation-dpms-suspend.html
   [16]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17298/shard-iclb3/igt@kms_vblank@pipe-a-ts-continuation-dpms-suspend.html

  
#### Possible fixes ####

  * {igt@gem_wait@write-wait@all}:
    - shard-skl:          [FAIL][17] ([i915#1676]) -> [PASS][18]
   [17]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8298/shard-skl2/igt@gem_wait@write-wait@all.html
   [18]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17298/shard-skl10/igt@gem_wait@write-wait@all.html

  * igt@i915_suspend@fence-restore-tiled2untiled:
    - shard-kbl:          [DMESG-WARN][19] ([i915#180]) -> [PASS][20]
   [19]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8298/shard-kbl4/igt@i915_suspend@fence-restore-tiled2untiled.html
   [20]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17298/shard-kbl1/igt@i915_suspend@fence-restore-tiled2untiled.html

  * igt@kms_cursor_legacy@2x-long-flip-vs-cursor-legacy:
    - shard-glk:          [FAIL][21] ([i915#72]) -> [PASS][22]
   [21]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8298/shard-glk6/igt@kms_cursor_legacy@2x-long-flip-vs-cursor-legacy.html
   [22]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17298/shard-glk8/igt@kms_cursor_legacy@2x-long-flip-vs-cursor-legacy.html

  * igt@kms_draw_crc@draw-method-rgb565-mmap-cpu-untiled:
    - shard-glk:          [FAIL][23] ([i915#52] / [i915#54]) -> [PASS][24]
   [23]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8298/shard-glk2/igt@kms_draw_crc@draw-method-rgb565-mmap-cpu-untiled.html
   [24]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17298/shard-glk5/igt@kms_draw_crc@draw-method-rgb565-mmap-cpu-untiled.html

  * igt@kms_flip@flip-vs-expired-vblank:
    - shard-skl:          [FAIL][25] ([i915#79]) -> [PASS][26]
   [25]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8298/shard-skl8/igt@kms_flip@flip-vs-expired-vblank.html
   [26]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17298/shard-skl4/igt@kms_flip@flip-vs-expired-vblank.html

  * igt@kms_hdr@bpc-switch-dpms:
    - shard-skl:          [FAIL][27] ([i915#1188]) -> [PASS][28]
   [27]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8298/shard-skl10/igt@kms_hdr@bpc-switch-dpms.html
   [28]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17298/shard-skl1/igt@kms_hdr@bpc-switch-dpms.html

  * igt@kms_psr@psr2_sprite_mmap_gtt:
    - shard-iclb:         [SKIP][29] ([fdo#109441]) -> [PASS][30] +3 similar issues
   [29]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8298/shard-iclb3/igt@kms_psr@psr2_sprite_mmap_gtt.html
   [30]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17298/shard-iclb2/igt@kms_psr@psr2_sprite_mmap_gtt.html

  * igt@kms_vblank@pipe-b-ts-continuation-suspend:
    - shard-apl:          [DMESG-WARN][31] ([i915#180]) -> [PASS][32] +3 similar issues
   [31]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8298/shard-apl4/igt@kms_vblank@pipe-b-ts-continuation-suspend.html
   [32]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17298/shard-apl6/igt@kms_vblank@pipe-b-ts-continuation-suspend.html

  * {igt@perf@blocking-parameterized}:
    - shard-hsw:          [FAIL][33] ([i915#1542]) -> [PASS][34]
   [33]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8298/shard-hsw6/igt@perf@blocking-parameterized.html
   [34]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17298/shard-hsw4/igt@perf@blocking-parameterized.html

  * {igt@sysfs_heartbeat_interval@mixed@vcs0}:
    - shard-skl:          [INCOMPLETE][35] ([i915#198]) -> [PASS][36]
   [35]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8298/shard-skl7/igt@sysfs_heartbeat_interval@mixed@vcs0.html
   [36]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17298/shard-skl4/igt@sysfs_heartbeat_interval@mixed@vcs0.html

  
#### Warnings ####

  * igt@i915_pm_dc@dc3co-vpb-simulation:
    - shard-iclb:         [SKIP][37] ([i915#658]) -> [SKIP][38] ([i915#588])
   [37]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8298/shard-iclb4/igt@i915_pm_dc@dc3co-vpb-simulation.html
   [38]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17298/shard-iclb2/igt@i915_pm_dc@dc3co-vpb-simulation.html

  * igt@i915_pm_rpm@sysfs-read:
    - shard-hsw:          [SKIP][39] ([fdo#109271]) -> [INCOMPLETE][40] ([i915#151] / [i915#61])
   [39]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8298/shard-hsw7/igt@i915_pm_rpm@sysfs-read.html
   [40]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17298/shard-hsw8/igt@i915_pm_rpm@sysfs-read.html

  * igt@kms_fbcon_fbt@fbc-suspend:
    - shard-kbl:          [DMESG-FAIL][41] ([i915#180] / [i915#95]) -> [FAIL][42] ([i915#93] / [i915#95])
   [41]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8298/shard-kbl6/igt@kms_fbcon_fbt@fbc-suspend.html
   [42]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17298/shard-kbl2/igt@kms_fbcon_fbt@fbc-suspend.html

  * igt@kms_plane_alpha_blend@pipe-c-alpha-7efc:
    - shard-kbl:          [FAIL][43] ([fdo#108145] / [i915#265] / [i915#93] / [i915#95]) -> [FAIL][44] ([fdo#108145] / [i915#265])
   [43]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_8298/shard-kbl2/igt@kms_plane_alpha_blend@pipe-c-alpha-7efc.html
   [44]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17298/shard-kbl6/igt@kms_plane_alpha_blend@pipe-c-alpha-7efc.html

  
  {name}: This element is suppressed. This means it is ignored when computing
          the status of the difference (SUCCESS, WARNING, or FAILURE).

  [fdo#108145]: https://bugs.freedesktop.org/show_bug.cgi?id=108145
  [fdo#109271]: https://bugs.freedesktop.org/show_bug.cgi?id=109271
  [fdo#109441]: https://bugs.freedesktop.org/show_bug.cgi?id=109441
  [i915#1078]: https://gitlab.freedesktop.org/drm/intel/issues/1078
  [i915#1185]: https://gitlab.freedesktop.org/drm/intel/issues/1185
  [i915#1188]: https://gitlab.freedesktop.org/drm/intel/issues/1188
  [i915#151]: https://gitlab.freedesktop.org/drm/intel/issues/151
  [i915#1542]: https://gitlab.freedesktop.org/drm/intel/issues/1542
  [i915#1676]: https://gitlab.freedesktop.org/drm/intel/issues/1676
  [i915#180]: https://gitlab.freedesktop.org/drm/intel/issues/180
  [i915#198]: https://gitlab.freedesktop.org/drm/intel/issues/198
  [i915#265]: https://gitlab.freedesktop.org/drm/intel/issues/265
  [i915#31]: https://gitlab.freedesktop.org/drm/intel/issues/31
  [i915#52]: https://gitlab.freedesktop.org/drm/intel/issues/52
  [i915#54]: https://gitlab.freedesktop.org/drm/intel/issues/54
  [i915#588]: https://gitlab.freedesktop.org/drm/intel/issues/588
  [i915#61]: https://gitlab.freedesktop.org/drm/intel/issues/61
  [i915#658]: https://gitlab.freedesktop.org/drm/intel/issues/658
  [i915#72]: https://gitlab.freedesktop.org/drm/intel/issues/72
  [i915#79]: https://gitlab.freedesktop.org/drm/intel/issues/79
  [i915#93]: https://gitlab.freedesktop.org/drm/intel/issues/93
  [i915#95]: https://gitlab.freedesktop.org/drm/intel/issues/95


Participating hosts (10 -> 10)
------------------------------

  No changes in participating hosts


Build changes
-------------

  * CI: CI-20190529 -> None
  * Linux: CI_DRM_8298 -> Patchwork_17298

  CI-20190529: 20190529
  CI_DRM_8298: 17f82f0c2857d0b442adbdb62eb44b61d0f5b775 @ git://anongit.freedesktop.org/gfx-ci/linux
  IGT_5589: 31962324ac86f029e2841e56e97c42cf9d572956 @ git://anongit.freedesktop.org/xorg/app/intel-gpu-tools
  Patchwork_17298: c320d0f386a455ccc184a5959634fdadeab6b5e6 @ git://anongit.freedesktop.org/gfx-ci/linux
  piglit_4509: fdc5a4ca11124ab8413c7988896eec4c97336694 @ git://anongit.freedesktop.org/piglit

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_17298/index.html
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 34+ messages in thread

end of thread, other threads:[~2020-04-15 15:22 UTC | newest]

Thread overview: 34+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-04-14 16:14 [Intel-gfx] [PATCH 1/2] drm/i915/gt: Try to smooth RPS spikes Chris Wilson
2020-04-14 16:14 ` [PATCH 2/2] drm/i915/gt: Shrink the RPS evalution intervals Chris Wilson
2020-04-14 16:14   ` [Intel-gfx] " Chris Wilson
2020-04-14 16:35   ` Chris Wilson
2020-04-14 16:35     ` [Intel-gfx] " Chris Wilson
2020-04-14 19:39     ` Francisco Jerez
2020-04-14 19:39       ` [Intel-gfx] " Francisco Jerez
2020-04-14 20:13       ` Chris Wilson
2020-04-14 20:13         ` Chris Wilson
2020-04-14 21:00         ` Francisco Jerez
2020-04-14 21:00           ` Francisco Jerez
2020-04-14 21:52           ` Chris Wilson
2020-04-14 21:52             ` Chris Wilson
2020-04-14 22:28             ` Francisco Jerez
2020-04-14 22:28               ` Francisco Jerez
2020-04-14 22:38               ` Francisco Jerez
2020-04-14 22:38                 ` Francisco Jerez
2020-04-14 21:35       ` Chris Wilson
2020-04-14 21:35         ` Chris Wilson
2020-04-14 22:27         ` Francisco Jerez
2020-04-14 22:27           ` Francisco Jerez
2020-04-15  7:37       ` Chris Wilson
2020-04-15  7:37         ` Chris Wilson
2020-04-15 11:36         ` Chris Wilson
2020-04-15 11:36           ` Chris Wilson
2020-04-15 11:11   ` Andi Shyti
2020-04-15 11:11     ` [Intel-gfx] " Andi Shyti
2020-04-15  0:56 ` [Intel-gfx] ✗ Fi.CI.CHECKPATCH: warning for series starting with [1/2] drm/i915/gt: Try to smooth RPS spikes Patchwork
2020-04-15  1:13 ` [Intel-gfx] ✗ Fi.CI.DOCS: " Patchwork
2020-04-15  1:18 ` [Intel-gfx] ✓ Fi.CI.BAT: success " Patchwork
2020-04-15 10:56 ` [Intel-gfx] [PATCH 1/2] " Andi Shyti
2020-04-15 11:24   ` Chris Wilson
2020-04-15 11:45     ` Chris Wilson
2020-04-15 15:22 ` [Intel-gfx] ✗ Fi.CI.IGT: failure for series starting with [1/2] " Patchwork

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.