All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/3] drm/i915: Fix __wait_seqno to use true infinite timeouts
@ 2013-09-25 16:34 Chris Wilson
  2013-09-25 16:34 ` [PATCH 2/3] drm/i915: Boost RPS frequency for CPU stalls Chris Wilson
  2013-09-25 16:34 ` [PATCH 3/3] drm/i915: Tweak RPS thresholds to more aggressively downclock Chris Wilson
  0 siblings, 2 replies; 15+ messages in thread
From: Chris Wilson @ 2013-09-25 16:34 UTC (permalink / raw)
  To: intel-gfx

When we switched to always using a timeout in conjunction with
wait_seqno, we lost the ability to detect missed interrupts. Since, we
have had issues with interrupts on a number of generations, and they are
required to be delivered in a timely fashion for a smooth UX, it is
important that we do log errors found in the wild and prevent the
display stalling for upwards of 1s every time the seqno interrupt is
missed.

Rather than continue to fix up the timeouts to work around the interface
impedence in wait_event_*(), open code the combination of
wait_event[_interruptible][_timeout], and use the exposed timer to
poll for seqno should we detect a lost interrupt.

v2: In order to satisfy the debug requirement of logging missed
interrupts with the real world requirments of making machines work even
if interrupts are hosed, we revert to polling after detecting a missed
interrupt.

v3: Throw in a debugfs interface to simulate broken hw not reporting
interrupts.

v4: s/EGAIN/EAGAIN/ (Imre)

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Imre Deak <imre.deak@intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c   |  68 ++++++++++++++++++++
 drivers/gpu/drm/i915/i915_drv.h       |   6 ++
 drivers/gpu/drm/i915/i915_gem.c       | 114 ++++++++++++++++++++--------------
 drivers/gpu/drm/i915/i915_gpu_error.c |   1 +
 drivers/gpu/drm/i915/i915_irq.c       |  11 ++--
 5 files changed, 149 insertions(+), 51 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index fcfa9884..0c339f0 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1897,6 +1897,72 @@ DEFINE_SIMPLE_ATTRIBUTE(i915_ring_stop_fops,
 			i915_ring_stop_get, i915_ring_stop_set,
 			"0x%08llx\n");
 
+static int
+i915_ring_missed_irq_get(void *data, u64 *val)
+{
+	struct drm_device *dev = data;
+	drm_i915_private_t *dev_priv = dev->dev_private;
+
+	*val = dev_priv->gpu_error.missed_irq_rings;
+	return 0;
+}
+
+static int
+i915_ring_missed_irq_set(void *data, u64 val)
+{
+	struct drm_device *dev = data;
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	int ret;
+
+	/* Lock against concurrent debugfs callers */
+	ret = mutex_lock_interruptible(&dev->struct_mutex);
+	if (ret)
+		return ret;
+	dev_priv->gpu_error.missed_irq_rings = val;
+	mutex_unlock(&dev->struct_mutex);
+
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(i915_ring_missed_irq_fops,
+			i915_ring_missed_irq_get, i915_ring_missed_irq_set,
+			"0x%08llx\n");
+
+static int
+i915_ring_test_irq_get(void *data, u64 *val)
+{
+	struct drm_device *dev = data;
+	drm_i915_private_t *dev_priv = dev->dev_private;
+
+	*val = dev_priv->gpu_error.test_irq_rings;
+
+	return 0;
+}
+
+static int
+i915_ring_test_irq_set(void *data, u64 val)
+{
+	struct drm_device *dev = data;
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	int ret;
+
+	DRM_DEBUG_DRIVER("Masking interrupts on rings 0x%08llx\n", val);
+
+	/* Lock against concurrent debugfs callers */
+	ret = mutex_lock_interruptible(&dev->struct_mutex);
+	if (ret)
+		return ret;
+
+	dev_priv->gpu_error.test_irq_rings = val;
+	mutex_unlock(&dev->struct_mutex);
+
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(i915_ring_test_irq_fops,
+			i915_ring_test_irq_get, i915_ring_test_irq_set,
+			"0x%08llx\n");
+
 #define DROP_UNBOUND 0x1
 #define DROP_BOUND 0x2
 #define DROP_RETIRE 0x4
@@ -2290,6 +2356,8 @@ static struct i915_debugfs_files {
 	{"i915_min_freq", &i915_min_freq_fops},
 	{"i915_cache_sharing", &i915_cache_sharing_fops},
 	{"i915_ring_stop", &i915_ring_stop_fops},
+	{"i915_ring_missed_irq", &i915_ring_missed_irq_fops},
+	{"i915_ring_test_irq", &i915_ring_test_irq_fops},
 	{"i915_gem_drop_caches", &i915_drop_caches_fops},
 	{"i915_error_state", &i915_error_state_fops},
 	{"i915_next_seqno", &i915_next_seqno_fops},
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index e8ffd57..906f4fb 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1022,6 +1022,9 @@ struct i915_gpu_error {
 	struct drm_i915_error_state *first_error;
 	struct work_struct work;
 
+
+	unsigned long missed_irq_rings;
+
 	/**
 	 * State variable and reset counter controlling the reset flow
 	 *
@@ -1060,6 +1063,9 @@ struct i915_gpu_error {
 
 	/* For gpu hang simulation. */
 	unsigned int stop_rings;
+
+	/* For missed irq/seqno simulation. */
+	unsigned int test_irq_rings;
 };
 
 enum modeset_restore {
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index fa3b373..40e1293 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -971,6 +971,17 @@ i915_gem_check_olr(struct intel_ring_buffer *ring, u32 seqno)
 	return ret;
 }
 
+static void fake_irq(unsigned long data)
+{
+	wake_up_process((struct task_struct *)data);
+}
+
+static bool missed_irq(struct drm_i915_private *dev_priv,
+		       struct intel_ring_buffer *ring)
+{
+	return test_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings);
+}
+
 /**
  * __wait_seqno - wait until execution of seqno has finished
  * @ring: the ring expected to report seqno
@@ -994,10 +1005,9 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno,
 			bool interruptible, struct timespec *timeout)
 {
 	drm_i915_private_t *dev_priv = ring->dev->dev_private;
-	struct timespec before, now, wait_time={1,0};
-	unsigned long timeout_jiffies;
-	long end;
-	bool wait_forever = true;
+	struct timespec before, now;
+	DEFINE_WAIT(wait);
+	long timeout_jiffies;
 	int ret;
 
 	WARN(dev_priv->pc8.irqs_disabled, "IRQs disabled\n");
@@ -1005,51 +1015,71 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno,
 	if (i915_seqno_passed(ring->get_seqno(ring, true), seqno))
 		return 0;
 
-	trace_i915_gem_request_wait_begin(ring, seqno);
-
-	if (timeout != NULL) {
-		wait_time = *timeout;
-		wait_forever = false;
-	}
-
-	timeout_jiffies = timespec_to_jiffies_timeout(&wait_time);
+	timeout_jiffies = timeout ? timespec_to_jiffies_timeout(timeout) : 1;
 
-	if (WARN_ON(!ring->irq_get(ring)))
+	if (!(dev_priv->gpu_error.test_irq_rings & intel_ring_flag(ring)) &&
+	    WARN_ON(!ring->irq_get(ring)))
 		return -ENODEV;
 
-	/* Record current time in case interrupted by signal, or wedged * */
+	/* Record current time in case interrupted by signal, or wedged */
+	trace_i915_gem_request_wait_begin(ring, seqno);
 	getrawmonotonic(&before);
+	for (;;) {
+		struct timer_list timer;
+		unsigned long expire;
 
-#define EXIT_COND \
-	(i915_seqno_passed(ring->get_seqno(ring, false), seqno) || \
-	 i915_reset_in_progress(&dev_priv->gpu_error) || \
-	 reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter))
-	do {
-		if (interruptible)
-			end = wait_event_interruptible_timeout(ring->irq_queue,
-							       EXIT_COND,
-							       timeout_jiffies);
-		else
-			end = wait_event_timeout(ring->irq_queue, EXIT_COND,
-						 timeout_jiffies);
+		prepare_to_wait(&ring->irq_queue, &wait,
+				interruptible ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE);
 
 		/* We need to check whether any gpu reset happened in between
 		 * the caller grabbing the seqno and now ... */
-		if (reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter))
-			end = -EAGAIN;
+		if (reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter)) {
+			/* ... but upgrade the -EAGAIN to an -EIO if the gpu
+			 * is truely gone. */
+			ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible);
+			if (ret == 0)
+				ret = -EAGAIN;
+			break;
+		}
 
-		/* ... but upgrade the -EGAIN to an -EIO if the gpu is truely
-		 * gone. */
-		ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible);
-		if (ret)
-			end = ret;
-	} while (end == 0 && wait_forever);
+		if (i915_seqno_passed(ring->get_seqno(ring, false), seqno)) {
+			ret = 0;
+			break;
+		}
+
+		if (interruptible && signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
+
+		if (timeout_jiffies <= 0) {
+			ret = -ETIME;
+			break;
+		}
 
+		timer.function = NULL;
+		if (timeout || missed_irq(dev_priv, ring)) {
+			setup_timer_on_stack(&timer, fake_irq, (unsigned long)current);
+			expire = jiffies + (missed_irq(dev_priv, ring) ? 1: timeout_jiffies);
+			mod_timer(&timer, expire);
+		}
+
+		schedule();
+
+		if (timeout)
+			timeout_jiffies = expire - jiffies;
+
+		if (timer.function) {
+			del_singleshot_timer_sync(&timer);
+			destroy_timer_on_stack(&timer);
+		}
+	}
 	getrawmonotonic(&now);
+	trace_i915_gem_request_wait_end(ring, seqno);
 
 	ring->irq_put(ring);
-	trace_i915_gem_request_wait_end(ring, seqno);
-#undef EXIT_COND
+
+	finish_wait(&ring->irq_queue, &wait);
 
 	if (timeout) {
 		struct timespec sleep_time = timespec_sub(now, before);
@@ -1058,17 +1088,7 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno,
 			set_normalized_timespec(timeout, 0, 0);
 	}
 
-	switch (end) {
-	case -EIO:
-	case -EAGAIN: /* Wedged */
-	case -ERESTARTSYS: /* Signal */
-		return (int)end;
-	case 0: /* Timeout */
-		return -ETIME;
-	default: /* Completed */
-		WARN_ON(end < 0); /* We're not aware of other errors */
-		return 0;
-	}
+	return ret;
 }
 
 /**
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 0a49b65..da1022a 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -311,6 +311,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 	err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake);
 	err_printf(m, "DERRMR: 0x%08x\n", error->derrmr);
 	err_printf(m, "CCID: 0x%08x\n", error->ccid);
+	err_printf(m, "Missed interrupts: 0x%08lx\n", dev_priv->gpu_error.missed_irq_rings);
 
 	for (i = 0; i < dev_priv->num_fence_regs; i++)
 		err_printf(m, "  fence[%d] = %08llx\n", i, error->fence[i]);
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 84b7efc..05c05a6 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2039,10 +2039,13 @@ static void i915_hangcheck_elapsed(unsigned long data)
 
 				if (waitqueue_active(&ring->irq_queue)) {
 					/* Issue a wake-up to catch stuck h/w. */
-					DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
-						  ring->name);
-					wake_up_all(&ring->irq_queue);
-					ring->hangcheck.score += HUNG;
+					if (!test_and_set_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings)) {
+						DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
+							  ring->name);
+						wake_up_all(&ring->irq_queue);
+					}
+					/* Safeguard against driver failure */
+					ring->hangcheck.score += BUSY;
 				} else
 					busy = false;
 			} else {
-- 
1.8.4.rc3

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 2/3] drm/i915: Boost RPS frequency for CPU stalls
  2013-09-25 16:34 [PATCH 1/3] drm/i915: Fix __wait_seqno to use true infinite timeouts Chris Wilson
@ 2013-09-25 16:34 ` Chris Wilson
  2013-10-01 21:54   ` Jesse Barnes
  2013-09-25 16:34 ` [PATCH 3/3] drm/i915: Tweak RPS thresholds to more aggressively downclock Chris Wilson
  1 sibling, 1 reply; 15+ messages in thread
From: Chris Wilson @ 2013-09-25 16:34 UTC (permalink / raw)
  To: intel-gfx; +Cc: Owen Taylor, Stéphane Marchesin, Zhuang, Lena

If we encounter a situation where the CPU blocks waiting for results
from the GPU, give the GPU a kick to boost its the frequency.

This should work to reduce user interface stalls and to quickly promote
mesa to high frequencies - but the cost is that our requested frequency
stalls high (as we do not idle for long enough before rc6 to start
reducing frequencies, nor are we aggressive at down clocking an
underused GPU). However, this should be mitigated by rc6 itself powering
off the GPU when idle, and that energy use is dependent upon the workload
of the GPU in addition to its frequency (e.g. the math or sampler
functions only consume power when used). Still, this is likely to
adversely affect light workloads.

In particular, this nearly eliminates the highly noticeable wake-up lag
in animations from idle. For example, expose or workspace transitions.
(However, given the situation where we fail to downclock, our requested
frequency is almost always the maximum, except for Baytrail where we
manually downclock upon idling. This often masks the latency of
upclocking after being idle, so animations are typically smooth - at the
cost of increased power consumption.)

Stéphane raised the concern that this will punish good applications and
reward bad applications - but due to the nature of how mesa performs its
client throttling, I believe all mesa applications will be roughly
equally affected. To address this concern, and to prevent applications
like compositors from permanently boosting the RPS state, we ratelimit the
frequency of the wait-boosts each client recieves.

Unfortunately, this techinique is ineffective with Ironlake - which also
has dynamic render power states and suffers just as dramatically. For
Ironlake, the thermal/power headroom is shared with the CPU through
Intelligent Power Sharing and the intel-ips module. This leaves us with
no GPU boost frequencies available when coming out of idle, and due to
hardware limitations we cannot change the arbitration between the CPU and
GPU quickly enough to be effective.

v2: Limit each client to receiving a single boost for each active period.
    Tested by QA to only marginally increase power, and to demonstrably
    increase throughput in games. No latency measurements yet.

v3: Cater for front-buffer rendering with manual throttling.

v4: Tidy up.

v5: Sadly the compositor needs frequent boosts as it may never idle, but
due to its picking mechanism (using ReadPixels) may require frequent
waits. Those waits, along with the waits for the vrefresh swap, conspire
to keep the GPU at low frequencies despite the interactive latency. To
overcome this we ditch the one-boost-per-active-period and just ratelimit
the number of wait-boosts each client can receive.

Reported-and-tested-by: Paul Neumann <paul104x@yahoo.de>
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=68716
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Kenneth Graunke <kenneth@whitecape.org>
Cc: Stéphane Marchesin <stephane.marchesin@gmail.com>
Cc: Owen Taylor <otaylor@redhat.com>
Cc: "Meng, Mengmeng" <mengmeng.meng@intel.com>
Cc: "Zhuang, Lena" <lena.zhuang@intel.com>
---
 drivers/gpu/drm/i915/i915_dma.c      |  16 ++---
 drivers/gpu/drm/i915/i915_drv.h      |  19 +++--
 drivers/gpu/drm/i915/i915_gem.c      | 135 ++++++++++++++++++++++++-----------
 drivers/gpu/drm/i915/i915_irq.c      |  11 ---
 drivers/gpu/drm/i915/intel_display.c |   3 +
 drivers/gpu/drm/i915/intel_drv.h     |   3 +
 drivers/gpu/drm/i915/intel_pm.c      |  42 ++++++-----
 7 files changed, 138 insertions(+), 91 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index d35de1b..57ca5af 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -1816,19 +1816,11 @@ int i915_driver_unload(struct drm_device *dev)
 
 int i915_driver_open(struct drm_device *dev, struct drm_file *file)
 {
-	struct drm_i915_file_private *file_priv;
-
-	DRM_DEBUG_DRIVER("\n");
-	file_priv = kzalloc(sizeof(*file_priv), GFP_KERNEL);
-	if (!file_priv)
-		return -ENOMEM;
-
-	file->driver_priv = file_priv;
-
-	spin_lock_init(&file_priv->mm.lock);
-	INIT_LIST_HEAD(&file_priv->mm.request_list);
+	int ret;
 
-	idr_init(&file_priv->context_idr);
+	ret = i915_gem_open(dev, file);
+	if (ret)
+		return ret;
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 906f4fb..ddc528b 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -853,9 +853,6 @@ struct intel_gen6_power_mgmt {
 	struct work_struct work;
 	u32 pm_iir;
 
-	/* On vlv we need to manually drop to Vmin with a delayed work. */
-	struct delayed_work vlv_work;
-
 	/* The below variables an all the rps hw state are protected by
 	 * dev->struct mutext. */
 	u8 cur_delay;
@@ -974,6 +971,15 @@ struct i915_gem_mm {
 	struct delayed_work retire_work;
 
 	/**
+	 * When we detect an idle GPU, we want to turn on
+	 * powersaving features. So once we see that there
+	 * are no more requests outstanding and no more
+	 * arrive within a small period of time, we fire
+	 * off the idle_work.
+	 */
+	struct delayed_work idle_work;
+
+	/**
 	 * Are we in a non-interruptible section of code like
 	 * modesetting?
 	 */
@@ -1606,13 +1612,17 @@ struct drm_i915_gem_request {
 };
 
 struct drm_i915_file_private {
+	struct drm_i915_private *dev_priv;
+
 	struct {
 		spinlock_t lock;
 		struct list_head request_list;
+		struct delayed_work idle_work;
 	} mm;
 	struct idr context_idr;
 
 	struct i915_ctx_hang_stats hang_stats;
+	atomic_t rps_wait_boost;
 };
 
 #define INTEL_INFO(dev)	(to_i915(dev)->info)
@@ -1962,7 +1972,7 @@ i915_gem_object_unpin_fence(struct drm_i915_gem_object *obj)
 	}
 }
 
-void i915_gem_retire_requests(struct drm_device *dev);
+bool i915_gem_retire_requests(struct drm_device *dev);
 void i915_gem_retire_requests_ring(struct intel_ring_buffer *ring);
 int __must_check i915_gem_check_wedge(struct i915_gpu_error *error,
 				      bool interruptible);
@@ -2013,6 +2023,7 @@ int i915_gem_attach_phys_object(struct drm_device *dev,
 void i915_gem_detach_phys_object(struct drm_device *dev,
 				 struct drm_i915_gem_object *obj);
 void i915_gem_free_all_phys_object(struct drm_device *dev);
+int i915_gem_open(struct drm_device *dev, struct drm_file *file);
 void i915_gem_release(struct drm_device *dev, struct drm_file *file);
 
 uint32_t
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 40e1293..a4b1ccb 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -982,6 +982,14 @@ static bool missed_irq(struct drm_i915_private *dev_priv,
 	return test_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings);
 }
 
+static bool can_wait_boost(struct drm_i915_file_private *file_priv)
+{
+	if (file_priv == NULL)
+		return true;
+
+	return !atomic_xchg(&file_priv->rps_wait_boost, true);
+}
+
 /**
  * __wait_seqno - wait until execution of seqno has finished
  * @ring: the ring expected to report seqno
@@ -1002,7 +1010,9 @@ static bool missed_irq(struct drm_i915_private *dev_priv,
  */
 static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno,
 			unsigned reset_counter,
-			bool interruptible, struct timespec *timeout)
+			bool interruptible,
+			struct timespec *timeout,
+			struct drm_i915_file_private *file_priv)
 {
 	drm_i915_private_t *dev_priv = ring->dev->dev_private;
 	struct timespec before, now;
@@ -1017,6 +1027,14 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno,
 
 	timeout_jiffies = timeout ? timespec_to_jiffies_timeout(timeout) : 1;
 
+	if (dev_priv->info->gen >= 6 && can_wait_boost(file_priv)) {
+		gen6_rps_boost(dev_priv);
+		if (file_priv)
+			mod_delayed_work(dev_priv->wq,
+					 &file_priv->mm.idle_work,
+					 msecs_to_jiffies(100));
+	}
+
 	if (!(dev_priv->gpu_error.test_irq_rings & intel_ring_flag(ring)) &&
 	    WARN_ON(!ring->irq_get(ring)))
 		return -ENODEV;
@@ -1116,7 +1134,7 @@ i915_wait_seqno(struct intel_ring_buffer *ring, uint32_t seqno)
 
 	return __wait_seqno(ring, seqno,
 			    atomic_read(&dev_priv->gpu_error.reset_counter),
-			    interruptible, NULL);
+			    interruptible, NULL, NULL);
 }
 
 static int
@@ -1166,6 +1184,7 @@ i915_gem_object_wait_rendering(struct drm_i915_gem_object *obj,
  */
 static __must_check int
 i915_gem_object_wait_rendering__nonblocking(struct drm_i915_gem_object *obj,
+					    struct drm_file *file,
 					    bool readonly)
 {
 	struct drm_device *dev = obj->base.dev;
@@ -1192,7 +1211,7 @@ i915_gem_object_wait_rendering__nonblocking(struct drm_i915_gem_object *obj,
 
 	reset_counter = atomic_read(&dev_priv->gpu_error.reset_counter);
 	mutex_unlock(&dev->struct_mutex);
-	ret = __wait_seqno(ring, seqno, reset_counter, true, NULL);
+	ret = __wait_seqno(ring, seqno, reset_counter, true, NULL, file->driver_priv);
 	mutex_lock(&dev->struct_mutex);
 	if (ret)
 		return ret;
@@ -1241,7 +1260,7 @@ i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
 	 * We will repeat the flush holding the lock in the normal manner
 	 * to catch cases where we are gazumped.
 	 */
-	ret = i915_gem_object_wait_rendering__nonblocking(obj, !write_domain);
+	ret = i915_gem_object_wait_rendering__nonblocking(obj, file, !write_domain);
 	if (ret)
 		goto unref;
 
@@ -2162,6 +2181,7 @@ int __i915_add_request(struct intel_ring_buffer *ring,
 		i915_queue_hangcheck(ring->dev);
 
 		if (was_empty) {
+			cancel_delayed_work_sync(&dev_priv->mm.idle_work);
 			queue_delayed_work(dev_priv->wq,
 					   &dev_priv->mm.retire_work,
 					   round_jiffies_up_relative(HZ));
@@ -2183,10 +2203,8 @@ i915_gem_request_remove_from_client(struct drm_i915_gem_request *request)
 		return;
 
 	spin_lock(&file_priv->mm.lock);
-	if (request->file_priv) {
-		list_del(&request->client_list);
-		request->file_priv = NULL;
-	}
+	list_del(&request->client_list);
+	request->file_priv = NULL;
 	spin_unlock(&file_priv->mm.lock);
 }
 
@@ -2450,57 +2468,53 @@ i915_gem_retire_requests_ring(struct intel_ring_buffer *ring)
 	WARN_ON(i915_verify_lists(ring->dev));
 }
 
-void
+bool
 i915_gem_retire_requests(struct drm_device *dev)
 {
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	struct intel_ring_buffer *ring;
+	bool idle = true;
 	int i;
 
-	for_each_ring(ring, dev_priv, i)
+	for_each_ring(ring, dev_priv, i) {
 		i915_gem_retire_requests_ring(ring);
+		idle &= list_empty(&ring->request_list);
+	}
+
+	if (idle)
+		mod_delayed_work(dev_priv->wq,
+				   &dev_priv->mm.idle_work,
+				   msecs_to_jiffies(100));
+
+	return idle;
 }
 
 static void
 i915_gem_retire_work_handler(struct work_struct *work)
 {
-	drm_i915_private_t *dev_priv;
-	struct drm_device *dev;
-	struct intel_ring_buffer *ring;
+	struct drm_i915_private *dev_priv =
+		container_of(work, typeof(*dev_priv), mm.retire_work.work);
+	struct drm_device *dev = dev_priv->dev;
 	bool idle;
-	int i;
-
-	dev_priv = container_of(work, drm_i915_private_t,
-				mm.retire_work.work);
-	dev = dev_priv->dev;
 
 	/* Come back later if the device is busy... */
-	if (!mutex_trylock(&dev->struct_mutex)) {
-		queue_delayed_work(dev_priv->wq, &dev_priv->mm.retire_work,
-				   round_jiffies_up_relative(HZ));
-		return;
-	}
-
-	i915_gem_retire_requests(dev);
-
-	/* Send a periodic flush down the ring so we don't hold onto GEM
-	 * objects indefinitely.
-	 */
-	idle = true;
-	for_each_ring(ring, dev_priv, i) {
-		if (ring->gpu_caches_dirty)
-			i915_add_request(ring, NULL);
-
-		idle &= list_empty(&ring->request_list);
+	idle = false;
+	if (mutex_trylock(&dev->struct_mutex)) {
+		idle = i915_gem_retire_requests(dev);
+		mutex_unlock(&dev->struct_mutex);
 	}
-
-	if (!dev_priv->ums.mm_suspended && !idle)
+	if (!idle)
 		queue_delayed_work(dev_priv->wq, &dev_priv->mm.retire_work,
 				   round_jiffies_up_relative(HZ));
-	if (idle)
-		intel_mark_idle(dev);
+}
 
-	mutex_unlock(&dev->struct_mutex);
+static void
+i915_gem_idle_work_handler(struct work_struct *work)
+{
+	struct drm_i915_private *dev_priv =
+		container_of(work, typeof(*dev_priv), mm.idle_work.work);
+
+	intel_mark_idle(dev_priv->dev);
 }
 
 /**
@@ -2598,7 +2612,7 @@ i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 	reset_counter = atomic_read(&dev_priv->gpu_error.reset_counter);
 	mutex_unlock(&dev->struct_mutex);
 
-	ret = __wait_seqno(ring, seqno, reset_counter, true, timeout);
+	ret = __wait_seqno(ring, seqno, reset_counter, true, timeout, file->driver_priv);
 	if (timeout)
 		args->timeout_ns = timespec_to_ns(timeout);
 	return ret;
@@ -3801,7 +3815,7 @@ i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
 	if (seqno == 0)
 		return 0;
 
-	ret = __wait_seqno(ring, seqno, reset_counter, true, NULL);
+	ret = __wait_seqno(ring, seqno, reset_counter, true, NULL, NULL);
 	if (ret == 0)
 		queue_delayed_work(dev_priv->wq, &dev_priv->mm.retire_work, 0);
 
@@ -4274,6 +4288,7 @@ i915_gem_idle(struct drm_device *dev)
 
 	/* Cancel the retire work handler, which should be idle now. */
 	cancel_delayed_work_sync(&dev_priv->mm.retire_work);
+	cancel_delayed_work_sync(&dev_priv->mm.idle_work);
 
 	return 0;
 }
@@ -4607,6 +4622,8 @@ i915_gem_load(struct drm_device *dev)
 		INIT_LIST_HEAD(&dev_priv->fence_regs[i].lru_list);
 	INIT_DELAYED_WORK(&dev_priv->mm.retire_work,
 			  i915_gem_retire_work_handler);
+	INIT_DELAYED_WORK(&dev_priv->mm.idle_work,
+			  i915_gem_idle_work_handler);
 	init_waitqueue_head(&dev_priv->gpu_error.reset_queue);
 
 	/* On GEN3 we really need to make sure the ARB C3 LP bit is set */
@@ -4831,6 +4848,8 @@ void i915_gem_release(struct drm_device *dev, struct drm_file *file)
 {
 	struct drm_i915_file_private *file_priv = file->driver_priv;
 
+	cancel_delayed_work_sync(&file_priv->mm.idle_work);
+
 	/* Clean up our request list when the client is going away, so that
 	 * later retire_requests won't dereference our soon-to-be-gone
 	 * file_priv.
@@ -4848,6 +4867,38 @@ void i915_gem_release(struct drm_device *dev, struct drm_file *file)
 	spin_unlock(&file_priv->mm.lock);
 }
 
+static void
+i915_gem_file_idle_work_handler(struct work_struct *work)
+{
+	struct drm_i915_file_private *file_priv =
+		container_of(work, typeof(*file_priv), mm.idle_work.work);
+
+	atomic_set(&file_priv->rps_wait_boost, false);
+}
+
+int i915_gem_open(struct drm_device *dev, struct drm_file *file)
+{
+	struct drm_i915_file_private *file_priv;
+
+	DRM_DEBUG_DRIVER("\n");
+
+	file_priv = kzalloc(sizeof(*file_priv), GFP_KERNEL);
+	if (!file_priv)
+		return -ENOMEM;
+
+	file->driver_priv = file_priv;
+	file_priv->dev_priv = dev->dev_private;
+
+	spin_lock_init(&file_priv->mm.lock);
+	INIT_LIST_HEAD(&file_priv->mm.request_list);
+	INIT_DELAYED_WORK(&file_priv->mm.idle_work,
+			  i915_gem_file_idle_work_handler);
+
+	idr_init(&file_priv->context_idr);
+
+	return 0;
+}
+
 static bool mutex_is_locked_by(struct mutex *mutex, struct task_struct *task)
 {
 	if (!mutex_is_locked(mutex))
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 05c05a6..6ee5572 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -859,17 +859,6 @@ static void gen6_pm_rps_work(struct work_struct *work)
 			gen6_set_rps(dev_priv->dev, new_delay);
 	}
 
-	if (IS_VALLEYVIEW(dev_priv->dev)) {
-		/*
-		 * On VLV, when we enter RC6 we may not be at the minimum
-		 * voltage level, so arm a timer to check.  It should only
-		 * fire when there's activity or once after we've entered
-		 * RC6, and then won't be re-armed until the next RPS interrupt.
-		 */
-		mod_delayed_work(dev_priv->wq, &dev_priv->rps.vlv_work,
-				 msecs_to_jiffies(100));
-	}
-
 	mutex_unlock(&dev_priv->rps.hw_lock);
 }
 
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 5fa58ad..9b810a7 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -7627,6 +7627,9 @@ void intel_mark_idle(struct drm_device *dev)
 
 		intel_decrease_pllclock(crtc);
 	}
+
+	if (dev_priv->info->gen >= 6)
+		gen6_rps_idle(dev->dev_private);
 }
 
 void intel_mark_fb_busy(struct drm_i915_gem_object *obj,
diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h
index 8c3cb3e..993e844 100644
--- a/drivers/gpu/drm/i915/intel_drv.h
+++ b/drivers/gpu/drm/i915/intel_drv.h
@@ -826,4 +826,7 @@ int intel_sprite_get_colorkey(struct drm_device *dev, void *data,
 /* intel_tv.c */
 void intel_tv_init(struct drm_device *dev);
 
+extern void gen6_rps_idle(struct drm_i915_private *dev_priv);
+extern void gen6_rps_boost(struct drm_i915_private *dev_priv);
+
 #endif /* __INTEL_DRV_H__ */
diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
index d27eda6..6c59420 100644
--- a/drivers/gpu/drm/i915/intel_pm.c
+++ b/drivers/gpu/drm/i915/intel_pm.c
@@ -3335,6 +3335,26 @@ void gen6_set_rps(struct drm_device *dev, u8 val)
 	trace_intel_gpu_freq_change(val * 50);
 }
 
+void gen6_rps_idle(struct drm_i915_private *dev_priv)
+{
+	mutex_lock(&dev_priv->rps.hw_lock);
+	if (dev_priv->info->is_valleyview)
+		valleyview_set_rps(dev_priv->dev, dev_priv->rps.min_delay);
+	else
+		gen6_set_rps(dev_priv->dev, dev_priv->rps.min_delay);
+	mutex_unlock(&dev_priv->rps.hw_lock);
+}
+
+void gen6_rps_boost(struct drm_i915_private *dev_priv)
+{
+	mutex_lock(&dev_priv->rps.hw_lock);
+	if (dev_priv->info->is_valleyview)
+		valleyview_set_rps(dev_priv->dev, dev_priv->rps.max_delay);
+	else
+		gen6_set_rps(dev_priv->dev, dev_priv->rps.max_delay);
+	mutex_unlock(&dev_priv->rps.hw_lock);
+}
+
 /*
  * Wait until the previous freq change has completed,
  * or the timeout elapsed, and then update our notion
@@ -3724,24 +3744,6 @@ int valleyview_rps_min_freq(struct drm_i915_private *dev_priv)
 	return vlv_punit_read(dev_priv, PUNIT_REG_GPU_LFM) & 0xff;
 }
 
-static void vlv_rps_timer_work(struct work_struct *work)
-{
-	drm_i915_private_t *dev_priv = container_of(work, drm_i915_private_t,
-						    rps.vlv_work.work);
-
-	/*
-	 * Timer fired, we must be idle.  Drop to min voltage state.
-	 * Note: we use RPe here since it should match the
-	 * Vmin we were shooting for.  That should give us better
-	 * perf when we come back out of RC6 than if we used the
-	 * min freq available.
-	 */
-	mutex_lock(&dev_priv->rps.hw_lock);
-	if (dev_priv->rps.cur_delay > dev_priv->rps.rpe_delay)
-		valleyview_set_rps(dev_priv->dev, dev_priv->rps.rpe_delay);
-	mutex_unlock(&dev_priv->rps.hw_lock);
-}
-
 static void valleyview_setup_pctx(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
@@ -3880,8 +3882,6 @@ static void valleyview_enable_rps(struct drm_device *dev)
 				      dev_priv->rps.rpe_delay),
 			 dev_priv->rps.rpe_delay);
 
-	INIT_DELAYED_WORK(&dev_priv->rps.vlv_work, vlv_rps_timer_work);
-
 	valleyview_set_rps(dev_priv->dev, dev_priv->rps.rpe_delay);
 
 	gen6_enable_rps_interrupts(dev);
@@ -4621,8 +4621,6 @@ void intel_disable_gt_powersave(struct drm_device *dev)
 	} else if (INTEL_INFO(dev)->gen >= 6) {
 		cancel_delayed_work_sync(&dev_priv->rps.delayed_resume_work);
 		cancel_work_sync(&dev_priv->rps.work);
-		if (IS_VALLEYVIEW(dev))
-			cancel_delayed_work_sync(&dev_priv->rps.vlv_work);
 		mutex_lock(&dev_priv->rps.hw_lock);
 		if (IS_VALLEYVIEW(dev))
 			valleyview_disable_rps(dev);
-- 
1.8.4.rc3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 3/3] drm/i915: Tweak RPS thresholds to more aggressively downclock
  2013-09-25 16:34 [PATCH 1/3] drm/i915: Fix __wait_seqno to use true infinite timeouts Chris Wilson
  2013-09-25 16:34 ` [PATCH 2/3] drm/i915: Boost RPS frequency for CPU stalls Chris Wilson
@ 2013-09-25 16:34 ` Chris Wilson
  2013-10-01 22:04   ` Jesse Barnes
  1 sibling, 1 reply; 15+ messages in thread
From: Chris Wilson @ 2013-09-25 16:34 UTC (permalink / raw)
  To: intel-gfx; +Cc: Owen Taylor, Stéphane Marchesin, Zhuang, Lena

After applying wait-boost we often find ourselves stuck at higher clocks
than required. The current threshold value requires the GPU to be
continuously and completely idle for 313ms before it is dropped by one
bin. Conversely, we require the GPU to be busy for an average of 90% over
a 84ms period before we upclock. So the current thresholds almost never
downclock the GPU, and respond very slowly to sudden demands for more
power. It is easy to observe that we currently lock into the wrong bin
and both underperform in benchmarks and consume more power than optimal
(just by repeating the task and measuring the different results).

An alternative approach, as discussed in the bspec, is to use a
continuous threshold for upclocking, and an average value for downclocking.
This is good for quickly detecting and reacting to state changes within a
frame, however it fails with the common throttling method of waiting
upon the outstanding frame - at least it is difficult to choose a
threshold that works well at 15,000fps and at 60fps. So continue to use
average busy/idle loads to determine frequency change.

v2: Use 3 power zones to keep frequencies low in steady-state mostly
idle (e.g. scrolling, interactive 2D drawing), and frequencies high
for demanding games. In between those end-states, we use a
fast-reclocking algorithm to converge more quickly on the desired bin.

v3: Bug fixes - make sure we reset adj after switching power zones.

v4: Tune - drop the continuous busy thresholds as it prevents us from
choosing the right frequency for glxgears style swap benchmarks. Instead
the goal is to be able to find the right clocks irrespective of the
wait-boost.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Kenneth Graunke <kenneth@whitecape.org>
Cc: Stéphane Marchesin <stephane.marchesin@gmail.com>
Cc: Owen Taylor <otaylor@redhat.com>
Cc: "Meng, Mengmeng" <mengmeng.meng@intel.com>
Cc: "Zhuang, Lena" <lena.zhuang@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h |   5 ++
 drivers/gpu/drm/i915/i915_irq.c |  46 ++++++++++----
 drivers/gpu/drm/i915/i915_reg.h |   2 +-
 drivers/gpu/drm/i915/intel_pm.c | 137 ++++++++++++++++++++++++++++++----------
 4 files changed, 143 insertions(+), 47 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index ddc528b..fd0a28d 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -859,8 +859,13 @@ struct intel_gen6_power_mgmt {
 	u8 min_delay;
 	u8 max_delay;
 	u8 rpe_delay;
+	u8 rp1_delay;
+	u8 rp0_delay;
 	u8 hw_max;
 
+	int last_adj;
+	enum { LOW_POWER, BETWEEN, HIGH_POWER } power;
+
 	struct delayed_work delayed_resume_work;
 
 	/*
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 6ee5572..418ad64 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -818,7 +818,7 @@ static void gen6_pm_rps_work(struct work_struct *work)
 	drm_i915_private_t *dev_priv = container_of(work, drm_i915_private_t,
 						    rps.work);
 	u32 pm_iir;
-	u8 new_delay;
+	int new_delay, adj;
 
 	spin_lock_irq(&dev_priv->irq_lock);
 	pm_iir = dev_priv->rps.pm_iir;
@@ -835,29 +835,49 @@ static void gen6_pm_rps_work(struct work_struct *work)
 
 	mutex_lock(&dev_priv->rps.hw_lock);
 
+	adj = dev_priv->rps.last_adj;
 	if (pm_iir & GEN6_PM_RP_UP_THRESHOLD) {
-		new_delay = dev_priv->rps.cur_delay + 1;
+		if (adj > 0)
+			adj *= 2;
+		else
+			adj = 1;
+		new_delay = dev_priv->rps.cur_delay + adj;
 
 		/*
 		 * For better performance, jump directly
 		 * to RPe if we're below it.
 		 */
-		if (IS_VALLEYVIEW(dev_priv->dev) &&
-		    dev_priv->rps.cur_delay < dev_priv->rps.rpe_delay)
+		if (new_delay < dev_priv->rps.rpe_delay)
+			new_delay = dev_priv->rps.rpe_delay;
+	} else if (pm_iir & GEN6_PM_RP_DOWN_TIMEOUT) {
+		if (dev_priv->rps.cur_delay > dev_priv->rps.rpe_delay)
 			new_delay = dev_priv->rps.rpe_delay;
-	} else
-		new_delay = dev_priv->rps.cur_delay - 1;
+		else
+			new_delay = dev_priv->rps.min_delay;
+		adj = 0;
+	} else if (pm_iir & GEN6_PM_RP_DOWN_THRESHOLD) {
+		if (adj < 0)
+			adj *= 2;
+		else
+			adj = -1;
+		new_delay = dev_priv->rps.cur_delay + adj;
+	} else { /* unknown event */
+		new_delay = dev_priv->rps.cur_delay;
+	}
 
 	/* sysfs frequency interfaces may have snuck in while servicing the
 	 * interrupt
 	 */
-	if (new_delay >= dev_priv->rps.min_delay &&
-	    new_delay <= dev_priv->rps.max_delay) {
-		if (IS_VALLEYVIEW(dev_priv->dev))
-			valleyview_set_rps(dev_priv->dev, new_delay);
-		else
-			gen6_set_rps(dev_priv->dev, new_delay);
-	}
+	if (new_delay < (int)dev_priv->rps.min_delay)
+		new_delay = dev_priv->rps.min_delay;
+	if (new_delay > (int)dev_priv->rps.max_delay)
+		new_delay = dev_priv->rps.max_delay;
+	dev_priv->rps.last_adj = new_delay - dev_priv->rps.cur_delay;
+
+	if (IS_VALLEYVIEW(dev_priv->dev))
+		valleyview_set_rps(dev_priv->dev, new_delay);
+	else
+		gen6_set_rps(dev_priv->dev, new_delay);
 
 	mutex_unlock(&dev_priv->rps.hw_lock);
 }
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 00fda45..ce8458c 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -4676,7 +4676,7 @@
 #define   GEN6_RP_UP_IDLE_MIN			(0x1<<3)
 #define   GEN6_RP_UP_BUSY_AVG			(0x2<<3)
 #define   GEN6_RP_UP_BUSY_CONT			(0x4<<3)
-#define   GEN7_RP_DOWN_IDLE_AVG			(0x2<<0)
+#define   GEN6_RP_DOWN_IDLE_AVG			(0x2<<0)
 #define   GEN6_RP_DOWN_IDLE_CONT		(0x1<<0)
 #define GEN6_RP_UP_THRESHOLD			0xA02C
 #define GEN6_RP_DOWN_THRESHOLD			0xA030
diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c
index 6c59420..c269d5a 100644
--- a/drivers/gpu/drm/i915/intel_pm.c
+++ b/drivers/gpu/drm/i915/intel_pm.c
@@ -3302,6 +3302,98 @@ static u32 gen6_rps_limits(struct drm_i915_private *dev_priv, u8 *val)
 	return limits;
 }
 
+static void gen6_set_rps_thresholds(struct drm_i915_private *dev_priv, u8 val)
+{
+	int new_power;
+
+	new_power = dev_priv->rps.power;
+	switch (dev_priv->rps.power) {
+	case LOW_POWER:
+		if (val > dev_priv->rps.rpe_delay + 1 && val > dev_priv->rps.cur_delay)
+			new_power = BETWEEN;
+		break;
+
+	case BETWEEN:
+		if (val <= dev_priv->rps.rpe_delay && val < dev_priv->rps.cur_delay)
+			new_power = LOW_POWER;
+		else if (val >= dev_priv->rps.rp0_delay && val > dev_priv->rps.cur_delay)
+			new_power = HIGH_POWER;
+		break;
+
+	case HIGH_POWER:
+		if (val < (dev_priv->rps.rp1_delay + dev_priv->rps.rp0_delay) >> 1 && val < dev_priv->rps.cur_delay)
+			new_power = BETWEEN;
+		break;
+	}
+	/* Max/min bins are special */
+	if (val == dev_priv->rps.min_delay)
+		new_power = LOW_POWER;
+	if (val == dev_priv->rps.max_delay)
+		new_power = HIGH_POWER;
+	if (new_power == dev_priv->rps.power)
+		return;
+
+	/* Note the units here are not exactly 1us, but 1280ns. */
+	switch (new_power) {
+	case LOW_POWER:
+		/* Upclock if more than 95% busy over 16ms */
+		I915_WRITE(GEN6_RP_UP_EI, 12500);
+		I915_WRITE(GEN6_RP_UP_THRESHOLD, 11800);
+
+		/* Downclock if less than 85% busy over 32ms */
+		I915_WRITE(GEN6_RP_DOWN_EI, 25000);
+		I915_WRITE(GEN6_RP_DOWN_THRESHOLD, 21250);
+
+		I915_WRITE(GEN6_RP_CONTROL,
+			   GEN6_RP_MEDIA_TURBO |
+			   GEN6_RP_MEDIA_HW_NORMAL_MODE |
+			   GEN6_RP_MEDIA_IS_GFX |
+			   GEN6_RP_ENABLE |
+			   GEN6_RP_UP_BUSY_AVG |
+			   GEN6_RP_DOWN_IDLE_AVG);
+		break;
+
+	case BETWEEN:
+		/* Upclock if more than 90% busy over 13ms */
+		I915_WRITE(GEN6_RP_UP_EI, 10250);
+		I915_WRITE(GEN6_RP_UP_THRESHOLD, 9225);
+
+		/* Downclock if less than 75% busy over 32ms */
+		I915_WRITE(GEN6_RP_DOWN_EI, 25000);
+		I915_WRITE(GEN6_RP_DOWN_THRESHOLD, 18750);
+
+		I915_WRITE(GEN6_RP_CONTROL,
+			   GEN6_RP_MEDIA_TURBO |
+			   GEN6_RP_MEDIA_HW_NORMAL_MODE |
+			   GEN6_RP_MEDIA_IS_GFX |
+			   GEN6_RP_ENABLE |
+			   GEN6_RP_UP_BUSY_AVG |
+			   GEN6_RP_DOWN_IDLE_AVG);
+		break;
+
+	case HIGH_POWER:
+		/* Upclock if more than 85% busy over 10ms */
+		I915_WRITE(GEN6_RP_UP_EI, 8000);
+		I915_WRITE(GEN6_RP_UP_THRESHOLD, 6800);
+
+		/* Downclock if less than 60% busy over 32ms */
+		I915_WRITE(GEN6_RP_DOWN_EI, 25000);
+		I915_WRITE(GEN6_RP_DOWN_THRESHOLD, 15000);
+
+		I915_WRITE(GEN6_RP_CONTROL,
+			   GEN6_RP_MEDIA_TURBO |
+			   GEN6_RP_MEDIA_HW_NORMAL_MODE |
+			   GEN6_RP_MEDIA_IS_GFX |
+			   GEN6_RP_ENABLE |
+			   GEN6_RP_UP_BUSY_AVG |
+			   GEN6_RP_DOWN_IDLE_AVG);
+		break;
+	}
+
+	dev_priv->rps.power = new_power;
+	dev_priv->rps.last_adj = 0;
+}
+
 void gen6_set_rps(struct drm_device *dev, u8 val)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
@@ -3314,6 +3406,8 @@ void gen6_set_rps(struct drm_device *dev, u8 val)
 	if (val == dev_priv->rps.cur_delay)
 		return;
 
+	gen6_set_rps_thresholds(dev_priv, val);
+
 	if (IS_HASWELL(dev))
 		I915_WRITE(GEN6_RPNSWREQ,
 			   HSW_FREQUENCY(val));
@@ -3342,6 +3436,7 @@ void gen6_rps_idle(struct drm_i915_private *dev_priv)
 		valleyview_set_rps(dev_priv->dev, dev_priv->rps.min_delay);
 	else
 		gen6_set_rps(dev_priv->dev, dev_priv->rps.min_delay);
+	dev_priv->rps.last_adj = 0;
 	mutex_unlock(&dev_priv->rps.hw_lock);
 }
 
@@ -3352,6 +3447,7 @@ void gen6_rps_boost(struct drm_i915_private *dev_priv)
 		valleyview_set_rps(dev_priv->dev, dev_priv->rps.max_delay);
 	else
 		gen6_set_rps(dev_priv->dev, dev_priv->rps.max_delay);
+	dev_priv->rps.last_adj = 0;
 	mutex_unlock(&dev_priv->rps.hw_lock);
 }
 
@@ -3536,7 +3632,10 @@ static void gen6_enable_rps(struct drm_device *dev)
 
 	/* In units of 50MHz */
 	dev_priv->rps.hw_max = dev_priv->rps.max_delay = rp_state_cap & 0xff;
-	dev_priv->rps.min_delay = (rp_state_cap & 0xff0000) >> 16;
+	dev_priv->rps.min_delay = (rp_state_cap >> 16) & 0xff;
+	dev_priv->rps.rp1_delay = (rp_state_cap >>  8) & 0xff;
+	dev_priv->rps.rp0_delay = (rp_state_cap >>  0) & 0xff;
+	dev_priv->rps.rpe_delay = dev_priv->rps.rp1_delay;
 	dev_priv->rps.cur_delay = 0;
 
 	/* disable the counters and set deterministic thresholds */
@@ -3584,38 +3683,9 @@ static void gen6_enable_rps(struct drm_device *dev)
 		   GEN6_RC_CTL_EI_MODE(1) |
 		   GEN6_RC_CTL_HW_ENABLE);
 
-	if (IS_HASWELL(dev)) {
-		I915_WRITE(GEN6_RPNSWREQ,
-			   HSW_FREQUENCY(10));
-		I915_WRITE(GEN6_RC_VIDEO_FREQ,
-			   HSW_FREQUENCY(12));
-	} else {
-		I915_WRITE(GEN6_RPNSWREQ,
-			   GEN6_FREQUENCY(10) |
-			   GEN6_OFFSET(0) |
-			   GEN6_AGGRESSIVE_TURBO);
-		I915_WRITE(GEN6_RC_VIDEO_FREQ,
-			   GEN6_FREQUENCY(12));
-	}
-
-	I915_WRITE(GEN6_RP_DOWN_TIMEOUT, 1000000);
-	I915_WRITE(GEN6_RP_INTERRUPT_LIMITS,
-		   dev_priv->rps.max_delay << 24 |
-		   dev_priv->rps.min_delay << 16);
-
-	I915_WRITE(GEN6_RP_UP_THRESHOLD, 59400);
-	I915_WRITE(GEN6_RP_DOWN_THRESHOLD, 245000);
-	I915_WRITE(GEN6_RP_UP_EI, 66000);
-	I915_WRITE(GEN6_RP_DOWN_EI, 350000);
-
+	/* Power down if completely idle for over 50ms */
+	I915_WRITE(GEN6_RP_DOWN_TIMEOUT, 50000);
 	I915_WRITE(GEN6_RP_IDLE_HYSTERSIS, 10);
-	I915_WRITE(GEN6_RP_CONTROL,
-		   GEN6_RP_MEDIA_TURBO |
-		   GEN6_RP_MEDIA_HW_NORMAL_MODE |
-		   GEN6_RP_MEDIA_IS_GFX |
-		   GEN6_RP_ENABLE |
-		   GEN6_RP_UP_BUSY_AVG |
-		   (IS_HASWELL(dev) ? GEN7_RP_DOWN_IDLE_AVG : GEN6_RP_DOWN_IDLE_CONT));
 
 	ret = sandybridge_pcode_write(dev_priv, GEN6_PCODE_WRITE_MIN_FREQ_TABLE, 0);
 	if (!ret) {
@@ -3631,7 +3701,8 @@ static void gen6_enable_rps(struct drm_device *dev)
 		DRM_DEBUG_DRIVER("Failed to set the min frequency\n");
 	}
 
-	gen6_set_rps(dev_priv->dev, (gt_perf_status & 0xff00) >> 8);
+	dev_priv->rps.power = HIGH_POWER; /* force a reset */
+	gen6_set_rps(dev_priv->dev, dev_priv->rps.min_delay);
 
 	gen6_enable_rps_interrupts(dev);
 
-- 
1.8.4.rc3

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/3] drm/i915: Boost RPS frequency for CPU stalls
  2013-09-25 16:34 ` [PATCH 2/3] drm/i915: Boost RPS frequency for CPU stalls Chris Wilson
@ 2013-10-01 21:54   ` Jesse Barnes
  2013-10-01 21:56     ` Jesse Barnes
  2013-10-01 22:23     ` Chris Wilson
  0 siblings, 2 replies; 15+ messages in thread
From: Jesse Barnes @ 2013-10-01 21:54 UTC (permalink / raw)
  To: Chris Wilson
  Cc: Owen Taylor, intel-gfx, Zhuang, Lena, Stéphane Marchesin

On Wed, 25 Sep 2013 17:34:56 +0100
Chris Wilson <chris@chris-wilson.co.uk> wrote:

> +void gen6_rps_idle(struct drm_i915_private *dev_priv)
> +{
> +	mutex_lock(&dev_priv->rps.hw_lock);
> +	if (dev_priv->info->is_valleyview)
> +		valleyview_set_rps(dev_priv->dev, dev_priv->rps.min_delay);
> +	else
> +		gen6_set_rps(dev_priv->dev, dev_priv->rps.min_delay);
> +	mutex_unlock(&dev_priv->rps.hw_lock);
> +}

Looks pretty good, but I think these should be rpe_delay instead.  Not
much point in going down to a less efficient frequency...

Jesse

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/3] drm/i915: Boost RPS frequency for CPU stalls
  2013-10-01 21:54   ` Jesse Barnes
@ 2013-10-01 21:56     ` Jesse Barnes
  2013-10-01 22:23     ` Chris Wilson
  1 sibling, 0 replies; 15+ messages in thread
From: Jesse Barnes @ 2013-10-01 21:56 UTC (permalink / raw)
  To: Jesse Barnes
  Cc: Owen Taylor, intel-gfx, Zhuang, Lena, Stéphane Marchesin

On Tue, 1 Oct 2013 14:54:26 -0700
Jesse Barnes <jbarnes@virtuousgeek.org> wrote:

> On Wed, 25 Sep 2013 17:34:56 +0100
> Chris Wilson <chris@chris-wilson.co.uk> wrote:
> 
> > +void gen6_rps_idle(struct drm_i915_private *dev_priv)
> > +{
> > +	mutex_lock(&dev_priv->rps.hw_lock);
> > +	if (dev_priv->info->is_valleyview)
> > +		valleyview_set_rps(dev_priv->dev, dev_priv->rps.min_delay);
> > +	else
> > +		gen6_set_rps(dev_priv->dev, dev_priv->rps.min_delay);
> > +	mutex_unlock(&dev_priv->rps.hw_lock);
> > +}
> 
> Looks pretty good, but I think these should be rpe_delay instead.  Not
> much point in going down to a less efficient frequency...

Oh and you can have my
Reviewed-by: Jesse Barnes <jbarnes@virtuousgeek.org>

with that change (or if you can justify the above).

Jesse

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/3] drm/i915: Tweak RPS thresholds to more aggressively downclock
  2013-09-25 16:34 ` [PATCH 3/3] drm/i915: Tweak RPS thresholds to more aggressively downclock Chris Wilson
@ 2013-10-01 22:04   ` Jesse Barnes
  2013-10-03  7:41     ` Daniel Vetter
  0 siblings, 1 reply; 15+ messages in thread
From: Jesse Barnes @ 2013-10-01 22:04 UTC (permalink / raw)
  To: Chris Wilson
  Cc: Owen Taylor, intel-gfx, Zhuang, Lena, Stéphane Marchesin

On Wed, 25 Sep 2013 17:34:57 +0100
Chris Wilson <chris@chris-wilson.co.uk> wrote:

> After applying wait-boost we often find ourselves stuck at higher clocks
> than required. The current threshold value requires the GPU to be
> continuously and completely idle for 313ms before it is dropped by one
> bin. Conversely, we require the GPU to be busy for an average of 90% over
> a 84ms period before we upclock. So the current thresholds almost never
> downclock the GPU, and respond very slowly to sudden demands for more
> power. It is easy to observe that we currently lock into the wrong bin
> and both underperform in benchmarks and consume more power than optimal
> (just by repeating the task and measuring the different results).
> 
> An alternative approach, as discussed in the bspec, is to use a
> continuous threshold for upclocking, and an average value for downclocking.
> This is good for quickly detecting and reacting to state changes within a
> frame, however it fails with the common throttling method of waiting
> upon the outstanding frame - at least it is difficult to choose a
> threshold that works well at 15,000fps and at 60fps. So continue to use
> average busy/idle loads to determine frequency change.
> 
> v2: Use 3 power zones to keep frequencies low in steady-state mostly
> idle (e.g. scrolling, interactive 2D drawing), and frequencies high
> for demanding games. In between those end-states, we use a
> fast-reclocking algorithm to converge more quickly on the desired bin.
> 
> v3: Bug fixes - make sure we reset adj after switching power zones.
> 
> v4: Tune - drop the continuous busy thresholds as it prevents us from
> choosing the right frequency for glxgears style swap benchmarks. Instead
> the goal is to be able to find the right clocks irrespective of the
> wait-boost.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Kenneth Graunke <kenneth@whitecape.org>
> Cc: Stéphane Marchesin <stephane.marchesin@gmail.com>
> Cc: Owen Taylor <otaylor@redhat.com>
> Cc: "Meng, Mengmeng" <mengmeng.meng@intel.com>
> Cc: "Zhuang, Lena" <lena.zhuang@intel.com>
> ---

It's a little scary to mess with these, but we've gotten some good
numbers so far so I guess it's ok.

As a follow up, it might be nice to expose the power, balanced,
performance profiles to userspace via sysfs.  Since we can't solve this
problem for all users and all needs, we can just punt it out to
userspace. :)

Reviewed-by: Jesse Barnes <jbarnes@virtuousgeek.org>

Jesse
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/3] drm/i915: Boost RPS frequency for CPU stalls
  2013-10-01 21:54   ` Jesse Barnes
  2013-10-01 21:56     ` Jesse Barnes
@ 2013-10-01 22:23     ` Chris Wilson
  2013-10-01 22:39       ` Jesse Barnes
  1 sibling, 1 reply; 15+ messages in thread
From: Chris Wilson @ 2013-10-01 22:23 UTC (permalink / raw)
  To: Jesse Barnes
  Cc: Owen Taylor, intel-gfx, Zhuang, Lena, Stéphane Marchesin

On Tue, Oct 01, 2013 at 02:54:26PM -0700, Jesse Barnes wrote:
> On Wed, 25 Sep 2013 17:34:56 +0100
> Chris Wilson <chris@chris-wilson.co.uk> wrote:
> 
> > +void gen6_rps_idle(struct drm_i915_private *dev_priv)
> > +{
> > +	mutex_lock(&dev_priv->rps.hw_lock);
> > +	if (dev_priv->info->is_valleyview)
> > +		valleyview_set_rps(dev_priv->dev, dev_priv->rps.min_delay);
> > +	else
> > +		gen6_set_rps(dev_priv->dev, dev_priv->rps.min_delay);
> > +	mutex_unlock(&dev_priv->rps.hw_lock);
> > +}
> 
> Looks pretty good, but I think these should be rpe_delay instead.  Not
> much point in going down to a less efficient frequency...

Less efficient for what? My concern here is only with power draw when
idle. As soon as we start to render again (well very shortly afterwards
with this particular iteration) we bump up to rpe and then beyond.

Correct me if I am wrong but rpe is an inflection point rather than a
minumum?
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/3] drm/i915: Boost RPS frequency for CPU stalls
  2013-10-01 22:23     ` Chris Wilson
@ 2013-10-01 22:39       ` Jesse Barnes
  2013-10-02  0:33         ` Chris Wilson
  0 siblings, 1 reply; 15+ messages in thread
From: Jesse Barnes @ 2013-10-01 22:39 UTC (permalink / raw)
  To: Chris Wilson
  Cc: Owen Taylor, intel-gfx, Zhuang, Lena, Stéphane Marchesin

On Tue, 1 Oct 2013 23:23:32 +0100
Chris Wilson <chris@chris-wilson.co.uk> wrote:

> On Tue, Oct 01, 2013 at 02:54:26PM -0700, Jesse Barnes wrote:
> > On Wed, 25 Sep 2013 17:34:56 +0100
> > Chris Wilson <chris@chris-wilson.co.uk> wrote:
> > 
> > > +void gen6_rps_idle(struct drm_i915_private *dev_priv)
> > > +{
> > > +	mutex_lock(&dev_priv->rps.hw_lock);
> > > +	if (dev_priv->info->is_valleyview)
> > > +		valleyview_set_rps(dev_priv->dev, dev_priv->rps.min_delay);
> > > +	else
> > > +		gen6_set_rps(dev_priv->dev, dev_priv->rps.min_delay);
> > > +	mutex_unlock(&dev_priv->rps.hw_lock);
> > > +}
> > 
> > Looks pretty good, but I think these should be rpe_delay instead.  Not
> > much point in going down to a less efficient frequency...
> 
> Less efficient for what? My concern here is only with power draw when
> idle. As soon as we start to render again (well very shortly afterwards
> with this particular iteration) we bump up to rpe and then beyond.
> 
> Correct me if I am wrong but rpe is an inflection point rather than a
> minumum?

So yes, running at a lower than RPe freq will use less power, but it'll
also be less efficient (perf/power) than doing the same rendering at
RPe.

But if we're really idle, RC6 will kick in and the freq won't matter
(as long as it gets down to RPe anyway, since on VLV that's the only
way we'll get down to Vmin when we shut down).

Jesse

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/3] drm/i915: Boost RPS frequency for CPU stalls
  2013-10-01 22:39       ` Jesse Barnes
@ 2013-10-02  0:33         ` Chris Wilson
  2013-10-02 18:26           ` Jesse Barnes
  0 siblings, 1 reply; 15+ messages in thread
From: Chris Wilson @ 2013-10-02  0:33 UTC (permalink / raw)
  To: Jesse Barnes
  Cc: Owen Taylor, intel-gfx, Zhuang, Lena, Stéphane Marchesin

On Tue, Oct 01, 2013 at 03:39:40PM -0700, Jesse Barnes wrote:
> On Tue, 1 Oct 2013 23:23:32 +0100
> Chris Wilson <chris@chris-wilson.co.uk> wrote:
> 
> > On Tue, Oct 01, 2013 at 02:54:26PM -0700, Jesse Barnes wrote:
> > > On Wed, 25 Sep 2013 17:34:56 +0100
> > > Chris Wilson <chris@chris-wilson.co.uk> wrote:
> > > 
> > > > +void gen6_rps_idle(struct drm_i915_private *dev_priv)
> > > > +{
> > > > +	mutex_lock(&dev_priv->rps.hw_lock);
> > > > +	if (dev_priv->info->is_valleyview)
> > > > +		valleyview_set_rps(dev_priv->dev, dev_priv->rps.min_delay);
> > > > +	else
> > > > +		gen6_set_rps(dev_priv->dev, dev_priv->rps.min_delay);
> > > > +	mutex_unlock(&dev_priv->rps.hw_lock);
> > > > +}
> > > 
> > > Looks pretty good, but I think these should be rpe_delay instead.  Not
> > > much point in going down to a less efficient frequency...
> > 
> > Less efficient for what? My concern here is only with power draw when
> > idle. As soon as we start to render again (well very shortly afterwards
> > with this particular iteration) we bump up to rpe and then beyond.
> > 
> > Correct me if I am wrong but rpe is an inflection point rather than a
> > minumum?
> 
> So yes, running at a lower than RPe freq will use less power, but it'll
> also be less efficient (perf/power) than doing the same rendering at
> RPe.

Right, so I think so long as autotuning works, we can run at low power,
low efficiency for as long as that is capable of sustaining the desired
throughput and latency. And the heuristics we have here are pretty good
at detecting when more power is required.

> But if we're really idle, RC6 will kick in and the freq won't matter
> (as long as it gets down to RPe anyway, since on VLV that's the only
> way we'll get down to Vmin when we shut down).

I guess we do have one issue whereby min_delay can be overriden by the
user. So perhaps that should be rpn, or maybe it would be clearer as
vmin_delay.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/3] drm/i915: Boost RPS frequency for CPU stalls
  2013-10-02  0:33         ` Chris Wilson
@ 2013-10-02 18:26           ` Jesse Barnes
  2013-10-02 21:57             ` Chris Wilson
  0 siblings, 1 reply; 15+ messages in thread
From: Jesse Barnes @ 2013-10-02 18:26 UTC (permalink / raw)
  To: Chris Wilson
  Cc: Owen Taylor, intel-gfx, Zhuang, Lena, Stéphane Marchesin

On Wed, 2 Oct 2013 01:33:24 +0100
Chris Wilson <chris@chris-wilson.co.uk> wrote:

> On Tue, Oct 01, 2013 at 03:39:40PM -0700, Jesse Barnes wrote:
> > On Tue, 1 Oct 2013 23:23:32 +0100
> > Chris Wilson <chris@chris-wilson.co.uk> wrote:
> > 
> > > On Tue, Oct 01, 2013 at 02:54:26PM -0700, Jesse Barnes wrote:
> > > > On Wed, 25 Sep 2013 17:34:56 +0100
> > > > Chris Wilson <chris@chris-wilson.co.uk> wrote:
> > > > 
> > > > > +void gen6_rps_idle(struct drm_i915_private *dev_priv)
> > > > > +{
> > > > > +	mutex_lock(&dev_priv->rps.hw_lock);
> > > > > +	if (dev_priv->info->is_valleyview)
> > > > > +		valleyview_set_rps(dev_priv->dev, dev_priv->rps.min_delay);
> > > > > +	else
> > > > > +		gen6_set_rps(dev_priv->dev, dev_priv->rps.min_delay);
> > > > > +	mutex_unlock(&dev_priv->rps.hw_lock);
> > > > > +}
> > > > 
> > > > Looks pretty good, but I think these should be rpe_delay instead.  Not
> > > > much point in going down to a less efficient frequency...
> > > 
> > > Less efficient for what? My concern here is only with power draw when
> > > idle. As soon as we start to render again (well very shortly afterwards
> > > with this particular iteration) we bump up to rpe and then beyond.
> > > 
> > > Correct me if I am wrong but rpe is an inflection point rather than a
> > > minumum?
> > 
> > So yes, running at a lower than RPe freq will use less power, but it'll
> > also be less efficient (perf/power) than doing the same rendering at
> > RPe.
> 
> Right, so I think so long as autotuning works, we can run at low power,
> low efficiency for as long as that is capable of sustaining the desired
> throughput and latency. And the heuristics we have here are pretty good
> at detecting when more power is required.

The downside is that we'll take longer to enter RC6 if we run at the
slower speed.   And supposedly, running at RPe so we get to RC6 a
little faster ends up saving more power than running slower, even
though running slower uses less power while active.

Or have you taken that into account here?

Jesse

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/3] drm/i915: Boost RPS frequency for CPU stalls
  2013-10-02 18:26           ` Jesse Barnes
@ 2013-10-02 21:57             ` Chris Wilson
  2013-10-02 22:18               ` Jesse Barnes
  0 siblings, 1 reply; 15+ messages in thread
From: Chris Wilson @ 2013-10-02 21:57 UTC (permalink / raw)
  To: Jesse Barnes
  Cc: Owen Taylor, intel-gfx, Zhuang, Lena, Stéphane Marchesin

On Wed, Oct 02, 2013 at 11:26:47AM -0700, Jesse Barnes wrote:
> On Wed, 2 Oct 2013 01:33:24 +0100
> Chris Wilson <chris@chris-wilson.co.uk> wrote:
> 
> > On Tue, Oct 01, 2013 at 03:39:40PM -0700, Jesse Barnes wrote:
> > > On Tue, 1 Oct 2013 23:23:32 +0100
> > > Chris Wilson <chris@chris-wilson.co.uk> wrote:
> > > 
> > > > On Tue, Oct 01, 2013 at 02:54:26PM -0700, Jesse Barnes wrote:
> > > > > On Wed, 25 Sep 2013 17:34:56 +0100
> > > > > Chris Wilson <chris@chris-wilson.co.uk> wrote:
> > > > > 
> > > > > > +void gen6_rps_idle(struct drm_i915_private *dev_priv)
> > > > > > +{
> > > > > > +	mutex_lock(&dev_priv->rps.hw_lock);
> > > > > > +	if (dev_priv->info->is_valleyview)
> > > > > > +		valleyview_set_rps(dev_priv->dev, dev_priv->rps.min_delay);
> > > > > > +	else
> > > > > > +		gen6_set_rps(dev_priv->dev, dev_priv->rps.min_delay);
> > > > > > +	mutex_unlock(&dev_priv->rps.hw_lock);
> > > > > > +}
> > > > > 
> > > > > Looks pretty good, but I think these should be rpe_delay instead.  Not
> > > > > much point in going down to a less efficient frequency...
> > > > 
> > > > Less efficient for what? My concern here is only with power draw when
> > > > idle. As soon as we start to render again (well very shortly afterwards
> > > > with this particular iteration) we bump up to rpe and then beyond.
> > > > 
> > > > Correct me if I am wrong but rpe is an inflection point rather than a
> > > > minumum?
> > > 
> > > So yes, running at a lower than RPe freq will use less power, but it'll
> > > also be less efficient (perf/power) than doing the same rendering at
> > > RPe.
> > 
> > Right, so I think so long as autotuning works, we can run at low power,
> > low efficiency for as long as that is capable of sustaining the desired
> > throughput and latency. And the heuristics we have here are pretty good
> > at detecting when more power is required.
> 
> The downside is that we'll take longer to enter RC6 if we run at the
> slower speed.   And supposedly, running at RPe so we get to RC6 a
> little faster ends up saving more power than running slower, even
> though running slower uses less power while active.
> 
> Or have you taken that into account here?

No, that is a factor I had not considered. I had been concerned about
monitoring latency whilst minimising frequency and using the power gauge
as the ultimate measure of success. One thing that is apparent with ivb,
is that the power gauge (at least) is dependent upon workload. That is
you can set high frequencies, but if the ring is idle (but not in rc6)
then it consumes relatively little power. (That still may be 1W on the
big GPUs, but it does mean that at low frequencies there is not a lot of
difference between rc6 on/off, a few 10s of mW, and we are concerned
with a window of about 10ms.) I suspect the difference is likely to be
in the noise for an idle desktop workload and would only really show
itself in a synthetic benchmark.

So we want to race-to-idle, except for when we don't want to
race-to-idle. Seems like a good reason to keep improving our measuring
tools.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/3] drm/i915: Boost RPS frequency for CPU stalls
  2013-10-02 21:57             ` Chris Wilson
@ 2013-10-02 22:18               ` Jesse Barnes
  2013-10-02 22:20                 ` Jesse Barnes
  0 siblings, 1 reply; 15+ messages in thread
From: Jesse Barnes @ 2013-10-02 22:18 UTC (permalink / raw)
  To: Chris Wilson
  Cc: Owen Taylor, intel-gfx, Zhuang, Lena, Stéphane Marchesin

On Wed, 2 Oct 2013 22:57:41 +0100
Chris Wilson <chris@chris-wilson.co.uk> wrote:
> No, that is a factor I had not considered. I had been concerned about
> monitoring latency whilst minimising frequency and using the power gauge
> as the ultimate measure of success. One thing that is apparent with ivb,
> is that the power gauge (at least) is dependent upon workload. That is
> you can set high frequencies, but if the ring is idle (but not in rc6)
> then it consumes relatively little power. (That still may be 1W on the
> big GPUs, but it does mean that at low frequencies there is not a lot of
> difference between rc6 on/off, a few 10s of mW, and we are concerned
> with a window of about 10ms.) I suspect the difference is likely to be
> in the noise for an idle desktop workload and would only really show
> itself in a synthetic benchmark.

Hm are you sure about that?  It doesn't match the last measurements I
had from a long time ago... until RC6 kicked in the GPU burned quite a
bit (several Watts).

If you're correct though, then I agree, this is quibbling over crumbs,
so not worth modifying.

> So we want to race-to-idle, except for when we don't want to
> race-to-idle. Seems like a good reason to keep improving our measuring
> tools.

Always.

Jesse

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/3] drm/i915: Boost RPS frequency for CPU stalls
  2013-10-02 22:18               ` Jesse Barnes
@ 2013-10-02 22:20                 ` Jesse Barnes
  2013-10-02 23:19                   ` Chris Wilson
  0 siblings, 1 reply; 15+ messages in thread
From: Jesse Barnes @ 2013-10-02 22:20 UTC (permalink / raw)
  To: Jesse Barnes
  Cc: Owen Taylor, intel-gfx, Zhuang, Lena, Stéphane Marchesin

On Wed, 2 Oct 2013 15:18:20 -0700
Jesse Barnes <jbarnes@virtuousgeek.org> wrote:

> On Wed, 2 Oct 2013 22:57:41 +0100
> Chris Wilson <chris@chris-wilson.co.uk> wrote:
> > No, that is a factor I had not considered. I had been concerned about
> > monitoring latency whilst minimising frequency and using the power gauge
> > as the ultimate measure of success. One thing that is apparent with ivb,
> > is that the power gauge (at least) is dependent upon workload. That is
> > you can set high frequencies, but if the ring is idle (but not in rc6)
> > then it consumes relatively little power. (That still may be 1W on the
> > big GPUs, but it does mean that at low frequencies there is not a lot of
> > difference between rc6 on/off, a few 10s of mW, and we are concerned
> > with a window of about 10ms.) I suspect the difference is likely to be
> > in the noise for an idle desktop workload and would only really show
> > itself in a synthetic benchmark.
> 
> Hm are you sure about that?  It doesn't match the last measurements I
> had from a long time ago... until RC6 kicked in the GPU burned quite a
> bit (several Watts).
> 
> If you're correct though, then I agree, this is quibbling over crumbs,
> so not worth modifying.

Oh and the other thing that comes to mind is platform power.  Even if
RC6 vs no RC6 at low freq is minimally different, entering RC6 allows
the platform to go into a deep sleep state if the CPUs are also idle.
So race to idle as efficiently as possible is probably still the right
answer.

Jesse

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/3] drm/i915: Boost RPS frequency for CPU stalls
  2013-10-02 22:20                 ` Jesse Barnes
@ 2013-10-02 23:19                   ` Chris Wilson
  0 siblings, 0 replies; 15+ messages in thread
From: Chris Wilson @ 2013-10-02 23:19 UTC (permalink / raw)
  To: Jesse Barnes
  Cc: Owen Taylor, intel-gfx, Zhuang, Lena, Stéphane Marchesin

On Wed, Oct 02, 2013 at 03:20:23PM -0700, Jesse Barnes wrote:
> On Wed, 2 Oct 2013 15:18:20 -0700
> Jesse Barnes <jbarnes@virtuousgeek.org> wrote:
> 
> > On Wed, 2 Oct 2013 22:57:41 +0100
> > Chris Wilson <chris@chris-wilson.co.uk> wrote:
> > > No, that is a factor I had not considered. I had been concerned about
> > > monitoring latency whilst minimising frequency and using the power gauge
> > > as the ultimate measure of success. One thing that is apparent with ivb,
> > > is that the power gauge (at least) is dependent upon workload. That is
> > > you can set high frequencies, but if the ring is idle (but not in rc6)
> > > then it consumes relatively little power. (That still may be 1W on the
> > > big GPUs, but it does mean that at low frequencies there is not a lot of
> > > difference between rc6 on/off, a few 10s of mW, and we are concerned
> > > with a window of about 10ms.) I suspect the difference is likely to be
> > > in the noise for an idle desktop workload and would only really show
> > > itself in a synthetic benchmark.
> > 
> > Hm are you sure about that?  It doesn't match the last measurements I
> > had from a long time ago... until RC6 kicked in the GPU burned quite a
> > bit (several Watts).
> > 
> > If you're correct though, then I agree, this is quibbling over crumbs,
> > so not worth modifying.
> 
> Oh and the other thing that comes to mind is platform power.  Even if
> RC6 vs no RC6 at low freq is minimally different, entering RC6 allows
> the platform to go into a deep sleep state if the CPUs are also idle.
> So race to idle as efficiently as possible is probably still the right
> answer.

Right, I'm basing my judgement purely on the energy msr, I have no idea
how that actually relates to instantaneous power consumption from the
battery/mains. I only hope the correlation is good (because that msr is
so convenient to measure and record in realtime).

I've been deferring to Mengmeng for system power measurements - its
about time I actually started measuring those for myself.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/3] drm/i915: Tweak RPS thresholds to more aggressively downclock
  2013-10-01 22:04   ` Jesse Barnes
@ 2013-10-03  7:41     ` Daniel Vetter
  0 siblings, 0 replies; 15+ messages in thread
From: Daniel Vetter @ 2013-10-03  7:41 UTC (permalink / raw)
  To: Jesse Barnes
  Cc: Owen Taylor, intel-gfx, Zhuang, Lena, Stéphane Marchesin

On Tue, Oct 01, 2013 at 03:04:01PM -0700, Jesse Barnes wrote:
> On Wed, 25 Sep 2013 17:34:57 +0100
> Chris Wilson <chris@chris-wilson.co.uk> wrote:
> 
> > After applying wait-boost we often find ourselves stuck at higher clocks
> > than required. The current threshold value requires the GPU to be
> > continuously and completely idle for 313ms before it is dropped by one
> > bin. Conversely, we require the GPU to be busy for an average of 90% over
> > a 84ms period before we upclock. So the current thresholds almost never
> > downclock the GPU, and respond very slowly to sudden demands for more
> > power. It is easy to observe that we currently lock into the wrong bin
> > and both underperform in benchmarks and consume more power than optimal
> > (just by repeating the task and measuring the different results).
> > 
> > An alternative approach, as discussed in the bspec, is to use a
> > continuous threshold for upclocking, and an average value for downclocking.
> > This is good for quickly detecting and reacting to state changes within a
> > frame, however it fails with the common throttling method of waiting
> > upon the outstanding frame - at least it is difficult to choose a
> > threshold that works well at 15,000fps and at 60fps. So continue to use
> > average busy/idle loads to determine frequency change.
> > 
> > v2: Use 3 power zones to keep frequencies low in steady-state mostly
> > idle (e.g. scrolling, interactive 2D drawing), and frequencies high
> > for demanding games. In between those end-states, we use a
> > fast-reclocking algorithm to converge more quickly on the desired bin.
> > 
> > v3: Bug fixes - make sure we reset adj after switching power zones.
> > 
> > v4: Tune - drop the continuous busy thresholds as it prevents us from
> > choosing the right frequency for glxgears style swap benchmarks. Instead
> > the goal is to be able to find the right clocks irrespective of the
> > wait-boost.
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Kenneth Graunke <kenneth@whitecape.org>
> > Cc: Stéphane Marchesin <stephane.marchesin@gmail.com>
> > Cc: Owen Taylor <otaylor@redhat.com>
> > Cc: "Meng, Mengmeng" <mengmeng.meng@intel.com>
> > Cc: "Zhuang, Lena" <lena.zhuang@intel.com>
> > ---
> 
> It's a little scary to mess with these, but we've gotten some good
> numbers so far so I guess it's ok.
> 
> As a follow up, it might be nice to expose the power, balanced,
> performance profiles to userspace via sysfs.  Since we can't solve this
> problem for all users and all needs, we can just punt it out to
> userspace. :)
> 
> Reviewed-by: Jesse Barnes <jbarnes@virtuousgeek.org>

All three merged, thanks for patches&review. I've frobbed the first two
with tiny style bikesheds:
- Dropped the typedef usage for the plain struct drm_i915_private in new
  code.
- Dropped the extern qualifier for the function prototypes in header
  files.

Cheers, Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2013-10-03  7:41 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-09-25 16:34 [PATCH 1/3] drm/i915: Fix __wait_seqno to use true infinite timeouts Chris Wilson
2013-09-25 16:34 ` [PATCH 2/3] drm/i915: Boost RPS frequency for CPU stalls Chris Wilson
2013-10-01 21:54   ` Jesse Barnes
2013-10-01 21:56     ` Jesse Barnes
2013-10-01 22:23     ` Chris Wilson
2013-10-01 22:39       ` Jesse Barnes
2013-10-02  0:33         ` Chris Wilson
2013-10-02 18:26           ` Jesse Barnes
2013-10-02 21:57             ` Chris Wilson
2013-10-02 22:18               ` Jesse Barnes
2013-10-02 22:20                 ` Jesse Barnes
2013-10-02 23:19                   ` Chris Wilson
2013-09-25 16:34 ` [PATCH 3/3] drm/i915: Tweak RPS thresholds to more aggressively downclock Chris Wilson
2013-10-01 22:04   ` Jesse Barnes
2013-10-03  7:41     ` Daniel Vetter

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.