[PATCH 1/2] drm/i915: Break busywaiting for requests on pending signals

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 1/2] drm/i915: Break busywaiting for requests on pending signals
@ 2015-11-15 13:32 ` Chris Wilson
  0 siblings, 0 replies; 35+ messages in thread
From: Chris Wilson @ 2015-11-15 13:32 UTC (permalink / raw)
  To: Jens Axboe, intel-gfx, linux-kernel
  Cc: dri-devel, Chris Wilson, Daniel Vetter, Tvrtko Ursulin,
	Eero Tamminen, Rantala, Valtteri, stable

The busywait in __i915_spin_request() does not respect pending signals
and so may consume the entire timeslice for the task instead of
returning to userspace to handle the signal.

Fixes regression from
commit 2def4ad99befa25775dd2f714fdd4d92faec6e34 [v4.2]
Author: Chris Wilson <chris@chris-wilson.co.uk>
Date:   Tue Apr 7 16:20:41 2015 +0100

     drm/i915: Optimistically spin for the request completion

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Jens Axboe <axboe@kernel.dk>
Cc; "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin@intel.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Eero Tamminen <eero.t.tamminen@intel.com>
Cc: "Rantala, Valtteri" <valtteri.rantala@intel.com>
Cc: stable@kernel.vger.org
---
 drivers/gpu/drm/i915/i915_gem.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 5cf4a1998273..740530c571d1 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1146,7 +1146,7 @@ static bool missed_irq(struct drm_i915_private *dev_priv,
 	return test_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings);
 }
 
-static int __i915_spin_request(struct drm_i915_gem_request *req)
+static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
 {
 	unsigned long timeout;
 
@@ -1158,6 +1158,9 @@ static int __i915_spin_request(struct drm_i915_gem_request *req)
 		if (i915_gem_request_completed(req, true))
 			return 0;
 
+		if (signal_pending_state(state, current))
+			break;
+
 		if (time_after_eq(jiffies, timeout))
 			break;
 
@@ -1197,6 +1200,7 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	const bool irq_test_in_progress =
 		ACCESS_ONCE(dev_priv->gpu_error.test_irq_rings) & intel_ring_flag(ring);
+	int state = interruptible ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE;
 	DEFINE_WAIT(wait);
 	unsigned long timeout_expire;
 	s64 before, now;
@@ -1221,7 +1225,7 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
 	before = ktime_get_raw_ns();
 
 	/* Optimistic spin for the next jiffie before touching IRQs */
-	ret = __i915_spin_request(req);
+	ret = __i915_spin_request(req, state);
 	if (ret == 0)
 		goto out;
 
@@ -1233,8 +1237,7 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
 	for (;;) {
 		struct timer_list timer;
 
-		prepare_to_wait(&ring->irq_queue, &wait,
-				interruptible ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE);
+		prepare_to_wait(&ring->irq_queue, &wait, state);
 
 		/* We need to check whether any gpu reset happened in between
 		 * the caller grabbing the seqno and now ... */
@@ -1252,7 +1255,7 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
 			break;
 		}
 
-		if (interruptible && signal_pending(current)) {
+		if (signal_pending_state(state, current)) {
 			ret = -ERESTARTSYS;
 			break;
 		}
-- 
2.6.2


^ permalink raw reply related	[flat|nested] 35+ messages in thread

* [PATCH 1/2] drm/i915: Break busywaiting for requests on pending signals
@ 2015-11-15 13:32 ` Chris Wilson
  0 siblings, 0 replies; 35+ messages in thread
From: Chris Wilson @ 2015-11-15 13:32 UTC (permalink / raw)
  To: Jens Axboe, intel-gfx, linux-kernel
  Cc: Tvrtko Ursulin, Daniel Vetter, Eero Tamminen, dri-devel, stable,
	Rantala, Valtteri

The busywait in __i915_spin_request() does not respect pending signals
and so may consume the entire timeslice for the task instead of
returning to userspace to handle the signal.

Fixes regression from
commit 2def4ad99befa25775dd2f714fdd4d92faec6e34 [v4.2]
Author: Chris Wilson <chris@chris-wilson.co.uk>
Date:   Tue Apr 7 16:20:41 2015 +0100

     drm/i915: Optimistically spin for the request completion

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Jens Axboe <axboe@kernel.dk>
Cc; "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin@intel.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Eero Tamminen <eero.t.tamminen@intel.com>
Cc: "Rantala, Valtteri" <valtteri.rantala@intel.com>
Cc: stable@kernel.vger.org
---
 drivers/gpu/drm/i915/i915_gem.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 5cf4a1998273..740530c571d1 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1146,7 +1146,7 @@ static bool missed_irq(struct drm_i915_private *dev_priv,
 	return test_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings);
 }
 
-static int __i915_spin_request(struct drm_i915_gem_request *req)
+static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
 {
 	unsigned long timeout;
 
@@ -1158,6 +1158,9 @@ static int __i915_spin_request(struct drm_i915_gem_request *req)
 		if (i915_gem_request_completed(req, true))
 			return 0;
 
+		if (signal_pending_state(state, current))
+			break;
+
 		if (time_after_eq(jiffies, timeout))
 			break;
 
@@ -1197,6 +1200,7 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	const bool irq_test_in_progress =
 		ACCESS_ONCE(dev_priv->gpu_error.test_irq_rings) & intel_ring_flag(ring);
+	int state = interruptible ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE;
 	DEFINE_WAIT(wait);
 	unsigned long timeout_expire;
 	s64 before, now;
@@ -1221,7 +1225,7 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
 	before = ktime_get_raw_ns();
 
 	/* Optimistic spin for the next jiffie before touching IRQs */
-	ret = __i915_spin_request(req);
+	ret = __i915_spin_request(req, state);
 	if (ret == 0)
 		goto out;
 
@@ -1233,8 +1237,7 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
 	for (;;) {
 		struct timer_list timer;
 
-		prepare_to_wait(&ring->irq_queue, &wait,
-				interruptible ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE);
+		prepare_to_wait(&ring->irq_queue, &wait, state);
 
 		/* We need to check whether any gpu reset happened in between
 		 * the caller grabbing the seqno and now ... */
@@ -1252,7 +1255,7 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
 			break;
 		}
 
-		if (interruptible && signal_pending(current)) {
+		if (signal_pending_state(state, current)) {
 			ret = -ERESTARTSYS;
 			break;
 		}
-- 
2.6.2

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 35+ messages in thread

* [PATCH 2/2] drm/i915: Limit the busy wait on requests to 2us not 10ms!
  2015-11-15 13:32 ` Chris Wilson
@ 2015-11-15 13:32   ` Chris Wilson
  -1 siblings, 0 replies; 35+ messages in thread
From: Chris Wilson @ 2015-11-15 13:32 UTC (permalink / raw)
  To: Jens Axboe, intel-gfx, linux-kernel
  Cc: dri-devel, Chris Wilson, Daniel Vetter, Tvrtko Ursulin,
	Eero Tamminen, Rantala, Valtteri, stable

When waiting for high frequency requests, the finite amount of time
required to set up the irq and wait upon it limits the response rate. By
busywaiting on the request completion for a short while we can service
the high frequency waits as quick as possible. However, if it is a slow
request, we want to sleep as quickly as possible. The tradeoff between
waiting and sleeping is roughly the time it takes to sleep on a request,
on the order of a microsecond. Based on measurements from big core, I
have set the limit for busywaiting as 2 microseconds.

The code currently uses the jiffie clock, but that is far too coarse (on
the order of 10 milliseconds) and results in poor interactivity as the
CPU ends up being hogged by slow requests. To get microsecond resolution
we need to use a high resolution timer. The cheapest of which is polling
local_clock(), but that is only valid on the same CPU. If we switch CPUs
because the task was preempted, we can also use that as an indicator that
 the system is too busy to waste cycles on spinning and we should sleep
instead.

__i915_spin_request was introduced in
commit 2def4ad99befa25775dd2f714fdd4d92faec6e34 [v4.2]
Author: Chris Wilson <chris@chris-wilson.co.uk>
Date:   Tue Apr 7 16:20:41 2015 +0100

     drm/i915: Optimistically spin for the request completion

Reported-by: Jens Axboe <axboe@kernel.dk>
Link: https://lkml.org/lkml/2015/11/12/621
Cc: Jens Axboe <axboe@kernel.dk>
Cc; "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin@intel.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Eero Tamminen <eero.t.tamminen@intel.com>
Cc: "Rantala, Valtteri" <valtteri.rantala@intel.com>
Cc: stable@kernel.vger.org
---
 drivers/gpu/drm/i915/i915_gem.c | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 740530c571d1..2a88158bd1f7 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1146,14 +1146,36 @@ static bool missed_irq(struct drm_i915_private *dev_priv,
 	return test_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings);
 }
 
+static u64 local_clock_us(unsigned *cpu)
+{
+	u64 t;
+
+	*cpu = get_cpu();
+	t = local_clock() >> 10;
+	put_cpu();
+
+	return t;
+}
+
+static bool busywait_stop(u64 timeout, unsigned cpu)
+{
+	unsigned this_cpu;
+
+	if (time_after64(local_clock_us(&this_cpu), timeout))
+		return true;
+
+	return this_cpu != cpu;
+}
+
 static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
 {
-	unsigned long timeout;
+	u64 timeout;
+	unsigned cpu;
 
 	if (i915_gem_request_get_ring(req)->irq_refcount)
 		return -EBUSY;
 
-	timeout = jiffies + 1;
+	timeout = local_clock_us(&cpu) + 2;
 	while (!need_resched()) {
 		if (i915_gem_request_completed(req, true))
 			return 0;
@@ -1161,7 +1183,7 @@ static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
 		if (signal_pending_state(state, current))
 			break;
 
-		if (time_after_eq(jiffies, timeout))
+		if (busywait_stop(timeout, cpu))
 			break;
 
 		cpu_relax_lowlatency();
-- 
2.6.2


^ permalink raw reply related	[flat|nested] 35+ messages in thread

* [PATCH 2/2] drm/i915: Limit the busy wait on requests to 2us not 10ms!
@ 2015-11-15 13:32   ` Chris Wilson
  0 siblings, 0 replies; 35+ messages in thread
From: Chris Wilson @ 2015-11-15 13:32 UTC (permalink / raw)
  To: Jens Axboe, intel-gfx, linux-kernel
  Cc: Daniel Vetter, Eero Tamminen, dri-devel, stable, Rantala, Valtteri

When waiting for high frequency requests, the finite amount of time
required to set up the irq and wait upon it limits the response rate. By
busywaiting on the request completion for a short while we can service
the high frequency waits as quick as possible. However, if it is a slow
request, we want to sleep as quickly as possible. The tradeoff between
waiting and sleeping is roughly the time it takes to sleep on a request,
on the order of a microsecond. Based on measurements from big core, I
have set the limit for busywaiting as 2 microseconds.

The code currently uses the jiffie clock, but that is far too coarse (on
the order of 10 milliseconds) and results in poor interactivity as the
CPU ends up being hogged by slow requests. To get microsecond resolution
we need to use a high resolution timer. The cheapest of which is polling
local_clock(), but that is only valid on the same CPU. If we switch CPUs
because the task was preempted, we can also use that as an indicator that
 the system is too busy to waste cycles on spinning and we should sleep
instead.

__i915_spin_request was introduced in
commit 2def4ad99befa25775dd2f714fdd4d92faec6e34 [v4.2]
Author: Chris Wilson <chris@chris-wilson.co.uk>
Date:   Tue Apr 7 16:20:41 2015 +0100

     drm/i915: Optimistically spin for the request completion

Reported-by: Jens Axboe <axboe@kernel.dk>
Link: https://lkml.org/lkml/2015/11/12/621
Cc: Jens Axboe <axboe@kernel.dk>
Cc; "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin@intel.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Eero Tamminen <eero.t.tamminen@intel.com>
Cc: "Rantala, Valtteri" <valtteri.rantala@intel.com>
Cc: stable@kernel.vger.org
---
 drivers/gpu/drm/i915/i915_gem.c | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 740530c571d1..2a88158bd1f7 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1146,14 +1146,36 @@ static bool missed_irq(struct drm_i915_private *dev_priv,
 	return test_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings);
 }

+static u64 local_clock_us(unsigned *cpu)
+{
+	u64 t;
+
+	*cpu = get_cpu();
+	t = local_clock() >> 10;
+	put_cpu();
+
+	return t;
+}
+
+static bool busywait_stop(u64 timeout, unsigned cpu)
+{
+	unsigned this_cpu;
+
+	if (time_after64(local_clock_us(&this_cpu), timeout))
+		return true;
+
+	return this_cpu != cpu;
+}
+
 static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
 {
-	unsigned long timeout;
+	u64 timeout;
+	unsigned cpu;

 	if (i915_gem_request_get_ring(req)->irq_refcount)
 		return -EBUSY;

-	timeout = jiffies + 1;
+	timeout = local_clock_us(&cpu) + 2;
 	while (!need_resched()) {
 		if (i915_gem_request_completed(req, true))
 			return 0;
@@ -1161,7 +1183,7 @@ static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
 		if (signal_pending_state(state, current))
 			break;

-		if (time_after_eq(jiffies, timeout))
+		if (busywait_stop(timeout, cpu))
 			break;

 		cpu_relax_lowlatency();
-- 
2.6.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 35+ messages in thread

* Re: [PATCH 2/2] drm/i915: Limit the busy wait on requests to 2us not 10ms!
  2015-11-15 13:32   ` Chris Wilson
@ 2015-11-15 17:48     ` Chris Wilson
  -1 siblings, 0 replies; 35+ messages in thread
From: Chris Wilson @ 2015-11-15 17:48 UTC (permalink / raw)
  To: Jens Axboe, intel-gfx, linux-kernel
  Cc: dri-devel, Daniel Vetter, Tvrtko Ursulin, Eero Tamminen, Rantala,
	Valtteri, stable

On Sun, Nov 15, 2015 at 01:32:44PM +0000, Chris Wilson wrote:
> +static bool busywait_stop(u64 timeout, unsigned cpu)
> +{
> +	unsigned this_cpu;
> +
> +	if (time_after64(local_clock_us(&this_cpu), timeout))
> +		return true;

Just a note to say that we can use the unsigned long version rather than
pass around u64 as this test will wraparound correctly (if we discard
the high bits on x86-32).
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 2/2] drm/i915: Limit the busy wait on requests to 2us not 10ms!
@ 2015-11-15 17:48     ` Chris Wilson
  0 siblings, 0 replies; 35+ messages in thread
From: Chris Wilson @ 2015-11-15 17:48 UTC (permalink / raw)
  To: Jens Axboe, intel-gfx, linux-kernel
  Cc: Daniel Vetter, Eero Tamminen, dri-devel, stable, Rantala, Valtteri

On Sun, Nov 15, 2015 at 01:32:44PM +0000, Chris Wilson wrote:
> +static bool busywait_stop(u64 timeout, unsigned cpu)
> +{
> +	unsigned this_cpu;
> +
> +	if (time_after64(local_clock_us(&this_cpu), timeout))
> +		return true;

Just a note to say that we can use the unsigned long version rather than
pass around u64 as this test will wraparound correctly (if we discard
the high bits on x86-32).
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 2/2] drm/i915: Limit the busy wait on requests to 2us not 10ms!
  2015-11-15 13:32   ` Chris Wilson
@ 2015-11-16 10:24     ` Tvrtko Ursulin
  -1 siblings, 0 replies; 35+ messages in thread
From: Tvrtko Ursulin @ 2015-11-16 10:24 UTC (permalink / raw)
  To: Chris Wilson, Jens Axboe, intel-gfx, linux-kernel
  Cc: dri-devel, Daniel Vetter, Eero Tamminen, Rantala, Valtteri, stable


Hi,

On 15/11/15 13:32, Chris Wilson wrote:
> When waiting for high frequency requests, the finite amount of time
> required to set up the irq and wait upon it limits the response rate. By
> busywaiting on the request completion for a short while we can service
> the high frequency waits as quick as possible. However, if it is a slow
> request, we want to sleep as quickly as possible. The tradeoff between
> waiting and sleeping is roughly the time it takes to sleep on a request,
> on the order of a microsecond. Based on measurements from big core, I
> have set the limit for busywaiting as 2 microseconds.

Sounds like solid reasoning. Would it also be worth finding the trade 
off limit for small core?

> The code currently uses the jiffie clock, but that is far too coarse (on
> the order of 10 milliseconds) and results in poor interactivity as the
> CPU ends up being hogged by slow requests. To get microsecond resolution
> we need to use a high resolution timer. The cheapest of which is polling
> local_clock(), but that is only valid on the same CPU. If we switch CPUs
> because the task was preempted, we can also use that as an indicator that
>   the system is too busy to waste cycles on spinning and we should sleep
> instead.

Hm, need_resched would not cover the CPU switch anyway? Or maybe 
need_resched means something other than I thought which is "there are 
other runnable tasks"?

This would also have impact on the patch subject line.I thought we would 
burn a jiffie of CPU cycles only if there are no other runnable tasks - 
so how come an impact on interactivity?

Also again I think the commit message needs some data on how this was 
found and what is the impact.

Btw as it happens, just last week as I was playing with perf, I did 
notice busy spinning is the top cycle waster in some benchmarks. I was 
in the process of trying to quantize the difference with it on or off 
but did not complete it.

> __i915_spin_request was introduced in
> commit 2def4ad99befa25775dd2f714fdd4d92faec6e34 [v4.2]
> Author: Chris Wilson <chris@chris-wilson.co.uk>
> Date:   Tue Apr 7 16:20:41 2015 +0100
>
>       drm/i915: Optimistically spin for the request completion
>
> Reported-by: Jens Axboe <axboe@kernel.dk>
> Link: https://lkml.org/lkml/2015/11/12/621
> Cc: Jens Axboe <axboe@kernel.dk>
> Cc; "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin@intel.com>
> Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> Cc: Eero Tamminen <eero.t.tamminen@intel.com>
> Cc: "Rantala, Valtteri" <valtteri.rantala@intel.com>
> Cc: stable@kernel.vger.org
> ---
>   drivers/gpu/drm/i915/i915_gem.c | 28 +++++++++++++++++++++++++---
>   1 file changed, 25 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 740530c571d1..2a88158bd1f7 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -1146,14 +1146,36 @@ static bool missed_irq(struct drm_i915_private *dev_priv,
>   	return test_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings);
>   }
>
> +static u64 local_clock_us(unsigned *cpu)
> +{
> +	u64 t;
> +
> +	*cpu = get_cpu();
> +	t = local_clock() >> 10;

Needs comment I think to explicitly mention the approximation, or maybe 
drop the _us suffix?

> +	put_cpu();
> +
> +	return t;
> +}
> +
> +static bool busywait_stop(u64 timeout, unsigned cpu)
> +{
> +	unsigned this_cpu;
> +
> +	if (time_after64(local_clock_us(&this_cpu), timeout))
> +		return true;
> +
> +	return this_cpu != cpu;
> +}
> +
>   static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
>   {
> -	unsigned long timeout;
> +	u64 timeout;
> +	unsigned cpu;
>
>   	if (i915_gem_request_get_ring(req)->irq_refcount)
>   		return -EBUSY;
>
> -	timeout = jiffies + 1;
> +	timeout = local_clock_us(&cpu) + 2;
>   	while (!need_resched()) {
>   		if (i915_gem_request_completed(req, true))
>   			return 0;
> @@ -1161,7 +1183,7 @@ static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
>   		if (signal_pending_state(state, current))
>   			break;
>
> -		if (time_after_eq(jiffies, timeout))
> +		if (busywait_stop(timeout, cpu))
>   			break;
>
>   		cpu_relax_lowlatency();
>

Otherwise looks good. Not sure what would you convert to 32-bit from 
your follow up reply since you need us resolution?

Regards,

Tvrtko

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 2/2] drm/i915: Limit the busy wait on requests to 2us not 10ms!
@ 2015-11-16 10:24     ` Tvrtko Ursulin
  0 siblings, 0 replies; 35+ messages in thread
From: Tvrtko Ursulin @ 2015-11-16 10:24 UTC (permalink / raw)
  To: Chris Wilson, Jens Axboe, intel-gfx, linux-kernel
  Cc: Daniel Vetter, Eero Tamminen, Rantala, Valtteri, stable, dri-devel


Hi,

On 15/11/15 13:32, Chris Wilson wrote:
> When waiting for high frequency requests, the finite amount of time
> required to set up the irq and wait upon it limits the response rate. By
> busywaiting on the request completion for a short while we can service
> the high frequency waits as quick as possible. However, if it is a slow
> request, we want to sleep as quickly as possible. The tradeoff between
> waiting and sleeping is roughly the time it takes to sleep on a request,
> on the order of a microsecond. Based on measurements from big core, I
> have set the limit for busywaiting as 2 microseconds.

Sounds like solid reasoning. Would it also be worth finding the trade 
off limit for small core?

> The code currently uses the jiffie clock, but that is far too coarse (on
> the order of 10 milliseconds) and results in poor interactivity as the
> CPU ends up being hogged by slow requests. To get microsecond resolution
> we need to use a high resolution timer. The cheapest of which is polling
> local_clock(), but that is only valid on the same CPU. If we switch CPUs
> because the task was preempted, we can also use that as an indicator that
>   the system is too busy to waste cycles on spinning and we should sleep
> instead.

Hm, need_resched would not cover the CPU switch anyway? Or maybe 
need_resched means something other than I thought which is "there are 
other runnable tasks"?

This would also have impact on the patch subject line.I thought we would 
burn a jiffie of CPU cycles only if there are no other runnable tasks - 
so how come an impact on interactivity?

Also again I think the commit message needs some data on how this was 
found and what is the impact.

Btw as it happens, just last week as I was playing with perf, I did 
notice busy spinning is the top cycle waster in some benchmarks. I was 
in the process of trying to quantize the difference with it on or off 
but did not complete it.

> __i915_spin_request was introduced in
> commit 2def4ad99befa25775dd2f714fdd4d92faec6e34 [v4.2]
> Author: Chris Wilson <chris@chris-wilson.co.uk>
> Date:   Tue Apr 7 16:20:41 2015 +0100
>
>       drm/i915: Optimistically spin for the request completion
>
> Reported-by: Jens Axboe <axboe@kernel.dk>
> Link: https://lkml.org/lkml/2015/11/12/621
> Cc: Jens Axboe <axboe@kernel.dk>
> Cc; "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin@intel.com>
> Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> Cc: Eero Tamminen <eero.t.tamminen@intel.com>
> Cc: "Rantala, Valtteri" <valtteri.rantala@intel.com>
> Cc: stable@kernel.vger.org
> ---
>   drivers/gpu/drm/i915/i915_gem.c | 28 +++++++++++++++++++++++++---
>   1 file changed, 25 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 740530c571d1..2a88158bd1f7 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -1146,14 +1146,36 @@ static bool missed_irq(struct drm_i915_private *dev_priv,
>   	return test_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings);
>   }
>
> +static u64 local_clock_us(unsigned *cpu)
> +{
> +	u64 t;
> +
> +	*cpu = get_cpu();
> +	t = local_clock() >> 10;

Needs comment I think to explicitly mention the approximation, or maybe 
drop the _us suffix?

> +	put_cpu();
> +
> +	return t;
> +}
> +
> +static bool busywait_stop(u64 timeout, unsigned cpu)
> +{
> +	unsigned this_cpu;
> +
> +	if (time_after64(local_clock_us(&this_cpu), timeout))
> +		return true;
> +
> +	return this_cpu != cpu;
> +}
> +
>   static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
>   {
> -	unsigned long timeout;
> +	u64 timeout;
> +	unsigned cpu;
>
>   	if (i915_gem_request_get_ring(req)->irq_refcount)
>   		return -EBUSY;
>
> -	timeout = jiffies + 1;
> +	timeout = local_clock_us(&cpu) + 2;
>   	while (!need_resched()) {
>   		if (i915_gem_request_completed(req, true))
>   			return 0;
> @@ -1161,7 +1183,7 @@ static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
>   		if (signal_pending_state(state, current))
>   			break;
>
> -		if (time_after_eq(jiffies, timeout))
> +		if (busywait_stop(timeout, cpu))
>   			break;
>
>   		cpu_relax_lowlatency();
>

Otherwise looks good. Not sure what would you convert to 32-bit from 
your follow up reply since you need us resolution?

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 2/2] drm/i915: Limit the busy wait on requests to 2us not 10ms!
  2015-11-16 10:24     ` Tvrtko Ursulin
@ 2015-11-16 11:12       ` Chris Wilson
  -1 siblings, 0 replies; 35+ messages in thread
From: Chris Wilson @ 2015-11-16 11:12 UTC (permalink / raw)
  To: Tvrtko Ursulin
  Cc: Jens Axboe, intel-gfx, linux-kernel, dri-devel, Daniel Vetter,
	Eero Tamminen, Rantala, Valtteri, stable

On Mon, Nov 16, 2015 at 10:24:45AM +0000, Tvrtko Ursulin wrote:
> 
> Hi,
> 
> On 15/11/15 13:32, Chris Wilson wrote:
> >When waiting for high frequency requests, the finite amount of time
> >required to set up the irq and wait upon it limits the response rate. By
> >busywaiting on the request completion for a short while we can service
> >the high frequency waits as quick as possible. However, if it is a slow
> >request, we want to sleep as quickly as possible. The tradeoff between
> >waiting and sleeping is roughly the time it takes to sleep on a request,
> >on the order of a microsecond. Based on measurements from big core, I
> >have set the limit for busywaiting as 2 microseconds.
> 
> Sounds like solid reasoning. Would it also be worth finding the
> trade off limit for small core?

Takes a bit longer, but 2us seems "ok" on PineView (as in it doesn't
lose the boost from spinning rather than sleeping). Have some more
testing to do on vlv/byt.
 
> >The code currently uses the jiffie clock, but that is far too coarse (on
> >the order of 10 milliseconds) and results in poor interactivity as the
> >CPU ends up being hogged by slow requests. To get microsecond resolution
> >we need to use a high resolution timer. The cheapest of which is polling
> >local_clock(), but that is only valid on the same CPU. If we switch CPUs
> >because the task was preempted, we can also use that as an indicator that
> >  the system is too busy to waste cycles on spinning and we should sleep
> >instead.
> 
> Hm, need_resched would not cover the CPU switch anyway? Or maybe
> need_resched means something other than I thought which is "there
> are other runnable tasks"?

As I understand it, it means that the scheduler tick fired (or something
else yielded). I haven't spotted if it gets set as the runqueue changes.
As it pertains to us, it means that we need to get to schedule() as
quick as possible which along this path means going to sleep.

I'm not sure if need_resched() would catch the cpu switch, if we were
preempted as the flag would be set on the idle process not us.
 
> This would also have impact on the patch subject line.I thought we
> would burn a jiffie of CPU cycles only if there are no other
> runnable tasks - so how come an impact on interactivity?

I have a couple of ideas for the effect on interactivty:

1. Burning through the time slice is acting as a penalty against running
that process (typically the compositor) in the near future, perhaps
enough to miss some deadlines.

2. Processor power balancing.

> Also again I think the commit message needs some data on how this
> was found and what is the impact.

The system felt unresponsive. It would be interesting for me to know a
few more details about the tick on that system (HZ, tickless?,
preemption?) to see if changing the config on my xps13 also produces the
lag/jitter/poor interactivty.
 
> Btw as it happens, just last week as I was playing with perf, I did
> notice busy spinning is the top cycle waster in some benchmarks. I
> was in the process of trying to quantize the difference with it on
> or off but did not complete it.

I found that spin-request appearing in profiles makes tracking down the
culprit higer in the stack much easier. Otherwise you have to remember to
enable a pass with the tracepoint to find the stalls (or use
intel-gpu-overlay which does it for you).
 
> >+static u64 local_clock_us(unsigned *cpu)
> >+{
> >+	u64 t;
> >+
> >+	*cpu = get_cpu();
> >+	t = local_clock() >> 10;
> 
> Needs comment I think to explicitly mention the approximation, or
> maybe drop the _us suffix?

I did consider _approx_us but thought that was overkill. A comment along
the lines of
/* Approximately convert ns to us - the error is less than the
 * truncation!
 */

> >@@ -1161,7 +1183,7 @@ static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
> >  		if (signal_pending_state(state, current))
> >  			break;
> >
> >-		if (time_after_eq(jiffies, timeout))
> >+		if (busywait_stop(timeout, cpu))
> >  			break;
> >
> >  		cpu_relax_lowlatency();
> >
> 
> Otherwise looks good. Not sure what would you convert to 32-bit from
> your follow up reply since you need us resolution?

s/u64/unsigned long/ s/time_after64/time_after/

32bits of us resolution gives us 1000s before wraparound between the two
samples. And I hope that a 1000s doesn't pass between loops. Or if it does,
the GPU managed to complete its task.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 2/2] drm/i915: Limit the busy wait on requests to 2us not 10ms!
@ 2015-11-16 11:12       ` Chris Wilson
  0 siblings, 0 replies; 35+ messages in thread
From: Chris Wilson @ 2015-11-16 11:12 UTC (permalink / raw)
  To: Tvrtko Ursulin
  Cc: Jens Axboe, Daniel Vetter, intel-gfx, linux-kernel, dri-devel,
	stable, Rantala, Valtteri, Eero Tamminen

On Mon, Nov 16, 2015 at 10:24:45AM +0000, Tvrtko Ursulin wrote:
> 
> Hi,
> 
> On 15/11/15 13:32, Chris Wilson wrote:
> >When waiting for high frequency requests, the finite amount of time
> >required to set up the irq and wait upon it limits the response rate. By
> >busywaiting on the request completion for a short while we can service
> >the high frequency waits as quick as possible. However, if it is a slow
> >request, we want to sleep as quickly as possible. The tradeoff between
> >waiting and sleeping is roughly the time it takes to sleep on a request,
> >on the order of a microsecond. Based on measurements from big core, I
> >have set the limit for busywaiting as 2 microseconds.
> 
> Sounds like solid reasoning. Would it also be worth finding the
> trade off limit for small core?

Takes a bit longer, but 2us seems "ok" on PineView (as in it doesn't
lose the boost from spinning rather than sleeping). Have some more
testing to do on vlv/byt.
 
> >The code currently uses the jiffie clock, but that is far too coarse (on
> >the order of 10 milliseconds) and results in poor interactivity as the
> >CPU ends up being hogged by slow requests. To get microsecond resolution
> >we need to use a high resolution timer. The cheapest of which is polling
> >local_clock(), but that is only valid on the same CPU. If we switch CPUs
> >because the task was preempted, we can also use that as an indicator that
> >  the system is too busy to waste cycles on spinning and we should sleep
> >instead.
> 
> Hm, need_resched would not cover the CPU switch anyway? Or maybe
> need_resched means something other than I thought which is "there
> are other runnable tasks"?

As I understand it, it means that the scheduler tick fired (or something
else yielded). I haven't spotted if it gets set as the runqueue changes.
As it pertains to us, it means that we need to get to schedule() as
quick as possible which along this path means going to sleep.

I'm not sure if need_resched() would catch the cpu switch, if we were
preempted as the flag would be set on the idle process not us.
 
> This would also have impact on the patch subject line.I thought we
> would burn a jiffie of CPU cycles only if there are no other
> runnable tasks - so how come an impact on interactivity?

I have a couple of ideas for the effect on interactivty:

1. Burning through the time slice is acting as a penalty against running
that process (typically the compositor) in the near future, perhaps
enough to miss some deadlines.

2. Processor power balancing.

> Also again I think the commit message needs some data on how this
> was found and what is the impact.

The system felt unresponsive. It would be interesting for me to know a
few more details about the tick on that system (HZ, tickless?,
preemption?) to see if changing the config on my xps13 also produces the
lag/jitter/poor interactivty.
 
> Btw as it happens, just last week as I was playing with perf, I did
> notice busy spinning is the top cycle waster in some benchmarks. I
> was in the process of trying to quantize the difference with it on
> or off but did not complete it.

I found that spin-request appearing in profiles makes tracking down the
culprit higer in the stack much easier. Otherwise you have to remember to
enable a pass with the tracepoint to find the stalls (or use
intel-gpu-overlay which does it for you).
 
> >+static u64 local_clock_us(unsigned *cpu)
> >+{
> >+	u64 t;
> >+
> >+	*cpu = get_cpu();
> >+	t = local_clock() >> 10;
> 
> Needs comment I think to explicitly mention the approximation, or
> maybe drop the _us suffix?

I did consider _approx_us but thought that was overkill. A comment along
the lines of
/* Approximately convert ns to us - the error is less than the
 * truncation!
 */

> >@@ -1161,7 +1183,7 @@ static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
> >  		if (signal_pending_state(state, current))
> >  			break;
> >
> >-		if (time_after_eq(jiffies, timeout))
> >+		if (busywait_stop(timeout, cpu))
> >  			break;
> >
> >  		cpu_relax_lowlatency();
> >
> 
> Otherwise looks good. Not sure what would you convert to 32-bit from
> your follow up reply since you need us resolution?

s/u64/unsigned long/ s/time_after64/time_after/

32bits of us resolution gives us 1000s before wraparound between the two
samples. And I hope that a 1000s doesn't pass between loops. Or if it does,
the GPU managed to complete its task.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 2/2] drm/i915: Limit the busy wait on requests to 2us not 10ms!
  2015-11-16 11:12       ` Chris Wilson
@ 2015-11-16 12:08         ` Tvrtko Ursulin
  -1 siblings, 0 replies; 35+ messages in thread
From: Tvrtko Ursulin @ 2015-11-16 12:08 UTC (permalink / raw)
  To: Chris Wilson, Jens Axboe, intel-gfx, linux-kernel, dri-devel,
	Daniel Vetter, Eero Tamminen, Rantala, Valtteri, stable


On 16/11/15 11:12, Chris Wilson wrote:
> On Mon, Nov 16, 2015 at 10:24:45AM +0000, Tvrtko Ursulin wrote:
>>
>> Hi,
>>
>> On 15/11/15 13:32, Chris Wilson wrote:
>>> When waiting for high frequency requests, the finite amount of time
>>> required to set up the irq and wait upon it limits the response rate. By
>>> busywaiting on the request completion for a short while we can service
>>> the high frequency waits as quick as possible. However, if it is a slow
>>> request, we want to sleep as quickly as possible. The tradeoff between
>>> waiting and sleeping is roughly the time it takes to sleep on a request,
>>> on the order of a microsecond. Based on measurements from big core, I
>>> have set the limit for busywaiting as 2 microseconds.
>>
>> Sounds like solid reasoning. Would it also be worth finding the
>> trade off limit for small core?
>
> Takes a bit longer, but 2us seems "ok" on PineView (as in it doesn't
> lose the boost from spinning rather than sleeping). Have some more
> testing to do on vlv/byt.

Cool.

>>> The code currently uses the jiffie clock, but that is far too coarse (on
>>> the order of 10 milliseconds) and results in poor interactivity as the
>>> CPU ends up being hogged by slow requests. To get microsecond resolution
>>> we need to use a high resolution timer. The cheapest of which is polling
>>> local_clock(), but that is only valid on the same CPU. If we switch CPUs
>>> because the task was preempted, we can also use that as an indicator that
>>>   the system is too busy to waste cycles on spinning and we should sleep
>>> instead.
>>
>> Hm, need_resched would not cover the CPU switch anyway? Or maybe
>> need_resched means something other than I thought which is "there
>> are other runnable tasks"?
>
> As I understand it, it means that the scheduler tick fired (or something
> else yielded). I haven't spotted if it gets set as the runqueue changes.
> As it pertains to us, it means that we need to get to schedule() as
> quick as possible which along this path means going to sleep.
>
> I'm not sure if need_resched() would catch the cpu switch, if we were
> preempted as the flag would be set on the idle process not us.

Could be, I wasn't sure at all, just curious and trying to understand it 
fully. Cpu check is so cheap as implemented that it is not under any 
criticism.

>> This would also have impact on the patch subject line.I thought we
>> would burn a jiffie of CPU cycles only if there are no other
>> runnable tasks - so how come an impact on interactivity?
>
> I have a couple of ideas for the effect on interactivty:
>
> 1. Burning through the time slice is acting as a penalty against running
> that process (typically the compositor) in the near future, perhaps
> enough to miss some deadlines.
>
> 2. Processor power balancing.
>
>> Also again I think the commit message needs some data on how this
>> was found and what is the impact.
>
> The system felt unresponsive. It would be interesting for me to know a
> few more details about the tick on that system (HZ, tickless?,
> preemption?) to see if changing the config on my xps13 also produces the
> lag/jitter/poor interactivty.

Yes interesting but not critical I think. Since the new scheme looks as 
efficient as the old one so there should be no downside anyway.

>> Btw as it happens, just last week as I was playing with perf, I did
>> notice busy spinning is the top cycle waster in some benchmarks. I
>> was in the process of trying to quantize the difference with it on
>> or off but did not complete it.
>
> I found that spin-request appearing in profiles makes tracking down the
> culprit higer in the stack much easier. Otherwise you have to remember to
> enable a pass with the tracepoint to find the stalls (or use
> intel-gpu-overlay which does it for you).

I'll put it on my TODO list of things to play with.

>>> +static u64 local_clock_us(unsigned *cpu)
>>> +{
>>> +	u64 t;
>>> +
>>> +	*cpu = get_cpu();
>>> +	t = local_clock() >> 10;
>>
>> Needs comment I think to explicitly mention the approximation, or
>> maybe drop the _us suffix?
>
> I did consider _approx_us but thought that was overkill. A comment along
> the lines of
> /* Approximately convert ns to us - the error is less than the
>   * truncation!
>   */

And the result is not used in subsequent calculations apart from 
comparing against an approximate timeout?

>>> @@ -1161,7 +1183,7 @@ static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
>>>   		if (signal_pending_state(state, current))
>>>   			break;
>>>
>>> -		if (time_after_eq(jiffies, timeout))
>>> +		if (busywait_stop(timeout, cpu))
>>>   			break;
>>>
>>>   		cpu_relax_lowlatency();
>>>
>>
>> Otherwise looks good. Not sure what would you convert to 32-bit from
>> your follow up reply since you need us resolution?
>
> s/u64/unsigned long/ s/time_after64/time_after/
>
> 32bits of us resolution gives us 1000s before wraparound between the two
> samples. And I hope that a 1000s doesn't pass between loops. Or if it does,
> the GPU managed to complete its task.

Now I see that you did say low bits.. yes that sounds fine.

Btw while you are optimizing things maybe pick up this micro 
optimization: http://patchwork.freedesktop.org/patch/64339/

Not in scope of this thread but under the normal development patch flow.

Btw2, any benchmark result changes with this?

Regards,

Tvrtko

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 2/2] drm/i915: Limit the busy wait on requests to 2us not 10ms!
@ 2015-11-16 12:08         ` Tvrtko Ursulin
  0 siblings, 0 replies; 35+ messages in thread
From: Tvrtko Ursulin @ 2015-11-16 12:08 UTC (permalink / raw)
  To: Chris Wilson, Jens Axboe, intel-gfx, linux-kernel, dri-devel,
	Daniel Vetter, Eero Tamminen, Rantala, Valtteri, stable


On 16/11/15 11:12, Chris Wilson wrote:
> On Mon, Nov 16, 2015 at 10:24:45AM +0000, Tvrtko Ursulin wrote:
>>
>> Hi,
>>
>> On 15/11/15 13:32, Chris Wilson wrote:
>>> When waiting for high frequency requests, the finite amount of time
>>> required to set up the irq and wait upon it limits the response rate. By
>>> busywaiting on the request completion for a short while we can service
>>> the high frequency waits as quick as possible. However, if it is a slow
>>> request, we want to sleep as quickly as possible. The tradeoff between
>>> waiting and sleeping is roughly the time it takes to sleep on a request,
>>> on the order of a microsecond. Based on measurements from big core, I
>>> have set the limit for busywaiting as 2 microseconds.
>>
>> Sounds like solid reasoning. Would it also be worth finding the
>> trade off limit for small core?
>
> Takes a bit longer, but 2us seems "ok" on PineView (as in it doesn't
> lose the boost from spinning rather than sleeping). Have some more
> testing to do on vlv/byt.

Cool.

>>> The code currently uses the jiffie clock, but that is far too coarse (on
>>> the order of 10 milliseconds) and results in poor interactivity as the
>>> CPU ends up being hogged by slow requests. To get microsecond resolution
>>> we need to use a high resolution timer. The cheapest of which is polling
>>> local_clock(), but that is only valid on the same CPU. If we switch CPUs
>>> because the task was preempted, we can also use that as an indicator that
>>>   the system is too busy to waste cycles on spinning and we should sleep
>>> instead.
>>
>> Hm, need_resched would not cover the CPU switch anyway? Or maybe
>> need_resched means something other than I thought which is "there
>> are other runnable tasks"?
>
> As I understand it, it means that the scheduler tick fired (or something
> else yielded). I haven't spotted if it gets set as the runqueue changes.
> As it pertains to us, it means that we need to get to schedule() as
> quick as possible which along this path means going to sleep.
>
> I'm not sure if need_resched() would catch the cpu switch, if we were
> preempted as the flag would be set on the idle process not us.

Could be, I wasn't sure at all, just curious and trying to understand it 
fully. Cpu check is so cheap as implemented that it is not under any 
criticism.

>> This would also have impact on the patch subject line.I thought we
>> would burn a jiffie of CPU cycles only if there are no other
>> runnable tasks - so how come an impact on interactivity?
>
> I have a couple of ideas for the effect on interactivty:
>
> 1. Burning through the time slice is acting as a penalty against running
> that process (typically the compositor) in the near future, perhaps
> enough to miss some deadlines.
>
> 2. Processor power balancing.
>
>> Also again I think the commit message needs some data on how this
>> was found and what is the impact.
>
> The system felt unresponsive. It would be interesting for me to know a
> few more details about the tick on that system (HZ, tickless?,
> preemption?) to see if changing the config on my xps13 also produces the
> lag/jitter/poor interactivty.

Yes interesting but not critical I think. Since the new scheme looks as 
efficient as the old one so there should be no downside anyway.

>> Btw as it happens, just last week as I was playing with perf, I did
>> notice busy spinning is the top cycle waster in some benchmarks. I
>> was in the process of trying to quantize the difference with it on
>> or off but did not complete it.
>
> I found that spin-request appearing in profiles makes tracking down the
> culprit higer in the stack much easier. Otherwise you have to remember to
> enable a pass with the tracepoint to find the stalls (or use
> intel-gpu-overlay which does it for you).

I'll put it on my TODO list of things to play with.

>>> +static u64 local_clock_us(unsigned *cpu)
>>> +{
>>> +	u64 t;
>>> +
>>> +	*cpu = get_cpu();
>>> +	t = local_clock() >> 10;
>>
>> Needs comment I think to explicitly mention the approximation, or
>> maybe drop the _us suffix?
>
> I did consider _approx_us but thought that was overkill. A comment along
> the lines of
> /* Approximately convert ns to us - the error is less than the
>   * truncation!
>   */

And the result is not used in subsequent calculations apart from 
comparing against an approximate timeout?

>>> @@ -1161,7 +1183,7 @@ static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
>>>   		if (signal_pending_state(state, current))
>>>   			break;
>>>
>>> -		if (time_after_eq(jiffies, timeout))
>>> +		if (busywait_stop(timeout, cpu))
>>>   			break;
>>>
>>>   		cpu_relax_lowlatency();
>>>
>>
>> Otherwise looks good. Not sure what would you convert to 32-bit from
>> your follow up reply since you need us resolution?
>
> s/u64/unsigned long/ s/time_after64/time_after/
>
> 32bits of us resolution gives us 1000s before wraparound between the two
> samples. And I hope that a 1000s doesn't pass between loops. Or if it does,
> the GPU managed to complete its task.

Now I see that you did say low bits.. yes that sounds fine.

Btw while you are optimizing things maybe pick up this micro 
optimization: http://patchwork.freedesktop.org/patch/64339/

Not in scope of this thread but under the normal development patch flow.

Btw2, any benchmark result changes with this?

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 2/2] drm/i915: Limit the busy wait on requests to 2us not 10ms!
  2015-11-16 12:08         ` Tvrtko Ursulin
@ 2015-11-16 12:55           ` Chris Wilson
  -1 siblings, 0 replies; 35+ messages in thread
From: Chris Wilson @ 2015-11-16 12:55 UTC (permalink / raw)
  To: Tvrtko Ursulin
  Cc: Jens Axboe, intel-gfx, linux-kernel, dri-devel, Daniel Vetter,
	Eero Tamminen, Rantala, Valtteri, stable

On Mon, Nov 16, 2015 at 12:08:08PM +0000, Tvrtko Ursulin wrote:
> 
> On 16/11/15 11:12, Chris Wilson wrote:
> >On Mon, Nov 16, 2015 at 10:24:45AM +0000, Tvrtko Ursulin wrote:
> >>On 15/11/15 13:32, Chris Wilson wrote:
> >>>+static u64 local_clock_us(unsigned *cpu)
> >>>+{
> >>>+	u64 t;
> >>>+
> >>>+	*cpu = get_cpu();
> >>>+	t = local_clock() >> 10;
> >>
> >>Needs comment I think to explicitly mention the approximation, or
> >>maybe drop the _us suffix?
> >
> >I did consider _approx_us but thought that was overkill. A comment along
> >the lines of
> >/* Approximately convert ns to us - the error is less than the
> >  * truncation!
> >  */
> 
> And the result is not used in subsequent calculations apart from
> comparing against an approximate timeout?

Exactly, the timeout is fairly arbitrary and defined in the same units.
That we truncate is a much bigger cause for concern in terms of spinning
accurately for a definite length of time.
 
> >>>@@ -1161,7 +1183,7 @@ static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
> >>>  		if (signal_pending_state(state, current))
> >>>  			break;
> >>>
> >>>-		if (time_after_eq(jiffies, timeout))
> >>>+		if (busywait_stop(timeout, cpu))
> >>>  			break;
> >>>
> >>>  		cpu_relax_lowlatency();
> >>>
> >>
> >>Otherwise looks good. Not sure what would you convert to 32-bit from
> >>your follow up reply since you need us resolution?
> >
> >s/u64/unsigned long/ s/time_after64/time_after/
> >
> >32bits of us resolution gives us 1000s before wraparound between the two
> >samples. And I hope that a 1000s doesn't pass between loops. Or if it does,
> >the GPU managed to complete its task.
> 
> Now I see that you did say low bits.. yes that sounds fine.
> 
> Btw while you are optimizing things maybe pick up this micro
> optimization: http://patchwork.freedesktop.org/patch/64339/
> 
> Not in scope of this thread but under the normal development patch flow.

There's a different series which looks at tackling the scalabiltiy issue
with dozens of concurrent waiters. I have an equivalent patch there and
one to tidy up the seqno query.
 
> Btw2, any benchmark result changes with this?

Spinning still gives the dramatic (2x) improvement in the microbenchmarks
(over pure interrupt driven waits), so that improvement is preserved.
There are a couple of interesting swings in the macro tests (comparedt to
the previous jiffie patch) just above the noise level which could well be
a change in the throttling/scheduling. (And those tests are also the
ones that correspond to the greatest gains (10-40%) using spinning.)
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 2/2] drm/i915: Limit the busy wait on requests to 2us not 10ms!
@ 2015-11-16 12:55           ` Chris Wilson
  0 siblings, 0 replies; 35+ messages in thread
From: Chris Wilson @ 2015-11-16 12:55 UTC (permalink / raw)
  To: Tvrtko Ursulin
  Cc: Jens Axboe, Daniel Vetter, intel-gfx, linux-kernel, dri-devel,
	stable, Rantala, Valtteri, Eero Tamminen

On Mon, Nov 16, 2015 at 12:08:08PM +0000, Tvrtko Ursulin wrote:
> 
> On 16/11/15 11:12, Chris Wilson wrote:
> >On Mon, Nov 16, 2015 at 10:24:45AM +0000, Tvrtko Ursulin wrote:
> >>On 15/11/15 13:32, Chris Wilson wrote:
> >>>+static u64 local_clock_us(unsigned *cpu)
> >>>+{
> >>>+	u64 t;
> >>>+
> >>>+	*cpu = get_cpu();
> >>>+	t = local_clock() >> 10;
> >>
> >>Needs comment I think to explicitly mention the approximation, or
> >>maybe drop the _us suffix?
> >
> >I did consider _approx_us but thought that was overkill. A comment along
> >the lines of
> >/* Approximately convert ns to us - the error is less than the
> >  * truncation!
> >  */
> 
> And the result is not used in subsequent calculations apart from
> comparing against an approximate timeout?

Exactly, the timeout is fairly arbitrary and defined in the same units.
That we truncate is a much bigger cause for concern in terms of spinning
accurately for a definite length of time.
 
> >>>@@ -1161,7 +1183,7 @@ static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
> >>>  		if (signal_pending_state(state, current))
> >>>  			break;
> >>>
> >>>-		if (time_after_eq(jiffies, timeout))
> >>>+		if (busywait_stop(timeout, cpu))
> >>>  			break;
> >>>
> >>>  		cpu_relax_lowlatency();
> >>>
> >>
> >>Otherwise looks good. Not sure what would you convert to 32-bit from
> >>your follow up reply since you need us resolution?
> >
> >s/u64/unsigned long/ s/time_after64/time_after/
> >
> >32bits of us resolution gives us 1000s before wraparound between the two
> >samples. And I hope that a 1000s doesn't pass between loops. Or if it does,
> >the GPU managed to complete its task.
> 
> Now I see that you did say low bits.. yes that sounds fine.
> 
> Btw while you are optimizing things maybe pick up this micro
> optimization: http://patchwork.freedesktop.org/patch/64339/
> 
> Not in scope of this thread but under the normal development patch flow.

There's a different series which looks at tackling the scalabiltiy issue
with dozens of concurrent waiters. I have an equivalent patch there and
one to tidy up the seqno query.
 
> Btw2, any benchmark result changes with this?

Spinning still gives the dramatic (2x) improvement in the microbenchmarks
(over pure interrupt driven waits), so that improvement is preserved.
There are a couple of interesting swings in the macro tests (comparedt to
the previous jiffie patch) just above the noise level which could well be
a change in the throttling/scheduling. (And those tests are also the
ones that correspond to the greatest gains (10-40%) using spinning.)
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 2/2] drm/i915: Limit the busy wait on requests to 2us not 10ms!
  2015-11-16 12:55           ` Chris Wilson
@ 2015-11-16 13:09             ` Tvrtko Ursulin
  -1 siblings, 0 replies; 35+ messages in thread
From: Tvrtko Ursulin @ 2015-11-16 13:09 UTC (permalink / raw)
  To: Chris Wilson, Jens Axboe, intel-gfx, linux-kernel, dri-devel,
	Daniel Vetter, Eero Tamminen, Rantala, Valtteri, stable


On 16/11/15 12:55, Chris Wilson wrote:
> On Mon, Nov 16, 2015 at 12:08:08PM +0000, Tvrtko Ursulin wrote:
>>
>> On 16/11/15 11:12, Chris Wilson wrote:
>>> On Mon, Nov 16, 2015 at 10:24:45AM +0000, Tvrtko Ursulin wrote:
>>>> On 15/11/15 13:32, Chris Wilson wrote:
>>>>> +static u64 local_clock_us(unsigned *cpu)
>>>>> +{
>>>>> +	u64 t;
>>>>> +
>>>>> +	*cpu = get_cpu();
>>>>> +	t = local_clock() >> 10;
>>>>
>>>> Needs comment I think to explicitly mention the approximation, or
>>>> maybe drop the _us suffix?
>>>
>>> I did consider _approx_us but thought that was overkill. A comment along
>>> the lines of
>>> /* Approximately convert ns to us - the error is less than the
>>>   * truncation!
>>>   */
>>
>> And the result is not used in subsequent calculations apart from
>> comparing against an approximate timeout?
>
> Exactly, the timeout is fairly arbitrary and defined in the same units.
> That we truncate is a much bigger cause for concern in terms of spinning
> accurately for a definite length of time.

Bah sorry that was not supposed to be a question but a suggestion to add 
to the comment. Must had mistyped the question mark. :)

Regards,

Tvrtko

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 2/2] drm/i915: Limit the busy wait on requests to 2us not 10ms!
@ 2015-11-16 13:09             ` Tvrtko Ursulin
  0 siblings, 0 replies; 35+ messages in thread
From: Tvrtko Ursulin @ 2015-11-16 13:09 UTC (permalink / raw)
  To: Chris Wilson, Jens Axboe, intel-gfx, linux-kernel, dri-devel,
	Daniel Vetter, Eero Tamminen, Rantala, Valtteri, stable


On 16/11/15 12:55, Chris Wilson wrote:
> On Mon, Nov 16, 2015 at 12:08:08PM +0000, Tvrtko Ursulin wrote:
>>
>> On 16/11/15 11:12, Chris Wilson wrote:
>>> On Mon, Nov 16, 2015 at 10:24:45AM +0000, Tvrtko Ursulin wrote:
>>>> On 15/11/15 13:32, Chris Wilson wrote:
>>>>> +static u64 local_clock_us(unsigned *cpu)
>>>>> +{
>>>>> +	u64 t;
>>>>> +
>>>>> +	*cpu = get_cpu();
>>>>> +	t = local_clock() >> 10;
>>>>
>>>> Needs comment I think to explicitly mention the approximation, or
>>>> maybe drop the _us suffix?
>>>
>>> I did consider _approx_us but thought that was overkill. A comment along
>>> the lines of
>>> /* Approximately convert ns to us - the error is less than the
>>>   * truncation!
>>>   */
>>
>> And the result is not used in subsequent calculations apart from
>> comparing against an approximate timeout?
>
> Exactly, the timeout is fairly arbitrary and defined in the same units.
> That we truncate is a much bigger cause for concern in terms of spinning
> accurately for a definite length of time.

Bah sorry that was not supposed to be a question but a suggestion to add 
to the comment. Must had mistyped the question mark. :)

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [Intel-gfx] [PATCH 2/2] drm/i915: Limit the busy wait on requests to 2us not 10ms!
  2015-11-16 12:55           ` Chris Wilson
@ 2015-11-16 13:30             ` Ville Syrjälä
  -1 siblings, 0 replies; 35+ messages in thread
From: Ville Syrjälä @ 2015-11-16 13:30 UTC (permalink / raw)
  To: Chris Wilson, Tvrtko Ursulin, Jens Axboe, intel-gfx,
	linux-kernel, dri-devel, Daniel Vetter, Eero Tamminen, Rantala,
	Valtteri, stable

On Mon, Nov 16, 2015 at 12:55:37PM +0000, Chris Wilson wrote:
> On Mon, Nov 16, 2015 at 12:08:08PM +0000, Tvrtko Ursulin wrote:
> > 
> > On 16/11/15 11:12, Chris Wilson wrote:
> > >On Mon, Nov 16, 2015 at 10:24:45AM +0000, Tvrtko Ursulin wrote:
> > >>On 15/11/15 13:32, Chris Wilson wrote:
> > >>>+static u64 local_clock_us(unsigned *cpu)
> > >>>+{
> > >>>+	u64 t;
> > >>>+
> > >>>+	*cpu = get_cpu();
> > >>>+	t = local_clock() >> 10;
> > >>
> > >>Needs comment I think to explicitly mention the approximation, or
> > >>maybe drop the _us suffix?
> > >
> > >I did consider _approx_us but thought that was overkill. A comment along
> > >the lines of
> > >/* Approximately convert ns to us - the error is less than the
> > >  * truncation!
> > >  */
> > 
> > And the result is not used in subsequent calculations apart from
> > comparing against an approximate timeout?
> 
> Exactly, the timeout is fairly arbitrary and defined in the same units.
> That we truncate is a much bigger cause for concern in terms of spinning
> accurately for a definite length of time.
>  
> > >>>@@ -1161,7 +1183,7 @@ static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
> > >>>  		if (signal_pending_state(state, current))
> > >>>  			break;
> > >>>
> > >>>-		if (time_after_eq(jiffies, timeout))
> > >>>+		if (busywait_stop(timeout, cpu))
> > >>>  			break;
> > >>>
> > >>>  		cpu_relax_lowlatency();
> > >>>
> > >>
> > >>Otherwise looks good. Not sure what would you convert to 32-bit from
> > >>your follow up reply since you need us resolution?
> > >
> > >s/u64/unsigned long/ s/time_after64/time_after/
> > >
> > >32bits of us resolution gives us 1000s before wraparound between the two
> > >samples. And I hope that a 1000s doesn't pass between loops. Or if it does,
> > >the GPU managed to complete its task.
> > 
> > Now I see that you did say low bits.. yes that sounds fine.
> > 
> > Btw while you are optimizing things maybe pick up this micro
> > optimization: http://patchwork.freedesktop.org/patch/64339/
> > 
> > Not in scope of this thread but under the normal development patch flow.
> 
> There's a different series which looks at tackling the scalabiltiy issue
> with dozens of concurrent waiters. I have an equivalent patch there and
> one to tidy up the seqno query.
>  
> > Btw2, any benchmark result changes with this?
> 
> Spinning still gives the dramatic (2x) improvement in the microbenchmarks
> (over pure interrupt driven waits), so that improvement is preserved.

Previously the spinning also increased power consumption without
offering any significant performance difference for some workloads.
IIRC on my BYT the average CPU power consumption was ~100mW higher
(as reported by RAPL) with xonotic the-big-keybench.dem (1920x1200
w/ "High" settings, IIRC) but average fps wasn't improved. Might
be interesting to know how the improved spin code stacks up on
that front.

> There are a couple of interesting swings in the macro tests (comparedt to
> the previous jiffie patch) just above the noise level which could well be
> a change in the throttling/scheduling. (And those tests are also the
> ones that correspond to the greatest gains (10-40%) using spinning.)
> -Chris
> 
> -- 
> Chris Wilson, Intel Open Source Technology Centre
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Ville Syrjälä
Intel OTC

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [Intel-gfx] [PATCH 2/2] drm/i915: Limit the busy wait on requests to 2us not 10ms!
@ 2015-11-16 13:30             ` Ville Syrjälä
  0 siblings, 0 replies; 35+ messages in thread
From: Ville Syrjälä @ 2015-11-16 13:30 UTC (permalink / raw)
  To: Chris Wilson, Tvrtko Ursulin, Jens Axboe, intel-gfx,
	linux-kernel, dri-devel, Daniel Vetter, Eero Tamminen, Rantala,
	Valtteri, stable

On Mon, Nov 16, 2015 at 12:55:37PM +0000, Chris Wilson wrote:
> On Mon, Nov 16, 2015 at 12:08:08PM +0000, Tvrtko Ursulin wrote:
> > 
> > On 16/11/15 11:12, Chris Wilson wrote:
> > >On Mon, Nov 16, 2015 at 10:24:45AM +0000, Tvrtko Ursulin wrote:
> > >>On 15/11/15 13:32, Chris Wilson wrote:
> > >>>+static u64 local_clock_us(unsigned *cpu)
> > >>>+{
> > >>>+	u64 t;
> > >>>+
> > >>>+	*cpu = get_cpu();
> > >>>+	t = local_clock() >> 10;
> > >>
> > >>Needs comment I think to explicitly mention the approximation, or
> > >>maybe drop the _us suffix?
> > >
> > >I did consider _approx_us but thought that was overkill. A comment along
> > >the lines of
> > >/* Approximately convert ns to us - the error is less than the
> > >  * truncation!
> > >  */
> > 
> > And the result is not used in subsequent calculations apart from
> > comparing against an approximate timeout?
> 
> Exactly, the timeout is fairly arbitrary and defined in the same units.
> That we truncate is a much bigger cause for concern in terms of spinning
> accurately for a definite length of time.
>  
> > >>>@@ -1161,7 +1183,7 @@ static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
> > >>>  		if (signal_pending_state(state, current))
> > >>>  			break;
> > >>>
> > >>>-		if (time_after_eq(jiffies, timeout))
> > >>>+		if (busywait_stop(timeout, cpu))
> > >>>  			break;
> > >>>
> > >>>  		cpu_relax_lowlatency();
> > >>>
> > >>
> > >>Otherwise looks good. Not sure what would you convert to 32-bit from
> > >>your follow up reply since you need us resolution?
> > >
> > >s/u64/unsigned long/ s/time_after64/time_after/
> > >
> > >32bits of us resolution gives us 1000s before wraparound between the two
> > >samples. And I hope that a 1000s doesn't pass between loops. Or if it does,
> > >the GPU managed to complete its task.
> > 
> > Now I see that you did say low bits.. yes that sounds fine.
> > 
> > Btw while you are optimizing things maybe pick up this micro
> > optimization: http://patchwork.freedesktop.org/patch/64339/
> > 
> > Not in scope of this thread but under the normal development patch flow.
> 
> There's a different series which looks at tackling the scalabiltiy issue
> with dozens of concurrent waiters. I have an equivalent patch there and
> one to tidy up the seqno query.
>  
> > Btw2, any benchmark result changes with this?
> 
> Spinning still gives the dramatic (2x) improvement in the microbenchmarks
> (over pure interrupt driven waits), so that improvement is preserved.

Previously the spinning also increased power consumption without
offering any significant performance difference for some workloads.
IIRC on my BYT the average CPU power consumption was ~100mW higher
(as reported by RAPL) with xonotic the-big-keybench.dem (1920x1200
w/ "High" settings, IIRC) but average fps wasn't improved. Might
be interesting to know how the improved spin code stacks up on
that front.

> There are a couple of interesting swings in the macro tests (comparedt to
> the previous jiffie patch) just above the noise level which could well be
> a change in the throttling/scheduling. (And those tests are also the
> ones that correspond to the greatest gains (10-40%) using spinning.)
> -Chris
> 
> -- 
> Chris Wilson, Intel Open Source Technology Centre
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Ville Syrjälä
Intel OTC
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 2/2] drm/i915: Limit the busy wait on requests to 2us not 10ms!
  2015-11-15 13:32   ` Chris Wilson
                     ` (2 preceding siblings ...)
  (?)
@ 2015-11-16 16:48   ` Jens Axboe
  2015-11-18  9:56     ` Limit busywaiting Chris Wilson
  -1 siblings, 1 reply; 35+ messages in thread
From: Jens Axboe @ 2015-11-16 16:48 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx, linux-kernel
  Cc: dri-devel, Daniel Vetter, Tvrtko Ursulin, Eero Tamminen, Rantala,
	Valtteri, stable

On 11/15/2015 06:32 AM, Chris Wilson wrote:
> When waiting for high frequency requests, the finite amount of time
> required to set up the irq and wait upon it limits the response rate. By
> busywaiting on the request completion for a short while we can service
> the high frequency waits as quick as possible. However, if it is a slow
> request, we want to sleep as quickly as possible. The tradeoff between
> waiting and sleeping is roughly the time it takes to sleep on a request,
> on the order of a microsecond. Based on measurements from big core, I
> have set the limit for busywaiting as 2 microseconds.
>
> The code currently uses the jiffie clock, but that is far too coarse (on
> the order of 10 milliseconds) and results in poor interactivity as the
> CPU ends up being hogged by slow requests. To get microsecond resolution
> we need to use a high resolution timer. The cheapest of which is polling
> local_clock(), but that is only valid on the same CPU. If we switch CPUs
> because the task was preempted, we can also use that as an indicator that
>   the system is too busy to waste cycles on spinning and we should sleep
> instead.

I tried this (1+2), and it feels better. However, I added some counters 
just to track how well it's faring:

[  491.077612] i915: invoked=7168, success=50

so out of 6144 invocations, we only avoided going to sleep 49 of those 
times. As a percentage, that's 99.3% of the time we spun 2usec for no 
good reason other than to burn up more of my battery. So the reason 
there's an improvement for me is that we're just not spinning the 10ms 
anymore, however we're still just wasting time for my use case.

I'd recommend putting this behind some option so that people can enable 
it and play with it if they want, but not making it default to on until 
some more clever tracking has been added to dynamically adapt to on when 
to poll and when not to. It should not be a default-on type of thing 
until it's closer to doing the right thing for a normal workload, not 
just some synthetic benchmark.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Limit busywaiting
  2015-11-16 16:48   ` Jens Axboe
@ 2015-11-18  9:56     ` Chris Wilson
  2015-11-18  9:56       ` [PATCH 1/3] drm/i915: Only spin whilst waiting on the current request Chris Wilson
                         ` (3 more replies)
  0 siblings, 4 replies; 35+ messages in thread
From: Chris Wilson @ 2015-11-18  9:56 UTC (permalink / raw)
  To: Jens Axboe; +Cc: intel-gfx

This should filter out all explicit wait requests from userspace and
only apply busywaiting to circumstances where we are forced to drain the
GPU of old requests. With the 2 microsecond timeout from before, this
still seems to preserve the speed up in stress tests and cancel the
busywaiting for desktop loads.
-Chris

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH 1/3] drm/i915: Only spin whilst waiting on the current request
  2015-11-18  9:56     ` Limit busywaiting Chris Wilson
@ 2015-11-18  9:56       ` Chris Wilson
  2015-11-18 17:03         ` Daniel Vetter
  2015-11-19 10:05         ` Tvrtko Ursulin
  2015-11-18  9:56       ` [PATCH 2/3] drm/i915: Convert __i915_wait_request to receive flags Chris Wilson
                         ` (2 subsequent siblings)
  3 siblings, 2 replies; 35+ messages in thread
From: Chris Wilson @ 2015-11-18  9:56 UTC (permalink / raw)
  To: Jens Axboe; +Cc: intel-gfx

Limit busywaiting only to the request currently being processed by the
GPU. If the request is not currently being processed by the GPU, there
is a very low likelihood of it being completed within the 2 microsecond
spin timeout and so we will just be wasting CPU cycles.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_drv.h | 2 +-
 drivers/gpu/drm/i915/i915_gem.c | 8 +++++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 8afda459a26e..16095b95d2df 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2190,7 +2190,7 @@ struct drm_i915_gem_request {
 	struct intel_engine_cs *ring;
 
 	/** GEM sequence number associated with this request. */
-	uint32_t seqno;
+	uint32_t seqno, spin_seqno;
 
 	/** Position in the ringbuffer of the start of the request */
 	u32 head;
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 414150a0b8d5..af9ffa11ef44 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1193,9 +1193,14 @@ static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
 	 * takes to sleep on a request, on the order of a microsecond.
 	 */
 
-	if (i915_gem_request_get_ring(req)->irq_refcount)
+	if (req->ring->irq_refcount)
 		return -EBUSY;
 
+	/* Only spin if we know the GPU is processing this request */
+	if (i915_seqno_passed(req->ring->get_seqno(req->ring, false),
+			      req->spin_seqno))
+		return -EAGAIN;
+
 	timeout = local_clock_us(&cpu) + 2;
 	while (!need_resched()) {
 		if (i915_gem_request_completed(req, true))
@@ -2592,6 +2597,7 @@ void __i915_add_request(struct drm_i915_gem_request *request,
 	request->batch_obj = obj;
 
 	request->emitted_jiffies = jiffies;
+	request->spin_seqno = ring->last_submitted_seqno;
 	ring->last_submitted_seqno = request->seqno;
 	list_add_tail(&request->list, &ring->request_list);
 
-- 
2.6.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/3] drm/i915: Only spin whilst waiting on the current request
  2015-11-18  9:56       ` [PATCH 1/3] drm/i915: Only spin whilst waiting on the current request Chris Wilson
@ 2015-11-18 17:03         ` Daniel Vetter
  2015-11-19 10:05         ` Tvrtko Ursulin
  1 sibling, 0 replies; 35+ messages in thread
From: Daniel Vetter @ 2015-11-18 17:03 UTC (permalink / raw)
  To: Chris Wilson; +Cc: Jens Axboe, intel-gfx

On Wed, Nov 18, 2015 at 09:56:06AM +0000, Chris Wilson wrote:
> Limit busywaiting only to the request currently being processed by the
> GPU. If the request is not currently being processed by the GPU, there
> is a very low likelihood of it being completed within the 2 microsecond
> spin timeout and so we will just be wasting CPU cycles.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>

> ---
>  drivers/gpu/drm/i915/i915_drv.h | 2 +-
>  drivers/gpu/drm/i915/i915_gem.c | 8 +++++++-
>  2 files changed, 8 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 8afda459a26e..16095b95d2df 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2190,7 +2190,7 @@ struct drm_i915_gem_request {
>  	struct intel_engine_cs *ring;
>  
>  	/** GEM sequence number associated with this request. */
> -	uint32_t seqno;
> +	uint32_t seqno, spin_seqno;
>  
>  	/** Position in the ringbuffer of the start of the request */
>  	u32 head;
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 414150a0b8d5..af9ffa11ef44 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -1193,9 +1193,14 @@ static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
>  	 * takes to sleep on a request, on the order of a microsecond.
>  	 */
>  
> -	if (i915_gem_request_get_ring(req)->irq_refcount)
> +	if (req->ring->irq_refcount)
>  		return -EBUSY;
>  
> +	/* Only spin if we know the GPU is processing this request */
> +	if (i915_seqno_passed(req->ring->get_seqno(req->ring, false),
> +			      req->spin_seqno))
> +		return -EAGAIN;
> +
>  	timeout = local_clock_us(&cpu) + 2;
>  	while (!need_resched()) {
>  		if (i915_gem_request_completed(req, true))
> @@ -2592,6 +2597,7 @@ void __i915_add_request(struct drm_i915_gem_request *request,
>  	request->batch_obj = obj;
>  
>  	request->emitted_jiffies = jiffies;
> +	request->spin_seqno = ring->last_submitted_seqno;
>  	ring->last_submitted_seqno = request->seqno;
>  	list_add_tail(&request->list, &ring->request_list);
>  
> -- 
> 2.6.2
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/3] drm/i915: Only spin whilst waiting on the current request
  2015-11-18  9:56       ` [PATCH 1/3] drm/i915: Only spin whilst waiting on the current request Chris Wilson
  2015-11-18 17:03         ` Daniel Vetter
@ 2015-11-19 10:05         ` Tvrtko Ursulin
  2015-11-19 10:12           ` Chris Wilson
  1 sibling, 1 reply; 35+ messages in thread
From: Tvrtko Ursulin @ 2015-11-19 10:05 UTC (permalink / raw)
  To: Chris Wilson, Jens Axboe; +Cc: intel-gfx


Hi,

On 18/11/15 09:56, Chris Wilson wrote:
> Limit busywaiting only to the request currently being processed by the
> GPU. If the request is not currently being processed by the GPU, there
> is a very low likelihood of it being completed within the 2 microsecond
> spin timeout and so we will just be wasting CPU cycles.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/i915_drv.h | 2 +-
>   drivers/gpu/drm/i915/i915_gem.c | 8 +++++++-
>   2 files changed, 8 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 8afda459a26e..16095b95d2df 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2190,7 +2190,7 @@ struct drm_i915_gem_request {
>   	struct intel_engine_cs *ring;
>
>   	/** GEM sequence number associated with this request. */
> -	uint32_t seqno;
> +	uint32_t seqno, spin_seqno;

Comment needs splitting out.

And spin_seqno is not the best name, I think previous_ring_seqno would 
be better. So it would immediately tell you what it is, and then at the 
place which uses it it would also be clearer what is the criteria for 
spinning.

>   	/** Position in the ringbuffer of the start of the request */
>   	u32 head;
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 414150a0b8d5..af9ffa11ef44 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -1193,9 +1193,14 @@ static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
>   	 * takes to sleep on a request, on the order of a microsecond.
>   	 */
>
> -	if (i915_gem_request_get_ring(req)->irq_refcount)
> +	if (req->ring->irq_refcount)
>   		return -EBUSY;
>
> +	/* Only spin if we know the GPU is processing this request */
> +	if (i915_seqno_passed(req->ring->get_seqno(req->ring, false),
> +			      req->spin_seqno))
> +		return -EAGAIN;
> +
>   	timeout = local_clock_us(&cpu) + 2;
>   	while (!need_resched()) {
>   		if (i915_gem_request_completed(req, true))
> @@ -2592,6 +2597,7 @@ void __i915_add_request(struct drm_i915_gem_request *request,
>   	request->batch_obj = obj;
>
>   	request->emitted_jiffies = jiffies;
> +	request->spin_seqno = ring->last_submitted_seqno;
>   	ring->last_submitted_seqno = request->seqno;
>   	list_add_tail(&request->list, &ring->request_list);

Commit message says it will spin only on the request currently being 
processed by the GPU but from here it looks like it will spin for any 
request _queued up_ before the last?

For example if we have submitted 1, 2, 3 and 4 and GPU is currently 
processing 1.

2 has spin_seqno 1.
3 has spin_seqno 2.
4 has spin_seqno 3.

ring->get_seqno is 0.

Wait on 1: seqno_passed(0, 0) = true -> wait
Wait on 2: seqno_passed(0, 1) = false -> spin
Wait on 3: seqno_passed(0, 2) = false -> spin
Wait on 4: seqno_passed(0, 3) = false -> spin

So it looks the opposite.

Or is it too early for me? :)

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/3] drm/i915: Only spin whilst waiting on the current request
  2015-11-19 10:05         ` Tvrtko Ursulin
@ 2015-11-19 10:12           ` Chris Wilson
  0 siblings, 0 replies; 35+ messages in thread
From: Chris Wilson @ 2015-11-19 10:12 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: Jens Axboe, intel-gfx

On Thu, Nov 19, 2015 at 10:05:39AM +0000, Tvrtko Ursulin wrote:
> 
> Hi,
> 
> On 18/11/15 09:56, Chris Wilson wrote:
> >Limit busywaiting only to the request currently being processed by the
> >GPU. If the request is not currently being processed by the GPU, there
> >is a very low likelihood of it being completed within the 2 microsecond
> >spin timeout and so we will just be wasting CPU cycles.
> >
> >Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >---
> >  drivers/gpu/drm/i915/i915_drv.h | 2 +-
> >  drivers/gpu/drm/i915/i915_gem.c | 8 +++++++-
> >  2 files changed, 8 insertions(+), 2 deletions(-)
> >
> >diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> >index 8afda459a26e..16095b95d2df 100644
> >--- a/drivers/gpu/drm/i915/i915_drv.h
> >+++ b/drivers/gpu/drm/i915/i915_drv.h
> >@@ -2190,7 +2190,7 @@ struct drm_i915_gem_request {
> >  	struct intel_engine_cs *ring;
> >
> >  	/** GEM sequence number associated with this request. */
> >-	uint32_t seqno;
> >+	uint32_t seqno, spin_seqno;
> 
> Comment needs splitting out.

Is it not the sequence associated with the request? The start of request
activity, end of active marker?
 
> And spin_seqno is not the best name, I think previous_ring_seqno
> would be better. So it would immediately tell you what it is, and
> then at the place which uses it it would also be clearer what is the
> criteria for spinning.

I agree, calling it spin_seqno was a mistake. I didn't like last_seqno
either. Though now I think,

u32 seqno_active, seqno_complete;

and a

i915_gem_request_active() helper to match i915_gem_request_completed().

> Commit message says it will spin only on the request currently being
> processed by the GPU but from here it looks like it will spin for
> any request _queued up_ before the last?

Nope, my mistake. Thanks,
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH 2/3] drm/i915: Convert __i915_wait_request to receive flags
  2015-11-18  9:56     ` Limit busywaiting Chris Wilson
  2015-11-18  9:56       ` [PATCH 1/3] drm/i915: Only spin whilst waiting on the current request Chris Wilson
@ 2015-11-18  9:56       ` Chris Wilson
  2015-11-18  9:56       ` [PATCH 3/3] drm/i915: Limit request busywaiting Chris Wilson
  2015-11-19 16:29       ` Limit busywaiting Jens Axboe
  3 siblings, 0 replies; 35+ messages in thread
From: Chris Wilson @ 2015-11-18  9:56 UTC (permalink / raw)
  To: Jens Axboe; +Cc: intel-gfx

Convert the bool interruptible argument over to a flags bitmask so that
we can add further bool parameters conveniently.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_drv.h         |  3 ++-
 drivers/gpu/drm/i915/i915_gem.c         | 42 +++++++++++++++++++++++----------
 drivers/gpu/drm/i915/intel_display.c    |  2 +-
 drivers/gpu/drm/i915/intel_ringbuffer.c |  7 +++++-
 4 files changed, 39 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 16095b95d2df..f47d701f2ddb 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2975,9 +2975,10 @@ void __i915_add_request(struct drm_i915_gem_request *req,
 	__i915_add_request(req, NULL, false)
 int __i915_wait_request(struct drm_i915_gem_request *req,
 			unsigned reset_counter,
-			bool interruptible,
+			unsigned flags,
 			s64 *timeout,
 			struct intel_rps_client *rps);
+#define I915_WAIT_INTERRUPTIBLE 0x1
 int __must_check i915_wait_request(struct drm_i915_gem_request *req);
 int i915_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
 int __must_check
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index af9ffa11ef44..d4d5c6e8c02f 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1224,7 +1224,7 @@ static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
  * __i915_wait_request - wait until execution of request has finished
  * @req: duh!
  * @reset_counter: reset sequence associated with the given request
- * @interruptible: do an interruptible wait (normally yes)
+ * @flags: flags
  * @timeout: in - how long to wait (NULL forever); out - how much time remaining
  *
  * Note: It is of utmost importance that the passed in seqno and reset_counter
@@ -1239,7 +1239,7 @@ static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
  */
 int __i915_wait_request(struct drm_i915_gem_request *req,
 			unsigned reset_counter,
-			bool interruptible,
+			unsigned flags,
 			s64 *timeout,
 			struct intel_rps_client *rps)
 {
@@ -1248,7 +1248,7 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	const bool irq_test_in_progress =
 		ACCESS_ONCE(dev_priv->gpu_error.test_irq_rings) & intel_ring_flag(ring);
-	int state = interruptible ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE;
+	int state = flags & I915_WAIT_INTERRUPTIBLE ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE;
 	DEFINE_WAIT(wait);
 	unsigned long timeout_expire;
 	s64 before, now;
@@ -1292,7 +1292,8 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
 		if (reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter)) {
 			/* ... but upgrade the -EAGAIN to an -EIO if the gpu
 			 * is truely gone. */
-			ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible);
+			ret = i915_gem_check_wedge(&dev_priv->gpu_error,
+						   flags & I915_WAIT_INTERRUPTIBLE);
 			if (ret == 0)
 				ret = -EAGAIN;
 			break;
@@ -1451,24 +1452,28 @@ i915_wait_request(struct drm_i915_gem_request *req)
 {
 	struct drm_device *dev;
 	struct drm_i915_private *dev_priv;
-	bool interruptible;
+	unsigned flags;
 	int ret;
 
 	BUG_ON(req == NULL);
 
 	dev = req->ring->dev;
 	dev_priv = dev->dev_private;
-	interruptible = dev_priv->mm.interruptible;
 
 	BUG_ON(!mutex_is_locked(&dev->struct_mutex));
 
-	ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible);
+	ret = i915_gem_check_wedge(&dev_priv->gpu_error,
+				   dev_priv->mm.interruptible);
 	if (ret)
 		return ret;
 
+	flags = 0;
+	if (dev_priv->mm.interruptible)
+		flags |= I915_WAIT_INTERRUPTIBLE;
+
 	ret = __i915_wait_request(req,
 				  atomic_read(&dev_priv->gpu_error.reset_counter),
-				  interruptible, NULL, NULL);
+				  flags, NULL, NULL);
 	if (ret)
 		return ret;
 
@@ -1580,7 +1585,9 @@ i915_gem_object_wait_rendering__nonblocking(struct drm_i915_gem_object *obj,
 
 	mutex_unlock(&dev->struct_mutex);
 	for (i = 0; ret == 0 && i < n; i++)
-		ret = __i915_wait_request(requests[i], reset_counter, true,
+		ret = __i915_wait_request(requests[i],
+					  reset_counter,
+					  I915_WAIT_INTERRUPTIBLE,
 					  NULL, rps);
 	mutex_lock(&dev->struct_mutex);
 
@@ -3096,7 +3103,9 @@ i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 
 	for (i = 0; i < n; i++) {
 		if (ret == 0)
-			ret = __i915_wait_request(req[i], reset_counter, true,
+			ret = __i915_wait_request(req[i],
+						  reset_counter,
+						  I915_WAIT_INTERRUPTIBLE,
 						  args->timeout_ns > 0 ? &args->timeout_ns : NULL,
 						  file->driver_priv);
 		i915_gem_request_unreference__unlocked(req[i]);
@@ -3127,9 +3136,15 @@ __i915_gem_object_sync(struct drm_i915_gem_object *obj,
 
 	if (!i915_semaphore_is_enabled(obj->base.dev)) {
 		struct drm_i915_private *i915 = to_i915(obj->base.dev);
+		unsigned flags;
+
+		flags = 0;
+		if (i915->mm.interruptible)
+			flags |= I915_WAIT_INTERRUPTIBLE;
+
 		ret = __i915_wait_request(from_req,
 					  atomic_read(&i915->gpu_error.reset_counter),
-					  i915->mm.interruptible,
+					  flags,
 					  NULL,
 					  &i915->rps.semaphores);
 		if (ret)
@@ -4090,7 +4105,10 @@ i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
 	if (target == NULL)
 		return 0;
 
-	ret = __i915_wait_request(target, reset_counter, true, NULL, NULL);
+	ret = __i915_wait_request(target,
+				  reset_counter,
+				  I915_WAIT_INTERRUPTIBLE,
+				  NULL, NULL);
 	if (ret == 0)
 		queue_delayed_work(dev_priv->wq, &dev_priv->mm.retire_work, 0);
 
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index f62ffc04c21d..edc0d313398d 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -11216,7 +11216,7 @@ static void intel_mmio_flip_work_func(struct work_struct *work)
 	if (mmio_flip->req) {
 		WARN_ON(__i915_wait_request(mmio_flip->req,
 					    mmio_flip->crtc->reset_counter,
-					    false, NULL,
+					    0, NULL,
 					    &mmio_flip->i915->rps.mmioflips));
 		i915_gem_request_unreference__unlocked(mmio_flip->req);
 	}
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 9461a238f5d5..664ce0b20b23 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -2231,6 +2231,7 @@ static void __wrap_ring_buffer(struct intel_ringbuffer *ringbuf)
 int intel_ring_idle(struct intel_engine_cs *ring)
 {
 	struct drm_i915_gem_request *req;
+	unsigned flags;
 
 	/* Wait upon the last request to be completed */
 	if (list_empty(&ring->request_list))
@@ -2240,10 +2241,14 @@ int intel_ring_idle(struct intel_engine_cs *ring)
 			struct drm_i915_gem_request,
 			list);
 
+	flags = 0;
+	if (to_i915(ring->dev)->mm.interruptible)
+		flags |= I915_WAIT_INTERRUPTIBLE;
+
 	/* Make sure we do not trigger any retires */
 	return __i915_wait_request(req,
 				   atomic_read(&to_i915(ring->dev)->gpu_error.reset_counter),
-				   to_i915(ring->dev)->mm.interruptible,
+				   flags,
 				   NULL, NULL);
 }
 
-- 
2.6.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 35+ messages in thread

* [PATCH 3/3] drm/i915: Limit request busywaiting
  2015-11-18  9:56     ` Limit busywaiting Chris Wilson
  2015-11-18  9:56       ` [PATCH 1/3] drm/i915: Only spin whilst waiting on the current request Chris Wilson
  2015-11-18  9:56       ` [PATCH 2/3] drm/i915: Convert __i915_wait_request to receive flags Chris Wilson
@ 2015-11-18  9:56       ` Chris Wilson
  2015-11-19 15:22         ` Daniel Vetter
  2015-11-19 16:29       ` Limit busywaiting Jens Axboe
  3 siblings, 1 reply; 35+ messages in thread
From: Chris Wilson @ 2015-11-18  9:56 UTC (permalink / raw)
  To: Jens Axboe; +Cc: intel-gfx

If we suppose that on average it takes 1ms to do a fullscreen paint
(which is fairly typical across generations, as the GPUs get faster the
displays get larger), then the likelihood of a synchronous wait on the last
request completing within 2 microseconds is miniscule. On the other
hand, not everything attempts a fullscreen repaint (or is ratelimited by
such) and for those a short busywait is much faster than setting up the
interrupt driven waiter. The challenge is identifying such tasks. Here
we opt to never spin for an unlocked wait (such as from a set-domain or
wait ioctl) and instead only spin in circumstances where we are holding
the lock and have either already completed one unlocked wait or we are
in a situation where we are being forced to drain old requests (which we
know must be close to completion).

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_drv.h         |  1 +
 drivers/gpu/drm/i915/i915_gem.c         | 12 +++++++-----
 drivers/gpu/drm/i915/intel_ringbuffer.c |  2 +-
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index f47d701f2ddb..dc8f2a87dac0 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2979,6 +2979,7 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
 			s64 *timeout,
 			struct intel_rps_client *rps);
 #define I915_WAIT_INTERRUPTIBLE 0x1
+#define I915_WAIT_MAY_SPIN 0x2
 int __must_check i915_wait_request(struct drm_i915_gem_request *req);
 int i915_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
 int __must_check
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index d4d5c6e8c02f..cfa101fad479 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1273,9 +1273,11 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
 	before = ktime_get_raw_ns();
 
 	/* Optimistic spin for the next jiffie before touching IRQs */
-	ret = __i915_spin_request(req, state);
-	if (ret == 0)
-		goto out;
+	if (flags & I915_WAIT_MAY_SPIN) {
+		ret = __i915_spin_request(req, state);
+		if (ret == 0)
+			goto out;
+	}
 
 	if (!irq_test_in_progress && WARN_ON(!ring->irq_get(ring))) {
 		ret = -ENODEV;
@@ -1467,7 +1469,7 @@ i915_wait_request(struct drm_i915_gem_request *req)
 	if (ret)
 		return ret;
 
-	flags = 0;
+	flags = I915_WAIT_MAY_SPIN;
 	if (dev_priv->mm.interruptible)
 		flags |= I915_WAIT_INTERRUPTIBLE;
 
@@ -4107,7 +4109,7 @@ i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
 
 	ret = __i915_wait_request(target,
 				  reset_counter,
-				  I915_WAIT_INTERRUPTIBLE,
+				  I915_WAIT_INTERRUPTIBLE | I915_WAIT_MAY_SPIN,
 				  NULL, NULL);
 	if (ret == 0)
 		queue_delayed_work(dev_priv->wq, &dev_priv->mm.retire_work, 0);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 664ce0b20b23..d3142af0f347 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -2241,7 +2241,7 @@ int intel_ring_idle(struct intel_engine_cs *ring)
 			struct drm_i915_gem_request,
 			list);
 
-	flags = 0;
+	flags = I915_WAIT_MAY_SPIN;
 	if (to_i915(ring->dev)->mm.interruptible)
 		flags |= I915_WAIT_INTERRUPTIBLE;
 
-- 
2.6.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 35+ messages in thread

* Re: [PATCH 3/3] drm/i915: Limit request busywaiting
  2015-11-18  9:56       ` [PATCH 3/3] drm/i915: Limit request busywaiting Chris Wilson
@ 2015-11-19 15:22         ` Daniel Vetter
  0 siblings, 0 replies; 35+ messages in thread
From: Daniel Vetter @ 2015-11-19 15:22 UTC (permalink / raw)
  To: Chris Wilson; +Cc: Jens Axboe, intel-gfx

On Wed, Nov 18, 2015 at 09:56:08AM +0000, Chris Wilson wrote:
> If we suppose that on average it takes 1ms to do a fullscreen paint
> (which is fairly typical across generations, as the GPUs get faster the
> displays get larger), then the likelihood of a synchronous wait on the last
> request completing within 2 microseconds is miniscule. On the other
> hand, not everything attempts a fullscreen repaint (or is ratelimited by
> such) and for those a short busywait is much faster than setting up the
> interrupt driven waiter. The challenge is identifying such tasks. Here
> we opt to never spin for an unlocked wait (such as from a set-domain or
> wait ioctl) and instead only spin in circumstances where we are holding
> the lock and have either already completed one unlocked wait or we are
> in a situation where we are being forced to drain old requests (which we
> know must be close to completion).
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

Do you have some perf datat for this?
-Daniel

> ---
>  drivers/gpu/drm/i915/i915_drv.h         |  1 +
>  drivers/gpu/drm/i915/i915_gem.c         | 12 +++++++-----
>  drivers/gpu/drm/i915/intel_ringbuffer.c |  2 +-
>  3 files changed, 9 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index f47d701f2ddb..dc8f2a87dac0 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2979,6 +2979,7 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
>  			s64 *timeout,
>  			struct intel_rps_client *rps);
>  #define I915_WAIT_INTERRUPTIBLE 0x1
> +#define I915_WAIT_MAY_SPIN 0x2
>  int __must_check i915_wait_request(struct drm_i915_gem_request *req);
>  int i915_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
>  int __must_check
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index d4d5c6e8c02f..cfa101fad479 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -1273,9 +1273,11 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
>  	before = ktime_get_raw_ns();
>  
>  	/* Optimistic spin for the next jiffie before touching IRQs */
> -	ret = __i915_spin_request(req, state);
> -	if (ret == 0)
> -		goto out;
> +	if (flags & I915_WAIT_MAY_SPIN) {
> +		ret = __i915_spin_request(req, state);
> +		if (ret == 0)
> +			goto out;
> +	}
>  
>  	if (!irq_test_in_progress && WARN_ON(!ring->irq_get(ring))) {
>  		ret = -ENODEV;
> @@ -1467,7 +1469,7 @@ i915_wait_request(struct drm_i915_gem_request *req)
>  	if (ret)
>  		return ret;
>  
> -	flags = 0;
> +	flags = I915_WAIT_MAY_SPIN;
>  	if (dev_priv->mm.interruptible)
>  		flags |= I915_WAIT_INTERRUPTIBLE;
>  
> @@ -4107,7 +4109,7 @@ i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
>  
>  	ret = __i915_wait_request(target,
>  				  reset_counter,
> -				  I915_WAIT_INTERRUPTIBLE,
> +				  I915_WAIT_INTERRUPTIBLE | I915_WAIT_MAY_SPIN,
>  				  NULL, NULL);
>  	if (ret == 0)
>  		queue_delayed_work(dev_priv->wq, &dev_priv->mm.retire_work, 0);
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 664ce0b20b23..d3142af0f347 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -2241,7 +2241,7 @@ int intel_ring_idle(struct intel_engine_cs *ring)
>  			struct drm_i915_gem_request,
>  			list);
>  
> -	flags = 0;
> +	flags = I915_WAIT_MAY_SPIN;
>  	if (to_i915(ring->dev)->mm.interruptible)
>  		flags |= I915_WAIT_INTERRUPTIBLE;
>  
> -- 
> 2.6.2
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: Limit busywaiting
  2015-11-18  9:56     ` Limit busywaiting Chris Wilson
                         ` (2 preceding siblings ...)
  2015-11-18  9:56       ` [PATCH 3/3] drm/i915: Limit request busywaiting Chris Wilson
@ 2015-11-19 16:29       ` Jens Axboe
  3 siblings, 0 replies; 35+ messages in thread
From: Jens Axboe @ 2015-11-19 16:29 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

On 11/18/2015 02:56 AM, Chris Wilson wrote:
> This should filter out all explicit wait requests from userspace and
> only apply busywaiting to circumstances where we are forced to drain the
> GPU of old requests. With the 2 microsecond timeout from before, this
> still seems to preserve the speed up in stress tests and cancel the
> busywaiting for desktop loads.

Chris,

I tested these 3 on top of the previous 2us limit patch, and it seems to 
work fine. You can add:

Tested-by: Jens Axboe <axboe@fb.com>

to them, if you'd like.

-- 
Jens Axboe

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 2/2] drm/i915: Limit the busy wait on requests to 2us not 10ms!
  2015-11-15 13:32   ` Chris Wilson
@ 2015-12-03 22:03     ` Pavel Machek
  -1 siblings, 0 replies; 35+ messages in thread
From: Pavel Machek @ 2015-12-03 22:03 UTC (permalink / raw)
  To: Chris Wilson
  Cc: Jens Axboe, intel-gfx, linux-kernel, dri-devel, Daniel Vetter,
	Tvrtko Ursulin, Eero Tamminen, Rantala, Valtteri, stable

Hi!

> Reported-by: Jens Axboe <axboe@kernel.dk>
> Link: https://lkml.org/lkml/2015/11/12/621
> Cc: Jens Axboe <axboe@kernel.dk>
> Cc; "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin@intel.com>
> Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> Cc: Eero Tamminen <eero.t.tamminen@intel.com>
> Cc: "Rantala, Valtteri" <valtteri.rantala@intel.com>
> Cc: stable@kernel.vger.org
> ---
>  drivers/gpu/drm/i915/i915_gem.c | 28 +++++++++++++++++++++++++---
>  1 file changed, 25 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 740530c571d1..2a88158bd1f7 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -1146,14 +1146,36 @@ static bool missed_irq(struct drm_i915_private *dev_priv,
>  	return test_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings);
>  }
>  
> +static u64 local_clock_us(unsigned *cpu)
> +{
> +	u64 t;
> +
> +	*cpu = get_cpu();
> +	t = local_clock() >> 10;
> +	put_cpu();
> +
> +	return t;
> +}
> +
> +static bool busywait_stop(u64 timeout, unsigned cpu)
> +{
> +	unsigned this_cpu;
> +
> +	if (time_after64(local_clock_us(&this_cpu), timeout))
> +		return true;
> +
> +	return this_cpu != cpu;
> +}

Perhaps you want to ask the timekeeping people for the right
primitive? I guess you are not the only one needing this..
									Pavel
									
-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 2/2] drm/i915: Limit the busy wait on requests to 2us not 10ms!
@ 2015-12-03 22:03     ` Pavel Machek
  0 siblings, 0 replies; 35+ messages in thread
From: Pavel Machek @ 2015-12-03 22:03 UTC (permalink / raw)
  To: Chris Wilson
  Cc: Jens Axboe, Tvrtko Ursulin, Daniel Vetter, intel-gfx,
	linux-kernel, dri-devel, stable, Rantala, Valtteri,
	Eero Tamminen

Hi!

> Reported-by: Jens Axboe <axboe@kernel.dk>
> Link: https://lkml.org/lkml/2015/11/12/621
> Cc: Jens Axboe <axboe@kernel.dk>
> Cc; "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin@intel.com>
> Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> Cc: Eero Tamminen <eero.t.tamminen@intel.com>
> Cc: "Rantala, Valtteri" <valtteri.rantala@intel.com>
> Cc: stable@kernel.vger.org
> ---
>  drivers/gpu/drm/i915/i915_gem.c | 28 +++++++++++++++++++++++++---
>  1 file changed, 25 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 740530c571d1..2a88158bd1f7 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -1146,14 +1146,36 @@ static bool missed_irq(struct drm_i915_private *dev_priv,
>  	return test_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings);
>  }
>  
> +static u64 local_clock_us(unsigned *cpu)
> +{
> +	u64 t;
> +
> +	*cpu = get_cpu();
> +	t = local_clock() >> 10;
> +	put_cpu();
> +
> +	return t;
> +}
> +
> +static bool busywait_stop(u64 timeout, unsigned cpu)
> +{
> +	unsigned this_cpu;
> +
> +	if (time_after64(local_clock_us(&this_cpu), timeout))
> +		return true;
> +
> +	return this_cpu != cpu;
> +}

Perhaps you want to ask the timekeeping people for the right
primitive? I guess you are not the only one needing this..
									Pavel
									
-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/2] drm/i915: Break busywaiting for requests on pending signals
  2015-11-15 13:32 ` Chris Wilson
@ 2015-11-16  9:54   ` Tvrtko Ursulin
  -1 siblings, 0 replies; 35+ messages in thread
From: Tvrtko Ursulin @ 2015-11-16  9:54 UTC (permalink / raw)
  To: Chris Wilson, Jens Axboe, intel-gfx, linux-kernel
  Cc: dri-devel, Daniel Vetter, Eero Tamminen, Rantala, Valtteri, stable


Hi,

On 15/11/15 13:32, Chris Wilson wrote:
> The busywait in __i915_spin_request() does not respect pending signals
> and so may consume the entire timeslice for the task instead of
> returning to userspace to handle the signal.

Obviously correct to break the spin, but if spending a jiffie to react 
to signals was the only problem then it is not too severe.

Add something to the commit message about how it was found/reported and 
about the severity of impact, etc?

Otherwise,

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

> Fixes regression from
> commit 2def4ad99befa25775dd2f714fdd4d92faec6e34 [v4.2]
> Author: Chris Wilson <chris@chris-wilson.co.uk>
> Date:   Tue Apr 7 16:20:41 2015 +0100
>
>       drm/i915: Optimistically spin for the request completion
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Jens Axboe <axboe@kernel.dk>
> Cc; "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin@intel.com>
> Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> Cc: Eero Tamminen <eero.t.tamminen@intel.com>
> Cc: "Rantala, Valtteri" <valtteri.rantala@intel.com>
> Cc: stable@kernel.vger.org
> ---
>   drivers/gpu/drm/i915/i915_gem.c | 13 ++++++++-----
>   1 file changed, 8 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 5cf4a1998273..740530c571d1 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -1146,7 +1146,7 @@ static bool missed_irq(struct drm_i915_private *dev_priv,
>   	return test_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings);
>   }
>
> -static int __i915_spin_request(struct drm_i915_gem_request *req)
> +static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
>   {
>   	unsigned long timeout;
>
> @@ -1158,6 +1158,9 @@ static int __i915_spin_request(struct drm_i915_gem_request *req)
>   		if (i915_gem_request_completed(req, true))
>   			return 0;
>
> +		if (signal_pending_state(state, current))
> +			break;
> +
>   		if (time_after_eq(jiffies, timeout))
>   			break;
>
> @@ -1197,6 +1200,7 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
>   	struct drm_i915_private *dev_priv = dev->dev_private;
>   	const bool irq_test_in_progress =
>   		ACCESS_ONCE(dev_priv->gpu_error.test_irq_rings) & intel_ring_flag(ring);
> +	int state = interruptible ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE;
>   	DEFINE_WAIT(wait);
>   	unsigned long timeout_expire;
>   	s64 before, now;
> @@ -1221,7 +1225,7 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
>   	before = ktime_get_raw_ns();
>
>   	/* Optimistic spin for the next jiffie before touching IRQs */
> -	ret = __i915_spin_request(req);
> +	ret = __i915_spin_request(req, state);
>   	if (ret == 0)
>   		goto out;
>
> @@ -1233,8 +1237,7 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
>   	for (;;) {
>   		struct timer_list timer;
>
> -		prepare_to_wait(&ring->irq_queue, &wait,
> -				interruptible ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE);
> +		prepare_to_wait(&ring->irq_queue, &wait, state);
>
>   		/* We need to check whether any gpu reset happened in between
>   		 * the caller grabbing the seqno and now ... */
> @@ -1252,7 +1255,7 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
>   			break;
>   		}
>
> -		if (interruptible && signal_pending(current)) {
> +		if (signal_pending_state(state, current)) {
>   			ret = -ERESTARTSYS;
>   			break;
>   		}
>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/2] drm/i915: Break busywaiting for requests on pending signals
@ 2015-11-16  9:54   ` Tvrtko Ursulin
  0 siblings, 0 replies; 35+ messages in thread
From: Tvrtko Ursulin @ 2015-11-16  9:54 UTC (permalink / raw)
  To: Chris Wilson, Jens Axboe, intel-gfx, linux-kernel
  Cc: Daniel Vetter, Eero Tamminen, Rantala, Valtteri, stable, dri-devel


Hi,

On 15/11/15 13:32, Chris Wilson wrote:
> The busywait in __i915_spin_request() does not respect pending signals
> and so may consume the entire timeslice for the task instead of
> returning to userspace to handle the signal.

Obviously correct to break the spin, but if spending a jiffie to react 
to signals was the only problem then it is not too severe.

Add something to the commit message about how it was found/reported and 
about the severity of impact, etc?

Otherwise,

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

> Fixes regression from
> commit 2def4ad99befa25775dd2f714fdd4d92faec6e34 [v4.2]
> Author: Chris Wilson <chris@chris-wilson.co.uk>
> Date:   Tue Apr 7 16:20:41 2015 +0100
>
>       drm/i915: Optimistically spin for the request completion
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Jens Axboe <axboe@kernel.dk>
> Cc; "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin@intel.com>
> Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> Cc: Eero Tamminen <eero.t.tamminen@intel.com>
> Cc: "Rantala, Valtteri" <valtteri.rantala@intel.com>
> Cc: stable@kernel.vger.org
> ---
>   drivers/gpu/drm/i915/i915_gem.c | 13 ++++++++-----
>   1 file changed, 8 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 5cf4a1998273..740530c571d1 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -1146,7 +1146,7 @@ static bool missed_irq(struct drm_i915_private *dev_priv,
>   	return test_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings);
>   }
>
> -static int __i915_spin_request(struct drm_i915_gem_request *req)
> +static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
>   {
>   	unsigned long timeout;
>
> @@ -1158,6 +1158,9 @@ static int __i915_spin_request(struct drm_i915_gem_request *req)
>   		if (i915_gem_request_completed(req, true))
>   			return 0;
>
> +		if (signal_pending_state(state, current))
> +			break;
> +
>   		if (time_after_eq(jiffies, timeout))
>   			break;
>
> @@ -1197,6 +1200,7 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
>   	struct drm_i915_private *dev_priv = dev->dev_private;
>   	const bool irq_test_in_progress =
>   		ACCESS_ONCE(dev_priv->gpu_error.test_irq_rings) & intel_ring_flag(ring);
> +	int state = interruptible ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE;
>   	DEFINE_WAIT(wait);
>   	unsigned long timeout_expire;
>   	s64 before, now;
> @@ -1221,7 +1225,7 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
>   	before = ktime_get_raw_ns();
>
>   	/* Optimistic spin for the next jiffie before touching IRQs */
> -	ret = __i915_spin_request(req);
> +	ret = __i915_spin_request(req, state);
>   	if (ret == 0)
>   		goto out;
>
> @@ -1233,8 +1237,7 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
>   	for (;;) {
>   		struct timer_list timer;
>
> -		prepare_to_wait(&ring->irq_queue, &wait,
> -				interruptible ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE);
> +		prepare_to_wait(&ring->irq_queue, &wait, state);
>
>   		/* We need to check whether any gpu reset happened in between
>   		 * the caller grabbing the seqno and now ... */
> @@ -1252,7 +1255,7 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
>   			break;
>   		}
>
> -		if (interruptible && signal_pending(current)) {
> +		if (signal_pending_state(state, current)) {
>   			ret = -ERESTARTSYS;
>   			break;
>   		}
>
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/2] drm/i915: Break busywaiting for requests on pending signals
  2015-11-16  9:54   ` Tvrtko Ursulin
@ 2015-11-16 11:22     ` Chris Wilson
  -1 siblings, 0 replies; 35+ messages in thread
From: Chris Wilson @ 2015-11-16 11:22 UTC (permalink / raw)
  To: Tvrtko Ursulin
  Cc: Jens Axboe, intel-gfx, linux-kernel, dri-devel, Daniel Vetter,
	Eero Tamminen, Rantala, Valtteri, stable

On Mon, Nov 16, 2015 at 09:54:10AM +0000, Tvrtko Ursulin wrote:
> 
> Hi,
> 
> On 15/11/15 13:32, Chris Wilson wrote:
> >The busywait in __i915_spin_request() does not respect pending signals
> >and so may consume the entire timeslice for the task instead of
> >returning to userspace to handle the signal.
> 
> Obviously correct to break the spin, but if spending a jiffie to
> react to signals was the only problem then it is not too severe.
> 
> Add something to the commit message about how it was found/reported
> and about the severity of impact, etc?

Perhaps:

At the worst case this could cause a delay in signal processing of 20ms,
which would be a noticeable jitter in cursor tracking. If a higher
resolution signal was being used, for example to provide fairness of a
server timeslices between clients, we could expect to detect some
unfairness between clients. This issue was noticed when inspecting a
report of poor interactivity resulting from excessively high
__i915_spin_request usage.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/2] drm/i915: Break busywaiting for requests on pending signals
@ 2015-11-16 11:22     ` Chris Wilson
  0 siblings, 0 replies; 35+ messages in thread
From: Chris Wilson @ 2015-11-16 11:22 UTC (permalink / raw)
  To: Tvrtko Ursulin
  Cc: Jens Axboe, Daniel Vetter, intel-gfx, linux-kernel, dri-devel,
	stable, Rantala, Valtteri, Eero Tamminen

On Mon, Nov 16, 2015 at 09:54:10AM +0000, Tvrtko Ursulin wrote:
> 
> Hi,
> 
> On 15/11/15 13:32, Chris Wilson wrote:
> >The busywait in __i915_spin_request() does not respect pending signals
> >and so may consume the entire timeslice for the task instead of
> >returning to userspace to handle the signal.
> 
> Obviously correct to break the spin, but if spending a jiffie to
> react to signals was the only problem then it is not too severe.
> 
> Add something to the commit message about how it was found/reported
> and about the severity of impact, etc?

Perhaps:

At the worst case this could cause a delay in signal processing of 20ms,
which would be a noticeable jitter in cursor tracking. If a higher
resolution signal was being used, for example to provide fairness of a
server timeslices between clients, we could expect to detect some
unfairness between clients. This issue was noticed when inspecting a
report of poor interactivity resulting from excessively high
__i915_spin_request usage.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/2] drm/i915: Break busywaiting for requests on pending signals
  2015-11-16 11:22     ` Chris Wilson
  (?)
@ 2015-11-16 11:40     ` Tvrtko Ursulin
  -1 siblings, 0 replies; 35+ messages in thread
From: Tvrtko Ursulin @ 2015-11-16 11:40 UTC (permalink / raw)
  To: Chris Wilson, Jens Axboe, intel-gfx, linux-kernel, dri-devel,
	Daniel Vetter, Eero Tamminen, Rantala, Valtteri, stable


On 16/11/15 11:22, Chris Wilson wrote:
> On Mon, Nov 16, 2015 at 09:54:10AM +0000, Tvrtko Ursulin wrote:
>>
>> Hi,
>>
>> On 15/11/15 13:32, Chris Wilson wrote:
>>> The busywait in __i915_spin_request() does not respect pending signals
>>> and so may consume the entire timeslice for the task instead of
>>> returning to userspace to handle the signal.
>>
>> Obviously correct to break the spin, but if spending a jiffie to
>> react to signals was the only problem then it is not too severe.
>>
>> Add something to the commit message about how it was found/reported
>> and about the severity of impact, etc?
>
> Perhaps:
>
> At the worst case this could cause a delay in signal processing of 20ms,
> which would be a noticeable jitter in cursor tracking. If a higher
> resolution signal was being used, for example to provide fairness of a
> server timeslices between clients, we could expect to detect some
> unfairness between clients. This issue was noticed when inspecting a
> report of poor interactivity resulting from excessively high
> __i915_spin_request usage.

Oh its the Xorg scheduler tick... I always forget about that. Was 
thinking that it is only about fatal, or at least infrequent signals.

Regards,

Tvrtko

^ permalink raw reply	[flat|nested] 35+ messages in thread

end of thread, other threads:[~2015-12-03 22:03 UTC | newest]

Thread overview: 35+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-11-15 13:32 [PATCH 1/2] drm/i915: Break busywaiting for requests on pending signals Chris Wilson
2015-11-15 13:32 ` Chris Wilson
2015-11-15 13:32 ` [PATCH 2/2] drm/i915: Limit the busy wait on requests to 2us not 10ms! Chris Wilson
2015-11-15 13:32   ` Chris Wilson
2015-11-15 17:48   ` Chris Wilson
2015-11-15 17:48     ` Chris Wilson
2015-11-16 10:24   ` Tvrtko Ursulin
2015-11-16 10:24     ` Tvrtko Ursulin
2015-11-16 11:12     ` Chris Wilson
2015-11-16 11:12       ` Chris Wilson
2015-11-16 12:08       ` Tvrtko Ursulin
2015-11-16 12:08         ` Tvrtko Ursulin
2015-11-16 12:55         ` Chris Wilson
2015-11-16 12:55           ` Chris Wilson
2015-11-16 13:09           ` Tvrtko Ursulin
2015-11-16 13:09             ` Tvrtko Ursulin
2015-11-16 13:30           ` [Intel-gfx] " Ville Syrjälä
2015-11-16 13:30             ` Ville Syrjälä
2015-11-16 16:48   ` Jens Axboe
2015-11-18  9:56     ` Limit busywaiting Chris Wilson
2015-11-18  9:56       ` [PATCH 1/3] drm/i915: Only spin whilst waiting on the current request Chris Wilson
2015-11-18 17:03         ` Daniel Vetter
2015-11-19 10:05         ` Tvrtko Ursulin
2015-11-19 10:12           ` Chris Wilson
2015-11-18  9:56       ` [PATCH 2/3] drm/i915: Convert __i915_wait_request to receive flags Chris Wilson
2015-11-18  9:56       ` [PATCH 3/3] drm/i915: Limit request busywaiting Chris Wilson
2015-11-19 15:22         ` Daniel Vetter
2015-11-19 16:29       ` Limit busywaiting Jens Axboe
2015-12-03 22:03   ` [PATCH 2/2] drm/i915: Limit the busy wait on requests to 2us not 10ms! Pavel Machek
2015-12-03 22:03     ` Pavel Machek
2015-11-16  9:54 ` [PATCH 1/2] drm/i915: Break busywaiting for requests on pending signals Tvrtko Ursulin
2015-11-16  9:54   ` Tvrtko Ursulin
2015-11-16 11:22   ` Chris Wilson
2015-11-16 11:22     ` Chris Wilson
2015-11-16 11:40     ` Tvrtko Ursulin

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.