All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/radeon: Avoid double gpu reset by adding a timeout on IB ring tests.
@ 2016-01-24  6:18 Matthew Dawson
  2016-01-24  9:49 ` Christian König
  0 siblings, 1 reply; 9+ messages in thread
From: Matthew Dawson @ 2016-01-24  6:18 UTC (permalink / raw)
  To: dri-devel

When the radeon driver resets a gpu, it attempts to test whether all the
rings can successfully handle an IB.  If these rings fail to respond, the
process will wait forever.  Another gpu reset can't happen at this point,
as the current reset holds a lock required to do so.  Instead, make all
the IB tests run with a timeout, so the system can attempt to recover
in this case.

While this doesn't fix the underlying issue with card resets failing, it
gives the system a higher chance of recovering.  These timeouts have been
confirmed to help both a Tathi and Hawaii card recover after a gpu reset.

I haven't been able to test this on other cards, but everything compiles.
I wasn't sure what to use for a timeout value, so for now I've used
radeon_lockup_timeout.  When that is 0, the timeout functionality in this
patch is also disabled.

This also adds a new function, radeon_fence_wait_timeout, that behaves like
radeon_fence_wait except allowing a different timeout value to be passed to
radeon_fence_wait_seq_timeout.

Signed-off-by: Matthew Dawson <matthew@mjdsystems.ca>
---
 drivers/gpu/drm/radeon/cik_sdma.c     |  3 ++-
 drivers/gpu/drm/radeon/r100.c         |  3 ++-
 drivers/gpu/drm/radeon/r600.c         |  3 ++-
 drivers/gpu/drm/radeon/r600_dma.c     |  3 ++-
 drivers/gpu/drm/radeon/radeon.h       |  1 +
 drivers/gpu/drm/radeon/radeon_fence.c | 25 ++++++++++++++++++++++---
 drivers/gpu/drm/radeon/radeon_vce.c   |  3 ++-
 drivers/gpu/drm/radeon/uvd_v1_0.c     |  3 ++-
 8 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/radeon/cik_sdma.c b/drivers/gpu/drm/radeon/cik_sdma.c
index d16f2ee..014f5ca 100644
--- a/drivers/gpu/drm/radeon/cik_sdma.c
+++ b/drivers/gpu/drm/radeon/cik_sdma.c
@@ -737,7 +737,8 @@ int cik_sdma_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
 		DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
 		return r;
 	}
-	r = radeon_fence_wait(ib.fence, false);
+	r = radeon_fence_wait_timeout(ib.fence, false, msecs_to_jiffies(
+		(radeon_lockup_timeout) ? radeon_lockup_timeout : MAX_SCHEDULE_TIMEOUT));
 	if (r) {
 		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
 		return r;
diff --git a/drivers/gpu/drm/radeon/r100.c b/drivers/gpu/drm/radeon/r100.c
index 5eae0a8..5386c09 100644
--- a/drivers/gpu/drm/radeon/r100.c
+++ b/drivers/gpu/drm/radeon/r100.c
@@ -3732,7 +3732,8 @@ int r100_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
 		DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
 		goto free_ib;
 	}
-	r = radeon_fence_wait(ib.fence, false);
+	r = radeon_fence_wait_timeout(ib.fence, false, msecs_to_jiffies(
+		(radeon_lockup_timeout) ? radeon_lockup_timeout : MAX_SCHEDULE_TIMEOUT));
 	if (r) {
 		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
 		goto free_ib;
diff --git a/drivers/gpu/drm/radeon/r600.c b/drivers/gpu/drm/radeon/r600.c
index cc2fdf0..8fbe5d7 100644
--- a/drivers/gpu/drm/radeon/r600.c
+++ b/drivers/gpu/drm/radeon/r600.c
@@ -3381,7 +3381,8 @@ int r600_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
 		DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
 		goto free_ib;
 	}
-	r = radeon_fence_wait(ib.fence, false);
+	r = radeon_fence_wait_timeout(ib.fence, false, msecs_to_jiffies(
+		(radeon_lockup_timeout) ? radeon_lockup_timeout : MAX_SCHEDULE_TIMEOUT));
 	if (r) {
 		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
 		goto free_ib;
diff --git a/drivers/gpu/drm/radeon/r600_dma.c b/drivers/gpu/drm/radeon/r600_dma.c
index d2dd29a..3ff614d 100644
--- a/drivers/gpu/drm/radeon/r600_dma.c
+++ b/drivers/gpu/drm/radeon/r600_dma.c
@@ -368,7 +368,8 @@ int r600_dma_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
 		DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
 		return r;
 	}
-	r = radeon_fence_wait(ib.fence, false);
+	r = radeon_fence_wait_timeout(ib.fence, false, msecs_to_jiffies(
+		(radeon_lockup_timeout) ? radeon_lockup_timeout : MAX_SCHEDULE_TIMEOUT));
 	if (r) {
 		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
 		return r;
diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
index 5ae6db9..0b698b6 100644
--- a/drivers/gpu/drm/radeon/radeon.h
+++ b/drivers/gpu/drm/radeon/radeon.h
@@ -382,6 +382,7 @@ void radeon_fence_driver_force_completion(struct radeon_device *rdev, int ring);
 int radeon_fence_emit(struct radeon_device *rdev, struct radeon_fence **fence, int ring);
 void radeon_fence_process(struct radeon_device *rdev, int ring);
 bool radeon_fence_signaled(struct radeon_fence *fence);
+int radeon_fence_wait_timeout(struct radeon_fence *fence, bool interruptible, long timeout);
 int radeon_fence_wait(struct radeon_fence *fence, bool interruptible);
 int radeon_fence_wait_next(struct radeon_device *rdev, int ring);
 int radeon_fence_wait_empty(struct radeon_device *rdev, int ring);
diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c
index 05815c4..9fec805 100644
--- a/drivers/gpu/drm/radeon/radeon_fence.c
+++ b/drivers/gpu/drm/radeon/radeon_fence.c
@@ -527,7 +527,7 @@ static long radeon_fence_wait_seq_timeout(struct radeon_device *rdev,
 }
 
 /**
- * radeon_fence_wait - wait for a fence to signal
+ * radeon_fence_wait_timeout - wait for a fence to signal with timeout
  *
  * @fence: radeon fence object
  * @intr: use interruptible sleep
@@ -535,9 +535,10 @@ static long radeon_fence_wait_seq_timeout(struct radeon_device *rdev,
  * Wait for the requested fence to signal (all asics).
  * @intr selects whether to use interruptable (true) or non-interruptable
  * (false) sleep when waiting for the fence.
+ * @timeout: maximum time to wait, or MAX_SCHEDULE_TIMEOUT for infinite wait
  * Returns 0 if the fence has passed, error for all other cases.
  */
-int radeon_fence_wait(struct radeon_fence *fence, bool intr)
+int radeon_fence_wait_timeout(struct radeon_fence *fence, bool intr, long timeout)
 {
 	uint64_t seq[RADEON_NUM_RINGS] = {};
 	long r;
@@ -552,9 +553,11 @@ int radeon_fence_wait(struct radeon_fence *fence, bool intr)
 		return fence_wait(&fence->base, intr);
 
 	seq[fence->ring] = fence->seq;
-	r = radeon_fence_wait_seq_timeout(fence->rdev, seq, intr, MAX_SCHEDULE_TIMEOUT);
+	r = radeon_fence_wait_seq_timeout(fence->rdev, seq, intr, timeout);
 	if (r < 0) {
 		return r;
+	} else if (r == 0) {
+		return -ETIMEDOUT;
 	}
 
 	r = fence_signal(&fence->base);
@@ -564,6 +567,22 @@ int radeon_fence_wait(struct radeon_fence *fence, bool intr)
 }
 
 /**
+ * radeon_fence_wait - wait for a fence to signal
+ *
+ * @fence: radeon fence object
+ * @intr: use interruptible sleep
+ *
+ * Wait for the requested fence to signal (all asics).
+ * @intr selects whether to use interruptable (true) or non-interruptable
+ * (false) sleep when waiting for the fence.
+ * Returns 0 if the fence has passed, error for all other cases.
+ */
+int radeon_fence_wait(struct radeon_fence *fence, bool intr)
+{
+	return radeon_fence_wait_timeout(fence, intr, MAX_SCHEDULE_TIMEOUT);
+}
+
+/**
  * radeon_fence_wait_any - wait for a fence to signal on any ring
  *
  * @rdev: radeon device pointer
diff --git a/drivers/gpu/drm/radeon/radeon_vce.c b/drivers/gpu/drm/radeon/radeon_vce.c
index 7eb1ae7..7d80dad 100644
--- a/drivers/gpu/drm/radeon/radeon_vce.c
+++ b/drivers/gpu/drm/radeon/radeon_vce.c
@@ -810,7 +810,8 @@ int radeon_vce_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
 		goto error;
 	}
 
-	r = radeon_fence_wait(fence, false);
+	r = radeon_fence_wait_timeout(fence, false, msecs_to_jiffies(
+		(radeon_lockup_timeout) ? radeon_lockup_timeout : MAX_SCHEDULE_TIMEOUT));
 	if (r) {
 		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
 	} else {
diff --git a/drivers/gpu/drm/radeon/uvd_v1_0.c b/drivers/gpu/drm/radeon/uvd_v1_0.c
index c6b1cbc..35caa89 100644
--- a/drivers/gpu/drm/radeon/uvd_v1_0.c
+++ b/drivers/gpu/drm/radeon/uvd_v1_0.c
@@ -522,7 +522,8 @@ int uvd_v1_0_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
 		goto error;
 	}
 
-	r = radeon_fence_wait(fence, false);
+	r = radeon_fence_wait_timeout(fence, false, msecs_to_jiffies(
+		(radeon_lockup_timeout) ? radeon_lockup_timeout : MAX_SCHEDULE_TIMEOUT));
 	if (r) {
 		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
 		goto error;
-- 
2.7.0

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH] drm/radeon: Avoid double gpu reset by adding a timeout on IB ring tests.
  2016-01-24  6:18 [PATCH] drm/radeon: Avoid double gpu reset by adding a timeout on IB ring tests Matthew Dawson
@ 2016-01-24  9:49 ` Christian König
  2016-01-31 18:50   ` Matthew Dawson
  0 siblings, 1 reply; 9+ messages in thread
From: Christian König @ 2016-01-24  9:49 UTC (permalink / raw)
  To: Matthew Dawson, dri-devel

Am 24.01.2016 um 07:18 schrieb Matthew Dawson:
> When the radeon driver resets a gpu, it attempts to test whether all the
> rings can successfully handle an IB.  If these rings fail to respond, the
> process will wait forever.  Another gpu reset can't happen at this point,
> as the current reset holds a lock required to do so.  Instead, make all
> the IB tests run with a timeout, so the system can attempt to recover
> in this case.
>
> While this doesn't fix the underlying issue with card resets failing, it
> gives the system a higher chance of recovering.  These timeouts have been
> confirmed to help both a Tathi and Hawaii card recover after a gpu reset.
>
> I haven't been able to test this on other cards, but everything compiles.
> I wasn't sure what to use for a timeout value, so for now I've used
> radeon_lockup_timeout.  When that is 0, the timeout functionality in this
> patch is also disabled.
>
> This also adds a new function, radeon_fence_wait_timeout, that behaves like
> radeon_fence_wait except allowing a different timeout value to be passed to
> radeon_fence_wait_seq_timeout.
>
> Signed-off-by: Matthew Dawson <matthew@mjdsystems.ca>

Really good idea, but please don't use radeon_lockup_timeout for this 
cause the 10 seconds default of that is way to long. Instead just use a 
fixed, let's say 100ms timeout defined in radeon.h.

Also don't define our own radeon_fence_wait_timeout, just use 
fence_wait_timeout(&fence->base, RADEON_IB_TEST_TIMEOUT) for this.

Apart from that the idea looks really good to me.

Regards,
Christian.

> ---
>   drivers/gpu/drm/radeon/cik_sdma.c     |  3 ++-
>   drivers/gpu/drm/radeon/r100.c         |  3 ++-
>   drivers/gpu/drm/radeon/r600.c         |  3 ++-
>   drivers/gpu/drm/radeon/r600_dma.c     |  3 ++-
>   drivers/gpu/drm/radeon/radeon.h       |  1 +
>   drivers/gpu/drm/radeon/radeon_fence.c | 25 ++++++++++++++++++++++---
>   drivers/gpu/drm/radeon/radeon_vce.c   |  3 ++-
>   drivers/gpu/drm/radeon/uvd_v1_0.c     |  3 ++-
>   8 files changed, 35 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/radeon/cik_sdma.c b/drivers/gpu/drm/radeon/cik_sdma.c
> index d16f2ee..014f5ca 100644
> --- a/drivers/gpu/drm/radeon/cik_sdma.c
> +++ b/drivers/gpu/drm/radeon/cik_sdma.c
> @@ -737,7 +737,8 @@ int cik_sdma_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
>   		DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
>   		return r;
>   	}
> -	r = radeon_fence_wait(ib.fence, false);
> +	r = radeon_fence_wait_timeout(ib.fence, false, msecs_to_jiffies(
> +		(radeon_lockup_timeout) ? radeon_lockup_timeout : MAX_SCHEDULE_TIMEOUT));
>   	if (r) {
>   		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>   		return r;
> diff --git a/drivers/gpu/drm/radeon/r100.c b/drivers/gpu/drm/radeon/r100.c
> index 5eae0a8..5386c09 100644
> --- a/drivers/gpu/drm/radeon/r100.c
> +++ b/drivers/gpu/drm/radeon/r100.c
> @@ -3732,7 +3732,8 @@ int r100_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
>   		DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
>   		goto free_ib;
>   	}
> -	r = radeon_fence_wait(ib.fence, false);
> +	r = radeon_fence_wait_timeout(ib.fence, false, msecs_to_jiffies(
> +		(radeon_lockup_timeout) ? radeon_lockup_timeout : MAX_SCHEDULE_TIMEOUT));
>   	if (r) {
>   		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>   		goto free_ib;
> diff --git a/drivers/gpu/drm/radeon/r600.c b/drivers/gpu/drm/radeon/r600.c
> index cc2fdf0..8fbe5d7 100644
> --- a/drivers/gpu/drm/radeon/r600.c
> +++ b/drivers/gpu/drm/radeon/r600.c
> @@ -3381,7 +3381,8 @@ int r600_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
>   		DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
>   		goto free_ib;
>   	}
> -	r = radeon_fence_wait(ib.fence, false);
> +	r = radeon_fence_wait_timeout(ib.fence, false, msecs_to_jiffies(
> +		(radeon_lockup_timeout) ? radeon_lockup_timeout : MAX_SCHEDULE_TIMEOUT));
>   	if (r) {
>   		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>   		goto free_ib;
> diff --git a/drivers/gpu/drm/radeon/r600_dma.c b/drivers/gpu/drm/radeon/r600_dma.c
> index d2dd29a..3ff614d 100644
> --- a/drivers/gpu/drm/radeon/r600_dma.c
> +++ b/drivers/gpu/drm/radeon/r600_dma.c
> @@ -368,7 +368,8 @@ int r600_dma_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
>   		DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
>   		return r;
>   	}
> -	r = radeon_fence_wait(ib.fence, false);
> +	r = radeon_fence_wait_timeout(ib.fence, false, msecs_to_jiffies(
> +		(radeon_lockup_timeout) ? radeon_lockup_timeout : MAX_SCHEDULE_TIMEOUT));
>   	if (r) {
>   		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>   		return r;
> diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
> index 5ae6db9..0b698b6 100644
> --- a/drivers/gpu/drm/radeon/radeon.h
> +++ b/drivers/gpu/drm/radeon/radeon.h
> @@ -382,6 +382,7 @@ void radeon_fence_driver_force_completion(struct radeon_device *rdev, int ring);
>   int radeon_fence_emit(struct radeon_device *rdev, struct radeon_fence **fence, int ring);
>   void radeon_fence_process(struct radeon_device *rdev, int ring);
>   bool radeon_fence_signaled(struct radeon_fence *fence);
> +int radeon_fence_wait_timeout(struct radeon_fence *fence, bool interruptible, long timeout);
>   int radeon_fence_wait(struct radeon_fence *fence, bool interruptible);
>   int radeon_fence_wait_next(struct radeon_device *rdev, int ring);
>   int radeon_fence_wait_empty(struct radeon_device *rdev, int ring);
> diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c
> index 05815c4..9fec805 100644
> --- a/drivers/gpu/drm/radeon/radeon_fence.c
> +++ b/drivers/gpu/drm/radeon/radeon_fence.c
> @@ -527,7 +527,7 @@ static long radeon_fence_wait_seq_timeout(struct radeon_device *rdev,
>   }
>   
>   /**
> - * radeon_fence_wait - wait for a fence to signal
> + * radeon_fence_wait_timeout - wait for a fence to signal with timeout
>    *
>    * @fence: radeon fence object
>    * @intr: use interruptible sleep
> @@ -535,9 +535,10 @@ static long radeon_fence_wait_seq_timeout(struct radeon_device *rdev,
>    * Wait for the requested fence to signal (all asics).
>    * @intr selects whether to use interruptable (true) or non-interruptable
>    * (false) sleep when waiting for the fence.
> + * @timeout: maximum time to wait, or MAX_SCHEDULE_TIMEOUT for infinite wait
>    * Returns 0 if the fence has passed, error for all other cases.
>    */
> -int radeon_fence_wait(struct radeon_fence *fence, bool intr)
> +int radeon_fence_wait_timeout(struct radeon_fence *fence, bool intr, long timeout)
>   {
>   	uint64_t seq[RADEON_NUM_RINGS] = {};
>   	long r;
> @@ -552,9 +553,11 @@ int radeon_fence_wait(struct radeon_fence *fence, bool intr)
>   		return fence_wait(&fence->base, intr);
>   
>   	seq[fence->ring] = fence->seq;
> -	r = radeon_fence_wait_seq_timeout(fence->rdev, seq, intr, MAX_SCHEDULE_TIMEOUT);
> +	r = radeon_fence_wait_seq_timeout(fence->rdev, seq, intr, timeout);
>   	if (r < 0) {
>   		return r;
> +	} else if (r == 0) {
> +		return -ETIMEDOUT;
>   	}
>   
>   	r = fence_signal(&fence->base);
> @@ -564,6 +567,22 @@ int radeon_fence_wait(struct radeon_fence *fence, bool intr)
>   }
>   
>   /**
> + * radeon_fence_wait - wait for a fence to signal
> + *
> + * @fence: radeon fence object
> + * @intr: use interruptible sleep
> + *
> + * Wait for the requested fence to signal (all asics).
> + * @intr selects whether to use interruptable (true) or non-interruptable
> + * (false) sleep when waiting for the fence.
> + * Returns 0 if the fence has passed, error for all other cases.
> + */
> +int radeon_fence_wait(struct radeon_fence *fence, bool intr)
> +{
> +	return radeon_fence_wait_timeout(fence, intr, MAX_SCHEDULE_TIMEOUT);
> +}
> +
> +/**
>    * radeon_fence_wait_any - wait for a fence to signal on any ring
>    *
>    * @rdev: radeon device pointer
> diff --git a/drivers/gpu/drm/radeon/radeon_vce.c b/drivers/gpu/drm/radeon/radeon_vce.c
> index 7eb1ae7..7d80dad 100644
> --- a/drivers/gpu/drm/radeon/radeon_vce.c
> +++ b/drivers/gpu/drm/radeon/radeon_vce.c
> @@ -810,7 +810,8 @@ int radeon_vce_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
>   		goto error;
>   	}
>   
> -	r = radeon_fence_wait(fence, false);
> +	r = radeon_fence_wait_timeout(fence, false, msecs_to_jiffies(
> +		(radeon_lockup_timeout) ? radeon_lockup_timeout : MAX_SCHEDULE_TIMEOUT));
>   	if (r) {
>   		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>   	} else {
> diff --git a/drivers/gpu/drm/radeon/uvd_v1_0.c b/drivers/gpu/drm/radeon/uvd_v1_0.c
> index c6b1cbc..35caa89 100644
> --- a/drivers/gpu/drm/radeon/uvd_v1_0.c
> +++ b/drivers/gpu/drm/radeon/uvd_v1_0.c
> @@ -522,7 +522,8 @@ int uvd_v1_0_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
>   		goto error;
>   	}
>   
> -	r = radeon_fence_wait(fence, false);
> +	r = radeon_fence_wait_timeout(fence, false, msecs_to_jiffies(
> +		(radeon_lockup_timeout) ? radeon_lockup_timeout : MAX_SCHEDULE_TIMEOUT));
>   	if (r) {
>   		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>   		goto error;

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] drm/radeon: Avoid double gpu reset by adding a timeout on IB ring tests.
  2016-01-24  9:49 ` Christian König
@ 2016-01-31 18:50   ` Matthew Dawson
  2016-02-03 12:24     ` Christian König
  0 siblings, 1 reply; 9+ messages in thread
From: Matthew Dawson @ 2016-01-31 18:50 UTC (permalink / raw)
  To: Christian König; +Cc: dri-devel


[-- Attachment #1.1: Type: text/plain, Size: 11349 bytes --]

On Sunday, January 24, 2016 10:49:00 AM EST Christian König wrote:
> Am 24.01.2016 um 07:18 schrieb Matthew Dawson:
> > When the radeon driver resets a gpu, it attempts to test whether all the
> > rings can successfully handle an IB.  If these rings fail to respond, the
> > process will wait forever.  Another gpu reset can't happen at this point,
> > as the current reset holds a lock required to do so.  Instead, make all
> > the IB tests run with a timeout, so the system can attempt to recover
> > in this case.
> > 
> > While this doesn't fix the underlying issue with card resets failing, it
> > gives the system a higher chance of recovering.  These timeouts have been
> > confirmed to help both a Tathi and Hawaii card recover after a gpu reset.
> > 
> > I haven't been able to test this on other cards, but everything compiles.
> > I wasn't sure what to use for a timeout value, so for now I've used
> > radeon_lockup_timeout.  When that is 0, the timeout functionality in this
> > patch is also disabled.
> > 
> > This also adds a new function, radeon_fence_wait_timeout, that behaves
> > like
> > radeon_fence_wait except allowing a different timeout value to be passed
> > to
> > radeon_fence_wait_seq_timeout.
> > 
> > Signed-off-by: Matthew Dawson <matthew@mjdsystems.ca>
> 
> Really good idea, but please don't use radeon_lockup_timeout for this
> cause the 10 seconds default of that is way to long. Instead just use a
> fixed, let's say 100ms timeout defined in radeon.h.
I originally tried 100ms, but I found that to be too short (my system would 
just lockup completely (no network even)).  I found 1s is long enough, and 
isn't so long to be annoying.  Would that be ok?

> 
> Also don't define our own radeon_fence_wait_timeout, just use
> fence_wait_timeout(&fence->base, RADEON_IB_TEST_TIMEOUT) for this.
I switched everything over to this, however the ib test always times out after 
a gpu reset (on startup, everything is normal).  I'm not sure why 
fence_wait_timeout isn't being signaled while radeon_fence_wait_seq_timeout 
is.  I'm suspicious that either radeon_fence_enable_signaling is not doing 
something necessary to get the fence completion to actual signal 
radeon_fence_default_wait, or because we hold the exclusive lock during a 
reset radeon_fence_check_lockup never runs and thus never triggers the fence 
or enables some irq lines.

Is it worth digging through the interactions here to get dma-buf fences to 
work correctly during a lockup, or would it be better to just keep 
radeon_fence_wait_timeout?  If option 1 is preferred, I'm happy to learn but I 
need some help learning how it is supposed to work.

> 
> Apart from that the idea looks really good to me.
> 
> Regards,
> Christian.
> 
Thanks,

Matthew

> > ---
> > 
> >   drivers/gpu/drm/radeon/cik_sdma.c     |  3 ++-
> >   drivers/gpu/drm/radeon/r100.c         |  3 ++-
> >   drivers/gpu/drm/radeon/r600.c         |  3 ++-
> >   drivers/gpu/drm/radeon/r600_dma.c     |  3 ++-
> >   drivers/gpu/drm/radeon/radeon.h       |  1 +
> >   drivers/gpu/drm/radeon/radeon_fence.c | 25 ++++++++++++++++++++++---
> >   drivers/gpu/drm/radeon/radeon_vce.c   |  3 ++-
> >   drivers/gpu/drm/radeon/uvd_v1_0.c     |  3 ++-
> >   8 files changed, 35 insertions(+), 9 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/radeon/cik_sdma.c
> > b/drivers/gpu/drm/radeon/cik_sdma.c index d16f2ee..014f5ca 100644
> > --- a/drivers/gpu/drm/radeon/cik_sdma.c
> > +++ b/drivers/gpu/drm/radeon/cik_sdma.c
> > @@ -737,7 +737,8 @@ int cik_sdma_ib_test(struct radeon_device *rdev,
> > struct radeon_ring *ring)> 
> >   		DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
> >   		return r;
> >   	
> >   	}
> > 
> > -	r = radeon_fence_wait(ib.fence, false);
> > +	r = radeon_fence_wait_timeout(ib.fence, false, msecs_to_jiffies(
> > +		(radeon_lockup_timeout) ? radeon_lockup_timeout :
> > MAX_SCHEDULE_TIMEOUT));> 
> >   	if (r) {
> >   	
> >   		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
> >   		return r;
> > 
> > diff --git a/drivers/gpu/drm/radeon/r100.c b/drivers/gpu/drm/radeon/r100.c
> > index 5eae0a8..5386c09 100644
> > --- a/drivers/gpu/drm/radeon/r100.c
> > +++ b/drivers/gpu/drm/radeon/r100.c
> > @@ -3732,7 +3732,8 @@ int r100_ib_test(struct radeon_device *rdev, struct
> > radeon_ring *ring)> 
> >   		DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
> >   		goto free_ib;
> >   	
> >   	}
> > 
> > -	r = radeon_fence_wait(ib.fence, false);
> > +	r = radeon_fence_wait_timeout(ib.fence, false, msecs_to_jiffies(
> > +		(radeon_lockup_timeout) ? radeon_lockup_timeout :
> > MAX_SCHEDULE_TIMEOUT));> 
> >   	if (r) {
> >   	
> >   		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
> >   		goto free_ib;
> > 
> > diff --git a/drivers/gpu/drm/radeon/r600.c b/drivers/gpu/drm/radeon/r600.c
> > index cc2fdf0..8fbe5d7 100644
> > --- a/drivers/gpu/drm/radeon/r600.c
> > +++ b/drivers/gpu/drm/radeon/r600.c
> > @@ -3381,7 +3381,8 @@ int r600_ib_test(struct radeon_device *rdev, struct
> > radeon_ring *ring)> 
> >   		DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
> >   		goto free_ib;
> >   	
> >   	}
> > 
> > -	r = radeon_fence_wait(ib.fence, false);
> > +	r = radeon_fence_wait_timeout(ib.fence, false, msecs_to_jiffies(
> > +		(radeon_lockup_timeout) ? radeon_lockup_timeout :
> > MAX_SCHEDULE_TIMEOUT));> 
> >   	if (r) {
> >   	
> >   		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
> >   		goto free_ib;
> > 
> > diff --git a/drivers/gpu/drm/radeon/r600_dma.c
> > b/drivers/gpu/drm/radeon/r600_dma.c index d2dd29a..3ff614d 100644
> > --- a/drivers/gpu/drm/radeon/r600_dma.c
> > +++ b/drivers/gpu/drm/radeon/r600_dma.c
> > @@ -368,7 +368,8 @@ int r600_dma_ib_test(struct radeon_device *rdev,
> > struct radeon_ring *ring)> 
> >   		DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
> >   		return r;
> >   	
> >   	}
> > 
> > -	r = radeon_fence_wait(ib.fence, false);
> > +	r = radeon_fence_wait_timeout(ib.fence, false, msecs_to_jiffies(
> > +		(radeon_lockup_timeout) ? radeon_lockup_timeout :
> > MAX_SCHEDULE_TIMEOUT));> 
> >   	if (r) {
> >   	
> >   		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
> >   		return r;
> > 
> > diff --git a/drivers/gpu/drm/radeon/radeon.h
> > b/drivers/gpu/drm/radeon/radeon.h index 5ae6db9..0b698b6 100644
> > --- a/drivers/gpu/drm/radeon/radeon.h
> > +++ b/drivers/gpu/drm/radeon/radeon.h
> > @@ -382,6 +382,7 @@ void radeon_fence_driver_force_completion(struct
> > radeon_device *rdev, int ring);> 
> >   int radeon_fence_emit(struct radeon_device *rdev, struct radeon_fence
> >   **fence, int ring); void radeon_fence_process(struct radeon_device
> >   *rdev, int ring);
> >   bool radeon_fence_signaled(struct radeon_fence *fence);
> > 
> > +int radeon_fence_wait_timeout(struct radeon_fence *fence, bool
> > interruptible, long timeout);> 
> >   int radeon_fence_wait(struct radeon_fence *fence, bool interruptible);
> >   int radeon_fence_wait_next(struct radeon_device *rdev, int ring);
> >   int radeon_fence_wait_empty(struct radeon_device *rdev, int ring);
> > 
> > diff --git a/drivers/gpu/drm/radeon/radeon_fence.c
> > b/drivers/gpu/drm/radeon/radeon_fence.c index 05815c4..9fec805 100644
> > --- a/drivers/gpu/drm/radeon/radeon_fence.c
> > +++ b/drivers/gpu/drm/radeon/radeon_fence.c
> > @@ -527,7 +527,7 @@ static long radeon_fence_wait_seq_timeout(struct
> > radeon_device *rdev,> 
> >   }
> >   
> >   /**
> > 
> > - * radeon_fence_wait - wait for a fence to signal
> > + * radeon_fence_wait_timeout - wait for a fence to signal with timeout
> > 
> >    *
> >    * @fence: radeon fence object
> >    * @intr: use interruptible sleep
> > 
> > @@ -535,9 +535,10 @@ static long radeon_fence_wait_seq_timeout(struct
> > radeon_device *rdev,> 
> >    * Wait for the requested fence to signal (all asics).
> >    * @intr selects whether to use interruptable (true) or
> >    non-interruptable
> >    * (false) sleep when waiting for the fence.
> > 
> > + * @timeout: maximum time to wait, or MAX_SCHEDULE_TIMEOUT for infinite
> > wait> 
> >    * Returns 0 if the fence has passed, error for all other cases.
> >    */
> > 
> > -int radeon_fence_wait(struct radeon_fence *fence, bool intr)
> > +int radeon_fence_wait_timeout(struct radeon_fence *fence, bool intr, long
> > timeout)> 
> >   {
> >   
> >   	uint64_t seq[RADEON_NUM_RINGS] = {};
> >   	long r;
> > 
> > @@ -552,9 +553,11 @@ int radeon_fence_wait(struct radeon_fence *fence,
> > bool intr)> 
> >   		return fence_wait(&fence->base, intr);
> >   	
> >   	seq[fence->ring] = fence->seq;
> > 
> > -	r = radeon_fence_wait_seq_timeout(fence->rdev, seq, intr,
> > MAX_SCHEDULE_TIMEOUT); +	r = radeon_fence_wait_seq_timeout(fence->rdev,
> > seq, intr, timeout);> 
> >   	if (r < 0) {
> >   	
> >   		return r;
> > 
> > +	} else if (r == 0) {
> > +		return -ETIMEDOUT;
> > 
> >   	}
> >   	
> >   	r = fence_signal(&fence->base);
> > 
> > @@ -564,6 +567,22 @@ int radeon_fence_wait(struct radeon_fence *fence,
> > bool intr)> 
> >   }
> >   
> >   /**
> > 
> > + * radeon_fence_wait - wait for a fence to signal
> > + *
> > + * @fence: radeon fence object
> > + * @intr: use interruptible sleep
> > + *
> > + * Wait for the requested fence to signal (all asics).
> > + * @intr selects whether to use interruptable (true) or non-interruptable
> > + * (false) sleep when waiting for the fence.
> > + * Returns 0 if the fence has passed, error for all other cases.
> > + */
> > +int radeon_fence_wait(struct radeon_fence *fence, bool intr)
> > +{
> > +	return radeon_fence_wait_timeout(fence, intr, MAX_SCHEDULE_TIMEOUT);
> > +}
> > +
> > +/**
> > 
> >    * radeon_fence_wait_any - wait for a fence to signal on any ring
> >    *
> >    * @rdev: radeon device pointer
> > 
> > diff --git a/drivers/gpu/drm/radeon/radeon_vce.c
> > b/drivers/gpu/drm/radeon/radeon_vce.c index 7eb1ae7..7d80dad 100644
> > --- a/drivers/gpu/drm/radeon/radeon_vce.c
> > +++ b/drivers/gpu/drm/radeon/radeon_vce.c
> > @@ -810,7 +810,8 @@ int radeon_vce_ib_test(struct radeon_device *rdev,
> > struct radeon_ring *ring)> 
> >   		goto error;
> >   	
> >   	}
> > 
> > -	r = radeon_fence_wait(fence, false);
> > +	r = radeon_fence_wait_timeout(fence, false, msecs_to_jiffies(
> > +		(radeon_lockup_timeout) ? radeon_lockup_timeout :
> > MAX_SCHEDULE_TIMEOUT));> 
> >   	if (r) {
> >   	
> >   		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
> >   	
> >   	} else {
> > 
> > diff --git a/drivers/gpu/drm/radeon/uvd_v1_0.c
> > b/drivers/gpu/drm/radeon/uvd_v1_0.c index c6b1cbc..35caa89 100644
> > --- a/drivers/gpu/drm/radeon/uvd_v1_0.c
> > +++ b/drivers/gpu/drm/radeon/uvd_v1_0.c
> > @@ -522,7 +522,8 @@ int uvd_v1_0_ib_test(struct radeon_device *rdev,
> > struct radeon_ring *ring)> 
> >   		goto error;
> >   	
> >   	}
> > 
> > -	r = radeon_fence_wait(fence, false);
> > +	r = radeon_fence_wait_timeout(fence, false, msecs_to_jiffies(
> > +		(radeon_lockup_timeout) ? radeon_lockup_timeout :
> > MAX_SCHEDULE_TIMEOUT));> 
> >   	if (r) {
> >   	
> >   		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
> >   		goto error;


-- 
Matthew

[-- Attachment #1.2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5584 bytes --]

[-- Attachment #2: Type: text/plain, Size: 159 bytes --]

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] drm/radeon: Avoid double gpu reset by adding a timeout on IB ring tests.
  2016-01-31 18:50   ` Matthew Dawson
@ 2016-02-03 12:24     ` Christian König
  2016-02-08 14:51       ` Matthew Dawson
  0 siblings, 1 reply; 9+ messages in thread
From: Christian König @ 2016-02-03 12:24 UTC (permalink / raw)
  To: Matthew Dawson; +Cc: dri-devel

Am 31.01.2016 um 19:50 schrieb Matthew Dawson:
> On Sunday, January 24, 2016 10:49:00 AM EST Christian König wrote:
>> Am 24.01.2016 um 07:18 schrieb Matthew Dawson:
>>> When the radeon driver resets a gpu, it attempts to test whether all the
>>> rings can successfully handle an IB.  If these rings fail to respond, the
>>> process will wait forever.  Another gpu reset can't happen at this point,
>>> as the current reset holds a lock required to do so.  Instead, make all
>>> the IB tests run with a timeout, so the system can attempt to recover
>>> in this case.
>>>
>>> While this doesn't fix the underlying issue with card resets failing, it
>>> gives the system a higher chance of recovering.  These timeouts have been
>>> confirmed to help both a Tathi and Hawaii card recover after a gpu reset.
>>>
>>> I haven't been able to test this on other cards, but everything compiles.
>>> I wasn't sure what to use for a timeout value, so for now I've used
>>> radeon_lockup_timeout.  When that is 0, the timeout functionality in this
>>> patch is also disabled.
>>>
>>> This also adds a new function, radeon_fence_wait_timeout, that behaves
>>> like
>>> radeon_fence_wait except allowing a different timeout value to be passed
>>> to
>>> radeon_fence_wait_seq_timeout.
>>>
>>> Signed-off-by: Matthew Dawson <matthew@mjdsystems.ca>
>> Really good idea, but please don't use radeon_lockup_timeout for this
>> cause the 10 seconds default of that is way to long. Instead just use a
>> fixed, let's say 100ms timeout defined in radeon.h.
> I originally tried 100ms, but I found that to be too short (my system would
> just lockup completely (no network even)).  I found 1s is long enough, and
> isn't so long to be annoying.  Would that be ok?

Yeah, 1s works as well. But that 100ms shouldn't be enough sounds like 
another bug to me.

>
>> Also don't define our own radeon_fence_wait_timeout, just use
>> fence_wait_timeout(&fence->base, RADEON_IB_TEST_TIMEOUT) for this.
> I switched everything over to this, however the ib test always times out after
> a gpu reset (on startup, everything is normal).  I'm not sure why
> fence_wait_timeout isn't being signaled while radeon_fence_wait_seq_timeout
> is.  I'm suspicious that either radeon_fence_enable_signaling is not doing
> something necessary to get the fence completion to actual signal
> radeon_fence_default_wait, or because we hold the exclusive lock during a
> reset radeon_fence_check_lockup never runs and thus never triggers the fence
> or enables some irq lines.
>
> Is it worth digging through the interactions here to get dma-buf fences to
> work correctly during a lockup, or would it be better to just keep
> radeon_fence_wait_timeout?  If option 1 is preferred, I'm happy to learn but I
> need some help learning how it is supposed to work.

In this case feel free to add the radeon_fence_wait_timeout. It's 
probably a good idea to get rid of the exclusive lock sooner or later in 
radeon as well, but that really is a long term project.

If you're really bored I can give you tons of input where to start with 
that.

Regards,
Christian.

>
>> Apart from that the idea looks really good to me.
>>
>> Regards,
>> Christian.
>>
> Thanks,
>
> Matthew
>
>>> ---
>>>
>>>    drivers/gpu/drm/radeon/cik_sdma.c     |  3 ++-
>>>    drivers/gpu/drm/radeon/r100.c         |  3 ++-
>>>    drivers/gpu/drm/radeon/r600.c         |  3 ++-
>>>    drivers/gpu/drm/radeon/r600_dma.c     |  3 ++-
>>>    drivers/gpu/drm/radeon/radeon.h       |  1 +
>>>    drivers/gpu/drm/radeon/radeon_fence.c | 25 ++++++++++++++++++++++---
>>>    drivers/gpu/drm/radeon/radeon_vce.c   |  3 ++-
>>>    drivers/gpu/drm/radeon/uvd_v1_0.c     |  3 ++-
>>>    8 files changed, 35 insertions(+), 9 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/radeon/cik_sdma.c
>>> b/drivers/gpu/drm/radeon/cik_sdma.c index d16f2ee..014f5ca 100644
>>> --- a/drivers/gpu/drm/radeon/cik_sdma.c
>>> +++ b/drivers/gpu/drm/radeon/cik_sdma.c
>>> @@ -737,7 +737,8 @@ int cik_sdma_ib_test(struct radeon_device *rdev,
>>> struct radeon_ring *ring)>
>>>    		DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
>>>    		return r;
>>>    	
>>>    	}
>>>
>>> -	r = radeon_fence_wait(ib.fence, false);
>>> +	r = radeon_fence_wait_timeout(ib.fence, false, msecs_to_jiffies(
>>> +		(radeon_lockup_timeout) ? radeon_lockup_timeout :
>>> MAX_SCHEDULE_TIMEOUT));>
>>>    	if (r) {
>>>    	
>>>    		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>>>    		return r;
>>>
>>> diff --git a/drivers/gpu/drm/radeon/r100.c b/drivers/gpu/drm/radeon/r100.c
>>> index 5eae0a8..5386c09 100644
>>> --- a/drivers/gpu/drm/radeon/r100.c
>>> +++ b/drivers/gpu/drm/radeon/r100.c
>>> @@ -3732,7 +3732,8 @@ int r100_ib_test(struct radeon_device *rdev, struct
>>> radeon_ring *ring)>
>>>    		DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
>>>    		goto free_ib;
>>>    	
>>>    	}
>>>
>>> -	r = radeon_fence_wait(ib.fence, false);
>>> +	r = radeon_fence_wait_timeout(ib.fence, false, msecs_to_jiffies(
>>> +		(radeon_lockup_timeout) ? radeon_lockup_timeout :
>>> MAX_SCHEDULE_TIMEOUT));>
>>>    	if (r) {
>>>    	
>>>    		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>>>    		goto free_ib;
>>>
>>> diff --git a/drivers/gpu/drm/radeon/r600.c b/drivers/gpu/drm/radeon/r600.c
>>> index cc2fdf0..8fbe5d7 100644
>>> --- a/drivers/gpu/drm/radeon/r600.c
>>> +++ b/drivers/gpu/drm/radeon/r600.c
>>> @@ -3381,7 +3381,8 @@ int r600_ib_test(struct radeon_device *rdev, struct
>>> radeon_ring *ring)>
>>>    		DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
>>>    		goto free_ib;
>>>    	
>>>    	}
>>>
>>> -	r = radeon_fence_wait(ib.fence, false);
>>> +	r = radeon_fence_wait_timeout(ib.fence, false, msecs_to_jiffies(
>>> +		(radeon_lockup_timeout) ? radeon_lockup_timeout :
>>> MAX_SCHEDULE_TIMEOUT));>
>>>    	if (r) {
>>>    	
>>>    		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>>>    		goto free_ib;
>>>
>>> diff --git a/drivers/gpu/drm/radeon/r600_dma.c
>>> b/drivers/gpu/drm/radeon/r600_dma.c index d2dd29a..3ff614d 100644
>>> --- a/drivers/gpu/drm/radeon/r600_dma.c
>>> +++ b/drivers/gpu/drm/radeon/r600_dma.c
>>> @@ -368,7 +368,8 @@ int r600_dma_ib_test(struct radeon_device *rdev,
>>> struct radeon_ring *ring)>
>>>    		DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
>>>    		return r;
>>>    	
>>>    	}
>>>
>>> -	r = radeon_fence_wait(ib.fence, false);
>>> +	r = radeon_fence_wait_timeout(ib.fence, false, msecs_to_jiffies(
>>> +		(radeon_lockup_timeout) ? radeon_lockup_timeout :
>>> MAX_SCHEDULE_TIMEOUT));>
>>>    	if (r) {
>>>    	
>>>    		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>>>    		return r;
>>>
>>> diff --git a/drivers/gpu/drm/radeon/radeon.h
>>> b/drivers/gpu/drm/radeon/radeon.h index 5ae6db9..0b698b6 100644
>>> --- a/drivers/gpu/drm/radeon/radeon.h
>>> +++ b/drivers/gpu/drm/radeon/radeon.h
>>> @@ -382,6 +382,7 @@ void radeon_fence_driver_force_completion(struct
>>> radeon_device *rdev, int ring);>
>>>    int radeon_fence_emit(struct radeon_device *rdev, struct radeon_fence
>>>    **fence, int ring); void radeon_fence_process(struct radeon_device
>>>    *rdev, int ring);
>>>    bool radeon_fence_signaled(struct radeon_fence *fence);
>>>
>>> +int radeon_fence_wait_timeout(struct radeon_fence *fence, bool
>>> interruptible, long timeout);>
>>>    int radeon_fence_wait(struct radeon_fence *fence, bool interruptible);
>>>    int radeon_fence_wait_next(struct radeon_device *rdev, int ring);
>>>    int radeon_fence_wait_empty(struct radeon_device *rdev, int ring);
>>>
>>> diff --git a/drivers/gpu/drm/radeon/radeon_fence.c
>>> b/drivers/gpu/drm/radeon/radeon_fence.c index 05815c4..9fec805 100644
>>> --- a/drivers/gpu/drm/radeon/radeon_fence.c
>>> +++ b/drivers/gpu/drm/radeon/radeon_fence.c
>>> @@ -527,7 +527,7 @@ static long radeon_fence_wait_seq_timeout(struct
>>> radeon_device *rdev,>
>>>    }
>>>    
>>>    /**
>>>
>>> - * radeon_fence_wait - wait for a fence to signal
>>> + * radeon_fence_wait_timeout - wait for a fence to signal with timeout
>>>
>>>     *
>>>     * @fence: radeon fence object
>>>     * @intr: use interruptible sleep
>>>
>>> @@ -535,9 +535,10 @@ static long radeon_fence_wait_seq_timeout(struct
>>> radeon_device *rdev,>
>>>     * Wait for the requested fence to signal (all asics).
>>>     * @intr selects whether to use interruptable (true) or
>>>     non-interruptable
>>>     * (false) sleep when waiting for the fence.
>>>
>>> + * @timeout: maximum time to wait, or MAX_SCHEDULE_TIMEOUT for infinite
>>> wait>
>>>     * Returns 0 if the fence has passed, error for all other cases.
>>>     */
>>>
>>> -int radeon_fence_wait(struct radeon_fence *fence, bool intr)
>>> +int radeon_fence_wait_timeout(struct radeon_fence *fence, bool intr, long
>>> timeout)>
>>>    {
>>>    
>>>    	uint64_t seq[RADEON_NUM_RINGS] = {};
>>>    	long r;
>>>
>>> @@ -552,9 +553,11 @@ int radeon_fence_wait(struct radeon_fence *fence,
>>> bool intr)>
>>>    		return fence_wait(&fence->base, intr);
>>>    	
>>>    	seq[fence->ring] = fence->seq;
>>>
>>> -	r = radeon_fence_wait_seq_timeout(fence->rdev, seq, intr,
>>> MAX_SCHEDULE_TIMEOUT); +	r = radeon_fence_wait_seq_timeout(fence->rdev,
>>> seq, intr, timeout);>
>>>    	if (r < 0) {
>>>    	
>>>    		return r;
>>>
>>> +	} else if (r == 0) {
>>> +		return -ETIMEDOUT;
>>>
>>>    	}
>>>    	
>>>    	r = fence_signal(&fence->base);
>>>
>>> @@ -564,6 +567,22 @@ int radeon_fence_wait(struct radeon_fence *fence,
>>> bool intr)>
>>>    }
>>>    
>>>    /**
>>>
>>> + * radeon_fence_wait - wait for a fence to signal
>>> + *
>>> + * @fence: radeon fence object
>>> + * @intr: use interruptible sleep
>>> + *
>>> + * Wait for the requested fence to signal (all asics).
>>> + * @intr selects whether to use interruptable (true) or non-interruptable
>>> + * (false) sleep when waiting for the fence.
>>> + * Returns 0 if the fence has passed, error for all other cases.
>>> + */
>>> +int radeon_fence_wait(struct radeon_fence *fence, bool intr)
>>> +{
>>> +	return radeon_fence_wait_timeout(fence, intr, MAX_SCHEDULE_TIMEOUT);
>>> +}
>>> +
>>> +/**
>>>
>>>     * radeon_fence_wait_any - wait for a fence to signal on any ring
>>>     *
>>>     * @rdev: radeon device pointer
>>>
>>> diff --git a/drivers/gpu/drm/radeon/radeon_vce.c
>>> b/drivers/gpu/drm/radeon/radeon_vce.c index 7eb1ae7..7d80dad 100644
>>> --- a/drivers/gpu/drm/radeon/radeon_vce.c
>>> +++ b/drivers/gpu/drm/radeon/radeon_vce.c
>>> @@ -810,7 +810,8 @@ int radeon_vce_ib_test(struct radeon_device *rdev,
>>> struct radeon_ring *ring)>
>>>    		goto error;
>>>    	
>>>    	}
>>>
>>> -	r = radeon_fence_wait(fence, false);
>>> +	r = radeon_fence_wait_timeout(fence, false, msecs_to_jiffies(
>>> +		(radeon_lockup_timeout) ? radeon_lockup_timeout :
>>> MAX_SCHEDULE_TIMEOUT));>
>>>    	if (r) {
>>>    	
>>>    		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>>>    	
>>>    	} else {
>>>
>>> diff --git a/drivers/gpu/drm/radeon/uvd_v1_0.c
>>> b/drivers/gpu/drm/radeon/uvd_v1_0.c index c6b1cbc..35caa89 100644
>>> --- a/drivers/gpu/drm/radeon/uvd_v1_0.c
>>> +++ b/drivers/gpu/drm/radeon/uvd_v1_0.c
>>> @@ -522,7 +522,8 @@ int uvd_v1_0_ib_test(struct radeon_device *rdev,
>>> struct radeon_ring *ring)>
>>>    		goto error;
>>>    	
>>>    	}
>>>
>>> -	r = radeon_fence_wait(fence, false);
>>> +	r = radeon_fence_wait_timeout(fence, false, msecs_to_jiffies(
>>> +		(radeon_lockup_timeout) ? radeon_lockup_timeout :
>>> MAX_SCHEDULE_TIMEOUT));>
>>>    	if (r) {
>>>    	
>>>    		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>>>    		goto error;
>

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] drm/radeon: Avoid double gpu reset by adding a timeout on IB ring tests.
  2016-02-03 12:24     ` Christian König
@ 2016-02-08 14:51       ` Matthew Dawson
  2016-02-08 16:55         ` Bruno Kant
  0 siblings, 1 reply; 9+ messages in thread
From: Matthew Dawson @ 2016-02-08 14:51 UTC (permalink / raw)
  To: Christian König; +Cc: dri-devel


[-- Attachment #1.1: Type: text/plain, Size: 3906 bytes --]

On Wednesday, February 3, 2016 1:24:49 PM EST Christian König wrote:
> Am 31.01.2016 um 19:50 schrieb Matthew Dawson:
> > On Sunday, January 24, 2016 10:49:00 AM EST Christian König wrote:
> >> Am 24.01.2016 um 07:18 schrieb Matthew Dawson:
> >>> When the radeon driver resets a gpu, it attempts to test whether all the
> >>> rings can successfully handle an IB.  If these rings fail to respond,
> >>> the
> >>> process will wait forever.  Another gpu reset can't happen at this
> >>> point,
> >>> as the current reset holds a lock required to do so.  Instead, make all
> >>> the IB tests run with a timeout, so the system can attempt to recover
> >>> in this case.
> >>> 
> >>> While this doesn't fix the underlying issue with card resets failing, it
> >>> gives the system a higher chance of recovering.  These timeouts have
> >>> been
> >>> confirmed to help both a Tathi and Hawaii card recover after a gpu
> >>> reset.
> >>> 
> >>> I haven't been able to test this on other cards, but everything
> >>> compiles.
> >>> I wasn't sure what to use for a timeout value, so for now I've used
> >>> radeon_lockup_timeout.  When that is 0, the timeout functionality in
> >>> this
> >>> patch is also disabled.
> >>> 
> >>> This also adds a new function, radeon_fence_wait_timeout, that behaves
> >>> like
> >>> radeon_fence_wait except allowing a different timeout value to be passed
> >>> to
> >>> radeon_fence_wait_seq_timeout.
> >>> 
> >>> Signed-off-by: Matthew Dawson <matthew@mjdsystems.ca>
> >> 
> >> Really good idea, but please don't use radeon_lockup_timeout for this
> >> cause the 10 seconds default of that is way to long. Instead just use a
> >> fixed, let's say 100ms timeout defined in radeon.h.
> > 
> > I originally tried 100ms, but I found that to be too short (my system
> > would
> > just lockup completely (no network even)).  I found 1s is long enough, and
> > isn't so long to be annoying.  Would that be ok?
> 
> Yeah, 1s works as well. But that 100ms shouldn't be enough sounds like
> another bug to me.
Probably, but it has a tendency to crash my entire system, so I'm not sure how 
to debug it further.  Netconsole maybe?  I probably won't be able to dig into 
this due to time since 1s does work.

> 
> >> Also don't define our own radeon_fence_wait_timeout, just use
> >> fence_wait_timeout(&fence->base, RADEON_IB_TEST_TIMEOUT) for this.
> > 
> > I switched everything over to this, however the ib test always times out
> > after a gpu reset (on startup, everything is normal).  I'm not sure why
> > fence_wait_timeout isn't being signaled while
> > radeon_fence_wait_seq_timeout
> > is.  I'm suspicious that either radeon_fence_enable_signaling is not doing
> > something necessary to get the fence completion to actual signal
> > radeon_fence_default_wait, or because we hold the exclusive lock during a
> > reset radeon_fence_check_lockup never runs and thus never triggers the
> > fence or enables some irq lines.
> > 
> > Is it worth digging through the interactions here to get dma-buf fences to
> > work correctly during a lockup, or would it be better to just keep
> > radeon_fence_wait_timeout?  If option 1 is preferred, I'm happy to learn
> > but I need some help learning how it is supposed to work.
> 
> In this case feel free to add the radeon_fence_wait_timeout. It's
> probably a good idea to get rid of the exclusive lock sooner or later in
> radeon as well, but that really is a long term project.
> 
> If you're really bored I can give you tons of input where to start with
> that.
That sounds like a good project to learn more about the driver, so I'd be 
interested.  I don't have a lot of time right now to play with it though.  I 
do want to finish off the tf2 crasher first as well.  Once I have time I'll 
poke the list to get started.

Thanks for the reviews!
-- 
Matthew

[-- Attachment #1.2: This is a digitally signed message part. --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

[-- Attachment #2: Type: text/plain, Size: 160 bytes --]

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] drm/radeon: Avoid double gpu reset by adding a timeout on IB ring tests.
  2016-02-08 14:51       ` Matthew Dawson
@ 2016-02-08 16:55         ` Bruno Kant
  0 siblings, 0 replies; 9+ messages in thread
From: Bruno Kant @ 2016-02-08 16:55 UTC (permalink / raw)
  To: Matthew Dawson; +Cc: dri-devel

Le 2016-02-08 15:51, Matthew Dawson a écrit :
>> Yeah, 1s works as well. But that 100ms shouldn't be enough sounds like
>> another bug to me.
> Probably, but it has a tendency to crash my entire system, so I'm not 
> sure how
> to debug it further.  Netconsole maybe?  I probably won't be able to 
> dig into
> this due to time since 1s does work.
> 

Hello,

netconsole won't help if the box gets killed. IP, USB, everything should 
be dead on a hang. I have a netconsole running, but I see nothing on it 
when my box hangs.

I'm now trying to use crash dumps, they could help you also. It dumps a 
core, plus a short text file with the main kernel panic message. If your 
box panics, you would get at least the text file (sort of main cause).

The core dump itself must later be read with kernel debug symbols, this 
can be more difficult to set up. But try kdump, this was easy to set up 
on my Fedora box, look at this tuto:

https://fedoraproject.org/wiki/How_to_use_kdump_to_debug_kernel_crashes

You may also later need a custom built kernel, or a debug kernel (which 
could slow down your box). I have noticed some kernel debug options 
which generate panics for different kind of kernel locks or oops.

Best regards
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] drm/radeon: Avoid double gpu reset by adding a timeout on IB ring tests.
  2016-02-08  9:38 ` Christian König
@ 2016-02-08 15:42   ` Alex Deucher
  0 siblings, 0 replies; 9+ messages in thread
From: Alex Deucher @ 2016-02-08 15:42 UTC (permalink / raw)
  To: Christian König; +Cc: Matthew Dawson, Maling list - DRI developers

On Mon, Feb 8, 2016 at 4:38 AM, Christian König <deathsimple@vodafone.de> wrote:
> Am 07.02.2016 um 22:51 schrieb Matthew Dawson:
>>
>> When the radeon driver resets a gpu, it attempts to test whether all the
>> rings can successfully handle an IB.  If these rings fail to respond, the
>> process will wait forever.  Another gpu reset can't happen at this point,
>> as the current reset holds a lock required to do so.  Instead, make all
>> the IB tests run with a timeout, so the system can attempt to recover
>> in this case.
>>
>> While this doesn't fix the underlying issue with card resets failing, it
>> gives the system a higher chance of recovering.  These timeouts have been
>> confirmed to help both a Tathi and Hawaii card recover after a gpu reset.
>>
>> This also adds a new function, radeon_fence_wait_timeout, that behaves
>> like
>> fence_wait_timeout.  It is used instead of fence_wait_timeout as it
>> continues
>> to work during a reset.  radeon_fence_wait is changed to be implemented
>> using this function.
>>
>> V2:
>>   - Changed the timeout to 1s, as the default 10s from radeon_wait_timeout
>> was
>> too long.  A timeout of 100ms was tested and found to be too short.
>>   - Changed radeon_fence_wait_timeout to behave more like
>> fence_wait_timeout.
>>
>> Signed-off-by: Matthew Dawson <matthew@mjdsystems.ca>
>
>
> Reviewed-by: Christian König <christian.koenig@amd.com>


Applied.  Thanks!

Alex

>
> Regards,
> Christian.
>
>
>> ---
>>   drivers/gpu/drm/radeon/cik.c          | 11 ++++++++--
>>   drivers/gpu/drm/radeon/cik_sdma.c     |  9 ++++++--
>>   drivers/gpu/drm/radeon/r100.c         | 10 +++++++--
>>   drivers/gpu/drm/radeon/r600.c         | 10 +++++++--
>>   drivers/gpu/drm/radeon/r600_dma.c     |  9 ++++++--
>>   drivers/gpu/drm/radeon/radeon.h       |  2 ++
>>   drivers/gpu/drm/radeon/radeon_fence.c | 40
>> ++++++++++++++++++++++++++++-------
>>   drivers/gpu/drm/radeon/radeon_vce.c   | 11 +++++++---
>>   drivers/gpu/drm/radeon/uvd_v1_0.c     | 10 +++++++--
>>   9 files changed, 89 insertions(+), 23 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/radeon/cik.c b/drivers/gpu/drm/radeon/cik.c
>> index 4c30d8c..0600140 100644
>> --- a/drivers/gpu/drm/radeon/cik.c
>> +++ b/drivers/gpu/drm/radeon/cik.c
>> @@ -4219,13 +4219,20 @@ int cik_ib_test(struct radeon_device *rdev, struct
>> radeon_ring *ring)
>>                 DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
>>                 return r;
>>         }
>> -       r = radeon_fence_wait(ib.fence, false);
>> -       if (r) {
>> +       r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies(
>> +               RADEON_USEC_IB_TEST_TIMEOUT));
>> +       if (r < 0) {
>>                 DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>>                 radeon_scratch_free(rdev, scratch);
>>                 radeon_ib_free(rdev, &ib);
>>                 return r;
>> +       } else if (r == 0) {
>> +               DRM_ERROR("radeon: fence wait timed out.\n");
>> +               radeon_scratch_free(rdev, scratch);
>> +               radeon_ib_free(rdev, &ib);
>> +               return -ETIMEDOUT;
>>         }
>> +       r = 0;
>>         for (i = 0; i < rdev->usec_timeout; i++) {
>>                 tmp = RREG32(scratch);
>>                 if (tmp == 0xDEADBEEF)
>> diff --git a/drivers/gpu/drm/radeon/cik_sdma.c
>> b/drivers/gpu/drm/radeon/cik_sdma.c
>> index d16f2ee..9c351dc 100644
>> --- a/drivers/gpu/drm/radeon/cik_sdma.c
>> +++ b/drivers/gpu/drm/radeon/cik_sdma.c
>> @@ -737,11 +737,16 @@ int cik_sdma_ib_test(struct radeon_device *rdev,
>> struct radeon_ring *ring)
>>                 DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
>>                 return r;
>>         }
>> -       r = radeon_fence_wait(ib.fence, false);
>> -       if (r) {
>> +       r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies(
>> +               RADEON_USEC_IB_TEST_TIMEOUT));
>> +       if (r < 0) {
>>                 DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>>                 return r;
>> +       } else if (r == 0) {
>> +               DRM_ERROR("radeon: fence wait timed out.\n");
>> +               return -ETIMEDOUT;
>>         }
>> +       r = 0;
>>         for (i = 0; i < rdev->usec_timeout; i++) {
>>                 tmp = le32_to_cpu(rdev->wb.wb[index/4]);
>>                 if (tmp == 0xDEADBEEF)
>> diff --git a/drivers/gpu/drm/radeon/r100.c b/drivers/gpu/drm/radeon/r100.c
>> index 5eae0a8..6e478a2 100644
>> --- a/drivers/gpu/drm/radeon/r100.c
>> +++ b/drivers/gpu/drm/radeon/r100.c
>> @@ -3732,11 +3732,17 @@ int r100_ib_test(struct radeon_device *rdev,
>> struct radeon_ring *ring)
>>                 DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
>>                 goto free_ib;
>>         }
>> -       r = radeon_fence_wait(ib.fence, false);
>> -       if (r) {
>> +       r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies(
>> +               RADEON_USEC_IB_TEST_TIMEOUT));
>> +       if (r < 0) {
>>                 DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>>                 goto free_ib;
>> +       } else if (r == 0) {
>> +               DRM_ERROR("radeon: fence wait timed out.\n");
>> +               r = -ETIMEDOUT;
>> +               goto free_ib;
>>         }
>> +       r = 0;
>>         for (i = 0; i < rdev->usec_timeout; i++) {
>>                 tmp = RREG32(scratch);
>>                 if (tmp == 0xDEADBEEF) {
>> diff --git a/drivers/gpu/drm/radeon/r600.c b/drivers/gpu/drm/radeon/r600.c
>> index cc2fdf0..ed12104 100644
>> --- a/drivers/gpu/drm/radeon/r600.c
>> +++ b/drivers/gpu/drm/radeon/r600.c
>> @@ -3381,11 +3381,17 @@ int r600_ib_test(struct radeon_device *rdev,
>> struct radeon_ring *ring)
>>                 DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
>>                 goto free_ib;
>>         }
>> -       r = radeon_fence_wait(ib.fence, false);
>> -       if (r) {
>> +       r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies(
>> +               RADEON_USEC_IB_TEST_TIMEOUT));
>> +       if (r < 0) {
>>                 DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>>                 goto free_ib;
>> +       } else if (r == 0) {
>> +               DRM_ERROR("radeon: fence wait timed out.\n");
>> +               r = -ETIMEDOUT;
>> +               goto free_ib;
>>         }
>> +       r = 0;
>>         for (i = 0; i < rdev->usec_timeout; i++) {
>>                 tmp = RREG32(scratch);
>>                 if (tmp == 0xDEADBEEF)
>> diff --git a/drivers/gpu/drm/radeon/r600_dma.c
>> b/drivers/gpu/drm/radeon/r600_dma.c
>> index d2dd29a..fb65e6f 100644
>> --- a/drivers/gpu/drm/radeon/r600_dma.c
>> +++ b/drivers/gpu/drm/radeon/r600_dma.c
>> @@ -368,11 +368,16 @@ int r600_dma_ib_test(struct radeon_device *rdev,
>> struct radeon_ring *ring)
>>                 DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
>>                 return r;
>>         }
>> -       r = radeon_fence_wait(ib.fence, false);
>> -       if (r) {
>> +       r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies(
>> +               RADEON_USEC_IB_TEST_TIMEOUT));
>> +       if (r < 0) {
>>                 DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>>                 return r;
>> +       } else if (r == 0) {
>> +               DRM_ERROR("radeon: fence wait timed out.\n");
>> +               return -ETIMEDOUT;
>>         }
>> +       r = 0;
>>         for (i = 0; i < rdev->usec_timeout; i++) {
>>                 tmp = le32_to_cpu(rdev->wb.wb[index/4]);
>>                 if (tmp == 0xDEADBEEF)
>> diff --git a/drivers/gpu/drm/radeon/radeon.h
>> b/drivers/gpu/drm/radeon/radeon.h
>> index 78a51b3..007be29 100644
>> --- a/drivers/gpu/drm/radeon/radeon.h
>> +++ b/drivers/gpu/drm/radeon/radeon.h
>> @@ -120,6 +120,7 @@ extern int radeon_mst;
>>    */
>>   #define RADEON_MAX_USEC_TIMEOUT                       100000  /* 100 ms
>> */
>>   #define RADEON_FENCE_JIFFIES_TIMEOUT          (HZ / 2)
>> +#define RADEON_USEC_IB_TEST_TIMEOUT            1000000 /* 1s */
>>   /* RADEON_IB_POOL_SIZE must be a power of 2 */
>>   #define RADEON_IB_POOL_SIZE                   16
>>   #define RADEON_DEBUGFS_MAX_COMPONENTS         32
>> @@ -382,6 +383,7 @@ void radeon_fence_driver_force_completion(struct
>> radeon_device *rdev, int ring);
>>   int radeon_fence_emit(struct radeon_device *rdev, struct radeon_fence
>> **fence, int ring);
>>   void radeon_fence_process(struct radeon_device *rdev, int ring);
>>   bool radeon_fence_signaled(struct radeon_fence *fence);
>> +long radeon_fence_wait_timeout(struct radeon_fence *fence, bool
>> interruptible, long timeout);
>>   int radeon_fence_wait(struct radeon_fence *fence, bool interruptible);
>>   int radeon_fence_wait_next(struct radeon_device *rdev, int ring);
>>   int radeon_fence_wait_empty(struct radeon_device *rdev, int ring);
>> diff --git a/drivers/gpu/drm/radeon/radeon_fence.c
>> b/drivers/gpu/drm/radeon/radeon_fence.c
>> index 05815c4..7ef075a 100644
>> --- a/drivers/gpu/drm/radeon/radeon_fence.c
>> +++ b/drivers/gpu/drm/radeon/radeon_fence.c
>> @@ -527,7 +527,7 @@ static long radeon_fence_wait_seq_timeout(struct
>> radeon_device *rdev,
>>   }
>>     /**
>> - * radeon_fence_wait - wait for a fence to signal
>> + * radeon_fence_wait_timeout - wait for a fence to signal with timeout
>>    *
>>    * @fence: radeon fence object
>>    * @intr: use interruptible sleep
>> @@ -535,12 +535,15 @@ static long radeon_fence_wait_seq_timeout(struct
>> radeon_device *rdev,
>>    * Wait for the requested fence to signal (all asics).
>>    * @intr selects whether to use interruptable (true) or
>> non-interruptable
>>    * (false) sleep when waiting for the fence.
>> - * Returns 0 if the fence has passed, error for all other cases.
>> + * @timeout: maximum time to wait, or MAX_SCHEDULE_TIMEOUT for infinite
>> wait
>> + * Returns remaining time if the sequence number has passed, 0 when
>> + * the wait timeout, or an error for all other cases.
>>    */
>> -int radeon_fence_wait(struct radeon_fence *fence, bool intr)
>> +long radeon_fence_wait_timeout(struct radeon_fence *fence, bool intr,
>> long timeout)
>>   {
>>         uint64_t seq[RADEON_NUM_RINGS] = {};
>>         long r;
>> +       int r_sig;
>>         /*
>>          * This function should not be called on !radeon fences.
>> @@ -552,15 +555,36 @@ int radeon_fence_wait(struct radeon_fence *fence,
>> bool intr)
>>                 return fence_wait(&fence->base, intr);
>>         seq[fence->ring] = fence->seq;
>> -       r = radeon_fence_wait_seq_timeout(fence->rdev, seq, intr,
>> MAX_SCHEDULE_TIMEOUT);
>> -       if (r < 0) {
>> +       r = radeon_fence_wait_seq_timeout(fence->rdev, seq, intr,
>> timeout);
>> +       if (r <= 0) {
>>                 return r;
>>         }
>>   -     r = fence_signal(&fence->base);
>> -       if (!r)
>> +       r_sig = fence_signal(&fence->base);
>> +       if (!r_sig)
>>                 FENCE_TRACE(&fence->base, "signaled from fence_wait\n");
>> -       return 0;
>> +       return r;
>> +}
>> +
>> +/**
>> + * radeon_fence_wait - wait for a fence to signal
>> + *
>> + * @fence: radeon fence object
>> + * @intr: use interruptible sleep
>> + *
>> + * Wait for the requested fence to signal (all asics).
>> + * @intr selects whether to use interruptable (true) or non-interruptable
>> + * (false) sleep when waiting for the fence.
>> + * Returns 0 if the fence has passed, error for all other cases.
>> + */
>> +int radeon_fence_wait(struct radeon_fence *fence, bool intr)
>> +{
>> +       long r = radeon_fence_wait_timeout(fence, intr,
>> MAX_SCHEDULE_TIMEOUT);
>> +       if (r > 0) {
>> +               return 0;
>> +       } else {
>> +               return r;
>> +       }
>>   }
>>     /**
>> diff --git a/drivers/gpu/drm/radeon/radeon_vce.c
>> b/drivers/gpu/drm/radeon/radeon_vce.c
>> index 7eb1ae7..566a1a0 100644
>> --- a/drivers/gpu/drm/radeon/radeon_vce.c
>> +++ b/drivers/gpu/drm/radeon/radeon_vce.c
>> @@ -810,11 +810,16 @@ int radeon_vce_ib_test(struct radeon_device *rdev,
>> struct radeon_ring *ring)
>>                 goto error;
>>         }
>>   -     r = radeon_fence_wait(fence, false);
>> -       if (r) {
>> +       r = radeon_fence_wait_timeout(fence, false, usecs_to_jiffies(
>> +               RADEON_USEC_IB_TEST_TIMEOUT));
>> +       if (r < 0) {
>>                 DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>> +       } else if (r == 0) {
>> +               DRM_ERROR("radeon: fence wait timed out.\n");
>> +               r = -ETIMEDOUT;
>>         } else {
>> -               DRM_INFO("ib test on ring %d succeeded\n", ring->idx);
>> +               DRM_INFO("ib test on ring %d succeeded\n", ring->idx);
>> +               r = 0;
>>         }
>>   error:
>>         radeon_fence_unref(&fence);
>> diff --git a/drivers/gpu/drm/radeon/uvd_v1_0.c
>> b/drivers/gpu/drm/radeon/uvd_v1_0.c
>> index c6b1cbc..12ddcfa 100644
>> --- a/drivers/gpu/drm/radeon/uvd_v1_0.c
>> +++ b/drivers/gpu/drm/radeon/uvd_v1_0.c
>> @@ -522,11 +522,17 @@ int uvd_v1_0_ib_test(struct radeon_device *rdev,
>> struct radeon_ring *ring)
>>                 goto error;
>>         }
>>   -     r = radeon_fence_wait(fence, false);
>> -       if (r) {
>> +       r = radeon_fence_wait_timeout(fence, false, usecs_to_jiffies(
>> +               RADEON_USEC_IB_TEST_TIMEOUT));
>> +       if (r < 0) {
>>                 DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>>                 goto error;
>> +       } else if (r == 0) {
>> +               DRM_ERROR("radeon: fence wait timed out.\n");
>> +               r = -ETIMEDOUT;
>> +               goto error;
>>         }
>> +       r = 0;
>>         DRM_INFO("ib test on ring %d succeeded\n",  ring->idx);
>>   error:
>>         radeon_fence_unref(&fence);
>
>
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] drm/radeon: Avoid double gpu reset by adding a timeout on IB ring tests.
  2016-02-07 21:51 Matthew Dawson
@ 2016-02-08  9:38 ` Christian König
  2016-02-08 15:42   ` Alex Deucher
  0 siblings, 1 reply; 9+ messages in thread
From: Christian König @ 2016-02-08  9:38 UTC (permalink / raw)
  To: Matthew Dawson, dri-devel

Am 07.02.2016 um 22:51 schrieb Matthew Dawson:
> When the radeon driver resets a gpu, it attempts to test whether all the
> rings can successfully handle an IB.  If these rings fail to respond, the
> process will wait forever.  Another gpu reset can't happen at this point,
> as the current reset holds a lock required to do so.  Instead, make all
> the IB tests run with a timeout, so the system can attempt to recover
> in this case.
>
> While this doesn't fix the underlying issue with card resets failing, it
> gives the system a higher chance of recovering.  These timeouts have been
> confirmed to help both a Tathi and Hawaii card recover after a gpu reset.
>
> This also adds a new function, radeon_fence_wait_timeout, that behaves like
> fence_wait_timeout.  It is used instead of fence_wait_timeout as it continues
> to work during a reset.  radeon_fence_wait is changed to be implemented
> using this function.
>
> V2:
>   - Changed the timeout to 1s, as the default 10s from radeon_wait_timeout was
> too long.  A timeout of 100ms was tested and found to be too short.
>   - Changed radeon_fence_wait_timeout to behave more like fence_wait_timeout.
>
> Signed-off-by: Matthew Dawson <matthew@mjdsystems.ca>

Reviewed-by: Christian König <christian.koenig@amd.com>

Regards,
Christian.

> ---
>   drivers/gpu/drm/radeon/cik.c          | 11 ++++++++--
>   drivers/gpu/drm/radeon/cik_sdma.c     |  9 ++++++--
>   drivers/gpu/drm/radeon/r100.c         | 10 +++++++--
>   drivers/gpu/drm/radeon/r600.c         | 10 +++++++--
>   drivers/gpu/drm/radeon/r600_dma.c     |  9 ++++++--
>   drivers/gpu/drm/radeon/radeon.h       |  2 ++
>   drivers/gpu/drm/radeon/radeon_fence.c | 40 ++++++++++++++++++++++++++++-------
>   drivers/gpu/drm/radeon/radeon_vce.c   | 11 +++++++---
>   drivers/gpu/drm/radeon/uvd_v1_0.c     | 10 +++++++--
>   9 files changed, 89 insertions(+), 23 deletions(-)
>
> diff --git a/drivers/gpu/drm/radeon/cik.c b/drivers/gpu/drm/radeon/cik.c
> index 4c30d8c..0600140 100644
> --- a/drivers/gpu/drm/radeon/cik.c
> +++ b/drivers/gpu/drm/radeon/cik.c
> @@ -4219,13 +4219,20 @@ int cik_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
>   		DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
>   		return r;
>   	}
> -	r = radeon_fence_wait(ib.fence, false);
> -	if (r) {
> +	r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies(
> +		RADEON_USEC_IB_TEST_TIMEOUT));
> +	if (r < 0) {
>   		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>   		radeon_scratch_free(rdev, scratch);
>   		radeon_ib_free(rdev, &ib);
>   		return r;
> +	} else if (r == 0) {
> +		DRM_ERROR("radeon: fence wait timed out.\n");
> +		radeon_scratch_free(rdev, scratch);
> +		radeon_ib_free(rdev, &ib);
> +		return -ETIMEDOUT;
>   	}
> +	r = 0;
>   	for (i = 0; i < rdev->usec_timeout; i++) {
>   		tmp = RREG32(scratch);
>   		if (tmp == 0xDEADBEEF)
> diff --git a/drivers/gpu/drm/radeon/cik_sdma.c b/drivers/gpu/drm/radeon/cik_sdma.c
> index d16f2ee..9c351dc 100644
> --- a/drivers/gpu/drm/radeon/cik_sdma.c
> +++ b/drivers/gpu/drm/radeon/cik_sdma.c
> @@ -737,11 +737,16 @@ int cik_sdma_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
>   		DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
>   		return r;
>   	}
> -	r = radeon_fence_wait(ib.fence, false);
> -	if (r) {
> +	r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies(
> +		RADEON_USEC_IB_TEST_TIMEOUT));
> +	if (r < 0) {
>   		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>   		return r;
> +	} else if (r == 0) {
> +		DRM_ERROR("radeon: fence wait timed out.\n");
> +		return -ETIMEDOUT;
>   	}
> +	r = 0;
>   	for (i = 0; i < rdev->usec_timeout; i++) {
>   		tmp = le32_to_cpu(rdev->wb.wb[index/4]);
>   		if (tmp == 0xDEADBEEF)
> diff --git a/drivers/gpu/drm/radeon/r100.c b/drivers/gpu/drm/radeon/r100.c
> index 5eae0a8..6e478a2 100644
> --- a/drivers/gpu/drm/radeon/r100.c
> +++ b/drivers/gpu/drm/radeon/r100.c
> @@ -3732,11 +3732,17 @@ int r100_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
>   		DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
>   		goto free_ib;
>   	}
> -	r = radeon_fence_wait(ib.fence, false);
> -	if (r) {
> +	r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies(
> +		RADEON_USEC_IB_TEST_TIMEOUT));
> +	if (r < 0) {
>   		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>   		goto free_ib;
> +	} else if (r == 0) {
> +		DRM_ERROR("radeon: fence wait timed out.\n");
> +		r = -ETIMEDOUT;
> +		goto free_ib;
>   	}
> +	r = 0;
>   	for (i = 0; i < rdev->usec_timeout; i++) {
>   		tmp = RREG32(scratch);
>   		if (tmp == 0xDEADBEEF) {
> diff --git a/drivers/gpu/drm/radeon/r600.c b/drivers/gpu/drm/radeon/r600.c
> index cc2fdf0..ed12104 100644
> --- a/drivers/gpu/drm/radeon/r600.c
> +++ b/drivers/gpu/drm/radeon/r600.c
> @@ -3381,11 +3381,17 @@ int r600_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
>   		DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
>   		goto free_ib;
>   	}
> -	r = radeon_fence_wait(ib.fence, false);
> -	if (r) {
> +	r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies(
> +		RADEON_USEC_IB_TEST_TIMEOUT));
> +	if (r < 0) {
>   		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>   		goto free_ib;
> +	} else if (r == 0) {
> +		DRM_ERROR("radeon: fence wait timed out.\n");
> +		r = -ETIMEDOUT;
> +		goto free_ib;
>   	}
> +	r = 0;
>   	for (i = 0; i < rdev->usec_timeout; i++) {
>   		tmp = RREG32(scratch);
>   		if (tmp == 0xDEADBEEF)
> diff --git a/drivers/gpu/drm/radeon/r600_dma.c b/drivers/gpu/drm/radeon/r600_dma.c
> index d2dd29a..fb65e6f 100644
> --- a/drivers/gpu/drm/radeon/r600_dma.c
> +++ b/drivers/gpu/drm/radeon/r600_dma.c
> @@ -368,11 +368,16 @@ int r600_dma_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
>   		DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
>   		return r;
>   	}
> -	r = radeon_fence_wait(ib.fence, false);
> -	if (r) {
> +	r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies(
> +		RADEON_USEC_IB_TEST_TIMEOUT));
> +	if (r < 0) {
>   		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>   		return r;
> +	} else if (r == 0) {
> +		DRM_ERROR("radeon: fence wait timed out.\n");
> +		return -ETIMEDOUT;
>   	}
> +	r = 0;
>   	for (i = 0; i < rdev->usec_timeout; i++) {
>   		tmp = le32_to_cpu(rdev->wb.wb[index/4]);
>   		if (tmp == 0xDEADBEEF)
> diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
> index 78a51b3..007be29 100644
> --- a/drivers/gpu/drm/radeon/radeon.h
> +++ b/drivers/gpu/drm/radeon/radeon.h
> @@ -120,6 +120,7 @@ extern int radeon_mst;
>    */
>   #define RADEON_MAX_USEC_TIMEOUT			100000	/* 100 ms */
>   #define RADEON_FENCE_JIFFIES_TIMEOUT		(HZ / 2)
> +#define RADEON_USEC_IB_TEST_TIMEOUT		1000000 /* 1s */
>   /* RADEON_IB_POOL_SIZE must be a power of 2 */
>   #define RADEON_IB_POOL_SIZE			16
>   #define RADEON_DEBUGFS_MAX_COMPONENTS		32
> @@ -382,6 +383,7 @@ void radeon_fence_driver_force_completion(struct radeon_device *rdev, int ring);
>   int radeon_fence_emit(struct radeon_device *rdev, struct radeon_fence **fence, int ring);
>   void radeon_fence_process(struct radeon_device *rdev, int ring);
>   bool radeon_fence_signaled(struct radeon_fence *fence);
> +long radeon_fence_wait_timeout(struct radeon_fence *fence, bool interruptible, long timeout);
>   int radeon_fence_wait(struct radeon_fence *fence, bool interruptible);
>   int radeon_fence_wait_next(struct radeon_device *rdev, int ring);
>   int radeon_fence_wait_empty(struct radeon_device *rdev, int ring);
> diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c
> index 05815c4..7ef075a 100644
> --- a/drivers/gpu/drm/radeon/radeon_fence.c
> +++ b/drivers/gpu/drm/radeon/radeon_fence.c
> @@ -527,7 +527,7 @@ static long radeon_fence_wait_seq_timeout(struct radeon_device *rdev,
>   }
>   
>   /**
> - * radeon_fence_wait - wait for a fence to signal
> + * radeon_fence_wait_timeout - wait for a fence to signal with timeout
>    *
>    * @fence: radeon fence object
>    * @intr: use interruptible sleep
> @@ -535,12 +535,15 @@ static long radeon_fence_wait_seq_timeout(struct radeon_device *rdev,
>    * Wait for the requested fence to signal (all asics).
>    * @intr selects whether to use interruptable (true) or non-interruptable
>    * (false) sleep when waiting for the fence.
> - * Returns 0 if the fence has passed, error for all other cases.
> + * @timeout: maximum time to wait, or MAX_SCHEDULE_TIMEOUT for infinite wait
> + * Returns remaining time if the sequence number has passed, 0 when
> + * the wait timeout, or an error for all other cases.
>    */
> -int radeon_fence_wait(struct radeon_fence *fence, bool intr)
> +long radeon_fence_wait_timeout(struct radeon_fence *fence, bool intr, long timeout)
>   {
>   	uint64_t seq[RADEON_NUM_RINGS] = {};
>   	long r;
> +	int r_sig;
>   
>   	/*
>   	 * This function should not be called on !radeon fences.
> @@ -552,15 +555,36 @@ int radeon_fence_wait(struct radeon_fence *fence, bool intr)
>   		return fence_wait(&fence->base, intr);
>   
>   	seq[fence->ring] = fence->seq;
> -	r = radeon_fence_wait_seq_timeout(fence->rdev, seq, intr, MAX_SCHEDULE_TIMEOUT);
> -	if (r < 0) {
> +	r = radeon_fence_wait_seq_timeout(fence->rdev, seq, intr, timeout);
> +	if (r <= 0) {
>   		return r;
>   	}
>   
> -	r = fence_signal(&fence->base);
> -	if (!r)
> +	r_sig = fence_signal(&fence->base);
> +	if (!r_sig)
>   		FENCE_TRACE(&fence->base, "signaled from fence_wait\n");
> -	return 0;
> +	return r;
> +}
> +
> +/**
> + * radeon_fence_wait - wait for a fence to signal
> + *
> + * @fence: radeon fence object
> + * @intr: use interruptible sleep
> + *
> + * Wait for the requested fence to signal (all asics).
> + * @intr selects whether to use interruptable (true) or non-interruptable
> + * (false) sleep when waiting for the fence.
> + * Returns 0 if the fence has passed, error for all other cases.
> + */
> +int radeon_fence_wait(struct radeon_fence *fence, bool intr)
> +{
> +	long r = radeon_fence_wait_timeout(fence, intr, MAX_SCHEDULE_TIMEOUT);
> +	if (r > 0) {
> +		return 0;
> +	} else {
> +		return r;
> +	}
>   }
>   
>   /**
> diff --git a/drivers/gpu/drm/radeon/radeon_vce.c b/drivers/gpu/drm/radeon/radeon_vce.c
> index 7eb1ae7..566a1a0 100644
> --- a/drivers/gpu/drm/radeon/radeon_vce.c
> +++ b/drivers/gpu/drm/radeon/radeon_vce.c
> @@ -810,11 +810,16 @@ int radeon_vce_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
>   		goto error;
>   	}
>   
> -	r = radeon_fence_wait(fence, false);
> -	if (r) {
> +	r = radeon_fence_wait_timeout(fence, false, usecs_to_jiffies(
> +		RADEON_USEC_IB_TEST_TIMEOUT));
> +	if (r < 0) {
>   		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
> +	} else if (r == 0) {
> +		DRM_ERROR("radeon: fence wait timed out.\n");
> +		r = -ETIMEDOUT;
>   	} else {
> -	        DRM_INFO("ib test on ring %d succeeded\n", ring->idx);
> +		DRM_INFO("ib test on ring %d succeeded\n", ring->idx);
> +		r = 0;
>   	}
>   error:
>   	radeon_fence_unref(&fence);
> diff --git a/drivers/gpu/drm/radeon/uvd_v1_0.c b/drivers/gpu/drm/radeon/uvd_v1_0.c
> index c6b1cbc..12ddcfa 100644
> --- a/drivers/gpu/drm/radeon/uvd_v1_0.c
> +++ b/drivers/gpu/drm/radeon/uvd_v1_0.c
> @@ -522,11 +522,17 @@ int uvd_v1_0_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
>   		goto error;
>   	}
>   
> -	r = radeon_fence_wait(fence, false);
> -	if (r) {
> +	r = radeon_fence_wait_timeout(fence, false, usecs_to_jiffies(
> +		RADEON_USEC_IB_TEST_TIMEOUT));
> +	if (r < 0) {
>   		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
>   		goto error;
> +	} else if (r == 0) {
> +		DRM_ERROR("radeon: fence wait timed out.\n");
> +		r = -ETIMEDOUT;
> +		goto error;
>   	}
> +	r = 0;
>   	DRM_INFO("ib test on ring %d succeeded\n",  ring->idx);
>   error:
>   	radeon_fence_unref(&fence);

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH] drm/radeon: Avoid double gpu reset by adding a timeout on IB ring tests.
@ 2016-02-07 21:51 Matthew Dawson
  2016-02-08  9:38 ` Christian König
  0 siblings, 1 reply; 9+ messages in thread
From: Matthew Dawson @ 2016-02-07 21:51 UTC (permalink / raw)
  To: dri-devel

When the radeon driver resets a gpu, it attempts to test whether all the
rings can successfully handle an IB.  If these rings fail to respond, the
process will wait forever.  Another gpu reset can't happen at this point,
as the current reset holds a lock required to do so.  Instead, make all
the IB tests run with a timeout, so the system can attempt to recover
in this case.

While this doesn't fix the underlying issue with card resets failing, it
gives the system a higher chance of recovering.  These timeouts have been
confirmed to help both a Tathi and Hawaii card recover after a gpu reset.

This also adds a new function, radeon_fence_wait_timeout, that behaves like
fence_wait_timeout.  It is used instead of fence_wait_timeout as it continues
to work during a reset.  radeon_fence_wait is changed to be implemented
using this function.

V2:
 - Changed the timeout to 1s, as the default 10s from radeon_wait_timeout was
too long.  A timeout of 100ms was tested and found to be too short.
 - Changed radeon_fence_wait_timeout to behave more like fence_wait_timeout.

Signed-off-by: Matthew Dawson <matthew@mjdsystems.ca>
---
 drivers/gpu/drm/radeon/cik.c          | 11 ++++++++--
 drivers/gpu/drm/radeon/cik_sdma.c     |  9 ++++++--
 drivers/gpu/drm/radeon/r100.c         | 10 +++++++--
 drivers/gpu/drm/radeon/r600.c         | 10 +++++++--
 drivers/gpu/drm/radeon/r600_dma.c     |  9 ++++++--
 drivers/gpu/drm/radeon/radeon.h       |  2 ++
 drivers/gpu/drm/radeon/radeon_fence.c | 40 ++++++++++++++++++++++++++++-------
 drivers/gpu/drm/radeon/radeon_vce.c   | 11 +++++++---
 drivers/gpu/drm/radeon/uvd_v1_0.c     | 10 +++++++--
 9 files changed, 89 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/radeon/cik.c b/drivers/gpu/drm/radeon/cik.c
index 4c30d8c..0600140 100644
--- a/drivers/gpu/drm/radeon/cik.c
+++ b/drivers/gpu/drm/radeon/cik.c
@@ -4219,13 +4219,20 @@ int cik_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
 		DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
 		return r;
 	}
-	r = radeon_fence_wait(ib.fence, false);
-	if (r) {
+	r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies(
+		RADEON_USEC_IB_TEST_TIMEOUT));
+	if (r < 0) {
 		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
 		radeon_scratch_free(rdev, scratch);
 		radeon_ib_free(rdev, &ib);
 		return r;
+	} else if (r == 0) {
+		DRM_ERROR("radeon: fence wait timed out.\n");
+		radeon_scratch_free(rdev, scratch);
+		radeon_ib_free(rdev, &ib);
+		return -ETIMEDOUT;
 	}
+	r = 0;
 	for (i = 0; i < rdev->usec_timeout; i++) {
 		tmp = RREG32(scratch);
 		if (tmp == 0xDEADBEEF)
diff --git a/drivers/gpu/drm/radeon/cik_sdma.c b/drivers/gpu/drm/radeon/cik_sdma.c
index d16f2ee..9c351dc 100644
--- a/drivers/gpu/drm/radeon/cik_sdma.c
+++ b/drivers/gpu/drm/radeon/cik_sdma.c
@@ -737,11 +737,16 @@ int cik_sdma_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
 		DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
 		return r;
 	}
-	r = radeon_fence_wait(ib.fence, false);
-	if (r) {
+	r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies(
+		RADEON_USEC_IB_TEST_TIMEOUT));
+	if (r < 0) {
 		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
 		return r;
+	} else if (r == 0) {
+		DRM_ERROR("radeon: fence wait timed out.\n");
+		return -ETIMEDOUT;
 	}
+	r = 0;
 	for (i = 0; i < rdev->usec_timeout; i++) {
 		tmp = le32_to_cpu(rdev->wb.wb[index/4]);
 		if (tmp == 0xDEADBEEF)
diff --git a/drivers/gpu/drm/radeon/r100.c b/drivers/gpu/drm/radeon/r100.c
index 5eae0a8..6e478a2 100644
--- a/drivers/gpu/drm/radeon/r100.c
+++ b/drivers/gpu/drm/radeon/r100.c
@@ -3732,11 +3732,17 @@ int r100_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
 		DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
 		goto free_ib;
 	}
-	r = radeon_fence_wait(ib.fence, false);
-	if (r) {
+	r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies(
+		RADEON_USEC_IB_TEST_TIMEOUT));
+	if (r < 0) {
 		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
 		goto free_ib;
+	} else if (r == 0) {
+		DRM_ERROR("radeon: fence wait timed out.\n");
+		r = -ETIMEDOUT;
+		goto free_ib;
 	}
+	r = 0;
 	for (i = 0; i < rdev->usec_timeout; i++) {
 		tmp = RREG32(scratch);
 		if (tmp == 0xDEADBEEF) {
diff --git a/drivers/gpu/drm/radeon/r600.c b/drivers/gpu/drm/radeon/r600.c
index cc2fdf0..ed12104 100644
--- a/drivers/gpu/drm/radeon/r600.c
+++ b/drivers/gpu/drm/radeon/r600.c
@@ -3381,11 +3381,17 @@ int r600_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
 		DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
 		goto free_ib;
 	}
-	r = radeon_fence_wait(ib.fence, false);
-	if (r) {
+	r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies(
+		RADEON_USEC_IB_TEST_TIMEOUT));
+	if (r < 0) {
 		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
 		goto free_ib;
+	} else if (r == 0) {
+		DRM_ERROR("radeon: fence wait timed out.\n");
+		r = -ETIMEDOUT;
+		goto free_ib;
 	}
+	r = 0;
 	for (i = 0; i < rdev->usec_timeout; i++) {
 		tmp = RREG32(scratch);
 		if (tmp == 0xDEADBEEF)
diff --git a/drivers/gpu/drm/radeon/r600_dma.c b/drivers/gpu/drm/radeon/r600_dma.c
index d2dd29a..fb65e6f 100644
--- a/drivers/gpu/drm/radeon/r600_dma.c
+++ b/drivers/gpu/drm/radeon/r600_dma.c
@@ -368,11 +368,16 @@ int r600_dma_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
 		DRM_ERROR("radeon: failed to schedule ib (%d).\n", r);
 		return r;
 	}
-	r = radeon_fence_wait(ib.fence, false);
-	if (r) {
+	r = radeon_fence_wait_timeout(ib.fence, false, usecs_to_jiffies(
+		RADEON_USEC_IB_TEST_TIMEOUT));
+	if (r < 0) {
 		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
 		return r;
+	} else if (r == 0) {
+		DRM_ERROR("radeon: fence wait timed out.\n");
+		return -ETIMEDOUT;
 	}
+	r = 0;
 	for (i = 0; i < rdev->usec_timeout; i++) {
 		tmp = le32_to_cpu(rdev->wb.wb[index/4]);
 		if (tmp == 0xDEADBEEF)
diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
index 78a51b3..007be29 100644
--- a/drivers/gpu/drm/radeon/radeon.h
+++ b/drivers/gpu/drm/radeon/radeon.h
@@ -120,6 +120,7 @@ extern int radeon_mst;
  */
 #define RADEON_MAX_USEC_TIMEOUT			100000	/* 100 ms */
 #define RADEON_FENCE_JIFFIES_TIMEOUT		(HZ / 2)
+#define RADEON_USEC_IB_TEST_TIMEOUT		1000000 /* 1s */
 /* RADEON_IB_POOL_SIZE must be a power of 2 */
 #define RADEON_IB_POOL_SIZE			16
 #define RADEON_DEBUGFS_MAX_COMPONENTS		32
@@ -382,6 +383,7 @@ void radeon_fence_driver_force_completion(struct radeon_device *rdev, int ring);
 int radeon_fence_emit(struct radeon_device *rdev, struct radeon_fence **fence, int ring);
 void radeon_fence_process(struct radeon_device *rdev, int ring);
 bool radeon_fence_signaled(struct radeon_fence *fence);
+long radeon_fence_wait_timeout(struct radeon_fence *fence, bool interruptible, long timeout);
 int radeon_fence_wait(struct radeon_fence *fence, bool interruptible);
 int radeon_fence_wait_next(struct radeon_device *rdev, int ring);
 int radeon_fence_wait_empty(struct radeon_device *rdev, int ring);
diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c
index 05815c4..7ef075a 100644
--- a/drivers/gpu/drm/radeon/radeon_fence.c
+++ b/drivers/gpu/drm/radeon/radeon_fence.c
@@ -527,7 +527,7 @@ static long radeon_fence_wait_seq_timeout(struct radeon_device *rdev,
 }
 
 /**
- * radeon_fence_wait - wait for a fence to signal
+ * radeon_fence_wait_timeout - wait for a fence to signal with timeout
  *
  * @fence: radeon fence object
  * @intr: use interruptible sleep
@@ -535,12 +535,15 @@ static long radeon_fence_wait_seq_timeout(struct radeon_device *rdev,
  * Wait for the requested fence to signal (all asics).
  * @intr selects whether to use interruptable (true) or non-interruptable
  * (false) sleep when waiting for the fence.
- * Returns 0 if the fence has passed, error for all other cases.
+ * @timeout: maximum time to wait, or MAX_SCHEDULE_TIMEOUT for infinite wait
+ * Returns remaining time if the sequence number has passed, 0 when
+ * the wait timeout, or an error for all other cases.
  */
-int radeon_fence_wait(struct radeon_fence *fence, bool intr)
+long radeon_fence_wait_timeout(struct radeon_fence *fence, bool intr, long timeout)
 {
 	uint64_t seq[RADEON_NUM_RINGS] = {};
 	long r;
+	int r_sig;
 
 	/*
 	 * This function should not be called on !radeon fences.
@@ -552,15 +555,36 @@ int radeon_fence_wait(struct radeon_fence *fence, bool intr)
 		return fence_wait(&fence->base, intr);
 
 	seq[fence->ring] = fence->seq;
-	r = radeon_fence_wait_seq_timeout(fence->rdev, seq, intr, MAX_SCHEDULE_TIMEOUT);
-	if (r < 0) {
+	r = radeon_fence_wait_seq_timeout(fence->rdev, seq, intr, timeout);
+	if (r <= 0) {
 		return r;
 	}
 
-	r = fence_signal(&fence->base);
-	if (!r)
+	r_sig = fence_signal(&fence->base);
+	if (!r_sig)
 		FENCE_TRACE(&fence->base, "signaled from fence_wait\n");
-	return 0;
+	return r;
+}
+
+/**
+ * radeon_fence_wait - wait for a fence to signal
+ *
+ * @fence: radeon fence object
+ * @intr: use interruptible sleep
+ *
+ * Wait for the requested fence to signal (all asics).
+ * @intr selects whether to use interruptable (true) or non-interruptable
+ * (false) sleep when waiting for the fence.
+ * Returns 0 if the fence has passed, error for all other cases.
+ */
+int radeon_fence_wait(struct radeon_fence *fence, bool intr)
+{
+	long r = radeon_fence_wait_timeout(fence, intr, MAX_SCHEDULE_TIMEOUT);
+	if (r > 0) {
+		return 0;
+	} else {
+		return r;
+	}
 }
 
 /**
diff --git a/drivers/gpu/drm/radeon/radeon_vce.c b/drivers/gpu/drm/radeon/radeon_vce.c
index 7eb1ae7..566a1a0 100644
--- a/drivers/gpu/drm/radeon/radeon_vce.c
+++ b/drivers/gpu/drm/radeon/radeon_vce.c
@@ -810,11 +810,16 @@ int radeon_vce_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
 		goto error;
 	}
 
-	r = radeon_fence_wait(fence, false);
-	if (r) {
+	r = radeon_fence_wait_timeout(fence, false, usecs_to_jiffies(
+		RADEON_USEC_IB_TEST_TIMEOUT));
+	if (r < 0) {
 		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
+	} else if (r == 0) {
+		DRM_ERROR("radeon: fence wait timed out.\n");
+		r = -ETIMEDOUT;
 	} else {
-	        DRM_INFO("ib test on ring %d succeeded\n", ring->idx);
+		DRM_INFO("ib test on ring %d succeeded\n", ring->idx);
+		r = 0;
 	}
 error:
 	radeon_fence_unref(&fence);
diff --git a/drivers/gpu/drm/radeon/uvd_v1_0.c b/drivers/gpu/drm/radeon/uvd_v1_0.c
index c6b1cbc..12ddcfa 100644
--- a/drivers/gpu/drm/radeon/uvd_v1_0.c
+++ b/drivers/gpu/drm/radeon/uvd_v1_0.c
@@ -522,11 +522,17 @@ int uvd_v1_0_ib_test(struct radeon_device *rdev, struct radeon_ring *ring)
 		goto error;
 	}
 
-	r = radeon_fence_wait(fence, false);
-	if (r) {
+	r = radeon_fence_wait_timeout(fence, false, usecs_to_jiffies(
+		RADEON_USEC_IB_TEST_TIMEOUT));
+	if (r < 0) {
 		DRM_ERROR("radeon: fence wait failed (%d).\n", r);
 		goto error;
+	} else if (r == 0) {
+		DRM_ERROR("radeon: fence wait timed out.\n");
+		r = -ETIMEDOUT;
+		goto error;
 	}
+	r = 0;
 	DRM_INFO("ib test on ring %d succeeded\n",  ring->idx);
 error:
 	radeon_fence_unref(&fence);
-- 
2.7.0

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2016-02-08 16:55 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-01-24  6:18 [PATCH] drm/radeon: Avoid double gpu reset by adding a timeout on IB ring tests Matthew Dawson
2016-01-24  9:49 ` Christian König
2016-01-31 18:50   ` Matthew Dawson
2016-02-03 12:24     ` Christian König
2016-02-08 14:51       ` Matthew Dawson
2016-02-08 16:55         ` Bruno Kant
2016-02-07 21:51 Matthew Dawson
2016-02-08  9:38 ` Christian König
2016-02-08 15:42   ` Alex Deucher

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.