[PATCH 1/4] drm/v3d: Delay the scheduler timeout if we're still making progress.

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 1/4] drm/v3d: Delay the scheduler timeout if we're still making progress.
@ 2018-07-03 17:05 ` Eric Anholt
  0 siblings, 0 replies; 18+ messages in thread
From: Eric Anholt @ 2018-07-03 17:05 UTC (permalink / raw)
  To: dri-devel; +Cc: linux-kernel, Eric Anholt, Lucas Stach

GTF-GLES2.gtf.GL.acos.acos_float_vert_xvary submits jobs that take 4
seconds at maximum resolution, but we still want to reset quickly if a
job is really hung.  Sample the CL's current address and the return
address (since we call into tile lists repeatedly) and if either has
changed then assume we've made progress.

Signed-off-by: Eric Anholt <eric@anholt.net>
Cc: Lucas Stach <l.stach@pengutronix.de>
---
 drivers/gpu/drm/v3d/v3d_drv.h   |  2 ++
 drivers/gpu/drm/v3d/v3d_regs.h  |  1 +
 drivers/gpu/drm/v3d/v3d_sched.c | 18 ++++++++++++++++++
 3 files changed, 21 insertions(+)

diff --git a/drivers/gpu/drm/v3d/v3d_drv.h b/drivers/gpu/drm/v3d/v3d_drv.h
index f546e0ab9562..a5d96d823416 100644
--- a/drivers/gpu/drm/v3d/v3d_drv.h
+++ b/drivers/gpu/drm/v3d/v3d_drv.h
@@ -189,6 +189,8 @@ struct v3d_job {
 
 	/* GPU virtual addresses of the start/end of the CL job. */
 	u32 start, end;
+
+	u32 timedout_ctca, timedout_ctra;
 };
 
 struct v3d_exec_info {
diff --git a/drivers/gpu/drm/v3d/v3d_regs.h b/drivers/gpu/drm/v3d/v3d_regs.h
index fc13282dfc2f..854046565989 100644
--- a/drivers/gpu/drm/v3d/v3d_regs.h
+++ b/drivers/gpu/drm/v3d/v3d_regs.h
@@ -222,6 +222,7 @@
 #define V3D_CLE_CTNCA(n) (V3D_CLE_CT0CA + 4 * n)
 #define V3D_CLE_CT0RA                                  0x00118
 #define V3D_CLE_CT1RA                                  0x0011c
+#define V3D_CLE_CTNRA(n) (V3D_CLE_CT0RA + 4 * n)
 #define V3D_CLE_CT0LC                                  0x00120
 #define V3D_CLE_CT1LC                                  0x00124
 #define V3D_CLE_CT0PC                                  0x00128
diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
index 808bc901f567..00667c733dca 100644
--- a/drivers/gpu/drm/v3d/v3d_sched.c
+++ b/drivers/gpu/drm/v3d/v3d_sched.c
@@ -153,7 +153,25 @@ v3d_job_timedout(struct drm_sched_job *sched_job)
 	struct v3d_job *job = to_v3d_job(sched_job);
 	struct v3d_exec_info *exec = job->exec;
 	struct v3d_dev *v3d = exec->v3d;
+	enum v3d_queue job_q = job == &exec->bin ? V3D_BIN : V3D_RENDER;
 	enum v3d_queue q;
+	u32 ctca = V3D_CORE_READ(0, V3D_CLE_CTNCA(job_q));
+	u32 ctra = V3D_CORE_READ(0, V3D_CLE_CTNRA(job_q));
+
+	/* If the current address or return address have changed, then
+	 * the GPU has probably made progress and we should delay the
+	 * reset.  This could fail if the GPU got in an infinite loop
+	 * in the CL, but that is pretty unlikely outside of an i-g-t
+	 * testcase.
+	 */
+	if (job->timedout_ctca != ctca || job->timedout_ctra != ctra) {
+		job->timedout_ctca = ctca;
+		job->timedout_ctra = ctra;
+
+		schedule_delayed_work(&job->base.work_tdr,
+				      job->base.sched->timeout);
+		return;
+	}
 
 	mutex_lock(&v3d->reset_lock);
 
-- 
2.18.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 1/4] drm/v3d: Delay the scheduler timeout if we're still making progress.
@ 2018-07-03 17:05 ` Eric Anholt
  0 siblings, 0 replies; 18+ messages in thread
From: Eric Anholt @ 2018-07-03 17:05 UTC (permalink / raw)
  To: dri-devel; +Cc: linux-kernel

GTF-GLES2.gtf.GL.acos.acos_float_vert_xvary submits jobs that take 4
seconds at maximum resolution, but we still want to reset quickly if a
job is really hung.  Sample the CL's current address and the return
address (since we call into tile lists repeatedly) and if either has
changed then assume we've made progress.

Signed-off-by: Eric Anholt <eric@anholt.net>
Cc: Lucas Stach <l.stach@pengutronix.de>
---
 drivers/gpu/drm/v3d/v3d_drv.h   |  2 ++
 drivers/gpu/drm/v3d/v3d_regs.h  |  1 +
 drivers/gpu/drm/v3d/v3d_sched.c | 18 ++++++++++++++++++
 3 files changed, 21 insertions(+)

diff --git a/drivers/gpu/drm/v3d/v3d_drv.h b/drivers/gpu/drm/v3d/v3d_drv.h
index f546e0ab9562..a5d96d823416 100644
--- a/drivers/gpu/drm/v3d/v3d_drv.h
+++ b/drivers/gpu/drm/v3d/v3d_drv.h
@@ -189,6 +189,8 @@ struct v3d_job {
 
 	/* GPU virtual addresses of the start/end of the CL job. */
 	u32 start, end;
+
+	u32 timedout_ctca, timedout_ctra;
 };
 
 struct v3d_exec_info {
diff --git a/drivers/gpu/drm/v3d/v3d_regs.h b/drivers/gpu/drm/v3d/v3d_regs.h
index fc13282dfc2f..854046565989 100644
--- a/drivers/gpu/drm/v3d/v3d_regs.h
+++ b/drivers/gpu/drm/v3d/v3d_regs.h
@@ -222,6 +222,7 @@
 #define V3D_CLE_CTNCA(n) (V3D_CLE_CT0CA + 4 * n)
 #define V3D_CLE_CT0RA                                  0x00118
 #define V3D_CLE_CT1RA                                  0x0011c
+#define V3D_CLE_CTNRA(n) (V3D_CLE_CT0RA + 4 * n)
 #define V3D_CLE_CT0LC                                  0x00120
 #define V3D_CLE_CT1LC                                  0x00124
 #define V3D_CLE_CT0PC                                  0x00128
diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
index 808bc901f567..00667c733dca 100644
--- a/drivers/gpu/drm/v3d/v3d_sched.c
+++ b/drivers/gpu/drm/v3d/v3d_sched.c
@@ -153,7 +153,25 @@ v3d_job_timedout(struct drm_sched_job *sched_job)
 	struct v3d_job *job = to_v3d_job(sched_job);
 	struct v3d_exec_info *exec = job->exec;
 	struct v3d_dev *v3d = exec->v3d;
+	enum v3d_queue job_q = job == &exec->bin ? V3D_BIN : V3D_RENDER;
 	enum v3d_queue q;
+	u32 ctca = V3D_CORE_READ(0, V3D_CLE_CTNCA(job_q));
+	u32 ctra = V3D_CORE_READ(0, V3D_CLE_CTNRA(job_q));
+
+	/* If the current address or return address have changed, then
+	 * the GPU has probably made progress and we should delay the
+	 * reset.  This could fail if the GPU got in an infinite loop
+	 * in the CL, but that is pretty unlikely outside of an i-g-t
+	 * testcase.
+	 */
+	if (job->timedout_ctca != ctca || job->timedout_ctra != ctra) {
+		job->timedout_ctca = ctca;
+		job->timedout_ctra = ctra;
+
+		schedule_delayed_work(&job->base.work_tdr,
+				      job->base.sched->timeout);
+		return;
+	}
 
 	mutex_lock(&v3d->reset_lock);
 
-- 
2.18.0

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 2/4] drm/v3d: Remove unnecessary dma_fence_ops.
  2018-07-03 17:05 ` Eric Anholt
@ 2018-07-03 17:05   ` Eric Anholt
  -1 siblings, 0 replies; 18+ messages in thread
From: Eric Anholt @ 2018-07-03 17:05 UTC (permalink / raw)
  To: dri-devel; +Cc: linux-kernel, Eric Anholt, Daniel Vetter

The dma-fence core as of commit 418cc6ca0607 ("dma-fence: Make ->wait
callback optional") provides appropriate defaults for these methods.

Signed-off-by: Eric Anholt <eric@anholt.net>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/v3d/v3d_fence.c | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/drivers/gpu/drm/v3d/v3d_fence.c b/drivers/gpu/drm/v3d/v3d_fence.c
index bfe31a89668b..50bfcf9a8a1a 100644
--- a/drivers/gpu/drm/v3d/v3d_fence.c
+++ b/drivers/gpu/drm/v3d/v3d_fence.c
@@ -35,19 +35,7 @@ static const char *v3d_fence_get_timeline_name(struct dma_fence *fence)
 		return "v3d-render";
 }
 
-static bool v3d_fence_enable_signaling(struct dma_fence *fence)
-{
-	return true;
-}
-
 const struct dma_fence_ops v3d_fence_ops = {
 	.get_driver_name = v3d_fence_get_driver_name,
 	.get_timeline_name = v3d_fence_get_timeline_name,
-	.enable_signaling = v3d_fence_enable_signaling,
-	/* Each of our fences gets signaled as complete by the IRQ
-	 * handler, so we rely on the core's tracking of signaling.
-	 */
-	.signaled = NULL,
-	.wait = dma_fence_default_wait,
-	.release = dma_fence_free,
 };
-- 
2.18.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 2/4] drm/v3d: Remove unnecessary dma_fence_ops.
@ 2018-07-03 17:05   ` Eric Anholt
  0 siblings, 0 replies; 18+ messages in thread
From: Eric Anholt @ 2018-07-03 17:05 UTC (permalink / raw)
  To: dri-devel; +Cc: linux-kernel, Daniel Vetter

The dma-fence core as of commit 418cc6ca0607 ("dma-fence: Make ->wait
callback optional") provides appropriate defaults for these methods.

Signed-off-by: Eric Anholt <eric@anholt.net>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/v3d/v3d_fence.c | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/drivers/gpu/drm/v3d/v3d_fence.c b/drivers/gpu/drm/v3d/v3d_fence.c
index bfe31a89668b..50bfcf9a8a1a 100644
--- a/drivers/gpu/drm/v3d/v3d_fence.c
+++ b/drivers/gpu/drm/v3d/v3d_fence.c
@@ -35,19 +35,7 @@ static const char *v3d_fence_get_timeline_name(struct dma_fence *fence)
 		return "v3d-render";
 }
 
-static bool v3d_fence_enable_signaling(struct dma_fence *fence)
-{
-	return true;
-}
-
 const struct dma_fence_ops v3d_fence_ops = {
 	.get_driver_name = v3d_fence_get_driver_name,
 	.get_timeline_name = v3d_fence_get_timeline_name,
-	.enable_signaling = v3d_fence_enable_signaling,
-	/* Each of our fences gets signaled as complete by the IRQ
-	 * handler, so we rely on the core's tracking of signaling.
-	 */
-	.signaled = NULL,
-	.wait = dma_fence_default_wait,
-	.release = dma_fence_free,
 };
-- 
2.18.0

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 3/4] drm/v3d: Add missing v3d documentation structure.
  2018-07-03 17:05 ` Eric Anholt
@ 2018-07-03 17:05   ` Eric Anholt
  -1 siblings, 0 replies; 18+ messages in thread
From: Eric Anholt @ 2018-07-03 17:05 UTC (permalink / raw)
  To: dri-devel; +Cc: linux-kernel, Eric Anholt, Daniel Vetter

This was a failure of "git add" on my part -- we already referenced
the doc from drivers.rst.

Signed-off-by: Eric Anholt <eric@anholt.net>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 Documentation/gpu/v3d.rst | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)
 create mode 100644 Documentation/gpu/v3d.rst

diff --git a/Documentation/gpu/v3d.rst b/Documentation/gpu/v3d.rst
new file mode 100644
index 000000000000..543f7fbf526e
--- /dev/null
+++ b/Documentation/gpu/v3d.rst
@@ -0,0 +1,28 @@
+=====================================
+ drm/v3d Broadcom V3D Graphics Driver
+=====================================
+
+.. kernel-doc:: drivers/gpu/drm/v3d/v3d_drv.c
+   :doc: Broadcom V3D Graphics Driver
+
+GPU buffer object (BO) management
+---------------------------------
+
+.. kernel-doc:: drivers/gpu/drm/v3d/v3d_bo.c
+   :doc: V3D GEM BO management support
+
+Address space management
+===========================================
+.. kernel-doc:: drivers/gpu/drm/v3d/v3d_mmu.c
+   :doc: Broadcom V3D MMU
+
+GPU Scheduling
+===========================================
+.. kernel-doc:: drivers/gpu/drm/v3d/v3d_sched.c
+   :doc: Broadcom V3D scheduling
+
+Interrupts
+--------------
+
+.. kernel-doc:: drivers/gpu/drm/v3d/v3d_irq.c
+   :doc: Interrupt management for the V3D engine
-- 
2.18.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 3/4] drm/v3d: Add missing v3d documentation structure.
@ 2018-07-03 17:05   ` Eric Anholt
  0 siblings, 0 replies; 18+ messages in thread
From: Eric Anholt @ 2018-07-03 17:05 UTC (permalink / raw)
  To: dri-devel; +Cc: linux-kernel, Daniel Vetter

This was a failure of "git add" on my part -- we already referenced
the doc from drivers.rst.

Signed-off-by: Eric Anholt <eric@anholt.net>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 Documentation/gpu/v3d.rst | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)
 create mode 100644 Documentation/gpu/v3d.rst

diff --git a/Documentation/gpu/v3d.rst b/Documentation/gpu/v3d.rst
new file mode 100644
index 000000000000..543f7fbf526e
--- /dev/null
+++ b/Documentation/gpu/v3d.rst
@@ -0,0 +1,28 @@
+=====================================
+ drm/v3d Broadcom V3D Graphics Driver
+=====================================
+
+.. kernel-doc:: drivers/gpu/drm/v3d/v3d_drv.c
+   :doc: Broadcom V3D Graphics Driver
+
+GPU buffer object (BO) management
+---------------------------------
+
+.. kernel-doc:: drivers/gpu/drm/v3d/v3d_bo.c
+   :doc: V3D GEM BO management support
+
+Address space management
+===========================================
+.. kernel-doc:: drivers/gpu/drm/v3d/v3d_mmu.c
+   :doc: Broadcom V3D MMU
+
+GPU Scheduling
+===========================================
+.. kernel-doc:: drivers/gpu/drm/v3d/v3d_sched.c
+   :doc: Broadcom V3D scheduling
+
+Interrupts
+--------------
+
+.. kernel-doc:: drivers/gpu/drm/v3d/v3d_irq.c
+   :doc: Interrupt management for the V3D engine
-- 
2.18.0

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 4/4] drm/v3d: Fix a grammar nit in the scheduler docs.
  2018-07-03 17:05 ` Eric Anholt
@ 2018-07-03 17:05   ` Eric Anholt
  -1 siblings, 0 replies; 18+ messages in thread
From: Eric Anholt @ 2018-07-03 17:05 UTC (permalink / raw)
  To: dri-devel; +Cc: linux-kernel, Eric Anholt, Daniel Vetter

Signed-off-by: Eric Anholt <eric@anholt.net>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/v3d/v3d_sched.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
index 00667c733dca..a5501581d96b 100644
--- a/drivers/gpu/drm/v3d/v3d_sched.c
+++ b/drivers/gpu/drm/v3d/v3d_sched.c
@@ -14,8 +14,8 @@
  * to the HW only when it has completed the last one, instead of
  * filling up the CT[01]Q FIFOs with jobs.  Similarly, we use
  * v3d_job_dependency() to manage the dependency between bin and
- * render, instead of having the clients submit jobs with using the
- * HW's semaphores to interlock between them.
+ * render, instead of having the clients submit jobs using the HW's
+ * semaphores to interlock between them.
  */
 
 #include <linux/kthread.h>
-- 
2.18.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 4/4] drm/v3d: Fix a grammar nit in the scheduler docs.
@ 2018-07-03 17:05   ` Eric Anholt
  0 siblings, 0 replies; 18+ messages in thread
From: Eric Anholt @ 2018-07-03 17:05 UTC (permalink / raw)
  To: dri-devel; +Cc: linux-kernel, Daniel Vetter

Signed-off-by: Eric Anholt <eric@anholt.net>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/v3d/v3d_sched.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
index 00667c733dca..a5501581d96b 100644
--- a/drivers/gpu/drm/v3d/v3d_sched.c
+++ b/drivers/gpu/drm/v3d/v3d_sched.c
@@ -14,8 +14,8 @@
  * to the HW only when it has completed the last one, instead of
  * filling up the CT[01]Q FIFOs with jobs.  Similarly, we use
  * v3d_job_dependency() to manage the dependency between bin and
- * render, instead of having the clients submit jobs with using the
- * HW's semaphores to interlock between them.
+ * render, instead of having the clients submit jobs using the HW's
+ * semaphores to interlock between them.
  */
 
 #include <linux/kthread.h>
-- 
2.18.0

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/4] drm/v3d: Delay the scheduler timeout if we're still making progress.
  2018-07-03 17:05 ` Eric Anholt
@ 2018-07-03 20:45   ` Alex Deucher
  -1 siblings, 0 replies; 18+ messages in thread
From: Alex Deucher @ 2018-07-03 20:45 UTC (permalink / raw)
  To: Eric Anholt; +Cc: Maling list - DRI developers, LKML

On Tue, Jul 3, 2018 at 1:05 PM, Eric Anholt <eric@anholt.net> wrote:
> GTF-GLES2.gtf.GL.acos.acos_float_vert_xvary submits jobs that take 4
> seconds at maximum resolution, but we still want to reset quickly if a
> job is really hung.  Sample the CL's current address and the return
> address (since we call into tile lists repeatedly) and if either has
> changed then assume we've made progress.
>
> Signed-off-by: Eric Anholt <eric@anholt.net>
> Cc: Lucas Stach <l.stach@pengutronix.de>

Series is:
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>

> ---
>  drivers/gpu/drm/v3d/v3d_drv.h   |  2 ++
>  drivers/gpu/drm/v3d/v3d_regs.h  |  1 +
>  drivers/gpu/drm/v3d/v3d_sched.c | 18 ++++++++++++++++++
>  3 files changed, 21 insertions(+)
>
> diff --git a/drivers/gpu/drm/v3d/v3d_drv.h b/drivers/gpu/drm/v3d/v3d_drv.h
> index f546e0ab9562..a5d96d823416 100644
> --- a/drivers/gpu/drm/v3d/v3d_drv.h
> +++ b/drivers/gpu/drm/v3d/v3d_drv.h
> @@ -189,6 +189,8 @@ struct v3d_job {
>
>         /* GPU virtual addresses of the start/end of the CL job. */
>         u32 start, end;
> +
> +       u32 timedout_ctca, timedout_ctra;
>  };
>
>  struct v3d_exec_info {
> diff --git a/drivers/gpu/drm/v3d/v3d_regs.h b/drivers/gpu/drm/v3d/v3d_regs.h
> index fc13282dfc2f..854046565989 100644
> --- a/drivers/gpu/drm/v3d/v3d_regs.h
> +++ b/drivers/gpu/drm/v3d/v3d_regs.h
> @@ -222,6 +222,7 @@
>  #define V3D_CLE_CTNCA(n) (V3D_CLE_CT0CA + 4 * n)
>  #define V3D_CLE_CT0RA                                  0x00118
>  #define V3D_CLE_CT1RA                                  0x0011c
> +#define V3D_CLE_CTNRA(n) (V3D_CLE_CT0RA + 4 * n)
>  #define V3D_CLE_CT0LC                                  0x00120
>  #define V3D_CLE_CT1LC                                  0x00124
>  #define V3D_CLE_CT0PC                                  0x00128
> diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
> index 808bc901f567..00667c733dca 100644
> --- a/drivers/gpu/drm/v3d/v3d_sched.c
> +++ b/drivers/gpu/drm/v3d/v3d_sched.c
> @@ -153,7 +153,25 @@ v3d_job_timedout(struct drm_sched_job *sched_job)
>         struct v3d_job *job = to_v3d_job(sched_job);
>         struct v3d_exec_info *exec = job->exec;
>         struct v3d_dev *v3d = exec->v3d;
> +       enum v3d_queue job_q = job == &exec->bin ? V3D_BIN : V3D_RENDER;
>         enum v3d_queue q;
> +       u32 ctca = V3D_CORE_READ(0, V3D_CLE_CTNCA(job_q));
> +       u32 ctra = V3D_CORE_READ(0, V3D_CLE_CTNRA(job_q));
> +
> +       /* If the current address or return address have changed, then
> +        * the GPU has probably made progress and we should delay the
> +        * reset.  This could fail if the GPU got in an infinite loop
> +        * in the CL, but that is pretty unlikely outside of an i-g-t
> +        * testcase.
> +        */
> +       if (job->timedout_ctca != ctca || job->timedout_ctra != ctra) {
> +               job->timedout_ctca = ctca;
> +               job->timedout_ctra = ctra;
> +
> +               schedule_delayed_work(&job->base.work_tdr,
> +                                     job->base.sched->timeout);
> +               return;
> +       }
>
>         mutex_lock(&v3d->reset_lock);
>
> --
> 2.18.0
>
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/4] drm/v3d: Delay the scheduler timeout if we're still making progress.
@ 2018-07-03 20:45   ` Alex Deucher
  0 siblings, 0 replies; 18+ messages in thread
From: Alex Deucher @ 2018-07-03 20:45 UTC (permalink / raw)
  To: Eric Anholt; +Cc: LKML, Maling list - DRI developers

On Tue, Jul 3, 2018 at 1:05 PM, Eric Anholt <eric@anholt.net> wrote:
> GTF-GLES2.gtf.GL.acos.acos_float_vert_xvary submits jobs that take 4
> seconds at maximum resolution, but we still want to reset quickly if a
> job is really hung.  Sample the CL's current address and the return
> address (since we call into tile lists repeatedly) and if either has
> changed then assume we've made progress.
>
> Signed-off-by: Eric Anholt <eric@anholt.net>
> Cc: Lucas Stach <l.stach@pengutronix.de>

Series is:
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>

> ---
>  drivers/gpu/drm/v3d/v3d_drv.h   |  2 ++
>  drivers/gpu/drm/v3d/v3d_regs.h  |  1 +
>  drivers/gpu/drm/v3d/v3d_sched.c | 18 ++++++++++++++++++
>  3 files changed, 21 insertions(+)
>
> diff --git a/drivers/gpu/drm/v3d/v3d_drv.h b/drivers/gpu/drm/v3d/v3d_drv.h
> index f546e0ab9562..a5d96d823416 100644
> --- a/drivers/gpu/drm/v3d/v3d_drv.h
> +++ b/drivers/gpu/drm/v3d/v3d_drv.h
> @@ -189,6 +189,8 @@ struct v3d_job {
>
>         /* GPU virtual addresses of the start/end of the CL job. */
>         u32 start, end;
> +
> +       u32 timedout_ctca, timedout_ctra;
>  };
>
>  struct v3d_exec_info {
> diff --git a/drivers/gpu/drm/v3d/v3d_regs.h b/drivers/gpu/drm/v3d/v3d_regs.h
> index fc13282dfc2f..854046565989 100644
> --- a/drivers/gpu/drm/v3d/v3d_regs.h
> +++ b/drivers/gpu/drm/v3d/v3d_regs.h
> @@ -222,6 +222,7 @@
>  #define V3D_CLE_CTNCA(n) (V3D_CLE_CT0CA + 4 * n)
>  #define V3D_CLE_CT0RA                                  0x00118
>  #define V3D_CLE_CT1RA                                  0x0011c
> +#define V3D_CLE_CTNRA(n) (V3D_CLE_CT0RA + 4 * n)
>  #define V3D_CLE_CT0LC                                  0x00120
>  #define V3D_CLE_CT1LC                                  0x00124
>  #define V3D_CLE_CT0PC                                  0x00128
> diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
> index 808bc901f567..00667c733dca 100644
> --- a/drivers/gpu/drm/v3d/v3d_sched.c
> +++ b/drivers/gpu/drm/v3d/v3d_sched.c
> @@ -153,7 +153,25 @@ v3d_job_timedout(struct drm_sched_job *sched_job)
>         struct v3d_job *job = to_v3d_job(sched_job);
>         struct v3d_exec_info *exec = job->exec;
>         struct v3d_dev *v3d = exec->v3d;
> +       enum v3d_queue job_q = job == &exec->bin ? V3D_BIN : V3D_RENDER;
>         enum v3d_queue q;
> +       u32 ctca = V3D_CORE_READ(0, V3D_CLE_CTNCA(job_q));
> +       u32 ctra = V3D_CORE_READ(0, V3D_CLE_CTNRA(job_q));
> +
> +       /* If the current address or return address have changed, then
> +        * the GPU has probably made progress and we should delay the
> +        * reset.  This could fail if the GPU got in an infinite loop
> +        * in the CL, but that is pretty unlikely outside of an i-g-t
> +        * testcase.
> +        */
> +       if (job->timedout_ctca != ctca || job->timedout_ctra != ctra) {
> +               job->timedout_ctca = ctca;
> +               job->timedout_ctra = ctra;
> +
> +               schedule_delayed_work(&job->base.work_tdr,
> +                                     job->base.sched->timeout);
> +               return;
> +       }
>
>         mutex_lock(&v3d->reset_lock);
>
> --
> 2.18.0
>
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/4] drm/v3d: Delay the scheduler timeout if we're still making progress.
  2018-07-03 17:05 ` Eric Anholt
@ 2018-07-04  8:31   ` Daniel Vetter
  -1 siblings, 0 replies; 18+ messages in thread
From: Daniel Vetter @ 2018-07-04  8:31 UTC (permalink / raw)
  To: Eric Anholt; +Cc: dri-devel, linux-kernel

On Tue, Jul 03, 2018 at 10:05:12AM -0700, Eric Anholt wrote:
> GTF-GLES2.gtf.GL.acos.acos_float_vert_xvary submits jobs that take 4
> seconds at maximum resolution, but we still want to reset quickly if a
> job is really hung.  Sample the CL's current address and the return
> address (since we call into tile lists repeatedly) and if either has
> changed then assume we've made progress.
> 
> Signed-off-by: Eric Anholt <eric@anholt.net>
> Cc: Lucas Stach <l.stach@pengutronix.de>
> ---
>  drivers/gpu/drm/v3d/v3d_drv.h   |  2 ++
>  drivers/gpu/drm/v3d/v3d_regs.h  |  1 +
>  drivers/gpu/drm/v3d/v3d_sched.c | 18 ++++++++++++++++++
>  3 files changed, 21 insertions(+)
> 
> diff --git a/drivers/gpu/drm/v3d/v3d_drv.h b/drivers/gpu/drm/v3d/v3d_drv.h
> index f546e0ab9562..a5d96d823416 100644
> --- a/drivers/gpu/drm/v3d/v3d_drv.h
> +++ b/drivers/gpu/drm/v3d/v3d_drv.h
> @@ -189,6 +189,8 @@ struct v3d_job {
>  
>  	/* GPU virtual addresses of the start/end of the CL job. */
>  	u32 start, end;
> +
> +	u32 timedout_ctca, timedout_ctra;
>  };
>  
>  struct v3d_exec_info {
> diff --git a/drivers/gpu/drm/v3d/v3d_regs.h b/drivers/gpu/drm/v3d/v3d_regs.h
> index fc13282dfc2f..854046565989 100644
> --- a/drivers/gpu/drm/v3d/v3d_regs.h
> +++ b/drivers/gpu/drm/v3d/v3d_regs.h
> @@ -222,6 +222,7 @@
>  #define V3D_CLE_CTNCA(n) (V3D_CLE_CT0CA + 4 * n)
>  #define V3D_CLE_CT0RA                                  0x00118
>  #define V3D_CLE_CT1RA                                  0x0011c
> +#define V3D_CLE_CTNRA(n) (V3D_CLE_CT0RA + 4 * n)
>  #define V3D_CLE_CT0LC                                  0x00120
>  #define V3D_CLE_CT1LC                                  0x00124
>  #define V3D_CLE_CT0PC                                  0x00128
> diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
> index 808bc901f567..00667c733dca 100644
> --- a/drivers/gpu/drm/v3d/v3d_sched.c
> +++ b/drivers/gpu/drm/v3d/v3d_sched.c
> @@ -153,7 +153,25 @@ v3d_job_timedout(struct drm_sched_job *sched_job)
>  	struct v3d_job *job = to_v3d_job(sched_job);
>  	struct v3d_exec_info *exec = job->exec;
>  	struct v3d_dev *v3d = exec->v3d;
> +	enum v3d_queue job_q = job == &exec->bin ? V3D_BIN : V3D_RENDER;
>  	enum v3d_queue q;
> +	u32 ctca = V3D_CORE_READ(0, V3D_CLE_CTNCA(job_q));
> +	u32 ctra = V3D_CORE_READ(0, V3D_CLE_CTNRA(job_q));
> +
> +	/* If the current address or return address have changed, then
> +	 * the GPU has probably made progress and we should delay the
> +	 * reset.  This could fail if the GPU got in an infinite loop
> +	 * in the CL, but that is pretty unlikely outside of an i-g-t
> +	 * testcase.
> +	 */

In i915 we have a credit system that allows only a limited amount of
postponing the timeout. That would block the fairly easy DOS. Otoh I have
no idea how robust v3d is against DOS attacks at the ioctl level and
whether that's worth it.

From your description I'm assuming that a loop in the shaders won't trick
this, so ARB_robustness/WebGL should still be happy.

Anyway, Ack on the entire pile.
-Daniel
> +	if (job->timedout_ctca != ctca || job->timedout_ctra != ctra) {
> +		job->timedout_ctca = ctca;
> +		job->timedout_ctra = ctra;
> +
> +		schedule_delayed_work(&job->base.work_tdr,
> +				      job->base.sched->timeout);
> +		return;
> +	}
>  
>  	mutex_lock(&v3d->reset_lock);
>  
> -- 
> 2.18.0
> 
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/4] drm/v3d: Delay the scheduler timeout if we're still making progress.
@ 2018-07-04  8:31   ` Daniel Vetter
  0 siblings, 0 replies; 18+ messages in thread
From: Daniel Vetter @ 2018-07-04  8:31 UTC (permalink / raw)
  To: Eric Anholt; +Cc: linux-kernel, dri-devel

On Tue, Jul 03, 2018 at 10:05:12AM -0700, Eric Anholt wrote:
> GTF-GLES2.gtf.GL.acos.acos_float_vert_xvary submits jobs that take 4
> seconds at maximum resolution, but we still want to reset quickly if a
> job is really hung.  Sample the CL's current address and the return
> address (since we call into tile lists repeatedly) and if either has
> changed then assume we've made progress.
> 
> Signed-off-by: Eric Anholt <eric@anholt.net>
> Cc: Lucas Stach <l.stach@pengutronix.de>
> ---
>  drivers/gpu/drm/v3d/v3d_drv.h   |  2 ++
>  drivers/gpu/drm/v3d/v3d_regs.h  |  1 +
>  drivers/gpu/drm/v3d/v3d_sched.c | 18 ++++++++++++++++++
>  3 files changed, 21 insertions(+)
> 
> diff --git a/drivers/gpu/drm/v3d/v3d_drv.h b/drivers/gpu/drm/v3d/v3d_drv.h
> index f546e0ab9562..a5d96d823416 100644
> --- a/drivers/gpu/drm/v3d/v3d_drv.h
> +++ b/drivers/gpu/drm/v3d/v3d_drv.h
> @@ -189,6 +189,8 @@ struct v3d_job {
>  
>  	/* GPU virtual addresses of the start/end of the CL job. */
>  	u32 start, end;
> +
> +	u32 timedout_ctca, timedout_ctra;
>  };
>  
>  struct v3d_exec_info {
> diff --git a/drivers/gpu/drm/v3d/v3d_regs.h b/drivers/gpu/drm/v3d/v3d_regs.h
> index fc13282dfc2f..854046565989 100644
> --- a/drivers/gpu/drm/v3d/v3d_regs.h
> +++ b/drivers/gpu/drm/v3d/v3d_regs.h
> @@ -222,6 +222,7 @@
>  #define V3D_CLE_CTNCA(n) (V3D_CLE_CT0CA + 4 * n)
>  #define V3D_CLE_CT0RA                                  0x00118
>  #define V3D_CLE_CT1RA                                  0x0011c
> +#define V3D_CLE_CTNRA(n) (V3D_CLE_CT0RA + 4 * n)
>  #define V3D_CLE_CT0LC                                  0x00120
>  #define V3D_CLE_CT1LC                                  0x00124
>  #define V3D_CLE_CT0PC                                  0x00128
> diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
> index 808bc901f567..00667c733dca 100644
> --- a/drivers/gpu/drm/v3d/v3d_sched.c
> +++ b/drivers/gpu/drm/v3d/v3d_sched.c
> @@ -153,7 +153,25 @@ v3d_job_timedout(struct drm_sched_job *sched_job)
>  	struct v3d_job *job = to_v3d_job(sched_job);
>  	struct v3d_exec_info *exec = job->exec;
>  	struct v3d_dev *v3d = exec->v3d;
> +	enum v3d_queue job_q = job == &exec->bin ? V3D_BIN : V3D_RENDER;
>  	enum v3d_queue q;
> +	u32 ctca = V3D_CORE_READ(0, V3D_CLE_CTNCA(job_q));
> +	u32 ctra = V3D_CORE_READ(0, V3D_CLE_CTNRA(job_q));
> +
> +	/* If the current address or return address have changed, then
> +	 * the GPU has probably made progress and we should delay the
> +	 * reset.  This could fail if the GPU got in an infinite loop
> +	 * in the CL, but that is pretty unlikely outside of an i-g-t
> +	 * testcase.
> +	 */

In i915 we have a credit system that allows only a limited amount of
postponing the timeout. That would block the fairly easy DOS. Otoh I have
no idea how robust v3d is against DOS attacks at the ioctl level and
whether that's worth it.

From your description I'm assuming that a loop in the shaders won't trick
this, so ARB_robustness/WebGL should still be happy.

Anyway, Ack on the entire pile.
-Daniel
> +	if (job->timedout_ctca != ctca || job->timedout_ctra != ctra) {
> +		job->timedout_ctca = ctca;
> +		job->timedout_ctra = ctra;
> +
> +		schedule_delayed_work(&job->base.work_tdr,
> +				      job->base.sched->timeout);
> +		return;
> +	}
>  
>  	mutex_lock(&v3d->reset_lock);
>  
> -- 
> 2.18.0
> 
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/4] drm/v3d: Delay the scheduler timeout if we're still making progress.
  2018-07-03 17:05 ` Eric Anholt
@ 2018-07-05 10:59   ` Lucas Stach
  -1 siblings, 0 replies; 18+ messages in thread
From: Lucas Stach @ 2018-07-05 10:59 UTC (permalink / raw)
  To: Eric Anholt, dri-devel; +Cc: linux-kernel

Am Dienstag, den 03.07.2018, 10:05 -0700 schrieb Eric Anholt:
> GTF-GLES2.gtf.GL.acos.acos_float_vert_xvary submits jobs that take 4
> seconds at maximum resolution, but we still want to reset quickly if a
> job is really hung.  Sample the CL's current address and the return
> address (since we call into tile lists repeatedly) and if either has
> changed then assume we've made progress.

So this means you are doubling your timeout? AFAICS for the first time
you hit the timeout handler the cached ctca and ctra values will
probably always differ from the current values. Maybe this warrants a
mention in the commit message, as it's changing the behavior of the
scheduler timeout.

Also how easy is it for userspace to construct such an infinite loop in
the CL? Thinking about a rogue client DoSing the GPU while exploiting
this check in the timeout handler to stay under the radar...

Regards,
Lucas

> Signed-off-by: Eric Anholt <eric@anholt.net>
> > Cc: Lucas Stach <l.stach@pengutronix.de>
> ---
>  drivers/gpu/drm/v3d/v3d_drv.h   |  2 ++
>  drivers/gpu/drm/v3d/v3d_regs.h  |  1 +
>  drivers/gpu/drm/v3d/v3d_sched.c | 18 ++++++++++++++++++
>  3 files changed, 21 insertions(+)
> 
> diff --git a/drivers/gpu/drm/v3d/v3d_drv.h b/drivers/gpu/drm/v3d/v3d_drv.h
> index f546e0ab9562..a5d96d823416 100644
> --- a/drivers/gpu/drm/v3d/v3d_drv.h
> +++ b/drivers/gpu/drm/v3d/v3d_drv.h
> @@ -189,6 +189,8 @@ struct v3d_job {
>  
> >  	/* GPU virtual addresses of the start/end of the CL job. */
> >  	u32 start, end;
> +
> > +	u32 timedout_ctca, timedout_ctra;
>  };
>  
>  struct v3d_exec_info {
> diff --git a/drivers/gpu/drm/v3d/v3d_regs.h b/drivers/gpu/drm/v3d/v3d_regs.h
> index fc13282dfc2f..854046565989 100644
> --- a/drivers/gpu/drm/v3d/v3d_regs.h
> +++ b/drivers/gpu/drm/v3d/v3d_regs.h
> @@ -222,6 +222,7 @@
>  #define V3D_CLE_CTNCA(n) (V3D_CLE_CT0CA + 4 * n)
>  #define V3D_CLE_CT0RA                                  0x00118
>  #define V3D_CLE_CT1RA                                  0x0011c
> +#define V3D_CLE_CTNRA(n) (V3D_CLE_CT0RA + 4 * n)
>  #define V3D_CLE_CT0LC                                  0x00120
>  #define V3D_CLE_CT1LC                                  0x00124
>  #define V3D_CLE_CT0PC                                  0x00128
> diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
> index 808bc901f567..00667c733dca 100644
> --- a/drivers/gpu/drm/v3d/v3d_sched.c
> +++ b/drivers/gpu/drm/v3d/v3d_sched.c
> @@ -153,7 +153,25 @@ v3d_job_timedout(struct drm_sched_job *sched_job)
> >  	struct v3d_job *job = to_v3d_job(sched_job);
> >  	struct v3d_exec_info *exec = job->exec;
> >  	struct v3d_dev *v3d = exec->v3d;
> > +	enum v3d_queue job_q = job == &exec->bin ? V3D_BIN : V3D_RENDER;
> >  	enum v3d_queue q;
> > +	u32 ctca = V3D_CORE_READ(0, V3D_CLE_CTNCA(job_q));
> > +	u32 ctra = V3D_CORE_READ(0, V3D_CLE_CTNRA(job_q));
> +
> > +	/* If the current address or return address have changed, then
> > +	 * the GPU has probably made progress and we should delay the
> > +	 * reset.  This could fail if the GPU got in an infinite loop
> > +	 * in the CL, but that is pretty unlikely outside of an i-g-t
> > +	 * testcase.
> > +	 */
> > +	if (job->timedout_ctca != ctca || job->timedout_ctra != ctra) {
> > +		job->timedout_ctca = ctca;
> > +		job->timedout_ctra = ctra;
> +
> > +		schedule_delayed_work(&job->base.work_tdr,
> > +				      job->base.sched->timeout);
> > +		return;
> > +	}
>  
> >  	mutex_lock(&v3d->reset_lock);
>  

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/4] drm/v3d: Delay the scheduler timeout if we're still making progress.
@ 2018-07-05 10:59   ` Lucas Stach
  0 siblings, 0 replies; 18+ messages in thread
From: Lucas Stach @ 2018-07-05 10:59 UTC (permalink / raw)
  To: Eric Anholt, dri-devel; +Cc: linux-kernel

Am Dienstag, den 03.07.2018, 10:05 -0700 schrieb Eric Anholt:
> GTF-GLES2.gtf.GL.acos.acos_float_vert_xvary submits jobs that take 4
> seconds at maximum resolution, but we still want to reset quickly if a
> job is really hung.  Sample the CL's current address and the return
> address (since we call into tile lists repeatedly) and if either has
> changed then assume we've made progress.

So this means you are doubling your timeout? AFAICS for the first time
you hit the timeout handler the cached ctca and ctra values will
probably always differ from the current values. Maybe this warrants a
mention in the commit message, as it's changing the behavior of the
scheduler timeout.

Also how easy is it for userspace to construct such an infinite loop in
the CL? Thinking about a rogue client DoSing the GPU while exploiting
this check in the timeout handler to stay under the radar...

Regards,
Lucas

> Signed-off-by: Eric Anholt <eric@anholt.net>
> > Cc: Lucas Stach <l.stach@pengutronix.de>
> ---
>  drivers/gpu/drm/v3d/v3d_drv.h   |  2 ++
>  drivers/gpu/drm/v3d/v3d_regs.h  |  1 +
>  drivers/gpu/drm/v3d/v3d_sched.c | 18 ++++++++++++++++++
>  3 files changed, 21 insertions(+)
> 
> diff --git a/drivers/gpu/drm/v3d/v3d_drv.h b/drivers/gpu/drm/v3d/v3d_drv.h
> index f546e0ab9562..a5d96d823416 100644
> --- a/drivers/gpu/drm/v3d/v3d_drv.h
> +++ b/drivers/gpu/drm/v3d/v3d_drv.h
> @@ -189,6 +189,8 @@ struct v3d_job {
>  
> >  	/* GPU virtual addresses of the start/end of the CL job. */
> >  	u32 start, end;
> +
> > +	u32 timedout_ctca, timedout_ctra;
>  };
>  
>  struct v3d_exec_info {
> diff --git a/drivers/gpu/drm/v3d/v3d_regs.h b/drivers/gpu/drm/v3d/v3d_regs.h
> index fc13282dfc2f..854046565989 100644
> --- a/drivers/gpu/drm/v3d/v3d_regs.h
> +++ b/drivers/gpu/drm/v3d/v3d_regs.h
> @@ -222,6 +222,7 @@
>  #define V3D_CLE_CTNCA(n) (V3D_CLE_CT0CA + 4 * n)
>  #define V3D_CLE_CT0RA                                  0x00118
>  #define V3D_CLE_CT1RA                                  0x0011c
> +#define V3D_CLE_CTNRA(n) (V3D_CLE_CT0RA + 4 * n)
>  #define V3D_CLE_CT0LC                                  0x00120
>  #define V3D_CLE_CT1LC                                  0x00124
>  #define V3D_CLE_CT0PC                                  0x00128
> diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
> index 808bc901f567..00667c733dca 100644
> --- a/drivers/gpu/drm/v3d/v3d_sched.c
> +++ b/drivers/gpu/drm/v3d/v3d_sched.c
> @@ -153,7 +153,25 @@ v3d_job_timedout(struct drm_sched_job *sched_job)
> >  	struct v3d_job *job = to_v3d_job(sched_job);
> >  	struct v3d_exec_info *exec = job->exec;
> >  	struct v3d_dev *v3d = exec->v3d;
> > +	enum v3d_queue job_q = job == &exec->bin ? V3D_BIN : V3D_RENDER;
> >  	enum v3d_queue q;
> > +	u32 ctca = V3D_CORE_READ(0, V3D_CLE_CTNCA(job_q));
> > +	u32 ctra = V3D_CORE_READ(0, V3D_CLE_CTNRA(job_q));
> +
> > +	/* If the current address or return address have changed, then
> > +	 * the GPU has probably made progress and we should delay the
> > +	 * reset.  This could fail if the GPU got in an infinite loop
> > +	 * in the CL, but that is pretty unlikely outside of an i-g-t
> > +	 * testcase.
> > +	 */
> > +	if (job->timedout_ctca != ctca || job->timedout_ctra != ctra) {
> > +		job->timedout_ctca = ctca;
> > +		job->timedout_ctra = ctra;
> +
> > +		schedule_delayed_work(&job->base.work_tdr,
> > +				      job->base.sched->timeout);
> > +		return;
> > +	}
>  
> >  	mutex_lock(&v3d->reset_lock);
>  
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/4] drm/v3d: Delay the scheduler timeout if we're still making progress.
  2018-07-05 10:59   ` Lucas Stach
@ 2018-07-05 16:59     ` Eric Anholt
  -1 siblings, 0 replies; 18+ messages in thread
From: Eric Anholt @ 2018-07-05 16:59 UTC (permalink / raw)
  To: Lucas Stach, dri-devel; +Cc: linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1757 bytes --]

Lucas Stach <l.stach@pengutronix.de> writes:

> Am Dienstag, den 03.07.2018, 10:05 -0700 schrieb Eric Anholt:
>> GTF-GLES2.gtf.GL.acos.acos_float_vert_xvary submits jobs that take 4
>> seconds at maximum resolution, but we still want to reset quickly if a
>> job is really hung.  Sample the CL's current address and the return
>> address (since we call into tile lists repeatedly) and if either has
>> changed then assume we've made progress.
>
> So this means you are doubling your timeout? AFAICS for the first time
> you hit the timeout handler the cached ctca and ctra values will
> probably always differ from the current values. Maybe this warrants a
> mention in the commit message, as it's changing the behavior of the
> scheduler timeout.

I supposes that doubles the minimum timeout, but I don't think there's
any principled choice behind that value.

> Also how easy is it for userspace to construct such an infinite loop in
> the CL? Thinking about a rogue client DoSing the GPU while exploiting
> this check in the timeout handler to stay under the radar...

You'd need to have a big enough CL that you don't sample the same
location twice in a row, but otherwise it's trivial and equivalent to a
V3D33 igt case I wrote.  I don't think we as the kernel particularly
cares to protect from that case, though -- it's mainly "does a broken
WebGL shader take down your desktop?" which we will still be protecting
from.  If you wanted to protect from a general userspace attacker, you
could have a maximum 1 minute timeout or something, but I'm not sure
your life is actually much better when you let an arbitrary number of
clients submit many jobs to round-robin through each of which has a long
timeout like that.

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/4] drm/v3d: Delay the scheduler timeout if we're still making progress.
@ 2018-07-05 16:59     ` Eric Anholt
  0 siblings, 0 replies; 18+ messages in thread
From: Eric Anholt @ 2018-07-05 16:59 UTC (permalink / raw)
  To: Lucas Stach, dri-devel; +Cc: linux-kernel

[-- Attachment #1.1: Type: text/plain, Size: 1757 bytes --]

Lucas Stach <l.stach@pengutronix.de> writes:

> Am Dienstag, den 03.07.2018, 10:05 -0700 schrieb Eric Anholt:
>> GTF-GLES2.gtf.GL.acos.acos_float_vert_xvary submits jobs that take 4
>> seconds at maximum resolution, but we still want to reset quickly if a
>> job is really hung.  Sample the CL's current address and the return
>> address (since we call into tile lists repeatedly) and if either has
>> changed then assume we've made progress.
>
> So this means you are doubling your timeout? AFAICS for the first time
> you hit the timeout handler the cached ctca and ctra values will
> probably always differ from the current values. Maybe this warrants a
> mention in the commit message, as it's changing the behavior of the
> scheduler timeout.

I supposes that doubles the minimum timeout, but I don't think there's
any principled choice behind that value.

> Also how easy is it for userspace to construct such an infinite loop in
> the CL? Thinking about a rogue client DoSing the GPU while exploiting
> this check in the timeout handler to stay under the radar...

You'd need to have a big enough CL that you don't sample the same
location twice in a row, but otherwise it's trivial and equivalent to a
V3D33 igt case I wrote.  I don't think we as the kernel particularly
cares to protect from that case, though -- it's mainly "does a broken
WebGL shader take down your desktop?" which we will still be protecting
from.  If you wanted to protect from a general userspace attacker, you
could have a maximum 1 minute timeout or something, but I'm not sure
your life is actually much better when you let an arbitrary number of
clients submit many jobs to round-robin through each of which has a long
timeout like that.

[-- Attachment #1.2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

[-- Attachment #2: Type: text/plain, Size: 160 bytes --]

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/4] drm/v3d: Delay the scheduler timeout if we're still making progress.
  2018-07-03 17:05 ` Eric Anholt
@ 2018-07-06 10:03   ` Lucas Stach
  -1 siblings, 0 replies; 18+ messages in thread
From: Lucas Stach @ 2018-07-06 10:03 UTC (permalink / raw)
  To: Eric Anholt, dri-devel; +Cc: linux-kernel

Am Dienstag, den 03.07.2018, 10:05 -0700 schrieb Eric Anholt:
> GTF-GLES2.gtf.GL.acos.acos_float_vert_xvary submits jobs that take 4
> seconds at maximum resolution, but we still want to reset quickly if a
> job is really hung.  Sample the CL's current address and the return
> address (since we call into tile lists repeatedly) and if either has
> changed then assume we've made progress.
> 
> > Signed-off-by: Eric Anholt <eric@anholt.net>
> Cc: Lucas Stach <l.stach@pengutronix.de>

Reviewed-by: Lucas Stach <l.stach@pengutronix.de>

> ---
>  drivers/gpu/drm/v3d/v3d_drv.h   |  2 ++
>  drivers/gpu/drm/v3d/v3d_regs.h  |  1 +
>  drivers/gpu/drm/v3d/v3d_sched.c | 18 ++++++++++++++++++
>  3 files changed, 21 insertions(+)
> 
> diff --git a/drivers/gpu/drm/v3d/v3d_drv.h b/drivers/gpu/drm/v3d/v3d_drv.h
> index f546e0ab9562..a5d96d823416 100644
> --- a/drivers/gpu/drm/v3d/v3d_drv.h
> +++ b/drivers/gpu/drm/v3d/v3d_drv.h
> @@ -189,6 +189,8 @@ struct v3d_job {
>  
> >  	/* GPU virtual addresses of the start/end of the CL job. */
> >  	u32 start, end;
> +
> > +	u32 timedout_ctca, timedout_ctra;
>  };
>  
>  struct v3d_exec_info {
> diff --git a/drivers/gpu/drm/v3d/v3d_regs.h b/drivers/gpu/drm/v3d/v3d_regs.h
> index fc13282dfc2f..854046565989 100644
> --- a/drivers/gpu/drm/v3d/v3d_regs.h
> +++ b/drivers/gpu/drm/v3d/v3d_regs.h
> @@ -222,6 +222,7 @@
>  #define V3D_CLE_CTNCA(n) (V3D_CLE_CT0CA + 4 * n)
>  #define V3D_CLE_CT0RA                                  0x00118
>  #define V3D_CLE_CT1RA                                  0x0011c
> +#define V3D_CLE_CTNRA(n) (V3D_CLE_CT0RA + 4 * n)
>  #define V3D_CLE_CT0LC                                  0x00120
>  #define V3D_CLE_CT1LC                                  0x00124
>  #define V3D_CLE_CT0PC                                  0x00128
> diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
> index 808bc901f567..00667c733dca 100644
> --- a/drivers/gpu/drm/v3d/v3d_sched.c
> +++ b/drivers/gpu/drm/v3d/v3d_sched.c
> @@ -153,7 +153,25 @@ v3d_job_timedout(struct drm_sched_job *sched_job)
> >  	struct v3d_job *job = to_v3d_job(sched_job);
> >  	struct v3d_exec_info *exec = job->exec;
> >  	struct v3d_dev *v3d = exec->v3d;
> > +	enum v3d_queue job_q = job == &exec->bin ? V3D_BIN : V3D_RENDER;
> >  	enum v3d_queue q;
> > +	u32 ctca = V3D_CORE_READ(0, V3D_CLE_CTNCA(job_q));
> > +	u32 ctra = V3D_CORE_READ(0, V3D_CLE_CTNRA(job_q));
> +
> > +	/* If the current address or return address have changed, then
> > +	 * the GPU has probably made progress and we should delay the
> > +	 * reset.  This could fail if the GPU got in an infinite loop
> > +	 * in the CL, but that is pretty unlikely outside of an i-g-t
> > +	 * testcase.
> > +	 */
> > +	if (job->timedout_ctca != ctca || job->timedout_ctra != ctra) {
> > +		job->timedout_ctca = ctca;
> > +		job->timedout_ctra = ctra;
> +
> > +		schedule_delayed_work(&job->base.work_tdr,
> > +				      job->base.sched->timeout);
> > +		return;
> > +	}
>  
> >  	mutex_lock(&v3d->reset_lock);
>  

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/4] drm/v3d: Delay the scheduler timeout if we're still making progress.
@ 2018-07-06 10:03   ` Lucas Stach
  0 siblings, 0 replies; 18+ messages in thread
From: Lucas Stach @ 2018-07-06 10:03 UTC (permalink / raw)
  To: Eric Anholt, dri-devel; +Cc: linux-kernel

Am Dienstag, den 03.07.2018, 10:05 -0700 schrieb Eric Anholt:
> GTF-GLES2.gtf.GL.acos.acos_float_vert_xvary submits jobs that take 4
> seconds at maximum resolution, but we still want to reset quickly if a
> job is really hung.  Sample the CL's current address and the return
> address (since we call into tile lists repeatedly) and if either has
> changed then assume we've made progress.
> 
> > Signed-off-by: Eric Anholt <eric@anholt.net>
> Cc: Lucas Stach <l.stach@pengutronix.de>

Reviewed-by: Lucas Stach <l.stach@pengutronix.de>

> ---
>  drivers/gpu/drm/v3d/v3d_drv.h   |  2 ++
>  drivers/gpu/drm/v3d/v3d_regs.h  |  1 +
>  drivers/gpu/drm/v3d/v3d_sched.c | 18 ++++++++++++++++++
>  3 files changed, 21 insertions(+)
> 
> diff --git a/drivers/gpu/drm/v3d/v3d_drv.h b/drivers/gpu/drm/v3d/v3d_drv.h
> index f546e0ab9562..a5d96d823416 100644
> --- a/drivers/gpu/drm/v3d/v3d_drv.h
> +++ b/drivers/gpu/drm/v3d/v3d_drv.h
> @@ -189,6 +189,8 @@ struct v3d_job {
>  
> >  	/* GPU virtual addresses of the start/end of the CL job. */
> >  	u32 start, end;
> +
> > +	u32 timedout_ctca, timedout_ctra;
>  };
>  
>  struct v3d_exec_info {
> diff --git a/drivers/gpu/drm/v3d/v3d_regs.h b/drivers/gpu/drm/v3d/v3d_regs.h
> index fc13282dfc2f..854046565989 100644
> --- a/drivers/gpu/drm/v3d/v3d_regs.h
> +++ b/drivers/gpu/drm/v3d/v3d_regs.h
> @@ -222,6 +222,7 @@
>  #define V3D_CLE_CTNCA(n) (V3D_CLE_CT0CA + 4 * n)
>  #define V3D_CLE_CT0RA                                  0x00118
>  #define V3D_CLE_CT1RA                                  0x0011c
> +#define V3D_CLE_CTNRA(n) (V3D_CLE_CT0RA + 4 * n)
>  #define V3D_CLE_CT0LC                                  0x00120
>  #define V3D_CLE_CT1LC                                  0x00124
>  #define V3D_CLE_CT0PC                                  0x00128
> diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
> index 808bc901f567..00667c733dca 100644
> --- a/drivers/gpu/drm/v3d/v3d_sched.c
> +++ b/drivers/gpu/drm/v3d/v3d_sched.c
> @@ -153,7 +153,25 @@ v3d_job_timedout(struct drm_sched_job *sched_job)
> >  	struct v3d_job *job = to_v3d_job(sched_job);
> >  	struct v3d_exec_info *exec = job->exec;
> >  	struct v3d_dev *v3d = exec->v3d;
> > +	enum v3d_queue job_q = job == &exec->bin ? V3D_BIN : V3D_RENDER;
> >  	enum v3d_queue q;
> > +	u32 ctca = V3D_CORE_READ(0, V3D_CLE_CTNCA(job_q));
> > +	u32 ctra = V3D_CORE_READ(0, V3D_CLE_CTNRA(job_q));
> +
> > +	/* If the current address or return address have changed, then
> > +	 * the GPU has probably made progress and we should delay the
> > +	 * reset.  This could fail if the GPU got in an infinite loop
> > +	 * in the CL, but that is pretty unlikely outside of an i-g-t
> > +	 * testcase.
> > +	 */
> > +	if (job->timedout_ctca != ctca || job->timedout_ctra != ctra) {
> > +		job->timedout_ctca = ctca;
> > +		job->timedout_ctra = ctra;
> +
> > +		schedule_delayed_work(&job->base.work_tdr,
> > +				      job->base.sched->timeout);
> > +		return;
> > +	}
>  
> >  	mutex_lock(&v3d->reset_lock);
>  
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2018-07-06 10:06 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-07-03 17:05 [PATCH 1/4] drm/v3d: Delay the scheduler timeout if we're still making progress Eric Anholt
2018-07-03 17:05 ` Eric Anholt
2018-07-03 17:05 ` [PATCH 2/4] drm/v3d: Remove unnecessary dma_fence_ops Eric Anholt
2018-07-03 17:05   ` Eric Anholt
2018-07-03 17:05 ` [PATCH 3/4] drm/v3d: Add missing v3d documentation structure Eric Anholt
2018-07-03 17:05   ` Eric Anholt
2018-07-03 17:05 ` [PATCH 4/4] drm/v3d: Fix a grammar nit in the scheduler docs Eric Anholt
2018-07-03 17:05   ` Eric Anholt
2018-07-03 20:45 ` [PATCH 1/4] drm/v3d: Delay the scheduler timeout if we're still making progress Alex Deucher
2018-07-03 20:45   ` Alex Deucher
2018-07-04  8:31 ` Daniel Vetter
2018-07-04  8:31   ` Daniel Vetter
2018-07-05 10:59 ` Lucas Stach
2018-07-05 10:59   ` Lucas Stach
2018-07-05 16:59   ` Eric Anholt
2018-07-05 16:59     ` Eric Anholt
2018-07-06 10:03 ` Lucas Stach
2018-07-06 10:03   ` Lucas Stach

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.