All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] drm/panfrost: Fix job timeout handling
@ 2020-10-02  7:10 ` Boris Brezillon
  0 siblings, 0 replies; 6+ messages in thread
From: Boris Brezillon @ 2020-10-02  7:10 UTC (permalink / raw)
  To: Rob Herring, Tomeu Vizoso, Alyssa Rosenzweig, Steven Price, Robin Murphy
  Cc: dri-devel, Boris Brezillon, stable

If more than two jobs end up timeout-ing concurrently, only one of them
(the one attached to the scheduler acquiring the lock) is fully handled.
The other one remains in a dangling state where it's no longer part of
the scheduling queue, but still blocks something in scheduler, leading
to repetitive timeouts when new jobs are queued.

Let's make sure all bad jobs are properly handled by the thread
acquiring the lock.

v2:
- Fix the subject prefix
- Stop the scheduler before returning from panfrost_job_timedout()
- Call cancel_delayed_work_sync() after drm_sched_stop() to make sure
  no timeout handlers are in flight when we reset the GPU (Steven Price)
- Make sure we release the reset lock before restarting the
  schedulers (Steven Price)

Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Fixes: f3ba91228e8e ("drm/panfrost: Add initial panfrost driver")
Cc: <stable@vger.kernel.org>
---
 drivers/gpu/drm/panfrost/panfrost_job.c | 64 +++++++++++++++++++++----
 1 file changed, 55 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c
index 30e7b7196dab..6e4bfb938fab 100644
--- a/drivers/gpu/drm/panfrost/panfrost_job.c
+++ b/drivers/gpu/drm/panfrost/panfrost_job.c
@@ -25,7 +25,8 @@
 
 struct panfrost_queue_state {
 	struct drm_gpu_scheduler sched;
-
+	bool stopped;
+	struct mutex lock;
 	u64 fence_context;
 	u64 emit_seqno;
 };
@@ -369,6 +370,24 @@ void panfrost_job_enable_interrupts(struct panfrost_device *pfdev)
 	job_write(pfdev, JOB_INT_MASK, irq_mask);
 }
 
+static bool panfrost_scheduler_stop(struct panfrost_queue_state *queue,
+				    struct drm_sched_job *bad)
+{
+	bool stopped = false;
+
+	mutex_lock(&queue->lock);
+	if (!queue->stopped) {
+		drm_sched_stop(&queue->sched, bad);
+		if (bad)
+			drm_sched_increase_karma(bad);
+		queue->stopped = true;
+		stopped = true;
+	}
+	mutex_unlock(&queue->lock);
+
+	return stopped;
+}
+
 static void panfrost_job_timedout(struct drm_sched_job *sched_job)
 {
 	struct panfrost_job *job = to_panfrost_job(sched_job);
@@ -392,19 +411,41 @@ static void panfrost_job_timedout(struct drm_sched_job *sched_job)
 		job_read(pfdev, JS_TAIL_LO(js)),
 		sched_job);
 
+	/* Scheduler is already stopped, nothing to do. */
+	if (!panfrost_scheduler_stop(&pfdev->js->queue[js], sched_job))
+		return;
+
 	if (!mutex_trylock(&pfdev->reset_lock))
 		return;
 
+	mutex_lock(&pfdev->sched_lock);
 	for (i = 0; i < NUM_JOB_SLOTS; i++) {
 		struct drm_gpu_scheduler *sched = &pfdev->js->queue[i].sched;
 
-		drm_sched_stop(sched, sched_job);
-		if (js != i)
-			/* Ensure any timeouts on other slots have finished */
+		/*
+		 * If the queue is still active, make sure we wait for any
+		 * pending timeouts.
+		 */
+		if (!pfdev->js->queue[i].stopped)
 			cancel_delayed_work_sync(&sched->work_tdr);
-	}
 
-	drm_sched_increase_karma(sched_job);
+		/*
+		 * If the scheduler was not already stopped, there's a tiny
+		 * chance a timeout has expired just before we stopped it, and
+		 * drm_sched_stop() does not flush pending works. Let's flush
+		 * them now so the timeout handler doesn't get called in the
+		 * middle of a reset.
+		 */
+		if (panfrost_scheduler_stop(&pfdev->js->queue[i], NULL))
+			cancel_delayed_work_sync(&sched->work_tdr);
+
+		/*
+		 * Now that we cancelled the pending timeouts, we can safely
+		 * reset the stopped state.
+		 */
+		pfdev->js->queue[i].stopped = false;
+	}
+	mutex_unlock(&pfdev->sched_lock);
 
 	spin_lock_irqsave(&pfdev->js->job_lock, flags);
 	for (i = 0; i < NUM_JOB_SLOTS; i++) {
@@ -421,11 +462,11 @@ static void panfrost_job_timedout(struct drm_sched_job *sched_job)
 	for (i = 0; i < NUM_JOB_SLOTS; i++)
 		drm_sched_resubmit_jobs(&pfdev->js->queue[i].sched);
 
+	mutex_unlock(&pfdev->reset_lock);
+
 	/* restart scheduler after GPU is usable again */
 	for (i = 0; i < NUM_JOB_SLOTS; i++)
 		drm_sched_start(&pfdev->js->queue[i].sched, true);
-
-	mutex_unlock(&pfdev->reset_lock);
 }
 
 static const struct drm_sched_backend_ops panfrost_sched_ops = {
@@ -558,6 +599,7 @@ int panfrost_job_open(struct panfrost_file_priv *panfrost_priv)
 	int ret, i;
 
 	for (i = 0; i < NUM_JOB_SLOTS; i++) {
+		mutex_init(&js->queue[i].lock);
 		sched = &js->queue[i].sched;
 		ret = drm_sched_entity_init(&panfrost_priv->sched_entity[i],
 					    DRM_SCHED_PRIORITY_NORMAL, &sched,
@@ -570,10 +612,14 @@ int panfrost_job_open(struct panfrost_file_priv *panfrost_priv)
 
 void panfrost_job_close(struct panfrost_file_priv *panfrost_priv)
 {
+	struct panfrost_device *pfdev = panfrost_priv->pfdev;
+	struct panfrost_job_slot *js = pfdev->js;
 	int i;
 
-	for (i = 0; i < NUM_JOB_SLOTS; i++)
+	for (i = 0; i < NUM_JOB_SLOTS; i++) {
 		drm_sched_entity_destroy(&panfrost_priv->sched_entity[i]);
+		mutex_destroy(&js->queue[i].lock);
+	}
 }
 
 int panfrost_job_is_idle(struct panfrost_device *pfdev)
-- 
2.26.2


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH v2] drm/panfrost: Fix job timeout handling
@ 2020-10-02  7:10 ` Boris Brezillon
  0 siblings, 0 replies; 6+ messages in thread
From: Boris Brezillon @ 2020-10-02  7:10 UTC (permalink / raw)
  To: Rob Herring, Tomeu Vizoso, Alyssa Rosenzweig, Steven Price, Robin Murphy
  Cc: Boris Brezillon, stable, dri-devel

If more than two jobs end up timeout-ing concurrently, only one of them
(the one attached to the scheduler acquiring the lock) is fully handled.
The other one remains in a dangling state where it's no longer part of
the scheduling queue, but still blocks something in scheduler, leading
to repetitive timeouts when new jobs are queued.

Let's make sure all bad jobs are properly handled by the thread
acquiring the lock.

v2:
- Fix the subject prefix
- Stop the scheduler before returning from panfrost_job_timedout()
- Call cancel_delayed_work_sync() after drm_sched_stop() to make sure
  no timeout handlers are in flight when we reset the GPU (Steven Price)
- Make sure we release the reset lock before restarting the
  schedulers (Steven Price)

Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Fixes: f3ba91228e8e ("drm/panfrost: Add initial panfrost driver")
Cc: <stable@vger.kernel.org>
---
 drivers/gpu/drm/panfrost/panfrost_job.c | 64 +++++++++++++++++++++----
 1 file changed, 55 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c
index 30e7b7196dab..6e4bfb938fab 100644
--- a/drivers/gpu/drm/panfrost/panfrost_job.c
+++ b/drivers/gpu/drm/panfrost/panfrost_job.c
@@ -25,7 +25,8 @@
 
 struct panfrost_queue_state {
 	struct drm_gpu_scheduler sched;
-
+	bool stopped;
+	struct mutex lock;
 	u64 fence_context;
 	u64 emit_seqno;
 };
@@ -369,6 +370,24 @@ void panfrost_job_enable_interrupts(struct panfrost_device *pfdev)
 	job_write(pfdev, JOB_INT_MASK, irq_mask);
 }
 
+static bool panfrost_scheduler_stop(struct panfrost_queue_state *queue,
+				    struct drm_sched_job *bad)
+{
+	bool stopped = false;
+
+	mutex_lock(&queue->lock);
+	if (!queue->stopped) {
+		drm_sched_stop(&queue->sched, bad);
+		if (bad)
+			drm_sched_increase_karma(bad);
+		queue->stopped = true;
+		stopped = true;
+	}
+	mutex_unlock(&queue->lock);
+
+	return stopped;
+}
+
 static void panfrost_job_timedout(struct drm_sched_job *sched_job)
 {
 	struct panfrost_job *job = to_panfrost_job(sched_job);
@@ -392,19 +411,41 @@ static void panfrost_job_timedout(struct drm_sched_job *sched_job)
 		job_read(pfdev, JS_TAIL_LO(js)),
 		sched_job);
 
+	/* Scheduler is already stopped, nothing to do. */
+	if (!panfrost_scheduler_stop(&pfdev->js->queue[js], sched_job))
+		return;
+
 	if (!mutex_trylock(&pfdev->reset_lock))
 		return;
 
+	mutex_lock(&pfdev->sched_lock);
 	for (i = 0; i < NUM_JOB_SLOTS; i++) {
 		struct drm_gpu_scheduler *sched = &pfdev->js->queue[i].sched;
 
-		drm_sched_stop(sched, sched_job);
-		if (js != i)
-			/* Ensure any timeouts on other slots have finished */
+		/*
+		 * If the queue is still active, make sure we wait for any
+		 * pending timeouts.
+		 */
+		if (!pfdev->js->queue[i].stopped)
 			cancel_delayed_work_sync(&sched->work_tdr);
-	}
 
-	drm_sched_increase_karma(sched_job);
+		/*
+		 * If the scheduler was not already stopped, there's a tiny
+		 * chance a timeout has expired just before we stopped it, and
+		 * drm_sched_stop() does not flush pending works. Let's flush
+		 * them now so the timeout handler doesn't get called in the
+		 * middle of a reset.
+		 */
+		if (panfrost_scheduler_stop(&pfdev->js->queue[i], NULL))
+			cancel_delayed_work_sync(&sched->work_tdr);
+
+		/*
+		 * Now that we cancelled the pending timeouts, we can safely
+		 * reset the stopped state.
+		 */
+		pfdev->js->queue[i].stopped = false;
+	}
+	mutex_unlock(&pfdev->sched_lock);
 
 	spin_lock_irqsave(&pfdev->js->job_lock, flags);
 	for (i = 0; i < NUM_JOB_SLOTS; i++) {
@@ -421,11 +462,11 @@ static void panfrost_job_timedout(struct drm_sched_job *sched_job)
 	for (i = 0; i < NUM_JOB_SLOTS; i++)
 		drm_sched_resubmit_jobs(&pfdev->js->queue[i].sched);
 
+	mutex_unlock(&pfdev->reset_lock);
+
 	/* restart scheduler after GPU is usable again */
 	for (i = 0; i < NUM_JOB_SLOTS; i++)
 		drm_sched_start(&pfdev->js->queue[i].sched, true);
-
-	mutex_unlock(&pfdev->reset_lock);
 }
 
 static const struct drm_sched_backend_ops panfrost_sched_ops = {
@@ -558,6 +599,7 @@ int panfrost_job_open(struct panfrost_file_priv *panfrost_priv)
 	int ret, i;
 
 	for (i = 0; i < NUM_JOB_SLOTS; i++) {
+		mutex_init(&js->queue[i].lock);
 		sched = &js->queue[i].sched;
 		ret = drm_sched_entity_init(&panfrost_priv->sched_entity[i],
 					    DRM_SCHED_PRIORITY_NORMAL, &sched,
@@ -570,10 +612,14 @@ int panfrost_job_open(struct panfrost_file_priv *panfrost_priv)
 
 void panfrost_job_close(struct panfrost_file_priv *panfrost_priv)
 {
+	struct panfrost_device *pfdev = panfrost_priv->pfdev;
+	struct panfrost_job_slot *js = pfdev->js;
 	int i;
 
-	for (i = 0; i < NUM_JOB_SLOTS; i++)
+	for (i = 0; i < NUM_JOB_SLOTS; i++) {
 		drm_sched_entity_destroy(&panfrost_priv->sched_entity[i]);
+		mutex_destroy(&js->queue[i].lock);
+	}
 }
 
 int panfrost_job_is_idle(struct panfrost_device *pfdev)
-- 
2.26.2

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] drm/panfrost: Fix job timeout handling
  2020-10-02  7:10 ` Boris Brezillon
@ 2020-10-02 11:08   ` Steven Price
  -1 siblings, 0 replies; 6+ messages in thread
From: Steven Price @ 2020-10-02 11:08 UTC (permalink / raw)
  To: Boris Brezillon, Rob Herring, Tomeu Vizoso, Alyssa Rosenzweig,
	Robin Murphy
  Cc: dri-devel, stable

On 02/10/2020 08:10, Boris Brezillon wrote:
> If more than two jobs end up timeout-ing concurrently, only one of them
> (the one attached to the scheduler acquiring the lock) is fully handled.
> The other one remains in a dangling state where it's no longer part of
> the scheduling queue, but still blocks something in scheduler, leading
> to repetitive timeouts when new jobs are queued.
> 
> Let's make sure all bad jobs are properly handled by the thread
> acquiring the lock.
> 
> v2:
> - Fix the subject prefix
> - Stop the scheduler before returning from panfrost_job_timedout()
> - Call cancel_delayed_work_sync() after drm_sched_stop() to make sure
>    no timeout handlers are in flight when we reset the GPU (Steven Price)
> - Make sure we release the reset lock before restarting the
>    schedulers (Steven Price)
> 
> Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
> Fixes: f3ba91228e8e ("drm/panfrost: Add initial panfrost driver")
> Cc: <stable@vger.kernel.org>

LGTM!

Reviewed-by: Steven Price <steven.price@arm.com>

> ---
>   drivers/gpu/drm/panfrost/panfrost_job.c | 64 +++++++++++++++++++++----
>   1 file changed, 55 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c
> index 30e7b7196dab..6e4bfb938fab 100644
> --- a/drivers/gpu/drm/panfrost/panfrost_job.c
> +++ b/drivers/gpu/drm/panfrost/panfrost_job.c
> @@ -25,7 +25,8 @@
>   
>   struct panfrost_queue_state {
>   	struct drm_gpu_scheduler sched;
> -
> +	bool stopped;
> +	struct mutex lock;
>   	u64 fence_context;
>   	u64 emit_seqno;
>   };
> @@ -369,6 +370,24 @@ void panfrost_job_enable_interrupts(struct panfrost_device *pfdev)
>   	job_write(pfdev, JOB_INT_MASK, irq_mask);
>   }
>   
> +static bool panfrost_scheduler_stop(struct panfrost_queue_state *queue,
> +				    struct drm_sched_job *bad)
> +{
> +	bool stopped = false;
> +
> +	mutex_lock(&queue->lock);
> +	if (!queue->stopped) {
> +		drm_sched_stop(&queue->sched, bad);
> +		if (bad)
> +			drm_sched_increase_karma(bad);
> +		queue->stopped = true;
> +		stopped = true;
> +	}
> +	mutex_unlock(&queue->lock);
> +
> +	return stopped;
> +}
> +
>   static void panfrost_job_timedout(struct drm_sched_job *sched_job)
>   {
>   	struct panfrost_job *job = to_panfrost_job(sched_job);
> @@ -392,19 +411,41 @@ static void panfrost_job_timedout(struct drm_sched_job *sched_job)
>   		job_read(pfdev, JS_TAIL_LO(js)),
>   		sched_job);
>   
> +	/* Scheduler is already stopped, nothing to do. */
> +	if (!panfrost_scheduler_stop(&pfdev->js->queue[js], sched_job))
> +		return;
> +
>   	if (!mutex_trylock(&pfdev->reset_lock))
>   		return;
>   
> +	mutex_lock(&pfdev->sched_lock);
>   	for (i = 0; i < NUM_JOB_SLOTS; i++) {
>   		struct drm_gpu_scheduler *sched = &pfdev->js->queue[i].sched;
>   
> -		drm_sched_stop(sched, sched_job);
> -		if (js != i)
> -			/* Ensure any timeouts on other slots have finished */
> +		/*
> +		 * If the queue is still active, make sure we wait for any
> +		 * pending timeouts.
> +		 */
> +		if (!pfdev->js->queue[i].stopped)
>   			cancel_delayed_work_sync(&sched->work_tdr);
> -	}
>   
> -	drm_sched_increase_karma(sched_job);
> +		/*
> +		 * If the scheduler was not already stopped, there's a tiny
> +		 * chance a timeout has expired just before we stopped it, and
> +		 * drm_sched_stop() does not flush pending works. Let's flush
> +		 * them now so the timeout handler doesn't get called in the
> +		 * middle of a reset.
> +		 */
> +		if (panfrost_scheduler_stop(&pfdev->js->queue[i], NULL))
> +			cancel_delayed_work_sync(&sched->work_tdr);
> +
> +		/*
> +		 * Now that we cancelled the pending timeouts, we can safely
> +		 * reset the stopped state.
> +		 */
> +		pfdev->js->queue[i].stopped = false;
> +	}
> +	mutex_unlock(&pfdev->sched_lock);
>   
>   	spin_lock_irqsave(&pfdev->js->job_lock, flags);
>   	for (i = 0; i < NUM_JOB_SLOTS; i++) {
> @@ -421,11 +462,11 @@ static void panfrost_job_timedout(struct drm_sched_job *sched_job)
>   	for (i = 0; i < NUM_JOB_SLOTS; i++)
>   		drm_sched_resubmit_jobs(&pfdev->js->queue[i].sched);
>   
> +	mutex_unlock(&pfdev->reset_lock);
> +
>   	/* restart scheduler after GPU is usable again */
>   	for (i = 0; i < NUM_JOB_SLOTS; i++)
>   		drm_sched_start(&pfdev->js->queue[i].sched, true);
> -
> -	mutex_unlock(&pfdev->reset_lock);
>   }
>   
>   static const struct drm_sched_backend_ops panfrost_sched_ops = {
> @@ -558,6 +599,7 @@ int panfrost_job_open(struct panfrost_file_priv *panfrost_priv)
>   	int ret, i;
>   
>   	for (i = 0; i < NUM_JOB_SLOTS; i++) {
> +		mutex_init(&js->queue[i].lock);
>   		sched = &js->queue[i].sched;
>   		ret = drm_sched_entity_init(&panfrost_priv->sched_entity[i],
>   					    DRM_SCHED_PRIORITY_NORMAL, &sched,
> @@ -570,10 +612,14 @@ int panfrost_job_open(struct panfrost_file_priv *panfrost_priv)
>   
>   void panfrost_job_close(struct panfrost_file_priv *panfrost_priv)
>   {
> +	struct panfrost_device *pfdev = panfrost_priv->pfdev;
> +	struct panfrost_job_slot *js = pfdev->js;
>   	int i;
>   
> -	for (i = 0; i < NUM_JOB_SLOTS; i++)
> +	for (i = 0; i < NUM_JOB_SLOTS; i++) {
>   		drm_sched_entity_destroy(&panfrost_priv->sched_entity[i]);
> +		mutex_destroy(&js->queue[i].lock);
> +	}
>   }
>   
>   int panfrost_job_is_idle(struct panfrost_device *pfdev)
> 


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] drm/panfrost: Fix job timeout handling
@ 2020-10-02 11:08   ` Steven Price
  0 siblings, 0 replies; 6+ messages in thread
From: Steven Price @ 2020-10-02 11:08 UTC (permalink / raw)
  To: Boris Brezillon, Rob Herring, Tomeu Vizoso, Alyssa Rosenzweig,
	Robin Murphy
  Cc: stable, dri-devel

On 02/10/2020 08:10, Boris Brezillon wrote:
> If more than two jobs end up timeout-ing concurrently, only one of them
> (the one attached to the scheduler acquiring the lock) is fully handled.
> The other one remains in a dangling state where it's no longer part of
> the scheduling queue, but still blocks something in scheduler, leading
> to repetitive timeouts when new jobs are queued.
> 
> Let's make sure all bad jobs are properly handled by the thread
> acquiring the lock.
> 
> v2:
> - Fix the subject prefix
> - Stop the scheduler before returning from panfrost_job_timedout()
> - Call cancel_delayed_work_sync() after drm_sched_stop() to make sure
>    no timeout handlers are in flight when we reset the GPU (Steven Price)
> - Make sure we release the reset lock before restarting the
>    schedulers (Steven Price)
> 
> Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
> Fixes: f3ba91228e8e ("drm/panfrost: Add initial panfrost driver")
> Cc: <stable@vger.kernel.org>

LGTM!

Reviewed-by: Steven Price <steven.price@arm.com>

> ---
>   drivers/gpu/drm/panfrost/panfrost_job.c | 64 +++++++++++++++++++++----
>   1 file changed, 55 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c
> index 30e7b7196dab..6e4bfb938fab 100644
> --- a/drivers/gpu/drm/panfrost/panfrost_job.c
> +++ b/drivers/gpu/drm/panfrost/panfrost_job.c
> @@ -25,7 +25,8 @@
>   
>   struct panfrost_queue_state {
>   	struct drm_gpu_scheduler sched;
> -
> +	bool stopped;
> +	struct mutex lock;
>   	u64 fence_context;
>   	u64 emit_seqno;
>   };
> @@ -369,6 +370,24 @@ void panfrost_job_enable_interrupts(struct panfrost_device *pfdev)
>   	job_write(pfdev, JOB_INT_MASK, irq_mask);
>   }
>   
> +static bool panfrost_scheduler_stop(struct panfrost_queue_state *queue,
> +				    struct drm_sched_job *bad)
> +{
> +	bool stopped = false;
> +
> +	mutex_lock(&queue->lock);
> +	if (!queue->stopped) {
> +		drm_sched_stop(&queue->sched, bad);
> +		if (bad)
> +			drm_sched_increase_karma(bad);
> +		queue->stopped = true;
> +		stopped = true;
> +	}
> +	mutex_unlock(&queue->lock);
> +
> +	return stopped;
> +}
> +
>   static void panfrost_job_timedout(struct drm_sched_job *sched_job)
>   {
>   	struct panfrost_job *job = to_panfrost_job(sched_job);
> @@ -392,19 +411,41 @@ static void panfrost_job_timedout(struct drm_sched_job *sched_job)
>   		job_read(pfdev, JS_TAIL_LO(js)),
>   		sched_job);
>   
> +	/* Scheduler is already stopped, nothing to do. */
> +	if (!panfrost_scheduler_stop(&pfdev->js->queue[js], sched_job))
> +		return;
> +
>   	if (!mutex_trylock(&pfdev->reset_lock))
>   		return;
>   
> +	mutex_lock(&pfdev->sched_lock);
>   	for (i = 0; i < NUM_JOB_SLOTS; i++) {
>   		struct drm_gpu_scheduler *sched = &pfdev->js->queue[i].sched;
>   
> -		drm_sched_stop(sched, sched_job);
> -		if (js != i)
> -			/* Ensure any timeouts on other slots have finished */
> +		/*
> +		 * If the queue is still active, make sure we wait for any
> +		 * pending timeouts.
> +		 */
> +		if (!pfdev->js->queue[i].stopped)
>   			cancel_delayed_work_sync(&sched->work_tdr);
> -	}
>   
> -	drm_sched_increase_karma(sched_job);
> +		/*
> +		 * If the scheduler was not already stopped, there's a tiny
> +		 * chance a timeout has expired just before we stopped it, and
> +		 * drm_sched_stop() does not flush pending works. Let's flush
> +		 * them now so the timeout handler doesn't get called in the
> +		 * middle of a reset.
> +		 */
> +		if (panfrost_scheduler_stop(&pfdev->js->queue[i], NULL))
> +			cancel_delayed_work_sync(&sched->work_tdr);
> +
> +		/*
> +		 * Now that we cancelled the pending timeouts, we can safely
> +		 * reset the stopped state.
> +		 */
> +		pfdev->js->queue[i].stopped = false;
> +	}
> +	mutex_unlock(&pfdev->sched_lock);
>   
>   	spin_lock_irqsave(&pfdev->js->job_lock, flags);
>   	for (i = 0; i < NUM_JOB_SLOTS; i++) {
> @@ -421,11 +462,11 @@ static void panfrost_job_timedout(struct drm_sched_job *sched_job)
>   	for (i = 0; i < NUM_JOB_SLOTS; i++)
>   		drm_sched_resubmit_jobs(&pfdev->js->queue[i].sched);
>   
> +	mutex_unlock(&pfdev->reset_lock);
> +
>   	/* restart scheduler after GPU is usable again */
>   	for (i = 0; i < NUM_JOB_SLOTS; i++)
>   		drm_sched_start(&pfdev->js->queue[i].sched, true);
> -
> -	mutex_unlock(&pfdev->reset_lock);
>   }
>   
>   static const struct drm_sched_backend_ops panfrost_sched_ops = {
> @@ -558,6 +599,7 @@ int panfrost_job_open(struct panfrost_file_priv *panfrost_priv)
>   	int ret, i;
>   
>   	for (i = 0; i < NUM_JOB_SLOTS; i++) {
> +		mutex_init(&js->queue[i].lock);
>   		sched = &js->queue[i].sched;
>   		ret = drm_sched_entity_init(&panfrost_priv->sched_entity[i],
>   					    DRM_SCHED_PRIORITY_NORMAL, &sched,
> @@ -570,10 +612,14 @@ int panfrost_job_open(struct panfrost_file_priv *panfrost_priv)
>   
>   void panfrost_job_close(struct panfrost_file_priv *panfrost_priv)
>   {
> +	struct panfrost_device *pfdev = panfrost_priv->pfdev;
> +	struct panfrost_job_slot *js = pfdev->js;
>   	int i;
>   
> -	for (i = 0; i < NUM_JOB_SLOTS; i++)
> +	for (i = 0; i < NUM_JOB_SLOTS; i++) {
>   		drm_sched_entity_destroy(&panfrost_priv->sched_entity[i]);
> +		mutex_destroy(&js->queue[i].lock);
> +	}
>   }
>   
>   int panfrost_job_is_idle(struct panfrost_device *pfdev)
> 

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] drm/panfrost: Fix job timeout handling
  2020-10-02  7:10 ` Boris Brezillon
@ 2020-10-02 12:06   ` Boris Brezillon
  -1 siblings, 0 replies; 6+ messages in thread
From: Boris Brezillon @ 2020-10-02 12:06 UTC (permalink / raw)
  To: Rob Herring, Tomeu Vizoso, Alyssa Rosenzweig, Robin Murphy
  Cc: Steven Price, dri-devel, stable

On Fri,  2 Oct 2020 09:10:32 +0200
Boris Brezillon <boris.brezillon@collabora.com> wrote:

> @@ -392,19 +411,41 @@ static void panfrost_job_timedout(struct drm_sched_job *sched_job)
>  		job_read(pfdev, JS_TAIL_LO(js)),
>  		sched_job);
>  
> +	/* Scheduler is already stopped, nothing to do. */
> +	if (!panfrost_scheduler_stop(&pfdev->js->queue[js], sched_job))
> +		return;
> +
>  	if (!mutex_trylock(&pfdev->reset_lock))
>  		return;
>  
> +	mutex_lock(&pfdev->sched_lock);

Oops, sched_lock shouldn't be acquired here, I switched to a per-queue
lock instead.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] drm/panfrost: Fix job timeout handling
@ 2020-10-02 12:06   ` Boris Brezillon
  0 siblings, 0 replies; 6+ messages in thread
From: Boris Brezillon @ 2020-10-02 12:06 UTC (permalink / raw)
  To: Rob Herring, Tomeu Vizoso, Alyssa Rosenzweig, Robin Murphy
  Cc: stable, dri-devel, Steven Price

On Fri,  2 Oct 2020 09:10:32 +0200
Boris Brezillon <boris.brezillon@collabora.com> wrote:

> @@ -392,19 +411,41 @@ static void panfrost_job_timedout(struct drm_sched_job *sched_job)
>  		job_read(pfdev, JS_TAIL_LO(js)),
>  		sched_job);
>  
> +	/* Scheduler is already stopped, nothing to do. */
> +	if (!panfrost_scheduler_stop(&pfdev->js->queue[js], sched_job))
> +		return;
> +
>  	if (!mutex_trylock(&pfdev->reset_lock))
>  		return;
>  
> +	mutex_lock(&pfdev->sched_lock);

Oops, sched_lock shouldn't be acquired here, I switched to a per-queue
lock instead.
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2020-10-02 12:06 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-10-02  7:10 [PATCH v2] drm/panfrost: Fix job timeout handling Boris Brezillon
2020-10-02  7:10 ` Boris Brezillon
2020-10-02 11:08 ` Steven Price
2020-10-02 11:08   ` Steven Price
2020-10-02 12:06 ` Boris Brezillon
2020-10-02 12:06   ` Boris Brezillon

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.