[PATCH] drm/sched: fix the bug of time out calculation(v2)

* [PATCH] drm/sched: fix the bug of time out calculation(v2)
@ 2021-08-25  4:14 Monk Liu
  2021-08-25  6:31 ` Christian König
  2021-08-25 12:51 ` Alex Deucher
  0 siblings, 2 replies; 12+ messages in thread
From: Monk Liu @ 2021-08-25  4:14 UTC (permalink / raw)
  To: amd-gfx; +Cc: Monk Liu

the original logic is wrong that the timeout will not be retriggerd
after the previous job siganled, and that lead to the scenario that all
jobs in the same scheduler shares the same timeout timer from the very
begining job in this scheduler which is wrong.

we should modify the timer everytime a previous job signaled.

v2:
further cleanup the logic, and do the TDR timer cancelling if the signaled job
is the last one in its scheduler.

Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/scheduler/sched_main.c | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index a2a9536..8c102ac 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -305,8 +305,17 @@ static void drm_sched_job_begin(struct drm_sched_job *s_job)
 	struct drm_gpu_scheduler *sched = s_job->sched;
 
 	spin_lock(&sched->job_list_lock);
-	list_add_tail(&s_job->list, &sched->pending_list);
-	drm_sched_start_timeout(sched);
+	if (list_empty(&sched->pending_list)) {
+		list_add_tail(&s_job->list, &sched->pending_list);
+		drm_sched_start_timeout(sched);
+	} else {
+		/* the old jobs in pending list are not finished yet
+		 * no need to restart TDR timer here, it is already
+		 * handled by drm_sched_get_cleanup_job
+		 */
+		list_add_tail(&s_job->list, &sched->pending_list);
+	}
+
 	spin_unlock(&sched->job_list_lock);
 }
 
@@ -693,17 +702,22 @@ drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched)
 	if (job && dma_fence_is_signaled(&job->s_fence->finished)) {
 		/* remove job from pending_list */
 		list_del_init(&job->list);
+
 		/* make the scheduled timestamp more accurate */
 		next = list_first_entry_or_null(&sched->pending_list,
 						typeof(*next), list);
-		if (next)
+		if (next) {
+			/* if we still have job in pending list we need modify the TDR timer */
+			mod_delayed_work(system_wq, &sched->work_tdr, sched->timeout);
 			next->s_fence->scheduled.timestamp =
 				job->s_fence->finished.timestamp;
+		} else {
+			/* cancel the TDR timer if no job in pending list */
+			cancel_delayed_work(&sched->work_tdr);
+		}
 
 	} else {
 		job = NULL;
-		/* queue timeout for next job */
-		drm_sched_start_timeout(sched);
 	}
 
 	spin_unlock(&sched->job_list_lock);
@@ -791,11 +805,8 @@ static int drm_sched_main(void *param)
 					  (entity = drm_sched_select_entity(sched))) ||
 					 kthread_should_stop());
 
-		if (cleanup_job) {
+		if (cleanup_job)
 			sched->ops->free_job(cleanup_job);
-			/* queue timeout for next job */
-			drm_sched_start_timeout(sched);
-		}
 
 		if (!entity)
 			continue;
-- 
2.7.4


^ permalink raw reply related	[flat|nested] 12+ messages in thread