All of lore.kernel.org
 help / color / mirror / Atom feed
From: Lucas Stach <l.stach@pengutronix.de>
To: "Christian König" <christian.koenig@amd.com>,
	"Alex Deucher" <alexander.deucher@amd.com>,
	"Andrey Grodzovsky" <andrey.grodzovsky@amd.com>
Cc: kernel@pengutronix.de, dri-devel@lists.freedesktop.org,
	patchwork-lst@pengutronix.de
Subject: [PATCH] drm/scheduler: fix inconsistent locking of job_list_lock
Date: Mon, 20 Jan 2020 11:51:19 +0100	[thread overview]
Message-ID: <20200120105119.10237-1-l.stach@pengutronix.de> (raw)

1db8c142b6c5 (drm/scheduler: Add drm_sched_suspend/resume_timeout()) made
the job_list_lock IRQ safe in as the suspend/resume calls were expected to
be called from IRQ context. This usage never materialized in upstream.
Instead amdgpu started locking the job_list_lock in an IRQ unsafe way in
amdgpu_ib_preempt_mark_partial_job() and amdgpu_ib_preempt_job_recovery(),
which leads to potential deadlock if one would actually start to call the
drm_sched_suspend/resume_timeout functions from IRQ context.

As no current user needs the locking to be IRQ safe, the local IRQ
disable/enable is pure overhead. Fix the inconsistent locking by changing
all uses of job_list_lock to use the IRQ unsafe locking primitives.

Signed-off-by: Lucas Stach <l.stach@pengutronix.de>
---
 drivers/gpu/drm/scheduler/sched_main.c | 38 ++++++++++----------------
 1 file changed, 15 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index 3c57e84222ca..d1b24444dd99 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -220,8 +220,7 @@ EXPORT_SYMBOL(drm_sched_fault);
  *
  * Suspend the delayed work timeout for the scheduler. This is done by
  * modifying the delayed work timeout to an arbitrary large value,
- * MAX_SCHEDULE_TIMEOUT in this case. Note that this function can be
- * called from an IRQ context.
+ * MAX_SCHEDULE_TIMEOUT in this case.
  *
  * Returns the timeout remaining
  *
@@ -250,43 +249,39 @@ EXPORT_SYMBOL(drm_sched_suspend_timeout);
  * @sched: scheduler instance for which to resume the timeout
  * @remaining: remaining timeout
  *
- * Resume the delayed work timeout for the scheduler. Note that
- * this function can be called from an IRQ context.
+ * Resume the delayed work timeout for the scheduler.
  */
 void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched,
 		unsigned long remaining)
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(&sched->job_list_lock, flags);
+	spin_lock(&sched->job_list_lock);
 
 	if (list_empty(&sched->ring_mirror_list))
 		cancel_delayed_work(&sched->work_tdr);
 	else
 		mod_delayed_work(system_wq, &sched->work_tdr, remaining);
 
-	spin_unlock_irqrestore(&sched->job_list_lock, flags);
+	spin_unlock(&sched->job_list_lock);
 }
 EXPORT_SYMBOL(drm_sched_resume_timeout);
 
 static void drm_sched_job_begin(struct drm_sched_job *s_job)
 {
 	struct drm_gpu_scheduler *sched = s_job->sched;
-	unsigned long flags;
 
-	spin_lock_irqsave(&sched->job_list_lock, flags);
+	spin_lock(&sched->job_list_lock);
 	list_add_tail(&s_job->node, &sched->ring_mirror_list);
 	drm_sched_start_timeout(sched);
-	spin_unlock_irqrestore(&sched->job_list_lock, flags);
+	spin_unlock(&sched->job_list_lock);
 }
 
 static void drm_sched_job_timedout(struct work_struct *work)
 {
 	struct drm_gpu_scheduler *sched;
 	struct drm_sched_job *job;
-	unsigned long flags;
 
 	sched = container_of(work, struct drm_gpu_scheduler, work_tdr.work);
+
 	job = list_first_entry_or_null(&sched->ring_mirror_list,
 				       struct drm_sched_job, node);
 
@@ -303,9 +298,9 @@ static void drm_sched_job_timedout(struct work_struct *work)
 		}
 	}
 
-	spin_lock_irqsave(&sched->job_list_lock, flags);
+	spin_lock(&sched->job_list_lock);
 	drm_sched_start_timeout(sched);
-	spin_unlock_irqrestore(&sched->job_list_lock, flags);
+	spin_unlock(&sched->job_list_lock);
 }
 
  /**
@@ -368,7 +363,6 @@ EXPORT_SYMBOL(drm_sched_increase_karma);
 void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
 {
 	struct drm_sched_job *s_job, *tmp;
-	unsigned long flags;
 
 	kthread_park(sched->thread);
 
@@ -388,9 +382,9 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
 			 * remove job from ring_mirror_list.
 			 * Locking here is for concurrent resume timeout
 			 */
-			spin_lock_irqsave(&sched->job_list_lock, flags);
+			spin_lock(&sched->job_list_lock);
 			list_del_init(&s_job->node);
-			spin_unlock_irqrestore(&sched->job_list_lock, flags);
+			spin_unlock(&sched->job_list_lock);
 
 			/*
 			 * Wait for job's HW fence callback to finish using s_job
@@ -433,7 +427,6 @@ EXPORT_SYMBOL(drm_sched_stop);
 void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
 {
 	struct drm_sched_job *s_job, *tmp;
-	unsigned long flags;
 	int r;
 
 	/*
@@ -462,9 +455,9 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
 	}
 
 	if (full_recovery) {
-		spin_lock_irqsave(&sched->job_list_lock, flags);
+		spin_lock(&sched->job_list_lock);
 		drm_sched_start_timeout(sched);
-		spin_unlock_irqrestore(&sched->job_list_lock, flags);
+		spin_unlock(&sched->job_list_lock);
 	}
 
 	kthread_unpark(sched->thread);
@@ -648,7 +641,6 @@ static struct drm_sched_job *
 drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched)
 {
 	struct drm_sched_job *job;
-	unsigned long flags;
 
 	/*
 	 * Don't destroy jobs while the timeout worker is running  OR thread
@@ -659,7 +651,7 @@ drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched)
 	    __kthread_should_park(sched->thread))
 		return NULL;
 
-	spin_lock_irqsave(&sched->job_list_lock, flags);
+	spin_lock(&sched->job_list_lock);
 
 	job = list_first_entry_or_null(&sched->ring_mirror_list,
 				       struct drm_sched_job, node);
@@ -673,7 +665,7 @@ drm_sched_get_cleanup_job(struct drm_gpu_scheduler *sched)
 		drm_sched_start_timeout(sched);
 	}
 
-	spin_unlock_irqrestore(&sched->job_list_lock, flags);
+	spin_unlock(&sched->job_list_lock);
 
 	return job;
 }
-- 
2.20.1

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

             reply	other threads:[~2020-01-20 10:51 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-01-20 10:51 Lucas Stach [this message]
2020-01-20 10:59 ` [PATCH] drm/scheduler: fix inconsistent locking of job_list_lock Christian König
2020-01-20 11:11   ` Lucas Stach
2020-03-13 14:51   ` Lucas Stach
2020-03-13 16:25     ` Alex Deucher

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200120105119.10237-1-l.stach@pengutronix.de \
    --to=l.stach@pengutronix.de \
    --cc=alexander.deucher@amd.com \
    --cc=andrey.grodzovsky@amd.com \
    --cc=christian.koenig@amd.com \
    --cc=dri-devel@lists.freedesktop.org \
    --cc=kernel@pengutronix.de \
    --cc=patchwork-lst@pengutronix.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.