All of lore.kernel.org
 help / color / mirror / Atom feed
From: Andrey Grodzovsky <Andrey.Grodzovsky-5C7GfCeVMHo@public.gmane.org>
To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
Cc: Andrey Grodzovsky
	<Andrey.Grodzovsky-5C7GfCeVMHo@public.gmane.org>,
	christian.koenig-5C7GfCeVMHo@public.gmane.org
Subject: [PATCH 3/3] drm/amdgpu: Fix deadlock during GPU reset.
Date: Fri, 20 Oct 2017 09:32:07 -0400	[thread overview]
Message-ID: <1508506327-1084-3-git-send-email-Andrey.Grodzovsky@amd.com> (raw)
In-Reply-To: <1508506327-1084-1-git-send-email-Andrey.Grodzovsky-5C7GfCeVMHo@public.gmane.org>

Switch from kfifo to SPSC queue.

Bug:
Kfifo is limited at size, during GPU reset it would fill up to limit
and the pushing thread (producer) would wait for the scheduler worker to
consume the items in the fifo while holding reservation lock
on a BO. The gpu reset thread on the other hand blocks the scheduler
during reset. Before it unblocks the sceduler it might want
to recover VRAM and so will try to reserve the same BO the producer
thread is already holding creating a deadlock.

Fix:
Switch from kfifo to SPSC queue which is unlimited in size.

Signed-off-by: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h |  4 +-
 drivers/gpu/drm/amd/scheduler/gpu_scheduler.c   | 51 ++++++++++---------------
 drivers/gpu/drm/amd/scheduler/gpu_scheduler.h   |  4 +-
 3 files changed, 26 insertions(+), 33 deletions(-)

diff --git a/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h b/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h
index 8bd3810..86838a8 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h
+++ b/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h
@@ -28,8 +28,8 @@ TRACE_EVENT(amd_sched_job,
 			   __entry->id = sched_job->id;
 			   __entry->fence = &sched_job->s_fence->finished;
 			   __entry->name = sched_job->sched->name;
-			   __entry->job_count = kfifo_len(
-				   &sched_job->s_entity->job_queue) / sizeof(sched_job);
+			   __entry->job_count = spsc_queue_count(
+				   &sched_job->s_entity->job_queue);
 			   __entry->hw_job_count = atomic_read(
 				   &sched_job->sched->hw_rq_count);
 			   ),
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
index 1bbbce2..0c9cdc0 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
@@ -28,9 +28,14 @@
 #include <drm/drmP.h>
 #include "gpu_scheduler.h"
 
+#include "spsc_queue.h"
+
 #define CREATE_TRACE_POINTS
 #include "gpu_sched_trace.h"
 
+#define to_amd_sched_job(sched_job)		\
+		container_of((sched_job), struct amd_sched_job, queue_node)
+
 static bool amd_sched_entity_is_ready(struct amd_sched_entity *entity);
 static void amd_sched_wakeup(struct amd_gpu_scheduler *sched);
 static void amd_sched_process_job(struct dma_fence *f, struct dma_fence_cb *cb);
@@ -123,8 +128,6 @@ int amd_sched_entity_init(struct amd_gpu_scheduler *sched,
 			  struct amd_sched_rq *rq,
 			  uint32_t jobs)
 {
-	int r;
-
 	if (!(sched && entity && rq))
 		return -EINVAL;
 
@@ -135,9 +138,7 @@ int amd_sched_entity_init(struct amd_gpu_scheduler *sched,
 
 	spin_lock_init(&entity->rq_lock);
 	spin_lock_init(&entity->queue_lock);
-	r = kfifo_alloc(&entity->job_queue, jobs * sizeof(void *), GFP_KERNEL);
-	if (r)
-		return r;
+	spsc_queue_init(&entity->job_queue);
 
 	atomic_set(&entity->fence_seq, 0);
 	entity->fence_context = dma_fence_context_alloc(2);
@@ -170,7 +171,7 @@ static bool amd_sched_entity_is_initialized(struct amd_gpu_scheduler *sched,
 static bool amd_sched_entity_is_idle(struct amd_sched_entity *entity)
 {
 	rmb();
-	if (kfifo_is_empty(&entity->job_queue))
+	if (spsc_queue_peek(&entity->job_queue) == NULL)
 		return true;
 
 	return false;
@@ -185,7 +186,7 @@ static bool amd_sched_entity_is_idle(struct amd_sched_entity *entity)
  */
 static bool amd_sched_entity_is_ready(struct amd_sched_entity *entity)
 {
-	if (kfifo_is_empty(&entity->job_queue))
+	if (spsc_queue_peek(&entity->job_queue) == NULL)
 		return false;
 
 	if (ACCESS_ONCE(entity->dependency))
@@ -227,7 +228,7 @@ void amd_sched_entity_fini(struct amd_gpu_scheduler *sched,
 		 */
 		kthread_park(sched->thread);
 		kthread_unpark(sched->thread);
-		while (kfifo_out(&entity->job_queue, &job, sizeof(job))) {
+		while ((job = to_amd_sched_job(spsc_queue_pop(&entity->job_queue)))) {
 			struct amd_sched_fence *s_fence = job->s_fence;
 			amd_sched_fence_scheduled(s_fence);
 			dma_fence_set_error(&s_fence->finished, -ESRCH);
@@ -235,9 +236,7 @@ void amd_sched_entity_fini(struct amd_gpu_scheduler *sched,
 			dma_fence_put(&s_fence->finished);
 			sched->ops->free_job(job);
 		}
-
 	}
-	kfifo_free(&entity->job_queue);
 }
 
 static void amd_sched_entity_wakeup(struct dma_fence *f, struct dma_fence_cb *cb)
@@ -332,18 +331,21 @@ static bool amd_sched_entity_add_dependency_cb(struct amd_sched_entity *entity)
 }
 
 static struct amd_sched_job *
-amd_sched_entity_peek_job(struct amd_sched_entity *entity)
+amd_sched_entity_pop_job(struct amd_sched_entity *entity)
 {
 	struct amd_gpu_scheduler *sched = entity->sched;
-	struct amd_sched_job *sched_job;
+	struct amd_sched_job *sched_job = to_amd_sched_job(
+						spsc_queue_peek(&entity->job_queue));
 
-	if (!kfifo_out_peek(&entity->job_queue, &sched_job, sizeof(sched_job)))
+	if (!sched_job)
 		return NULL;
 
 	while ((entity->dependency = sched->ops->dependency(sched_job)))
 		if (amd_sched_entity_add_dependency_cb(entity))
 			return NULL;
 
+	sched_job->s_entity = NULL;
+	spsc_queue_pop(&entity->job_queue);
 	return sched_job;
 }
 
@@ -354,18 +356,14 @@ amd_sched_entity_peek_job(struct amd_sched_entity *entity)
  *
  * Returns true if we could submit the job.
  */
-static bool amd_sched_entity_in(struct amd_sched_job *sched_job)
+static void amd_sched_entity_in(struct amd_sched_job *sched_job)
 {
 	struct amd_gpu_scheduler *sched = sched_job->sched;
 	struct amd_sched_entity *entity = sched_job->s_entity;
-	bool added, first = false;
+	bool first = false;
 
 	spin_lock(&entity->queue_lock);
-	added = kfifo_in(&entity->job_queue, &sched_job,
-			sizeof(sched_job)) == sizeof(sched_job);
-
-	if (added && kfifo_len(&entity->job_queue) == sizeof(sched_job))
-		first = true;
+	first = spsc_queue_push(&entity->job_queue, &sched_job->queue_node);
 
 	spin_unlock(&entity->queue_lock);
 
@@ -377,7 +375,6 @@ static bool amd_sched_entity_in(struct amd_sched_job *sched_job)
 		spin_unlock(&entity->rq_lock);
 		amd_sched_wakeup(sched);
 	}
-	return added;
 }
 
 /* job_finish is called after hw fence signaled
@@ -514,11 +511,8 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
  */
 void amd_sched_entity_push_job(struct amd_sched_job *sched_job)
 {
-	struct amd_sched_entity *entity = sched_job->s_entity;
-
 	trace_amd_sched_job(sched_job);
-	wait_event(entity->sched->job_scheduled,
-		   amd_sched_entity_in(sched_job));
+	amd_sched_entity_in(sched_job);
 }
 
 /* init a sched_job with basic field */
@@ -611,7 +605,7 @@ static int amd_sched_main(void *param)
 {
 	struct sched_param sparam = {.sched_priority = 1};
 	struct amd_gpu_scheduler *sched = (struct amd_gpu_scheduler *)param;
-	int r, count;
+	int r;
 
 	sched_setscheduler(current, SCHED_FIFO, &sparam);
 
@@ -629,7 +623,7 @@ static int amd_sched_main(void *param)
 		if (!entity)
 			continue;
 
-		sched_job = amd_sched_entity_peek_job(entity);
+		sched_job = amd_sched_entity_pop_job(entity);
 		if (!sched_job)
 			continue;
 
@@ -656,9 +650,6 @@ static int amd_sched_main(void *param)
 			amd_sched_process_job(NULL, &s_fence->cb);
 		}
 
-		count = kfifo_out(&entity->job_queue, &sched_job,
-				sizeof(sched_job));
-		WARN_ON(count != sizeof(sched_job));
 		wake_up(&sched->job_scheduled);
 	}
 	return 0;
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
index 3f75b45..8844ce7 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
@@ -26,6 +26,7 @@
 
 #include <linux/kfifo.h>
 #include <linux/dma-fence.h>
+#include "spsc_queue.h"
 
 struct amd_gpu_scheduler;
 struct amd_sched_rq;
@@ -56,7 +57,7 @@ struct amd_sched_entity {
 	struct amd_gpu_scheduler	*sched;
 
 	spinlock_t			queue_lock;
-	struct kfifo                    job_queue;
+	struct spsc_queue	job_queue;
 
 	atomic_t			fence_seq;
 	uint64_t                        fence_context;
@@ -87,6 +88,7 @@ struct amd_sched_fence {
 };
 
 struct amd_sched_job {
+	struct spsc_node queue_node;
 	struct amd_gpu_scheduler        *sched;
 	struct amd_sched_entity         *s_entity;
 	struct amd_sched_fence          *s_fence;
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

  parent reply	other threads:[~2017-10-20 13:32 UTC|newest]

Thread overview: 20+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-10-20 13:32 [PATCH 1/3] drm/amdgpu: Avoid accessing job->entity after the job is scheduled Andrey Grodzovsky
     [not found] ` <1508506327-1084-1-git-send-email-Andrey.Grodzovsky-5C7GfCeVMHo@public.gmane.org>
2017-10-20 13:32   ` [PATCH 2/3] drm/amdgpu: Add SPSC queue to scheduler Andrey Grodzovsky
     [not found]     ` <1508506327-1084-2-git-send-email-Andrey.Grodzovsky-5C7GfCeVMHo@public.gmane.org>
2017-10-23  4:06       ` Liu, Monk
     [not found]         ` <BLUPR12MB04499B9F742E2CF76139F6BC84460-7LeqcoF/hwpTIQvHjXdJlwdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2017-10-23 10:50           ` Andrey Grodzovsky
2017-10-23 12:38       ` Christian König
2017-10-20 13:32   ` Andrey Grodzovsky [this message]
     [not found]     ` <1508506327-1084-3-git-send-email-Andrey.Grodzovsky-5C7GfCeVMHo@public.gmane.org>
2017-10-23  3:03       ` [PATCH 3/3] drm/amdgpu: Fix deadlock during GPU reset Liu, Monk
     [not found]         ` <BLUPR12MB04491AF9EAD2623B5EA9D7B784460-7LeqcoF/hwpTIQvHjXdJlwdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2017-10-23  7:08           ` Christian König
     [not found]             ` <8153c0a9-8509-c2c3-c53e-e25f6f7e83f1-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2017-10-23  7:24               ` Liu, Monk
     [not found]                 ` <BLUPR12MB04490DF904B056086D164BDD84460-7LeqcoF/hwpTIQvHjXdJlwdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2017-10-23  7:30                   ` Christian König
     [not found]                     ` <a9ed61a0-6e0b-6093-5c0b-4459cfbb2282-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2017-10-23  7:38                       ` Liu, Monk
     [not found]                         ` <BLUPR12MB0449E8F1F121B277B598430684460-7LeqcoF/hwpTIQvHjXdJlwdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2017-10-23  8:46                           ` Liu, Monk
2017-10-23 12:42       ` Christian König
2017-10-20 15:59   ` [PATCH 1/3] drm/amdgpu: Avoid accessing job->entity after the job is scheduled Andres Rodriguez
     [not found]     ` <ea764df5-6c80-e1cd-a4cd-562c86c7d553-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2017-10-20 16:19       ` Andrey Grodzovsky
     [not found]         ` <ecb8a9d2-ba94-c7a4-f5d7-156b061245d8-5C7GfCeVMHo@public.gmane.org>
2017-10-20 16:26           ` Andres Rodriguez
     [not found]             ` <86853e29-07c1-700b-8aa2-2183fbc64d2f-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2017-10-20 16:51               ` Andrey Grodzovsky
     [not found]                 ` <184ba414-981a-02eb-1a6f-7bd878b4f3b1-5C7GfCeVMHo@public.gmane.org>
2017-10-20 18:24                   ` Christian König
     [not found]                     ` <79fd05fa-711c-c454-36a1-54b31adda225-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2017-10-20 19:31                       ` Andres Rodriguez
2017-10-23 12:36   ` Christian König

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1508506327-1084-3-git-send-email-Andrey.Grodzovsky@amd.com \
    --to=andrey.grodzovsky-5c7gfcevmho@public.gmane.org \
    --cc=amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org \
    --cc=christian.koenig-5C7GfCeVMHo@public.gmane.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.