All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/3] drm/amdgpu: Avoid accessing job->entity after the job is scheduled.
@ 2017-10-20 13:32 Andrey Grodzovsky
       [not found] ` <1508506327-1084-1-git-send-email-Andrey.Grodzovsky-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 20+ messages in thread
From: Andrey Grodzovsky @ 2017-10-20 13:32 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Andrey Grodzovsky, christian.koenig-5C7GfCeVMHo

Bug: amdgpu_job_free_cb was accessing s_job->s_entity when the allocated
amdgpu_ctx (and the entity inside it) were already deallocated from
amdgpu_cs_parser_fini.

Fix: Save job's priority on it's creation instead of accessing it from
s_entity later on.

Signed-off-by: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c        |  3 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c       |  5 ++---
 drivers/gpu/drm/amd/scheduler/gpu_scheduler.c |  1 +
 drivers/gpu/drm/amd/scheduler/gpu_scheduler.h | 32 ++++++++++++---------------
 4 files changed, 18 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index f7fceb6..a760b6e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -1192,8 +1192,7 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
 	job->uf_sequence = seq;
 
 	amdgpu_job_free_resources(job);
-	amdgpu_ring_priority_get(job->ring,
-				 amd_sched_get_job_priority(&job->base));
+	amdgpu_ring_priority_get(job->ring, job->base.s_priority);
 
 	trace_amdgpu_cs_ioctl(job);
 	amd_sched_entity_push_job(&job->base);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 0cfc68d..a58e3c5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -104,7 +104,7 @@ static void amdgpu_job_free_cb(struct amd_sched_job *s_job)
 {
 	struct amdgpu_job *job = container_of(s_job, struct amdgpu_job, base);
 
-	amdgpu_ring_priority_put(job->ring, amd_sched_get_job_priority(s_job));
+	amdgpu_ring_priority_put(job->ring, s_job->s_priority);
 	dma_fence_put(job->fence);
 	amdgpu_sync_free(&job->sync);
 	amdgpu_sync_free(&job->dep_sync);
@@ -141,8 +141,7 @@ int amdgpu_job_submit(struct amdgpu_job *job, struct amdgpu_ring *ring,
 	job->fence_ctx = entity->fence_context;
 	*f = dma_fence_get(&job->base.s_fence->finished);
 	amdgpu_job_free_resources(job);
-	amdgpu_ring_priority_get(job->ring,
-				 amd_sched_get_job_priority(&job->base));
+	amdgpu_ring_priority_get(job->ring, job->base.s_priority);
 	amd_sched_entity_push_job(&job->base);
 
 	return 0;
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
index e4d3b4e..1bbbce2 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
@@ -529,6 +529,7 @@ int amd_sched_job_init(struct amd_sched_job *job,
 {
 	job->sched = sched;
 	job->s_entity = entity;
+	job->s_priority = entity->rq - sched->sched_rq;
 	job->s_fence = amd_sched_fence_create(entity, owner);
 	if (!job->s_fence)
 		return -ENOMEM;
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
index 52c8e54..3f75b45 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
@@ -30,6 +30,19 @@
 struct amd_gpu_scheduler;
 struct amd_sched_rq;
 
+enum amd_sched_priority {
+	AMD_SCHED_PRIORITY_MIN,
+	AMD_SCHED_PRIORITY_LOW = AMD_SCHED_PRIORITY_MIN,
+	AMD_SCHED_PRIORITY_NORMAL,
+	AMD_SCHED_PRIORITY_HIGH_SW,
+	AMD_SCHED_PRIORITY_HIGH_HW,
+	AMD_SCHED_PRIORITY_KERNEL,
+	AMD_SCHED_PRIORITY_MAX,
+	AMD_SCHED_PRIORITY_INVALID = -1,
+	AMD_SCHED_PRIORITY_UNSET = -2
+};
+
+
 /**
  * A scheduler entity is a wrapper around a job queue or a group
  * of other entities. Entities take turns emitting jobs from their
@@ -83,6 +96,7 @@ struct amd_sched_job {
 	struct delayed_work		work_tdr;
 	uint64_t			id;
 	atomic_t karma;
+	enum amd_sched_priority s_priority;
 };
 
 extern const struct dma_fence_ops amd_sched_fence_ops_scheduled;
@@ -114,18 +128,6 @@ struct amd_sched_backend_ops {
 	void (*free_job)(struct amd_sched_job *sched_job);
 };
 
-enum amd_sched_priority {
-	AMD_SCHED_PRIORITY_MIN,
-	AMD_SCHED_PRIORITY_LOW = AMD_SCHED_PRIORITY_MIN,
-	AMD_SCHED_PRIORITY_NORMAL,
-	AMD_SCHED_PRIORITY_HIGH_SW,
-	AMD_SCHED_PRIORITY_HIGH_HW,
-	AMD_SCHED_PRIORITY_KERNEL,
-	AMD_SCHED_PRIORITY_MAX,
-	AMD_SCHED_PRIORITY_INVALID = -1,
-	AMD_SCHED_PRIORITY_UNSET = -2
-};
-
 /**
  * One scheduler is implemented for each hardware ring
 */
@@ -176,10 +178,4 @@ bool amd_sched_dependency_optimized(struct dma_fence* fence,
 				    struct amd_sched_entity *entity);
 void amd_sched_job_kickout(struct amd_sched_job *s_job);
 
-static inline enum amd_sched_priority
-amd_sched_get_job_priority(struct amd_sched_job *job)
-{
-	return (job->s_entity->rq - job->sched->sched_rq);
-}
-
 #endif
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 2/3] drm/amdgpu: Add SPSC queue to scheduler.
       [not found] ` <1508506327-1084-1-git-send-email-Andrey.Grodzovsky-5C7GfCeVMHo@public.gmane.org>
@ 2017-10-20 13:32   ` Andrey Grodzovsky
       [not found]     ` <1508506327-1084-2-git-send-email-Andrey.Grodzovsky-5C7GfCeVMHo@public.gmane.org>
  2017-10-20 13:32   ` [PATCH 3/3] drm/amdgpu: Fix deadlock during GPU reset Andrey Grodzovsky
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 20+ messages in thread
From: Andrey Grodzovsky @ 2017-10-20 13:32 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Andrey Grodzovsky, christian.koenig-5C7GfCeVMHo

It is intended to sabstitute the bounded fifo we are currently
using.

Signed-off-by: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/scheduler/spsc_queue.h | 120 +++++++++++++++++++++++++++++
 1 file changed, 120 insertions(+)
 create mode 100644 drivers/gpu/drm/amd/scheduler/spsc_queue.h

diff --git a/drivers/gpu/drm/amd/scheduler/spsc_queue.h b/drivers/gpu/drm/amd/scheduler/spsc_queue.h
new file mode 100644
index 0000000..a3394f1
--- /dev/null
+++ b/drivers/gpu/drm/amd/scheduler/spsc_queue.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef AMD_SCHEDULER_SPSC_QUEUE_H_
+#define AMD_SCHEDULER_SPSC_QUEUE_H_
+
+#include <linux/atomic.h>
+
+/** SPSC lockless queue */
+
+struct spsc_node {
+
+	/* Stores spsc_node* */
+	struct spsc_node *next;
+};
+
+struct spsc_queue {
+
+	 struct spsc_node *head;
+
+	/* atomic pointer to struct spsc_node* */
+	atomic_long_t tail;
+
+	atomic_t job_count;
+};
+
+static inline void spsc_queue_init(struct spsc_queue *queue)
+{
+	queue->head = NULL;
+	atomic_long_set(&queue->tail, (long)&queue->head);
+	atomic_set(&queue->job_count, 0);
+}
+
+static inline struct spsc_node *spsc_queue_peek(struct spsc_queue *queue)
+{
+	return queue->head;
+}
+
+static inline int spsc_queue_count(struct spsc_queue *queue) {
+	return atomic_read(&queue->job_count);
+}
+
+static inline bool spsc_queue_push(struct spsc_queue *queue, struct spsc_node *node)
+{
+	struct spsc_node **tail;
+
+	node->next = NULL;
+
+	preempt_disable();
+
+	tail = (struct spsc_node **)atomic_long_xchg(&queue->tail, (long)&node->next);
+	WRITE_ONCE(*tail, node);
+	atomic_inc(&queue->job_count);
+
+	/*
+	 * In case of first element verify new node will be visible to the consumer
+	 * thread when we ping the kernel thread that there is new work to do.
+	 */
+	smp_wmb();
+
+	preempt_enable();
+
+	return tail == &queue->head;
+}
+
+
+static inline struct spsc_node *spsc_queue_pop(struct spsc_queue *queue)
+{
+	struct spsc_node *next, *node;
+
+	/* Verify reading from memory and not the cache */
+	smp_rmb();
+
+	node = READ_ONCE(queue->head);
+
+	if (!node)
+		return NULL;
+
+	next = READ_ONCE(node->next);
+	WRITE_ONCE(queue->head, next);
+
+	if (unlikely(!next)) {
+		/* slowpath for the last element in the queue */
+
+		if (atomic_long_cmpxchg(&queue->tail,
+				(long)&node->next,(long) &queue->head) != (long)&node->next) {
+			/* Updating tail failed wait for new next to appear */
+			do {
+				smp_rmb();
+			}while (unlikely(!(queue->head = READ_ONCE(node->next))));
+		}
+	}
+
+	atomic_dec(&queue->job_count);
+	return node;
+}
+
+
+
+#endif /* AMD_SCHEDULER_SPSC_QUEUE_H_ */
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 3/3] drm/amdgpu: Fix deadlock during GPU reset.
       [not found] ` <1508506327-1084-1-git-send-email-Andrey.Grodzovsky-5C7GfCeVMHo@public.gmane.org>
  2017-10-20 13:32   ` [PATCH 2/3] drm/amdgpu: Add SPSC queue to scheduler Andrey Grodzovsky
@ 2017-10-20 13:32   ` Andrey Grodzovsky
       [not found]     ` <1508506327-1084-3-git-send-email-Andrey.Grodzovsky-5C7GfCeVMHo@public.gmane.org>
  2017-10-20 15:59   ` [PATCH 1/3] drm/amdgpu: Avoid accessing job->entity after the job is scheduled Andres Rodriguez
  2017-10-23 12:36   ` Christian König
  3 siblings, 1 reply; 20+ messages in thread
From: Andrey Grodzovsky @ 2017-10-20 13:32 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Andrey Grodzovsky, christian.koenig-5C7GfCeVMHo

Switch from kfifo to SPSC queue.

Bug:
Kfifo is limited at size, during GPU reset it would fill up to limit
and the pushing thread (producer) would wait for the scheduler worker to
consume the items in the fifo while holding reservation lock
on a BO. The gpu reset thread on the other hand blocks the scheduler
during reset. Before it unblocks the sceduler it might want
to recover VRAM and so will try to reserve the same BO the producer
thread is already holding creating a deadlock.

Fix:
Switch from kfifo to SPSC queue which is unlimited in size.

Signed-off-by: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h |  4 +-
 drivers/gpu/drm/amd/scheduler/gpu_scheduler.c   | 51 ++++++++++---------------
 drivers/gpu/drm/amd/scheduler/gpu_scheduler.h   |  4 +-
 3 files changed, 26 insertions(+), 33 deletions(-)

diff --git a/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h b/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h
index 8bd3810..86838a8 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h
+++ b/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h
@@ -28,8 +28,8 @@ TRACE_EVENT(amd_sched_job,
 			   __entry->id = sched_job->id;
 			   __entry->fence = &sched_job->s_fence->finished;
 			   __entry->name = sched_job->sched->name;
-			   __entry->job_count = kfifo_len(
-				   &sched_job->s_entity->job_queue) / sizeof(sched_job);
+			   __entry->job_count = spsc_queue_count(
+				   &sched_job->s_entity->job_queue);
 			   __entry->hw_job_count = atomic_read(
 				   &sched_job->sched->hw_rq_count);
 			   ),
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
index 1bbbce2..0c9cdc0 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
@@ -28,9 +28,14 @@
 #include <drm/drmP.h>
 #include "gpu_scheduler.h"
 
+#include "spsc_queue.h"
+
 #define CREATE_TRACE_POINTS
 #include "gpu_sched_trace.h"
 
+#define to_amd_sched_job(sched_job)		\
+		container_of((sched_job), struct amd_sched_job, queue_node)
+
 static bool amd_sched_entity_is_ready(struct amd_sched_entity *entity);
 static void amd_sched_wakeup(struct amd_gpu_scheduler *sched);
 static void amd_sched_process_job(struct dma_fence *f, struct dma_fence_cb *cb);
@@ -123,8 +128,6 @@ int amd_sched_entity_init(struct amd_gpu_scheduler *sched,
 			  struct amd_sched_rq *rq,
 			  uint32_t jobs)
 {
-	int r;
-
 	if (!(sched && entity && rq))
 		return -EINVAL;
 
@@ -135,9 +138,7 @@ int amd_sched_entity_init(struct amd_gpu_scheduler *sched,
 
 	spin_lock_init(&entity->rq_lock);
 	spin_lock_init(&entity->queue_lock);
-	r = kfifo_alloc(&entity->job_queue, jobs * sizeof(void *), GFP_KERNEL);
-	if (r)
-		return r;
+	spsc_queue_init(&entity->job_queue);
 
 	atomic_set(&entity->fence_seq, 0);
 	entity->fence_context = dma_fence_context_alloc(2);
@@ -170,7 +171,7 @@ static bool amd_sched_entity_is_initialized(struct amd_gpu_scheduler *sched,
 static bool amd_sched_entity_is_idle(struct amd_sched_entity *entity)
 {
 	rmb();
-	if (kfifo_is_empty(&entity->job_queue))
+	if (spsc_queue_peek(&entity->job_queue) == NULL)
 		return true;
 
 	return false;
@@ -185,7 +186,7 @@ static bool amd_sched_entity_is_idle(struct amd_sched_entity *entity)
  */
 static bool amd_sched_entity_is_ready(struct amd_sched_entity *entity)
 {
-	if (kfifo_is_empty(&entity->job_queue))
+	if (spsc_queue_peek(&entity->job_queue) == NULL)
 		return false;
 
 	if (ACCESS_ONCE(entity->dependency))
@@ -227,7 +228,7 @@ void amd_sched_entity_fini(struct amd_gpu_scheduler *sched,
 		 */
 		kthread_park(sched->thread);
 		kthread_unpark(sched->thread);
-		while (kfifo_out(&entity->job_queue, &job, sizeof(job))) {
+		while ((job = to_amd_sched_job(spsc_queue_pop(&entity->job_queue)))) {
 			struct amd_sched_fence *s_fence = job->s_fence;
 			amd_sched_fence_scheduled(s_fence);
 			dma_fence_set_error(&s_fence->finished, -ESRCH);
@@ -235,9 +236,7 @@ void amd_sched_entity_fini(struct amd_gpu_scheduler *sched,
 			dma_fence_put(&s_fence->finished);
 			sched->ops->free_job(job);
 		}
-
 	}
-	kfifo_free(&entity->job_queue);
 }
 
 static void amd_sched_entity_wakeup(struct dma_fence *f, struct dma_fence_cb *cb)
@@ -332,18 +331,21 @@ static bool amd_sched_entity_add_dependency_cb(struct amd_sched_entity *entity)
 }
 
 static struct amd_sched_job *
-amd_sched_entity_peek_job(struct amd_sched_entity *entity)
+amd_sched_entity_pop_job(struct amd_sched_entity *entity)
 {
 	struct amd_gpu_scheduler *sched = entity->sched;
-	struct amd_sched_job *sched_job;
+	struct amd_sched_job *sched_job = to_amd_sched_job(
+						spsc_queue_peek(&entity->job_queue));
 
-	if (!kfifo_out_peek(&entity->job_queue, &sched_job, sizeof(sched_job)))
+	if (!sched_job)
 		return NULL;
 
 	while ((entity->dependency = sched->ops->dependency(sched_job)))
 		if (amd_sched_entity_add_dependency_cb(entity))
 			return NULL;
 
+	sched_job->s_entity = NULL;
+	spsc_queue_pop(&entity->job_queue);
 	return sched_job;
 }
 
@@ -354,18 +356,14 @@ amd_sched_entity_peek_job(struct amd_sched_entity *entity)
  *
  * Returns true if we could submit the job.
  */
-static bool amd_sched_entity_in(struct amd_sched_job *sched_job)
+static void amd_sched_entity_in(struct amd_sched_job *sched_job)
 {
 	struct amd_gpu_scheduler *sched = sched_job->sched;
 	struct amd_sched_entity *entity = sched_job->s_entity;
-	bool added, first = false;
+	bool first = false;
 
 	spin_lock(&entity->queue_lock);
-	added = kfifo_in(&entity->job_queue, &sched_job,
-			sizeof(sched_job)) == sizeof(sched_job);
-
-	if (added && kfifo_len(&entity->job_queue) == sizeof(sched_job))
-		first = true;
+	first = spsc_queue_push(&entity->job_queue, &sched_job->queue_node);
 
 	spin_unlock(&entity->queue_lock);
 
@@ -377,7 +375,6 @@ static bool amd_sched_entity_in(struct amd_sched_job *sched_job)
 		spin_unlock(&entity->rq_lock);
 		amd_sched_wakeup(sched);
 	}
-	return added;
 }
 
 /* job_finish is called after hw fence signaled
@@ -514,11 +511,8 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
  */
 void amd_sched_entity_push_job(struct amd_sched_job *sched_job)
 {
-	struct amd_sched_entity *entity = sched_job->s_entity;
-
 	trace_amd_sched_job(sched_job);
-	wait_event(entity->sched->job_scheduled,
-		   amd_sched_entity_in(sched_job));
+	amd_sched_entity_in(sched_job);
 }
 
 /* init a sched_job with basic field */
@@ -611,7 +605,7 @@ static int amd_sched_main(void *param)
 {
 	struct sched_param sparam = {.sched_priority = 1};
 	struct amd_gpu_scheduler *sched = (struct amd_gpu_scheduler *)param;
-	int r, count;
+	int r;
 
 	sched_setscheduler(current, SCHED_FIFO, &sparam);
 
@@ -629,7 +623,7 @@ static int amd_sched_main(void *param)
 		if (!entity)
 			continue;
 
-		sched_job = amd_sched_entity_peek_job(entity);
+		sched_job = amd_sched_entity_pop_job(entity);
 		if (!sched_job)
 			continue;
 
@@ -656,9 +650,6 @@ static int amd_sched_main(void *param)
 			amd_sched_process_job(NULL, &s_fence->cb);
 		}
 
-		count = kfifo_out(&entity->job_queue, &sched_job,
-				sizeof(sched_job));
-		WARN_ON(count != sizeof(sched_job));
 		wake_up(&sched->job_scheduled);
 	}
 	return 0;
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
index 3f75b45..8844ce7 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
@@ -26,6 +26,7 @@
 
 #include <linux/kfifo.h>
 #include <linux/dma-fence.h>
+#include "spsc_queue.h"
 
 struct amd_gpu_scheduler;
 struct amd_sched_rq;
@@ -56,7 +57,7 @@ struct amd_sched_entity {
 	struct amd_gpu_scheduler	*sched;
 
 	spinlock_t			queue_lock;
-	struct kfifo                    job_queue;
+	struct spsc_queue	job_queue;
 
 	atomic_t			fence_seq;
 	uint64_t                        fence_context;
@@ -87,6 +88,7 @@ struct amd_sched_fence {
 };
 
 struct amd_sched_job {
+	struct spsc_node queue_node;
 	struct amd_gpu_scheduler        *sched;
 	struct amd_sched_entity         *s_entity;
 	struct amd_sched_fence          *s_fence;
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: Avoid accessing job->entity after the job is scheduled.
       [not found] ` <1508506327-1084-1-git-send-email-Andrey.Grodzovsky-5C7GfCeVMHo@public.gmane.org>
  2017-10-20 13:32   ` [PATCH 2/3] drm/amdgpu: Add SPSC queue to scheduler Andrey Grodzovsky
  2017-10-20 13:32   ` [PATCH 3/3] drm/amdgpu: Fix deadlock during GPU reset Andrey Grodzovsky
@ 2017-10-20 15:59   ` Andres Rodriguez
       [not found]     ` <ea764df5-6c80-e1cd-a4cd-562c86c7d553-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
  2017-10-23 12:36   ` Christian König
  3 siblings, 1 reply; 20+ messages in thread
From: Andres Rodriguez @ 2017-10-20 15:59 UTC (permalink / raw)
  To: Andrey Grodzovsky, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: christian.koenig-5C7GfCeVMHo



On 2017-10-20 09:32 AM, Andrey Grodzovsky wrote:
> Bug: amdgpu_job_free_cb was accessing s_job->s_entity when the allocated
> amdgpu_ctx (and the entity inside it) were already deallocated from
> amdgpu_cs_parser_fini.
> 
> Fix: Save job's priority on it's creation instead of accessing it from
> s_entity later on.

I'm not sure if this is the correct approach for a fix.

Keeping s_entity as a dangling pointer could result in a similar bugs 
being reintroduced. For example, there would still be a race condition 
between amdgpu_cs_parser_fini() and amdgpu_job_dependency().

Instead, it might be better for the job to grab a reference to the 
context during job_init(), and put that reference on job free.

Regards,
Andres

> 
> Signed-off-by: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c        |  3 +--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c       |  5 ++---
>   drivers/gpu/drm/amd/scheduler/gpu_scheduler.c |  1 +
>   drivers/gpu/drm/amd/scheduler/gpu_scheduler.h | 32 ++++++++++++---------------
>   4 files changed, 18 insertions(+), 23 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index f7fceb6..a760b6e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -1192,8 +1192,7 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
>   	job->uf_sequence = seq;
>   
>   	amdgpu_job_free_resources(job);
> -	amdgpu_ring_priority_get(job->ring,
> -				 amd_sched_get_job_priority(&job->base));
> +	amdgpu_ring_priority_get(job->ring, job->base.s_priority);
>   
>   	trace_amdgpu_cs_ioctl(job);
>   	amd_sched_entity_push_job(&job->base);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index 0cfc68d..a58e3c5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -104,7 +104,7 @@ static void amdgpu_job_free_cb(struct amd_sched_job *s_job)
>   {
>   	struct amdgpu_job *job = container_of(s_job, struct amdgpu_job, base);
>   
> -	amdgpu_ring_priority_put(job->ring, amd_sched_get_job_priority(s_job));
> +	amdgpu_ring_priority_put(job->ring, s_job->s_priority);
>   	dma_fence_put(job->fence);
>   	amdgpu_sync_free(&job->sync);
>   	amdgpu_sync_free(&job->dep_sync);
> @@ -141,8 +141,7 @@ int amdgpu_job_submit(struct amdgpu_job *job, struct amdgpu_ring *ring,
>   	job->fence_ctx = entity->fence_context;
>   	*f = dma_fence_get(&job->base.s_fence->finished);
>   	amdgpu_job_free_resources(job);
> -	amdgpu_ring_priority_get(job->ring,
> -				 amd_sched_get_job_priority(&job->base));
> +	amdgpu_ring_priority_get(job->ring, job->base.s_priority);
>   	amd_sched_entity_push_job(&job->base);
>   
>   	return 0;
> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> index e4d3b4e..1bbbce2 100644
> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> @@ -529,6 +529,7 @@ int amd_sched_job_init(struct amd_sched_job *job,
>   {
>   	job->sched = sched;
>   	job->s_entity = entity;
> +	job->s_priority = entity->rq - sched->sched_rq;
>   	job->s_fence = amd_sched_fence_create(entity, owner);
>   	if (!job->s_fence)
>   		return -ENOMEM;
> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> index 52c8e54..3f75b45 100644
> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> @@ -30,6 +30,19 @@
>   struct amd_gpu_scheduler;
>   struct amd_sched_rq;
>   
> +enum amd_sched_priority {
> +	AMD_SCHED_PRIORITY_MIN,
> +	AMD_SCHED_PRIORITY_LOW = AMD_SCHED_PRIORITY_MIN,
> +	AMD_SCHED_PRIORITY_NORMAL,
> +	AMD_SCHED_PRIORITY_HIGH_SW,
> +	AMD_SCHED_PRIORITY_HIGH_HW,
> +	AMD_SCHED_PRIORITY_KERNEL,
> +	AMD_SCHED_PRIORITY_MAX,
> +	AMD_SCHED_PRIORITY_INVALID = -1,
> +	AMD_SCHED_PRIORITY_UNSET = -2
> +};
> +
> +
>   /**
>    * A scheduler entity is a wrapper around a job queue or a group
>    * of other entities. Entities take turns emitting jobs from their
> @@ -83,6 +96,7 @@ struct amd_sched_job {
>   	struct delayed_work		work_tdr;
>   	uint64_t			id;
>   	atomic_t karma;
> +	enum amd_sched_priority s_priority;
>   };
>   
>   extern const struct dma_fence_ops amd_sched_fence_ops_scheduled;
> @@ -114,18 +128,6 @@ struct amd_sched_backend_ops {
>   	void (*free_job)(struct amd_sched_job *sched_job);
>   };
>   
> -enum amd_sched_priority {
> -	AMD_SCHED_PRIORITY_MIN,
> -	AMD_SCHED_PRIORITY_LOW = AMD_SCHED_PRIORITY_MIN,
> -	AMD_SCHED_PRIORITY_NORMAL,
> -	AMD_SCHED_PRIORITY_HIGH_SW,
> -	AMD_SCHED_PRIORITY_HIGH_HW,
> -	AMD_SCHED_PRIORITY_KERNEL,
> -	AMD_SCHED_PRIORITY_MAX,
> -	AMD_SCHED_PRIORITY_INVALID = -1,
> -	AMD_SCHED_PRIORITY_UNSET = -2
> -};
> -
>   /**
>    * One scheduler is implemented for each hardware ring
>   */
> @@ -176,10 +178,4 @@ bool amd_sched_dependency_optimized(struct dma_fence* fence,
>   				    struct amd_sched_entity *entity);
>   void amd_sched_job_kickout(struct amd_sched_job *s_job);
>   
> -static inline enum amd_sched_priority
> -amd_sched_get_job_priority(struct amd_sched_job *job)
> -{
> -	return (job->s_entity->rq - job->sched->sched_rq);
> -}
> -
>   #endif
> 
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: Avoid accessing job->entity after the job is scheduled.
       [not found]     ` <ea764df5-6c80-e1cd-a4cd-562c86c7d553-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
@ 2017-10-20 16:19       ` Andrey Grodzovsky
       [not found]         ` <ecb8a9d2-ba94-c7a4-f5d7-156b061245d8-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 20+ messages in thread
From: Andrey Grodzovsky @ 2017-10-20 16:19 UTC (permalink / raw)
  To: Andres Rodriguez, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: christian.koenig-5C7GfCeVMHo



On 2017-10-20 11:59 AM, Andres Rodriguez wrote:
>
>
> On 2017-10-20 09:32 AM, Andrey Grodzovsky wrote:
>> Bug: amdgpu_job_free_cb was accessing s_job->s_entity when the allocated
>> amdgpu_ctx (and the entity inside it) were already deallocated from
>> amdgpu_cs_parser_fini.
>>
>> Fix: Save job's priority on it's creation instead of accessing it from
>> s_entity later on.
>
> I'm not sure if this is the correct approach for a fix.
>
> Keeping s_entity as a dangling pointer could result in a similar bugs 
> being reintroduced. For example, there would still be a race condition 
> between amdgpu_cs_parser_fini() and amdgpu_job_dependency().

.dependency hook is called only in once place amd_sched_entity_pop_job, 
amdgpu_cs_parser_fini will wait  (from amd_sched_entity_fini) for 
wake_up(&sched->job_scheduled) from amd_sched_main so I don't see a race 
here.

>
>
> Instead, it might be better for the job to grab a reference to the 
> context during job_init(), and put that reference on job free.

Originally it was my thinkimg to, but I consulted Christian and he 
advised that quote - "it's not the best idea since
the problem is that when we terminate a process we need to make sure 
that all resources are released or at least not hold for much longer. 
When we keep the ctx alive with the job we need to also keep the VM 
alive and that means we need to keep all the VM page tables alive".

Thanks,
Andrey

>
> Regards,
> Andres
>
>>
>> Signed-off-by: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c        |  3 +--
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c       |  5 ++---
>>   drivers/gpu/drm/amd/scheduler/gpu_scheduler.c |  1 +
>>   drivers/gpu/drm/amd/scheduler/gpu_scheduler.h | 32 
>> ++++++++++++---------------
>>   4 files changed, 18 insertions(+), 23 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> index f7fceb6..a760b6e 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> @@ -1192,8 +1192,7 @@ static int amdgpu_cs_submit(struct 
>> amdgpu_cs_parser *p,
>>       job->uf_sequence = seq;
>>         amdgpu_job_free_resources(job);
>> -    amdgpu_ring_priority_get(job->ring,
>> - amd_sched_get_job_priority(&job->base));
>> +    amdgpu_ring_priority_get(job->ring, job->base.s_priority);
>>         trace_amdgpu_cs_ioctl(job);
>>       amd_sched_entity_push_job(&job->base);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> index 0cfc68d..a58e3c5 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> @@ -104,7 +104,7 @@ static void amdgpu_job_free_cb(struct 
>> amd_sched_job *s_job)
>>   {
>>       struct amdgpu_job *job = container_of(s_job, struct amdgpu_job, 
>> base);
>>   -    amdgpu_ring_priority_put(job->ring, 
>> amd_sched_get_job_priority(s_job));
>> +    amdgpu_ring_priority_put(job->ring, s_job->s_priority);
>>       dma_fence_put(job->fence);
>>       amdgpu_sync_free(&job->sync);
>>       amdgpu_sync_free(&job->dep_sync);
>> @@ -141,8 +141,7 @@ int amdgpu_job_submit(struct amdgpu_job *job, 
>> struct amdgpu_ring *ring,
>>       job->fence_ctx = entity->fence_context;
>>       *f = dma_fence_get(&job->base.s_fence->finished);
>>       amdgpu_job_free_resources(job);
>> -    amdgpu_ring_priority_get(job->ring,
>> - amd_sched_get_job_priority(&job->base));
>> +    amdgpu_ring_priority_get(job->ring, job->base.s_priority);
>>       amd_sched_entity_push_job(&job->base);
>>         return 0;
>> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c 
>> b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
>> index e4d3b4e..1bbbce2 100644
>> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
>> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
>> @@ -529,6 +529,7 @@ int amd_sched_job_init(struct amd_sched_job *job,
>>   {
>>       job->sched = sched;
>>       job->s_entity = entity;
>> +    job->s_priority = entity->rq - sched->sched_rq;
>>       job->s_fence = amd_sched_fence_create(entity, owner);
>>       if (!job->s_fence)
>>           return -ENOMEM;
>> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h 
>> b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
>> index 52c8e54..3f75b45 100644
>> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
>> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
>> @@ -30,6 +30,19 @@
>>   struct amd_gpu_scheduler;
>>   struct amd_sched_rq;
>>   +enum amd_sched_priority {
>> +    AMD_SCHED_PRIORITY_MIN,
>> +    AMD_SCHED_PRIORITY_LOW = AMD_SCHED_PRIORITY_MIN,
>> +    AMD_SCHED_PRIORITY_NORMAL,
>> +    AMD_SCHED_PRIORITY_HIGH_SW,
>> +    AMD_SCHED_PRIORITY_HIGH_HW,
>> +    AMD_SCHED_PRIORITY_KERNEL,
>> +    AMD_SCHED_PRIORITY_MAX,
>> +    AMD_SCHED_PRIORITY_INVALID = -1,
>> +    AMD_SCHED_PRIORITY_UNSET = -2
>> +};
>> +
>> +
>>   /**
>>    * A scheduler entity is a wrapper around a job queue or a group
>>    * of other entities. Entities take turns emitting jobs from their
>> @@ -83,6 +96,7 @@ struct amd_sched_job {
>>       struct delayed_work        work_tdr;
>>       uint64_t            id;
>>       atomic_t karma;
>> +    enum amd_sched_priority s_priority;
>>   };
>>     extern const struct dma_fence_ops amd_sched_fence_ops_scheduled;
>> @@ -114,18 +128,6 @@ struct amd_sched_backend_ops {
>>       void (*free_job)(struct amd_sched_job *sched_job);
>>   };
>>   -enum amd_sched_priority {
>> -    AMD_SCHED_PRIORITY_MIN,
>> -    AMD_SCHED_PRIORITY_LOW = AMD_SCHED_PRIORITY_MIN,
>> -    AMD_SCHED_PRIORITY_NORMAL,
>> -    AMD_SCHED_PRIORITY_HIGH_SW,
>> -    AMD_SCHED_PRIORITY_HIGH_HW,
>> -    AMD_SCHED_PRIORITY_KERNEL,
>> -    AMD_SCHED_PRIORITY_MAX,
>> -    AMD_SCHED_PRIORITY_INVALID = -1,
>> -    AMD_SCHED_PRIORITY_UNSET = -2
>> -};
>> -
>>   /**
>>    * One scheduler is implemented for each hardware ring
>>   */
>> @@ -176,10 +178,4 @@ bool amd_sched_dependency_optimized(struct 
>> dma_fence* fence,
>>                       struct amd_sched_entity *entity);
>>   void amd_sched_job_kickout(struct amd_sched_job *s_job);
>>   -static inline enum amd_sched_priority
>> -amd_sched_get_job_priority(struct amd_sched_job *job)
>> -{
>> -    return (job->s_entity->rq - job->sched->sched_rq);
>> -}
>> -
>>   #endif
>>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: Avoid accessing job->entity after the job is scheduled.
       [not found]         ` <ecb8a9d2-ba94-c7a4-f5d7-156b061245d8-5C7GfCeVMHo@public.gmane.org>
@ 2017-10-20 16:26           ` Andres Rodriguez
       [not found]             ` <86853e29-07c1-700b-8aa2-2183fbc64d2f-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
  0 siblings, 1 reply; 20+ messages in thread
From: Andres Rodriguez @ 2017-10-20 16:26 UTC (permalink / raw)
  To: Andrey Grodzovsky, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: christian.koenig-5C7GfCeVMHo



On 2017-10-20 12:19 PM, Andrey Grodzovsky wrote:
> 
> 
> On 2017-10-20 11:59 AM, Andres Rodriguez wrote:
>>
>>
>> On 2017-10-20 09:32 AM, Andrey Grodzovsky wrote:
>>> Bug: amdgpu_job_free_cb was accessing s_job->s_entity when the allocated
>>> amdgpu_ctx (and the entity inside it) were already deallocated from
>>> amdgpu_cs_parser_fini.
>>>
>>> Fix: Save job's priority on it's creation instead of accessing it from
>>> s_entity later on.
>>
>> I'm not sure if this is the correct approach for a fix.
>>
>> Keeping s_entity as a dangling pointer could result in a similar bugs 
>> being reintroduced. For example, there would still be a race condition 
>> between amdgpu_cs_parser_fini() and amdgpu_job_dependency().
> 
> .dependency hook is called only in once place amd_sched_entity_pop_job, 
> amdgpu_cs_parser_fini will wait  (from amd_sched_entity_fini) for 
> wake_up(&sched->job_scheduled) from amd_sched_main so I don't see a race 
> here.
> 
>>
>>
>> Instead, it might be better for the job to grab a reference to the 
>> context during job_init(), and put that reference on job free.
> 
> Originally it was my thinkimg to, but I consulted Christian and he 
> advised that quote - "it's not the best idea since
> the problem is that when we terminate a process we need to make sure 
> that all resources are released or at least not hold for much longer. 
> When we keep the ctx alive with the job we need to also keep the VM 
> alive and that means we need to keep all the VM page tables alive".
> 

That makes sense.

Since s_entity is tied to the context reference held by the parser, can 
you set it to NULL when you drop the context reference there?

At least that way we can easily detect misuse of s_entity after it 
enters a "possibly deleted" state.

Regards,
Andres

> Thanks,
> Andrey
> 
>>
>> Regards,
>> Andres
>>
>>>
>>> Signed-off-by: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c        |  3 +--
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c       |  5 ++---
>>>   drivers/gpu/drm/amd/scheduler/gpu_scheduler.c |  1 +
>>>   drivers/gpu/drm/amd/scheduler/gpu_scheduler.h | 32 
>>> ++++++++++++---------------
>>>   4 files changed, 18 insertions(+), 23 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> index f7fceb6..a760b6e 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> @@ -1192,8 +1192,7 @@ static int amdgpu_cs_submit(struct 
>>> amdgpu_cs_parser *p,
>>>       job->uf_sequence = seq;
>>>         amdgpu_job_free_resources(job);
>>> -    amdgpu_ring_priority_get(job->ring,
>>> - amd_sched_get_job_priority(&job->base));
>>> +    amdgpu_ring_priority_get(job->ring, job->base.s_priority);
>>>         trace_amdgpu_cs_ioctl(job);
>>>       amd_sched_entity_push_job(&job->base);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> index 0cfc68d..a58e3c5 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> @@ -104,7 +104,7 @@ static void amdgpu_job_free_cb(struct 
>>> amd_sched_job *s_job)
>>>   {
>>>       struct amdgpu_job *job = container_of(s_job, struct amdgpu_job, 
>>> base);
>>>   -    amdgpu_ring_priority_put(job->ring, 
>>> amd_sched_get_job_priority(s_job));
>>> +    amdgpu_ring_priority_put(job->ring, s_job->s_priority);
>>>       dma_fence_put(job->fence);
>>>       amdgpu_sync_free(&job->sync);
>>>       amdgpu_sync_free(&job->dep_sync);
>>> @@ -141,8 +141,7 @@ int amdgpu_job_submit(struct amdgpu_job *job, 
>>> struct amdgpu_ring *ring,
>>>       job->fence_ctx = entity->fence_context;
>>>       *f = dma_fence_get(&job->base.s_fence->finished);
>>>       amdgpu_job_free_resources(job);
>>> -    amdgpu_ring_priority_get(job->ring,
>>> - amd_sched_get_job_priority(&job->base));
>>> +    amdgpu_ring_priority_get(job->ring, job->base.s_priority);
>>>       amd_sched_entity_push_job(&job->base);
>>>         return 0;
>>> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c 
>>> b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
>>> index e4d3b4e..1bbbce2 100644
>>> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
>>> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
>>> @@ -529,6 +529,7 @@ int amd_sched_job_init(struct amd_sched_job *job,
>>>   {
>>>       job->sched = sched;
>>>       job->s_entity = entity;
>>> +    job->s_priority = entity->rq - sched->sched_rq;
>>>       job->s_fence = amd_sched_fence_create(entity, owner);
>>>       if (!job->s_fence)
>>>           return -ENOMEM;
>>> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h 
>>> b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
>>> index 52c8e54..3f75b45 100644
>>> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
>>> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
>>> @@ -30,6 +30,19 @@
>>>   struct amd_gpu_scheduler;
>>>   struct amd_sched_rq;
>>>   +enum amd_sched_priority {
>>> +    AMD_SCHED_PRIORITY_MIN,
>>> +    AMD_SCHED_PRIORITY_LOW = AMD_SCHED_PRIORITY_MIN,
>>> +    AMD_SCHED_PRIORITY_NORMAL,
>>> +    AMD_SCHED_PRIORITY_HIGH_SW,
>>> +    AMD_SCHED_PRIORITY_HIGH_HW,
>>> +    AMD_SCHED_PRIORITY_KERNEL,
>>> +    AMD_SCHED_PRIORITY_MAX,
>>> +    AMD_SCHED_PRIORITY_INVALID = -1,
>>> +    AMD_SCHED_PRIORITY_UNSET = -2
>>> +};
>>> +
>>> +
>>>   /**
>>>    * A scheduler entity is a wrapper around a job queue or a group
>>>    * of other entities. Entities take turns emitting jobs from their
>>> @@ -83,6 +96,7 @@ struct amd_sched_job {
>>>       struct delayed_work        work_tdr;
>>>       uint64_t            id;
>>>       atomic_t karma;
>>> +    enum amd_sched_priority s_priority;
>>>   };
>>>     extern const struct dma_fence_ops amd_sched_fence_ops_scheduled;
>>> @@ -114,18 +128,6 @@ struct amd_sched_backend_ops {
>>>       void (*free_job)(struct amd_sched_job *sched_job);
>>>   };
>>>   -enum amd_sched_priority {
>>> -    AMD_SCHED_PRIORITY_MIN,
>>> -    AMD_SCHED_PRIORITY_LOW = AMD_SCHED_PRIORITY_MIN,
>>> -    AMD_SCHED_PRIORITY_NORMAL,
>>> -    AMD_SCHED_PRIORITY_HIGH_SW,
>>> -    AMD_SCHED_PRIORITY_HIGH_HW,
>>> -    AMD_SCHED_PRIORITY_KERNEL,
>>> -    AMD_SCHED_PRIORITY_MAX,
>>> -    AMD_SCHED_PRIORITY_INVALID = -1,
>>> -    AMD_SCHED_PRIORITY_UNSET = -2
>>> -};
>>> -
>>>   /**
>>>    * One scheduler is implemented for each hardware ring
>>>   */
>>> @@ -176,10 +178,4 @@ bool amd_sched_dependency_optimized(struct 
>>> dma_fence* fence,
>>>                       struct amd_sched_entity *entity);
>>>   void amd_sched_job_kickout(struct amd_sched_job *s_job);
>>>   -static inline enum amd_sched_priority
>>> -amd_sched_get_job_priority(struct amd_sched_job *job)
>>> -{
>>> -    return (job->s_entity->rq - job->sched->sched_rq);
>>> -}
>>> -
>>>   #endif
>>>
> 
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: Avoid accessing job->entity after the job is scheduled.
       [not found]             ` <86853e29-07c1-700b-8aa2-2183fbc64d2f-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
@ 2017-10-20 16:51               ` Andrey Grodzovsky
       [not found]                 ` <184ba414-981a-02eb-1a6f-7bd878b4f3b1-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 20+ messages in thread
From: Andrey Grodzovsky @ 2017-10-20 16:51 UTC (permalink / raw)
  To: Andres Rodriguez, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: christian.koenig-5C7GfCeVMHo



On 2017-10-20 12:26 PM, Andres Rodriguez wrote:
>
>
> On 2017-10-20 12:19 PM, Andrey Grodzovsky wrote:
>>
>>
>> On 2017-10-20 11:59 AM, Andres Rodriguez wrote:
>>>
>>>
>>> On 2017-10-20 09:32 AM, Andrey Grodzovsky wrote:
>>>> Bug: amdgpu_job_free_cb was accessing s_job->s_entity when the 
>>>> allocated
>>>> amdgpu_ctx (and the entity inside it) were already deallocated from
>>>> amdgpu_cs_parser_fini.
>>>>
>>>> Fix: Save job's priority on it's creation instead of accessing it from
>>>> s_entity later on.
>>>
>>> I'm not sure if this is the correct approach for a fix.
>>>
>>> Keeping s_entity as a dangling pointer could result in a similar 
>>> bugs being reintroduced. For example, there would still be a race 
>>> condition between amdgpu_cs_parser_fini() and amdgpu_job_dependency().
>>
>> .dependency hook is called only in once place 
>> amd_sched_entity_pop_job, amdgpu_cs_parser_fini will wait  (from 
>> amd_sched_entity_fini) for wake_up(&sched->job_scheduled) from 
>> amd_sched_main so I don't see a race here.
>>
>>>
>>>
>>> Instead, it might be better for the job to grab a reference to the 
>>> context during job_init(), and put that reference on job free.
>>
>> Originally it was my thinkimg to, but I consulted Christian and he 
>> advised that quote - "it's not the best idea since
>> the problem is that when we terminate a process we need to make sure 
>> that all resources are released or at least not hold for much longer. 
>> When we keep the ctx alive with the job we need to also keep the VM 
>> alive and that means we need to keep all the VM page tables alive".
>>
>
> That makes sense.
>
> Since s_entity is tied to the context reference held by the parser, 
> can you set it to NULL when you drop the context reference there?

O am not sure i understand - you want to send s_job->s_entity to NULL in 
amd_sched_entity_fini for each remaining job in the queue ? But all the 
jobs remaining in the queue are destroyed there anyway.

Thanks,
Andrey

>
> At least that way we can easily detect misuse of s_entity after it 
> enters a "possibly deleted" state.
>
> Regards,
> Andres
>
>> Thanks,
>> Andrey
>>
>>>
>>> Regards,
>>> Andres
>>>
>>>>
>>>> Signed-off-by: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>
>>>> ---
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c        |  3 +--
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c       |  5 ++---
>>>>   drivers/gpu/drm/amd/scheduler/gpu_scheduler.c |  1 +
>>>>   drivers/gpu/drm/amd/scheduler/gpu_scheduler.h | 32 
>>>> ++++++++++++---------------
>>>>   4 files changed, 18 insertions(+), 23 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>> index f7fceb6..a760b6e 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>> @@ -1192,8 +1192,7 @@ static int amdgpu_cs_submit(struct 
>>>> amdgpu_cs_parser *p,
>>>>       job->uf_sequence = seq;
>>>>         amdgpu_job_free_resources(job);
>>>> -    amdgpu_ring_priority_get(job->ring,
>>>> - amd_sched_get_job_priority(&job->base));
>>>> +    amdgpu_ring_priority_get(job->ring, job->base.s_priority);
>>>>         trace_amdgpu_cs_ioctl(job);
>>>>       amd_sched_entity_push_job(&job->base);
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>> index 0cfc68d..a58e3c5 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>> @@ -104,7 +104,7 @@ static void amdgpu_job_free_cb(struct 
>>>> amd_sched_job *s_job)
>>>>   {
>>>>       struct amdgpu_job *job = container_of(s_job, struct 
>>>> amdgpu_job, base);
>>>>   -    amdgpu_ring_priority_put(job->ring, 
>>>> amd_sched_get_job_priority(s_job));
>>>> +    amdgpu_ring_priority_put(job->ring, s_job->s_priority);
>>>>       dma_fence_put(job->fence);
>>>>       amdgpu_sync_free(&job->sync);
>>>>       amdgpu_sync_free(&job->dep_sync);
>>>> @@ -141,8 +141,7 @@ int amdgpu_job_submit(struct amdgpu_job *job, 
>>>> struct amdgpu_ring *ring,
>>>>       job->fence_ctx = entity->fence_context;
>>>>       *f = dma_fence_get(&job->base.s_fence->finished);
>>>>       amdgpu_job_free_resources(job);
>>>> -    amdgpu_ring_priority_get(job->ring,
>>>> - amd_sched_get_job_priority(&job->base));
>>>> +    amdgpu_ring_priority_get(job->ring, job->base.s_priority);
>>>>       amd_sched_entity_push_job(&job->base);
>>>>         return 0;
>>>> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c 
>>>> b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
>>>> index e4d3b4e..1bbbce2 100644
>>>> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
>>>> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
>>>> @@ -529,6 +529,7 @@ int amd_sched_job_init(struct amd_sched_job *job,
>>>>   {
>>>>       job->sched = sched;
>>>>       job->s_entity = entity;
>>>> +    job->s_priority = entity->rq - sched->sched_rq;
>>>>       job->s_fence = amd_sched_fence_create(entity, owner);
>>>>       if (!job->s_fence)
>>>>           return -ENOMEM;
>>>> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h 
>>>> b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
>>>> index 52c8e54..3f75b45 100644
>>>> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
>>>> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
>>>> @@ -30,6 +30,19 @@
>>>>   struct amd_gpu_scheduler;
>>>>   struct amd_sched_rq;
>>>>   +enum amd_sched_priority {
>>>> +    AMD_SCHED_PRIORITY_MIN,
>>>> +    AMD_SCHED_PRIORITY_LOW = AMD_SCHED_PRIORITY_MIN,
>>>> +    AMD_SCHED_PRIORITY_NORMAL,
>>>> +    AMD_SCHED_PRIORITY_HIGH_SW,
>>>> +    AMD_SCHED_PRIORITY_HIGH_HW,
>>>> +    AMD_SCHED_PRIORITY_KERNEL,
>>>> +    AMD_SCHED_PRIORITY_MAX,
>>>> +    AMD_SCHED_PRIORITY_INVALID = -1,
>>>> +    AMD_SCHED_PRIORITY_UNSET = -2
>>>> +};
>>>> +
>>>> +
>>>>   /**
>>>>    * A scheduler entity is a wrapper around a job queue or a group
>>>>    * of other entities. Entities take turns emitting jobs from their
>>>> @@ -83,6 +96,7 @@ struct amd_sched_job {
>>>>       struct delayed_work        work_tdr;
>>>>       uint64_t            id;
>>>>       atomic_t karma;
>>>> +    enum amd_sched_priority s_priority;
>>>>   };
>>>>     extern const struct dma_fence_ops amd_sched_fence_ops_scheduled;
>>>> @@ -114,18 +128,6 @@ struct amd_sched_backend_ops {
>>>>       void (*free_job)(struct amd_sched_job *sched_job);
>>>>   };
>>>>   -enum amd_sched_priority {
>>>> -    AMD_SCHED_PRIORITY_MIN,
>>>> -    AMD_SCHED_PRIORITY_LOW = AMD_SCHED_PRIORITY_MIN,
>>>> -    AMD_SCHED_PRIORITY_NORMAL,
>>>> -    AMD_SCHED_PRIORITY_HIGH_SW,
>>>> -    AMD_SCHED_PRIORITY_HIGH_HW,
>>>> -    AMD_SCHED_PRIORITY_KERNEL,
>>>> -    AMD_SCHED_PRIORITY_MAX,
>>>> -    AMD_SCHED_PRIORITY_INVALID = -1,
>>>> -    AMD_SCHED_PRIORITY_UNSET = -2
>>>> -};
>>>> -
>>>>   /**
>>>>    * One scheduler is implemented for each hardware ring
>>>>   */
>>>> @@ -176,10 +178,4 @@ bool amd_sched_dependency_optimized(struct 
>>>> dma_fence* fence,
>>>>                       struct amd_sched_entity *entity);
>>>>   void amd_sched_job_kickout(struct amd_sched_job *s_job);
>>>>   -static inline enum amd_sched_priority
>>>> -amd_sched_get_job_priority(struct amd_sched_job *job)
>>>> -{
>>>> -    return (job->s_entity->rq - job->sched->sched_rq);
>>>> -}
>>>> -
>>>>   #endif
>>>>
>>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: Avoid accessing job->entity after the job is scheduled.
       [not found]                 ` <184ba414-981a-02eb-1a6f-7bd878b4f3b1-5C7GfCeVMHo@public.gmane.org>
@ 2017-10-20 18:24                   ` Christian König
       [not found]                     ` <79fd05fa-711c-c454-36a1-54b31adda225-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
  0 siblings, 1 reply; 20+ messages in thread
From: Christian König @ 2017-10-20 18:24 UTC (permalink / raw)
  To: Andrey Grodzovsky, Andres Rodriguez,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: christian.koenig-5C7GfCeVMHo

Am 20.10.2017 um 18:51 schrieb Andrey Grodzovsky:
>
>
> On 2017-10-20 12:26 PM, Andres Rodriguez wrote:
>>
>>
>> On 2017-10-20 12:19 PM, Andrey Grodzovsky wrote:
>>>
>>>
>>> On 2017-10-20 11:59 AM, Andres Rodriguez wrote:
>>>>
>>>>
>>>> On 2017-10-20 09:32 AM, Andrey Grodzovsky wrote:
>>>>> Bug: amdgpu_job_free_cb was accessing s_job->s_entity when the 
>>>>> allocated
>>>>> amdgpu_ctx (and the entity inside it) were already deallocated from
>>>>> amdgpu_cs_parser_fini.
>>>>>
>>>>> Fix: Save job's priority on it's creation instead of accessing it 
>>>>> from
>>>>> s_entity later on.
>>>>
>>>> I'm not sure if this is the correct approach for a fix.
>>>>
>>>> Keeping s_entity as a dangling pointer could result in a similar 
>>>> bugs being reintroduced. For example, there would still be a race 
>>>> condition between amdgpu_cs_parser_fini() and amdgpu_job_dependency().
>>>
>>> .dependency hook is called only in once place 
>>> amd_sched_entity_pop_job, amdgpu_cs_parser_fini will wait (from 
>>> amd_sched_entity_fini) for wake_up(&sched->job_scheduled) from 
>>> amd_sched_main so I don't see a race here.
>>>
>>>>
>>>>
>>>> Instead, it might be better for the job to grab a reference to the 
>>>> context during job_init(), and put that reference on job free.
>>>
>>> Originally it was my thinkimg to, but I consulted Christian and he 
>>> advised that quote - "it's not the best idea since
>>> the problem is that when we terminate a process we need to make sure 
>>> that all resources are released or at least not hold for much 
>>> longer. When we keep the ctx alive with the job we need to also keep 
>>> the VM alive and that means we need to keep all the VM page tables 
>>> alive".
>>>
>>
>> That makes sense.
>>
>> Since s_entity is tied to the context reference held by the parser, 
>> can you set it to NULL when you drop the context reference there?
>
> O am not sure i understand - you want to send s_job->s_entity to NULL 
> in amd_sched_entity_fini for each remaining job in the queue ? But all 
> the jobs remaining in the queue are destroyed there anyway.

I think what Andres means here is exactly what we planned to do anyway. 
Set job->s_entity to NULL as soon as we know that the entity is not used 
any more and might be released.

In the long term we should target towards making s_job->s_entity as well 
as job->vm superfluous. This way we could even push remaining jobs on a 
graveyard entity when we destroy one and timeout.

Alternatively we could look into why wait_event_killable is sometimes 
not killable as the name says :)

Maybe we can get to a point where we can finally reboot the system 
cleanly even when the GPU is stuck.

Regards,
Christian.

>
> Thanks,
> Andrey
>
>>
>> At least that way we can easily detect misuse of s_entity after it 
>> enters a "possibly deleted" state.
>>
>> Regards,
>> Andres
>>
>>> Thanks,
>>> Andrey
>>>
>>>>
>>>> Regards,
>>>> Andres
>>>>
>>>>>
>>>>> Signed-off-by: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>
>>>>> ---
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c        |  3 +--
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c       |  5 ++---
>>>>>   drivers/gpu/drm/amd/scheduler/gpu_scheduler.c |  1 +
>>>>>   drivers/gpu/drm/amd/scheduler/gpu_scheduler.h | 32 
>>>>> ++++++++++++---------------
>>>>>   4 files changed, 18 insertions(+), 23 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>>> index f7fceb6..a760b6e 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>>> @@ -1192,8 +1192,7 @@ static int amdgpu_cs_submit(struct 
>>>>> amdgpu_cs_parser *p,
>>>>>       job->uf_sequence = seq;
>>>>>         amdgpu_job_free_resources(job);
>>>>> -    amdgpu_ring_priority_get(job->ring,
>>>>> - amd_sched_get_job_priority(&job->base));
>>>>> +    amdgpu_ring_priority_get(job->ring, job->base.s_priority);
>>>>>         trace_amdgpu_cs_ioctl(job);
>>>>>       amd_sched_entity_push_job(&job->base);
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>>> index 0cfc68d..a58e3c5 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>>> @@ -104,7 +104,7 @@ static void amdgpu_job_free_cb(struct 
>>>>> amd_sched_job *s_job)
>>>>>   {
>>>>>       struct amdgpu_job *job = container_of(s_job, struct 
>>>>> amdgpu_job, base);
>>>>>   -    amdgpu_ring_priority_put(job->ring, 
>>>>> amd_sched_get_job_priority(s_job));
>>>>> +    amdgpu_ring_priority_put(job->ring, s_job->s_priority);
>>>>>       dma_fence_put(job->fence);
>>>>>       amdgpu_sync_free(&job->sync);
>>>>>       amdgpu_sync_free(&job->dep_sync);
>>>>> @@ -141,8 +141,7 @@ int amdgpu_job_submit(struct amdgpu_job *job, 
>>>>> struct amdgpu_ring *ring,
>>>>>       job->fence_ctx = entity->fence_context;
>>>>>       *f = dma_fence_get(&job->base.s_fence->finished);
>>>>>       amdgpu_job_free_resources(job);
>>>>> -    amdgpu_ring_priority_get(job->ring,
>>>>> - amd_sched_get_job_priority(&job->base));
>>>>> +    amdgpu_ring_priority_get(job->ring, job->base.s_priority);
>>>>>       amd_sched_entity_push_job(&job->base);
>>>>>         return 0;
>>>>> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c 
>>>>> b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
>>>>> index e4d3b4e..1bbbce2 100644
>>>>> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
>>>>> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
>>>>> @@ -529,6 +529,7 @@ int amd_sched_job_init(struct amd_sched_job *job,
>>>>>   {
>>>>>       job->sched = sched;
>>>>>       job->s_entity = entity;
>>>>> +    job->s_priority = entity->rq - sched->sched_rq;
>>>>>       job->s_fence = amd_sched_fence_create(entity, owner);
>>>>>       if (!job->s_fence)
>>>>>           return -ENOMEM;
>>>>> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h 
>>>>> b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
>>>>> index 52c8e54..3f75b45 100644
>>>>> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
>>>>> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
>>>>> @@ -30,6 +30,19 @@
>>>>>   struct amd_gpu_scheduler;
>>>>>   struct amd_sched_rq;
>>>>>   +enum amd_sched_priority {
>>>>> +    AMD_SCHED_PRIORITY_MIN,
>>>>> +    AMD_SCHED_PRIORITY_LOW = AMD_SCHED_PRIORITY_MIN,
>>>>> +    AMD_SCHED_PRIORITY_NORMAL,
>>>>> +    AMD_SCHED_PRIORITY_HIGH_SW,
>>>>> +    AMD_SCHED_PRIORITY_HIGH_HW,
>>>>> +    AMD_SCHED_PRIORITY_KERNEL,
>>>>> +    AMD_SCHED_PRIORITY_MAX,
>>>>> +    AMD_SCHED_PRIORITY_INVALID = -1,
>>>>> +    AMD_SCHED_PRIORITY_UNSET = -2
>>>>> +};
>>>>> +
>>>>> +
>>>>>   /**
>>>>>    * A scheduler entity is a wrapper around a job queue or a group
>>>>>    * of other entities. Entities take turns emitting jobs from their
>>>>> @@ -83,6 +96,7 @@ struct amd_sched_job {
>>>>>       struct delayed_work        work_tdr;
>>>>>       uint64_t            id;
>>>>>       atomic_t karma;
>>>>> +    enum amd_sched_priority s_priority;
>>>>>   };
>>>>>     extern const struct dma_fence_ops amd_sched_fence_ops_scheduled;
>>>>> @@ -114,18 +128,6 @@ struct amd_sched_backend_ops {
>>>>>       void (*free_job)(struct amd_sched_job *sched_job);
>>>>>   };
>>>>>   -enum amd_sched_priority {
>>>>> -    AMD_SCHED_PRIORITY_MIN,
>>>>> -    AMD_SCHED_PRIORITY_LOW = AMD_SCHED_PRIORITY_MIN,
>>>>> -    AMD_SCHED_PRIORITY_NORMAL,
>>>>> -    AMD_SCHED_PRIORITY_HIGH_SW,
>>>>> -    AMD_SCHED_PRIORITY_HIGH_HW,
>>>>> -    AMD_SCHED_PRIORITY_KERNEL,
>>>>> -    AMD_SCHED_PRIORITY_MAX,
>>>>> -    AMD_SCHED_PRIORITY_INVALID = -1,
>>>>> -    AMD_SCHED_PRIORITY_UNSET = -2
>>>>> -};
>>>>> -
>>>>>   /**
>>>>>    * One scheduler is implemented for each hardware ring
>>>>>   */
>>>>> @@ -176,10 +178,4 @@ bool amd_sched_dependency_optimized(struct 
>>>>> dma_fence* fence,
>>>>>                       struct amd_sched_entity *entity);
>>>>>   void amd_sched_job_kickout(struct amd_sched_job *s_job);
>>>>>   -static inline enum amd_sched_priority
>>>>> -amd_sched_get_job_priority(struct amd_sched_job *job)
>>>>> -{
>>>>> -    return (job->s_entity->rq - job->sched->sched_rq);
>>>>> -}
>>>>> -
>>>>>   #endif
>>>>>
>>>
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: Avoid accessing job->entity after the job is scheduled.
       [not found]                     ` <79fd05fa-711c-c454-36a1-54b31adda225-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
@ 2017-10-20 19:31                       ` Andres Rodriguez
  0 siblings, 0 replies; 20+ messages in thread
From: Andres Rodriguez @ 2017-10-20 19:31 UTC (permalink / raw)
  To: christian.koenig-5C7GfCeVMHo, Andrey Grodzovsky,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW



On 2017-10-20 02:24 PM, Christian König wrote:
> Am 20.10.2017 um 18:51 schrieb Andrey Grodzovsky:
>>
>>
>> On 2017-10-20 12:26 PM, Andres Rodriguez wrote:
>>>
>>>
>>> On 2017-10-20 12:19 PM, Andrey Grodzovsky wrote:
>>>>
>>>>
>>>> On 2017-10-20 11:59 AM, Andres Rodriguez wrote:
>>>>>
>>>>>
>>>>> On 2017-10-20 09:32 AM, Andrey Grodzovsky wrote:
>>>>>> Bug: amdgpu_job_free_cb was accessing s_job->s_entity when the 
>>>>>> allocated
>>>>>> amdgpu_ctx (and the entity inside it) were already deallocated from
>>>>>> amdgpu_cs_parser_fini.
>>>>>>
>>>>>> Fix: Save job's priority on it's creation instead of accessing it 
>>>>>> from
>>>>>> s_entity later on.
>>>>>
>>>>> I'm not sure if this is the correct approach for a fix.
>>>>>
>>>>> Keeping s_entity as a dangling pointer could result in a similar 
>>>>> bugs being reintroduced. For example, there would still be a race 
>>>>> condition between amdgpu_cs_parser_fini() and amdgpu_job_dependency().
>>>>
>>>> .dependency hook is called only in once place 
>>>> amd_sched_entity_pop_job, amdgpu_cs_parser_fini will wait (from 
>>>> amd_sched_entity_fini) for wake_up(&sched->job_scheduled) from 
>>>> amd_sched_main so I don't see a race here.
>>>>
>>>>>
>>>>>
>>>>> Instead, it might be better for the job to grab a reference to the 
>>>>> context during job_init(), and put that reference on job free.
>>>>
>>>> Originally it was my thinkimg to, but I consulted Christian and he 
>>>> advised that quote - "it's not the best idea since
>>>> the problem is that when we terminate a process we need to make sure 
>>>> that all resources are released or at least not hold for much 
>>>> longer. When we keep the ctx alive with the job we need to also keep 
>>>> the VM alive and that means we need to keep all the VM page tables 
>>>> alive".
>>>>
>>>
>>> That makes sense.
>>>
>>> Since s_entity is tied to the context reference held by the parser, 
>>> can you set it to NULL when you drop the context reference there?
>>
>> O am not sure i understand - you want to send s_job->s_entity to NULL 
>> in amd_sched_entity_fini for each remaining job in the queue ? But all 
>> the jobs remaining in the queue are destroyed there anyway.
> 
> I think what Andres means here is exactly what we planned to do anyway. 
> Set job->s_entity to NULL as soon as we know that the entity is not used 
> any more and might be released.

Yeah this is what I would like to see. If you already have discussed it 
and have a plan to address it, then this patch looks good to me for 
static and dynamic priorities.

Feel free to add:
Reviewed-by: Andres Rodriguez <andresx7@gmail.com>

> 
> In the long term we should target towards making s_job->s_entity as well 
> as job->vm superfluous. This way we could even push remaining jobs on a 
> graveyard entity when we destroy one and timeout.
> 
> Alternatively we could look into why wait_event_killable is sometimes 
> not killable as the name says :)
> 
> Maybe we can get to a point where we can finally reboot the system 
> cleanly even when the GPU is stuck.
> 
> Regards,
> Christian.
> 
>>
>> Thanks,
>> Andrey
>>
>>>
>>> At least that way we can easily detect misuse of s_entity after it 
>>> enters a "possibly deleted" state.
>>>
>>> Regards,
>>> Andres
>>>
>>>> Thanks,
>>>> Andrey
>>>>
>>>>>
>>>>> Regards,
>>>>> Andres
>>>>>
>>>>>>
>>>>>> Signed-off-by: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>
>>>>>> ---
>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c        |  3 +--
>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c       |  5 ++---
>>>>>>   drivers/gpu/drm/amd/scheduler/gpu_scheduler.c |  1 +
>>>>>>   drivers/gpu/drm/amd/scheduler/gpu_scheduler.h | 32 
>>>>>> ++++++++++++---------------
>>>>>>   4 files changed, 18 insertions(+), 23 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>>>> index f7fceb6..a760b6e 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>>>>> @@ -1192,8 +1192,7 @@ static int amdgpu_cs_submit(struct 
>>>>>> amdgpu_cs_parser *p,
>>>>>>       job->uf_sequence = seq;
>>>>>>         amdgpu_job_free_resources(job);
>>>>>> -    amdgpu_ring_priority_get(job->ring,
>>>>>> - amd_sched_get_job_priority(&job->base));
>>>>>> +    amdgpu_ring_priority_get(job->ring, job->base.s_priority);
>>>>>>         trace_amdgpu_cs_ioctl(job);
>>>>>>       amd_sched_entity_push_job(&job->base);
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>>>> index 0cfc68d..a58e3c5 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>>>> @@ -104,7 +104,7 @@ static void amdgpu_job_free_cb(struct 
>>>>>> amd_sched_job *s_job)
>>>>>>   {
>>>>>>       struct amdgpu_job *job = container_of(s_job, struct 
>>>>>> amdgpu_job, base);
>>>>>>   -    amdgpu_ring_priority_put(job->ring, 
>>>>>> amd_sched_get_job_priority(s_job));
>>>>>> +    amdgpu_ring_priority_put(job->ring, s_job->s_priority);
>>>>>>       dma_fence_put(job->fence);
>>>>>>       amdgpu_sync_free(&job->sync);
>>>>>>       amdgpu_sync_free(&job->dep_sync);
>>>>>> @@ -141,8 +141,7 @@ int amdgpu_job_submit(struct amdgpu_job *job, 
>>>>>> struct amdgpu_ring *ring,
>>>>>>       job->fence_ctx = entity->fence_context;
>>>>>>       *f = dma_fence_get(&job->base.s_fence->finished);
>>>>>>       amdgpu_job_free_resources(job);
>>>>>> -    amdgpu_ring_priority_get(job->ring,
>>>>>> - amd_sched_get_job_priority(&job->base));
>>>>>> +    amdgpu_ring_priority_get(job->ring, job->base.s_priority);
>>>>>>       amd_sched_entity_push_job(&job->base);
>>>>>>         return 0;
>>>>>> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c 
>>>>>> b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
>>>>>> index e4d3b4e..1bbbce2 100644
>>>>>> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
>>>>>> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
>>>>>> @@ -529,6 +529,7 @@ int amd_sched_job_init(struct amd_sched_job *job,
>>>>>>   {
>>>>>>       job->sched = sched;
>>>>>>       job->s_entity = entity;
>>>>>> +    job->s_priority = entity->rq - sched->sched_rq;
>>>>>>       job->s_fence = amd_sched_fence_create(entity, owner);
>>>>>>       if (!job->s_fence)
>>>>>>           return -ENOMEM;
>>>>>> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h 
>>>>>> b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
>>>>>> index 52c8e54..3f75b45 100644
>>>>>> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
>>>>>> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
>>>>>> @@ -30,6 +30,19 @@
>>>>>>   struct amd_gpu_scheduler;
>>>>>>   struct amd_sched_rq;
>>>>>>   +enum amd_sched_priority {
>>>>>> +    AMD_SCHED_PRIORITY_MIN,
>>>>>> +    AMD_SCHED_PRIORITY_LOW = AMD_SCHED_PRIORITY_MIN,
>>>>>> +    AMD_SCHED_PRIORITY_NORMAL,
>>>>>> +    AMD_SCHED_PRIORITY_HIGH_SW,
>>>>>> +    AMD_SCHED_PRIORITY_HIGH_HW,
>>>>>> +    AMD_SCHED_PRIORITY_KERNEL,
>>>>>> +    AMD_SCHED_PRIORITY_MAX,
>>>>>> +    AMD_SCHED_PRIORITY_INVALID = -1,
>>>>>> +    AMD_SCHED_PRIORITY_UNSET = -2
>>>>>> +};
>>>>>> +
>>>>>> +
>>>>>>   /**
>>>>>>    * A scheduler entity is a wrapper around a job queue or a group
>>>>>>    * of other entities. Entities take turns emitting jobs from their
>>>>>> @@ -83,6 +96,7 @@ struct amd_sched_job {
>>>>>>       struct delayed_work        work_tdr;
>>>>>>       uint64_t            id;
>>>>>>       atomic_t karma;
>>>>>> +    enum amd_sched_priority s_priority;
>>>>>>   };
>>>>>>     extern const struct dma_fence_ops amd_sched_fence_ops_scheduled;
>>>>>> @@ -114,18 +128,6 @@ struct amd_sched_backend_ops {
>>>>>>       void (*free_job)(struct amd_sched_job *sched_job);
>>>>>>   };
>>>>>>   -enum amd_sched_priority {
>>>>>> -    AMD_SCHED_PRIORITY_MIN,
>>>>>> -    AMD_SCHED_PRIORITY_LOW = AMD_SCHED_PRIORITY_MIN,
>>>>>> -    AMD_SCHED_PRIORITY_NORMAL,
>>>>>> -    AMD_SCHED_PRIORITY_HIGH_SW,
>>>>>> -    AMD_SCHED_PRIORITY_HIGH_HW,
>>>>>> -    AMD_SCHED_PRIORITY_KERNEL,
>>>>>> -    AMD_SCHED_PRIORITY_MAX,
>>>>>> -    AMD_SCHED_PRIORITY_INVALID = -1,
>>>>>> -    AMD_SCHED_PRIORITY_UNSET = -2
>>>>>> -};
>>>>>> -
>>>>>>   /**
>>>>>>    * One scheduler is implemented for each hardware ring
>>>>>>   */
>>>>>> @@ -176,10 +178,4 @@ bool amd_sched_dependency_optimized(struct 
>>>>>> dma_fence* fence,
>>>>>>                       struct amd_sched_entity *entity);
>>>>>>   void amd_sched_job_kickout(struct amd_sched_job *s_job);
>>>>>>   -static inline enum amd_sched_priority
>>>>>> -amd_sched_get_job_priority(struct amd_sched_job *job)
>>>>>> -{
>>>>>> -    return (job->s_entity->rq - job->sched->sched_rq);
>>>>>> -}
>>>>>> -
>>>>>>   #endif
>>>>>>
>>>>
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> 
> 
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH 3/3] drm/amdgpu: Fix deadlock during GPU reset.
       [not found]     ` <1508506327-1084-3-git-send-email-Andrey.Grodzovsky-5C7GfCeVMHo@public.gmane.org>
@ 2017-10-23  3:03       ` Liu, Monk
       [not found]         ` <BLUPR12MB04491AF9EAD2623B5EA9D7B784460-7LeqcoF/hwpTIQvHjXdJlwdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  2017-10-23 12:42       ` Christian König
  1 sibling, 1 reply; 20+ messages in thread
From: Liu, Monk @ 2017-10-23  3:03 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Grodzovsky, Andrey, Koenig, Christian

Why not use a more simple way ?

Like moving ttm_eu_fence_buffer_objects() to before amd_sched_entity_push_job() ?
That could solve the deadlock from your description 

And the push order is already guaranteed by context->mutex (which is also a patch from you)


BR Monk

-----Original Message-----
From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf Of Andrey Grodzovsky
Sent: 2017年10月20日 21:32
To: amd-gfx@lists.freedesktop.org
Cc: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
Subject: [PATCH 3/3] drm/amdgpu: Fix deadlock during GPU reset.

Switch from kfifo to SPSC queue.

Bug:
Kfifo is limited at size, during GPU reset it would fill up to limit and the pushing thread (producer) would wait for the scheduler worker to consume the items in the fifo while holding reservation lock on a BO. The gpu reset thread on the other hand blocks the scheduler during reset. Before it unblocks the sceduler it might want to recover VRAM and so will try to reserve the same BO the producer thread is already holding creating a deadlock.

Fix:
Switch from kfifo to SPSC queue which is unlimited in size.

Signed-off-by: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h |  4 +-
 drivers/gpu/drm/amd/scheduler/gpu_scheduler.c   | 51 ++++++++++---------------
 drivers/gpu/drm/amd/scheduler/gpu_scheduler.h   |  4 +-
 3 files changed, 26 insertions(+), 33 deletions(-)

diff --git a/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h b/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h
index 8bd3810..86838a8 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h
+++ b/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h
@@ -28,8 +28,8 @@ TRACE_EVENT(amd_sched_job,
 			   __entry->id = sched_job->id;
 			   __entry->fence = &sched_job->s_fence->finished;
 			   __entry->name = sched_job->sched->name;
-			   __entry->job_count = kfifo_len(
-				   &sched_job->s_entity->job_queue) / sizeof(sched_job);
+			   __entry->job_count = spsc_queue_count(
+				   &sched_job->s_entity->job_queue);
 			   __entry->hw_job_count = atomic_read(
 				   &sched_job->sched->hw_rq_count);
 			   ),
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
index 1bbbce2..0c9cdc0 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
@@ -28,9 +28,14 @@
 #include <drm/drmP.h>
 #include "gpu_scheduler.h"
 
+#include "spsc_queue.h"
+
 #define CREATE_TRACE_POINTS
 #include "gpu_sched_trace.h"
 
+#define to_amd_sched_job(sched_job)		\
+		container_of((sched_job), struct amd_sched_job, queue_node)
+
 static bool amd_sched_entity_is_ready(struct amd_sched_entity *entity);  static void amd_sched_wakeup(struct amd_gpu_scheduler *sched);  static void amd_sched_process_job(struct dma_fence *f, struct dma_fence_cb *cb); @@ -123,8 +128,6 @@ int amd_sched_entity_init(struct amd_gpu_scheduler *sched,
 			  struct amd_sched_rq *rq,
 			  uint32_t jobs)
 {
-	int r;
-
 	if (!(sched && entity && rq))
 		return -EINVAL;
 
@@ -135,9 +138,7 @@ int amd_sched_entity_init(struct amd_gpu_scheduler *sched,
 
 	spin_lock_init(&entity->rq_lock);
 	spin_lock_init(&entity->queue_lock);
-	r = kfifo_alloc(&entity->job_queue, jobs * sizeof(void *), GFP_KERNEL);
-	if (r)
-		return r;
+	spsc_queue_init(&entity->job_queue);
 
 	atomic_set(&entity->fence_seq, 0);
 	entity->fence_context = dma_fence_context_alloc(2); @@ -170,7 +171,7 @@ static bool amd_sched_entity_is_initialized(struct amd_gpu_scheduler *sched,  static bool amd_sched_entity_is_idle(struct amd_sched_entity *entity)  {
 	rmb();
-	if (kfifo_is_empty(&entity->job_queue))
+	if (spsc_queue_peek(&entity->job_queue) == NULL)
 		return true;
 
 	return false;
@@ -185,7 +186,7 @@ static bool amd_sched_entity_is_idle(struct amd_sched_entity *entity)
  */
 static bool amd_sched_entity_is_ready(struct amd_sched_entity *entity)  {
-	if (kfifo_is_empty(&entity->job_queue))
+	if (spsc_queue_peek(&entity->job_queue) == NULL)
 		return false;
 
 	if (ACCESS_ONCE(entity->dependency))
@@ -227,7 +228,7 @@ void amd_sched_entity_fini(struct amd_gpu_scheduler *sched,
 		 */
 		kthread_park(sched->thread);
 		kthread_unpark(sched->thread);
-		while (kfifo_out(&entity->job_queue, &job, sizeof(job))) {
+		while ((job = to_amd_sched_job(spsc_queue_pop(&entity->job_queue)))) 
+{
 			struct amd_sched_fence *s_fence = job->s_fence;
 			amd_sched_fence_scheduled(s_fence);
 			dma_fence_set_error(&s_fence->finished, -ESRCH); @@ -235,9 +236,7 @@ void amd_sched_entity_fini(struct amd_gpu_scheduler *sched,
 			dma_fence_put(&s_fence->finished);
 			sched->ops->free_job(job);
 		}
-
 	}
-	kfifo_free(&entity->job_queue);
 }
 
 static void amd_sched_entity_wakeup(struct dma_fence *f, struct dma_fence_cb *cb) @@ -332,18 +331,21 @@ static bool amd_sched_entity_add_dependency_cb(struct amd_sched_entity *entity)  }
 
 static struct amd_sched_job *
-amd_sched_entity_peek_job(struct amd_sched_entity *entity)
+amd_sched_entity_pop_job(struct amd_sched_entity *entity)
 {
 	struct amd_gpu_scheduler *sched = entity->sched;
-	struct amd_sched_job *sched_job;
+	struct amd_sched_job *sched_job = to_amd_sched_job(
+						spsc_queue_peek(&entity->job_queue));
 
-	if (!kfifo_out_peek(&entity->job_queue, &sched_job, sizeof(sched_job)))
+	if (!sched_job)
 		return NULL;
 
 	while ((entity->dependency = sched->ops->dependency(sched_job)))
 		if (amd_sched_entity_add_dependency_cb(entity))
 			return NULL;
 
+	sched_job->s_entity = NULL;
+	spsc_queue_pop(&entity->job_queue);
 	return sched_job;
 }
 
@@ -354,18 +356,14 @@ amd_sched_entity_peek_job(struct amd_sched_entity *entity)
  *
  * Returns true if we could submit the job.
  */
-static bool amd_sched_entity_in(struct amd_sched_job *sched_job)
+static void amd_sched_entity_in(struct amd_sched_job *sched_job)
 {
 	struct amd_gpu_scheduler *sched = sched_job->sched;
 	struct amd_sched_entity *entity = sched_job->s_entity;
-	bool added, first = false;
+	bool first = false;
 
 	spin_lock(&entity->queue_lock);
-	added = kfifo_in(&entity->job_queue, &sched_job,
-			sizeof(sched_job)) == sizeof(sched_job);
-
-	if (added && kfifo_len(&entity->job_queue) == sizeof(sched_job))
-		first = true;
+	first = spsc_queue_push(&entity->job_queue, &sched_job->queue_node);
 
 	spin_unlock(&entity->queue_lock);
 
@@ -377,7 +375,6 @@ static bool amd_sched_entity_in(struct amd_sched_job *sched_job)
 		spin_unlock(&entity->rq_lock);
 		amd_sched_wakeup(sched);
 	}
-	return added;
 }
 
 /* job_finish is called after hw fence signaled @@ -514,11 +511,8 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
  */
 void amd_sched_entity_push_job(struct amd_sched_job *sched_job)  {
-	struct amd_sched_entity *entity = sched_job->s_entity;
-
 	trace_amd_sched_job(sched_job);
-	wait_event(entity->sched->job_scheduled,
-		   amd_sched_entity_in(sched_job));
+	amd_sched_entity_in(sched_job);
 }
 
 /* init a sched_job with basic field */ @@ -611,7 +605,7 @@ static int amd_sched_main(void *param)  {
 	struct sched_param sparam = {.sched_priority = 1};
 	struct amd_gpu_scheduler *sched = (struct amd_gpu_scheduler *)param;
-	int r, count;
+	int r;
 
 	sched_setscheduler(current, SCHED_FIFO, &sparam);
 
@@ -629,7 +623,7 @@ static int amd_sched_main(void *param)
 		if (!entity)
 			continue;
 
-		sched_job = amd_sched_entity_peek_job(entity);
+		sched_job = amd_sched_entity_pop_job(entity);
 		if (!sched_job)
 			continue;
 
@@ -656,9 +650,6 @@ static int amd_sched_main(void *param)
 			amd_sched_process_job(NULL, &s_fence->cb);
 		}
 
-		count = kfifo_out(&entity->job_queue, &sched_job,
-				sizeof(sched_job));
-		WARN_ON(count != sizeof(sched_job));
 		wake_up(&sched->job_scheduled);
 	}
 	return 0;
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
index 3f75b45..8844ce7 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
@@ -26,6 +26,7 @@
 
 #include <linux/kfifo.h>
 #include <linux/dma-fence.h>
+#include "spsc_queue.h"
 
 struct amd_gpu_scheduler;
 struct amd_sched_rq;
@@ -56,7 +57,7 @@ struct amd_sched_entity {
 	struct amd_gpu_scheduler	*sched;
 
 	spinlock_t			queue_lock;
-	struct kfifo                    job_queue;
+	struct spsc_queue	job_queue;
 
 	atomic_t			fence_seq;
 	uint64_t                        fence_context;
@@ -87,6 +88,7 @@ struct amd_sched_fence {  };
 
 struct amd_sched_job {
+	struct spsc_node queue_node;
 	struct amd_gpu_scheduler        *sched;
 	struct amd_sched_entity         *s_entity;
 	struct amd_sched_fence          *s_fence;
--
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* RE: [PATCH 2/3] drm/amdgpu: Add SPSC queue to scheduler.
       [not found]     ` <1508506327-1084-2-git-send-email-Andrey.Grodzovsky-5C7GfCeVMHo@public.gmane.org>
@ 2017-10-23  4:06       ` Liu, Monk
       [not found]         ` <BLUPR12MB04499B9F742E2CF76139F6BC84460-7LeqcoF/hwpTIQvHjXdJlwdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  2017-10-23 12:38       ` Christian König
  1 sibling, 1 reply; 20+ messages in thread
From: Liu, Monk @ 2017-10-23  4:06 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Grodzovsky, Andrey, Koenig, Christian

If the deadlock issue could be solved I don't see why we give up kfifo and switch to SPSC ......

-----Original Message-----
From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf Of Andrey Grodzovsky
Sent: 2017年10月20日 21:32
To: amd-gfx@lists.freedesktop.org
Cc: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
Subject: [PATCH 2/3] drm/amdgpu: Add SPSC queue to scheduler.

It is intended to sabstitute the bounded fifo we are currently using.

Signed-off-by: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/scheduler/spsc_queue.h | 120 +++++++++++++++++++++++++++++
 1 file changed, 120 insertions(+)
 create mode 100644 drivers/gpu/drm/amd/scheduler/spsc_queue.h

diff --git a/drivers/gpu/drm/amd/scheduler/spsc_queue.h b/drivers/gpu/drm/amd/scheduler/spsc_queue.h
new file mode 100644
index 0000000..a3394f1
--- /dev/null
+++ b/drivers/gpu/drm/amd/scheduler/spsc_queue.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person 
+obtaining a
+ * copy of this software and associated documentation files (the 
+"Software"),
+ * to deal in the Software without restriction, including without 
+limitation
+ * the rights to use, copy, modify, merge, publish, distribute, 
+sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom 
+the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT 
+SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, 
+DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
+OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 
+OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef AMD_SCHEDULER_SPSC_QUEUE_H_
+#define AMD_SCHEDULER_SPSC_QUEUE_H_
+
+#include <linux/atomic.h>
+
+/** SPSC lockless queue */
+
+struct spsc_node {
+
+	/* Stores spsc_node* */
+	struct spsc_node *next;
+};
+
+struct spsc_queue {
+
+	 struct spsc_node *head;
+
+	/* atomic pointer to struct spsc_node* */
+	atomic_long_t tail;
+
+	atomic_t job_count;
+};
+
+static inline void spsc_queue_init(struct spsc_queue *queue) {
+	queue->head = NULL;
+	atomic_long_set(&queue->tail, (long)&queue->head);
+	atomic_set(&queue->job_count, 0);
+}
+
+static inline struct spsc_node *spsc_queue_peek(struct spsc_queue 
+*queue) {
+	return queue->head;
+}
+
+static inline int spsc_queue_count(struct spsc_queue *queue) {
+	return atomic_read(&queue->job_count); }
+
+static inline bool spsc_queue_push(struct spsc_queue *queue, struct 
+spsc_node *node) {
+	struct spsc_node **tail;
+
+	node->next = NULL;
+
+	preempt_disable();
+
+	tail = (struct spsc_node **)atomic_long_xchg(&queue->tail, (long)&node->next);
+	WRITE_ONCE(*tail, node);
+	atomic_inc(&queue->job_count);
+
+	/*
+	 * In case of first element verify new node will be visible to the consumer
+	 * thread when we ping the kernel thread that there is new work to do.
+	 */
+	smp_wmb();
+
+	preempt_enable();
+
+	return tail == &queue->head;
+}
+
+
+static inline struct spsc_node *spsc_queue_pop(struct spsc_queue 
+*queue) {
+	struct spsc_node *next, *node;
+
+	/* Verify reading from memory and not the cache */
+	smp_rmb();
+
+	node = READ_ONCE(queue->head);
+
+	if (!node)
+		return NULL;
+
+	next = READ_ONCE(node->next);
+	WRITE_ONCE(queue->head, next);
+
+	if (unlikely(!next)) {
+		/* slowpath for the last element in the queue */
+
+		if (atomic_long_cmpxchg(&queue->tail,
+				(long)&node->next,(long) &queue->head) != (long)&node->next) {
+			/* Updating tail failed wait for new next to appear */
+			do {
+				smp_rmb();
+			}while (unlikely(!(queue->head = READ_ONCE(node->next))));
+		}
+	}
+
+	atomic_dec(&queue->job_count);
+	return node;
+}
+
+
+
+#endif /* AMD_SCHEDULER_SPSC_QUEUE_H_ */
--
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH 3/3] drm/amdgpu: Fix deadlock during GPU reset.
       [not found]         ` <BLUPR12MB04491AF9EAD2623B5EA9D7B784460-7LeqcoF/hwpTIQvHjXdJlwdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2017-10-23  7:08           ` Christian König
       [not found]             ` <8153c0a9-8509-c2c3-c53e-e25f6f7e83f1-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
  0 siblings, 1 reply; 20+ messages in thread
From: Christian König @ 2017-10-23  7:08 UTC (permalink / raw)
  To: Liu, Monk, Grodzovsky, Andrey, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: Koenig, Christian

We discussed that as well, problem is that this won't be sufficient.

We push to the kfifo not only during command submission, but also for VM 
updates and TTM buffers moves.

So we can still deadlock because of them.

Regards,
Christian.

Am 23.10.2017 um 05:03 schrieb Liu, Monk:
> Why not use a more simple way ?
>
> Like moving ttm_eu_fence_buffer_objects() to before amd_sched_entity_push_job() ?
> That could solve the deadlock from your description
>
> And the push order is already guaranteed by context->mutex (which is also a patch from you)
>
>
> BR Monk
>
> -----Original Message-----
> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf Of Andrey Grodzovsky
> Sent: 2017年10月20日 21:32
> To: amd-gfx@lists.freedesktop.org
> Cc: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
> Subject: [PATCH 3/3] drm/amdgpu: Fix deadlock during GPU reset.
>
> Switch from kfifo to SPSC queue.
>
> Bug:
> Kfifo is limited at size, during GPU reset it would fill up to limit and the pushing thread (producer) would wait for the scheduler worker to consume the items in the fifo while holding reservation lock on a BO. The gpu reset thread on the other hand blocks the scheduler during reset. Before it unblocks the sceduler it might want to recover VRAM and so will try to reserve the same BO the producer thread is already holding creating a deadlock.
>
> Fix:
> Switch from kfifo to SPSC queue which is unlimited in size.
>
> Signed-off-by: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>
> ---
>   drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h |  4 +-
>   drivers/gpu/drm/amd/scheduler/gpu_scheduler.c   | 51 ++++++++++---------------
>   drivers/gpu/drm/amd/scheduler/gpu_scheduler.h   |  4 +-
>   3 files changed, 26 insertions(+), 33 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h b/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h
> index 8bd3810..86838a8 100644
> --- a/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h
> +++ b/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h
> @@ -28,8 +28,8 @@ TRACE_EVENT(amd_sched_job,
>   			   __entry->id = sched_job->id;
>   			   __entry->fence = &sched_job->s_fence->finished;
>   			   __entry->name = sched_job->sched->name;
> -			   __entry->job_count = kfifo_len(
> -				   &sched_job->s_entity->job_queue) / sizeof(sched_job);
> +			   __entry->job_count = spsc_queue_count(
> +				   &sched_job->s_entity->job_queue);
>   			   __entry->hw_job_count = atomic_read(
>   				   &sched_job->sched->hw_rq_count);
>   			   ),
> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> index 1bbbce2..0c9cdc0 100644
> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> @@ -28,9 +28,14 @@
>   #include <drm/drmP.h>
>   #include "gpu_scheduler.h"
>   
> +#include "spsc_queue.h"
> +
>   #define CREATE_TRACE_POINTS
>   #include "gpu_sched_trace.h"
>   
> +#define to_amd_sched_job(sched_job)		\
> +		container_of((sched_job), struct amd_sched_job, queue_node)
> +
>   static bool amd_sched_entity_is_ready(struct amd_sched_entity *entity);  static void amd_sched_wakeup(struct amd_gpu_scheduler *sched);  static void amd_sched_process_job(struct dma_fence *f, struct dma_fence_cb *cb); @@ -123,8 +128,6 @@ int amd_sched_entity_init(struct amd_gpu_scheduler *sched,
>   			  struct amd_sched_rq *rq,
>   			  uint32_t jobs)
>   {
> -	int r;
> -
>   	if (!(sched && entity && rq))
>   		return -EINVAL;
>   
> @@ -135,9 +138,7 @@ int amd_sched_entity_init(struct amd_gpu_scheduler *sched,
>   
>   	spin_lock_init(&entity->rq_lock);
>   	spin_lock_init(&entity->queue_lock);
> -	r = kfifo_alloc(&entity->job_queue, jobs * sizeof(void *), GFP_KERNEL);
> -	if (r)
> -		return r;
> +	spsc_queue_init(&entity->job_queue);
>   
>   	atomic_set(&entity->fence_seq, 0);
>   	entity->fence_context = dma_fence_context_alloc(2); @@ -170,7 +171,7 @@ static bool amd_sched_entity_is_initialized(struct amd_gpu_scheduler *sched,  static bool amd_sched_entity_is_idle(struct amd_sched_entity *entity)  {
>   	rmb();
> -	if (kfifo_is_empty(&entity->job_queue))
> +	if (spsc_queue_peek(&entity->job_queue) == NULL)
>   		return true;
>   
>   	return false;
> @@ -185,7 +186,7 @@ static bool amd_sched_entity_is_idle(struct amd_sched_entity *entity)
>    */
>   static bool amd_sched_entity_is_ready(struct amd_sched_entity *entity)  {
> -	if (kfifo_is_empty(&entity->job_queue))
> +	if (spsc_queue_peek(&entity->job_queue) == NULL)
>   		return false;
>   
>   	if (ACCESS_ONCE(entity->dependency))
> @@ -227,7 +228,7 @@ void amd_sched_entity_fini(struct amd_gpu_scheduler *sched,
>   		 */
>   		kthread_park(sched->thread);
>   		kthread_unpark(sched->thread);
> -		while (kfifo_out(&entity->job_queue, &job, sizeof(job))) {
> +		while ((job = to_amd_sched_job(spsc_queue_pop(&entity->job_queue))))
> +{
>   			struct amd_sched_fence *s_fence = job->s_fence;
>   			amd_sched_fence_scheduled(s_fence);
>   			dma_fence_set_error(&s_fence->finished, -ESRCH); @@ -235,9 +236,7 @@ void amd_sched_entity_fini(struct amd_gpu_scheduler *sched,
>   			dma_fence_put(&s_fence->finished);
>   			sched->ops->free_job(job);
>   		}
> -
>   	}
> -	kfifo_free(&entity->job_queue);
>   }
>   
>   static void amd_sched_entity_wakeup(struct dma_fence *f, struct dma_fence_cb *cb) @@ -332,18 +331,21 @@ static bool amd_sched_entity_add_dependency_cb(struct amd_sched_entity *entity)  }
>   
>   static struct amd_sched_job *
> -amd_sched_entity_peek_job(struct amd_sched_entity *entity)
> +amd_sched_entity_pop_job(struct amd_sched_entity *entity)
>   {
>   	struct amd_gpu_scheduler *sched = entity->sched;
> -	struct amd_sched_job *sched_job;
> +	struct amd_sched_job *sched_job = to_amd_sched_job(
> +						spsc_queue_peek(&entity->job_queue));
>   
> -	if (!kfifo_out_peek(&entity->job_queue, &sched_job, sizeof(sched_job)))
> +	if (!sched_job)
>   		return NULL;
>   
>   	while ((entity->dependency = sched->ops->dependency(sched_job)))
>   		if (amd_sched_entity_add_dependency_cb(entity))
>   			return NULL;
>   
> +	sched_job->s_entity = NULL;
> +	spsc_queue_pop(&entity->job_queue);
>   	return sched_job;
>   }
>   
> @@ -354,18 +356,14 @@ amd_sched_entity_peek_job(struct amd_sched_entity *entity)
>    *
>    * Returns true if we could submit the job.
>    */
> -static bool amd_sched_entity_in(struct amd_sched_job *sched_job)
> +static void amd_sched_entity_in(struct amd_sched_job *sched_job)
>   {
>   	struct amd_gpu_scheduler *sched = sched_job->sched;
>   	struct amd_sched_entity *entity = sched_job->s_entity;
> -	bool added, first = false;
> +	bool first = false;
>   
>   	spin_lock(&entity->queue_lock);
> -	added = kfifo_in(&entity->job_queue, &sched_job,
> -			sizeof(sched_job)) == sizeof(sched_job);
> -
> -	if (added && kfifo_len(&entity->job_queue) == sizeof(sched_job))
> -		first = true;
> +	first = spsc_queue_push(&entity->job_queue, &sched_job->queue_node);
>   
>   	spin_unlock(&entity->queue_lock);
>   
> @@ -377,7 +375,6 @@ static bool amd_sched_entity_in(struct amd_sched_job *sched_job)
>   		spin_unlock(&entity->rq_lock);
>   		amd_sched_wakeup(sched);
>   	}
> -	return added;
>   }
>   
>   /* job_finish is called after hw fence signaled @@ -514,11 +511,8 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
>    */
>   void amd_sched_entity_push_job(struct amd_sched_job *sched_job)  {
> -	struct amd_sched_entity *entity = sched_job->s_entity;
> -
>   	trace_amd_sched_job(sched_job);
> -	wait_event(entity->sched->job_scheduled,
> -		   amd_sched_entity_in(sched_job));
> +	amd_sched_entity_in(sched_job);
>   }
>   
>   /* init a sched_job with basic field */ @@ -611,7 +605,7 @@ static int amd_sched_main(void *param)  {
>   	struct sched_param sparam = {.sched_priority = 1};
>   	struct amd_gpu_scheduler *sched = (struct amd_gpu_scheduler *)param;
> -	int r, count;
> +	int r;
>   
>   	sched_setscheduler(current, SCHED_FIFO, &sparam);
>   
> @@ -629,7 +623,7 @@ static int amd_sched_main(void *param)
>   		if (!entity)
>   			continue;
>   
> -		sched_job = amd_sched_entity_peek_job(entity);
> +		sched_job = amd_sched_entity_pop_job(entity);
>   		if (!sched_job)
>   			continue;
>   
> @@ -656,9 +650,6 @@ static int amd_sched_main(void *param)
>   			amd_sched_process_job(NULL, &s_fence->cb);
>   		}
>   
> -		count = kfifo_out(&entity->job_queue, &sched_job,
> -				sizeof(sched_job));
> -		WARN_ON(count != sizeof(sched_job));
>   		wake_up(&sched->job_scheduled);
>   	}
>   	return 0;
> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> index 3f75b45..8844ce7 100644
> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> @@ -26,6 +26,7 @@
>   
>   #include <linux/kfifo.h>
>   #include <linux/dma-fence.h>
> +#include "spsc_queue.h"
>   
>   struct amd_gpu_scheduler;
>   struct amd_sched_rq;
> @@ -56,7 +57,7 @@ struct amd_sched_entity {
>   	struct amd_gpu_scheduler	*sched;
>   
>   	spinlock_t			queue_lock;
> -	struct kfifo                    job_queue;
> +	struct spsc_queue	job_queue;
>   
>   	atomic_t			fence_seq;
>   	uint64_t                        fence_context;
> @@ -87,6 +88,7 @@ struct amd_sched_fence {  };
>   
>   struct amd_sched_job {
> +	struct spsc_node queue_node;
>   	struct amd_gpu_scheduler        *sched;
>   	struct amd_sched_entity         *s_entity;
>   	struct amd_sched_fence          *s_fence;
> --
> 2.7.4
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH 3/3] drm/amdgpu: Fix deadlock during GPU reset.
       [not found]             ` <8153c0a9-8509-c2c3-c53e-e25f6f7e83f1-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
@ 2017-10-23  7:24               ` Liu, Monk
       [not found]                 ` <BLUPR12MB04490DF904B056086D164BDD84460-7LeqcoF/hwpTIQvHjXdJlwdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  0 siblings, 1 reply; 20+ messages in thread
From: Liu, Monk @ 2017-10-23  7:24 UTC (permalink / raw)
  To: Koenig, Christian, Grodzovsky, Andrey,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

For VM updates, the situation is same as commands submission, deadlock can also be solved by moving ttm_eu_fence_buffer_objects() ahead of push_job()
For TTM buffers moves can we pardon gpu_reset routine and wait till the TTM moves complete ?

-----Original Message-----
From: Christian König [mailto:ckoenig.leichtzumerken@gmail.com] 
Sent: 2017年10月23日 15:09
To: Liu, Monk <Monk.Liu@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Koenig, Christian <Christian.Koenig@amd.com>
Subject: Re: [PATCH 3/3] drm/amdgpu: Fix deadlock during GPU reset.

We discussed that as well, problem is that this won't be sufficient.

We push to the kfifo not only during command submission, but also for VM updates and TTM buffers moves.

So we can still deadlock because of them.

Regards,
Christian.

Am 23.10.2017 um 05:03 schrieb Liu, Monk:
> Why not use a more simple way ?
>
> Like moving ttm_eu_fence_buffer_objects() to before amd_sched_entity_push_job() ?
> That could solve the deadlock from your description
>
> And the push order is already guaranteed by context->mutex (which is 
> also a patch from you)
>
>
> BR Monk
>
> -----Original Message-----
> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf 
> Of Andrey Grodzovsky
> Sent: 2017年10月20日 21:32
> To: amd-gfx@lists.freedesktop.org
> Cc: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Koenig, Christian 
> <Christian.Koenig@amd.com>
> Subject: [PATCH 3/3] drm/amdgpu: Fix deadlock during GPU reset.
>
> Switch from kfifo to SPSC queue.
>
> Bug:
> Kfifo is limited at size, during GPU reset it would fill up to limit and the pushing thread (producer) would wait for the scheduler worker to consume the items in the fifo while holding reservation lock on a BO. The gpu reset thread on the other hand blocks the scheduler during reset. Before it unblocks the sceduler it might want to recover VRAM and so will try to reserve the same BO the producer thread is already holding creating a deadlock.
>
> Fix:
> Switch from kfifo to SPSC queue which is unlimited in size.
>
> Signed-off-by: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>
> ---
>   drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h |  4 +-
>   drivers/gpu/drm/amd/scheduler/gpu_scheduler.c   | 51 ++++++++++---------------
>   drivers/gpu/drm/amd/scheduler/gpu_scheduler.h   |  4 +-
>   3 files changed, 26 insertions(+), 33 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h 
> b/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h
> index 8bd3810..86838a8 100644
> --- a/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h
> +++ b/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h
> @@ -28,8 +28,8 @@ TRACE_EVENT(amd_sched_job,
>   			   __entry->id = sched_job->id;
>   			   __entry->fence = &sched_job->s_fence->finished;
>   			   __entry->name = sched_job->sched->name;
> -			   __entry->job_count = kfifo_len(
> -				   &sched_job->s_entity->job_queue) / sizeof(sched_job);
> +			   __entry->job_count = spsc_queue_count(
> +				   &sched_job->s_entity->job_queue);
>   			   __entry->hw_job_count = atomic_read(
>   				   &sched_job->sched->hw_rq_count);
>   			   ),
> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c 
> b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> index 1bbbce2..0c9cdc0 100644
> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> @@ -28,9 +28,14 @@
>   #include <drm/drmP.h>
>   #include "gpu_scheduler.h"
>   
> +#include "spsc_queue.h"
> +
>   #define CREATE_TRACE_POINTS
>   #include "gpu_sched_trace.h"
>   
> +#define to_amd_sched_job(sched_job)		\
> +		container_of((sched_job), struct amd_sched_job, queue_node)
> +
>   static bool amd_sched_entity_is_ready(struct amd_sched_entity *entity);  static void amd_sched_wakeup(struct amd_gpu_scheduler *sched);  static void amd_sched_process_job(struct dma_fence *f, struct dma_fence_cb *cb); @@ -123,8 +128,6 @@ int amd_sched_entity_init(struct amd_gpu_scheduler *sched,
>   			  struct amd_sched_rq *rq,
>   			  uint32_t jobs)
>   {
> -	int r;
> -
>   	if (!(sched && entity && rq))
>   		return -EINVAL;
>   
> @@ -135,9 +138,7 @@ int amd_sched_entity_init(struct amd_gpu_scheduler 
> *sched,
>   
>   	spin_lock_init(&entity->rq_lock);
>   	spin_lock_init(&entity->queue_lock);
> -	r = kfifo_alloc(&entity->job_queue, jobs * sizeof(void *), GFP_KERNEL);
> -	if (r)
> -		return r;
> +	spsc_queue_init(&entity->job_queue);
>   
>   	atomic_set(&entity->fence_seq, 0);
>   	entity->fence_context = dma_fence_context_alloc(2); @@ -170,7 +171,7 @@ static bool amd_sched_entity_is_initialized(struct amd_gpu_scheduler *sched,  static bool amd_sched_entity_is_idle(struct amd_sched_entity *entity)  {
>   	rmb();
> -	if (kfifo_is_empty(&entity->job_queue))
> +	if (spsc_queue_peek(&entity->job_queue) == NULL)
>   		return true;
>   
>   	return false;
> @@ -185,7 +186,7 @@ static bool amd_sched_entity_is_idle(struct amd_sched_entity *entity)
>    */
>   static bool amd_sched_entity_is_ready(struct amd_sched_entity *entity)  {
> -	if (kfifo_is_empty(&entity->job_queue))
> +	if (spsc_queue_peek(&entity->job_queue) == NULL)
>   		return false;
>   
>   	if (ACCESS_ONCE(entity->dependency)) @@ -227,7 +228,7 @@ void 
> amd_sched_entity_fini(struct amd_gpu_scheduler *sched,
>   		 */
>   		kthread_park(sched->thread);
>   		kthread_unpark(sched->thread);
> -		while (kfifo_out(&entity->job_queue, &job, sizeof(job))) {
> +		while ((job = 
> +to_amd_sched_job(spsc_queue_pop(&entity->job_queue))))
> +{
>   			struct amd_sched_fence *s_fence = job->s_fence;
>   			amd_sched_fence_scheduled(s_fence);
>   			dma_fence_set_error(&s_fence->finished, -ESRCH); @@ -235,9 +236,7 @@ void amd_sched_entity_fini(struct amd_gpu_scheduler *sched,
>   			dma_fence_put(&s_fence->finished);
>   			sched->ops->free_job(job);
>   		}
> -
>   	}
> -	kfifo_free(&entity->job_queue);
>   }
>   
>   static void amd_sched_entity_wakeup(struct dma_fence *f, struct 
> dma_fence_cb *cb) @@ -332,18 +331,21 @@ static bool 
> amd_sched_entity_add_dependency_cb(struct amd_sched_entity *entity)  }
>   
>   static struct amd_sched_job *
> -amd_sched_entity_peek_job(struct amd_sched_entity *entity)
> +amd_sched_entity_pop_job(struct amd_sched_entity *entity)
>   {
>   	struct amd_gpu_scheduler *sched = entity->sched;
> -	struct amd_sched_job *sched_job;
> +	struct amd_sched_job *sched_job = to_amd_sched_job(
> +						spsc_queue_peek(&entity->job_queue));
>   
> -	if (!kfifo_out_peek(&entity->job_queue, &sched_job, sizeof(sched_job)))
> +	if (!sched_job)
>   		return NULL;
>   
>   	while ((entity->dependency = sched->ops->dependency(sched_job)))
>   		if (amd_sched_entity_add_dependency_cb(entity))
>   			return NULL;
>   
> +	sched_job->s_entity = NULL;
> +	spsc_queue_pop(&entity->job_queue);
>   	return sched_job;
>   }
>   
> @@ -354,18 +356,14 @@ amd_sched_entity_peek_job(struct amd_sched_entity *entity)
>    *
>    * Returns true if we could submit the job.
>    */
> -static bool amd_sched_entity_in(struct amd_sched_job *sched_job)
> +static void amd_sched_entity_in(struct amd_sched_job *sched_job)
>   {
>   	struct amd_gpu_scheduler *sched = sched_job->sched;
>   	struct amd_sched_entity *entity = sched_job->s_entity;
> -	bool added, first = false;
> +	bool first = false;
>   
>   	spin_lock(&entity->queue_lock);
> -	added = kfifo_in(&entity->job_queue, &sched_job,
> -			sizeof(sched_job)) == sizeof(sched_job);
> -
> -	if (added && kfifo_len(&entity->job_queue) == sizeof(sched_job))
> -		first = true;
> +	first = spsc_queue_push(&entity->job_queue, &sched_job->queue_node);
>   
>   	spin_unlock(&entity->queue_lock);
>   
> @@ -377,7 +375,6 @@ static bool amd_sched_entity_in(struct amd_sched_job *sched_job)
>   		spin_unlock(&entity->rq_lock);
>   		amd_sched_wakeup(sched);
>   	}
> -	return added;
>   }
>   
>   /* job_finish is called after hw fence signaled @@ -514,11 +511,8 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
>    */
>   void amd_sched_entity_push_job(struct amd_sched_job *sched_job)  {
> -	struct amd_sched_entity *entity = sched_job->s_entity;
> -
>   	trace_amd_sched_job(sched_job);
> -	wait_event(entity->sched->job_scheduled,
> -		   amd_sched_entity_in(sched_job));
> +	amd_sched_entity_in(sched_job);
>   }
>   
>   /* init a sched_job with basic field */ @@ -611,7 +605,7 @@ static int amd_sched_main(void *param)  {
>   	struct sched_param sparam = {.sched_priority = 1};
>   	struct amd_gpu_scheduler *sched = (struct amd_gpu_scheduler *)param;
> -	int r, count;
> +	int r;
>   
>   	sched_setscheduler(current, SCHED_FIFO, &sparam);
>   
> @@ -629,7 +623,7 @@ static int amd_sched_main(void *param)
>   		if (!entity)
>   			continue;
>   
> -		sched_job = amd_sched_entity_peek_job(entity);
> +		sched_job = amd_sched_entity_pop_job(entity);
>   		if (!sched_job)
>   			continue;
>   
> @@ -656,9 +650,6 @@ static int amd_sched_main(void *param)
>   			amd_sched_process_job(NULL, &s_fence->cb);
>   		}
>   
> -		count = kfifo_out(&entity->job_queue, &sched_job,
> -				sizeof(sched_job));
> -		WARN_ON(count != sizeof(sched_job));
>   		wake_up(&sched->job_scheduled);
>   	}
>   	return 0;
> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h 
> b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> index 3f75b45..8844ce7 100644
> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> @@ -26,6 +26,7 @@
>   
>   #include <linux/kfifo.h>
>   #include <linux/dma-fence.h>
> +#include "spsc_queue.h"
>   
>   struct amd_gpu_scheduler;
>   struct amd_sched_rq;
> @@ -56,7 +57,7 @@ struct amd_sched_entity {
>   	struct amd_gpu_scheduler	*sched;
>   
>   	spinlock_t			queue_lock;
> -	struct kfifo                    job_queue;
> +	struct spsc_queue	job_queue;
>   
>   	atomic_t			fence_seq;
>   	uint64_t                        fence_context;
> @@ -87,6 +88,7 @@ struct amd_sched_fence {  };
>   
>   struct amd_sched_job {
> +	struct spsc_node queue_node;
>   	struct amd_gpu_scheduler        *sched;
>   	struct amd_sched_entity         *s_entity;
>   	struct amd_sched_fence          *s_fence;
> --
> 2.7.4
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 3/3] drm/amdgpu: Fix deadlock during GPU reset.
       [not found]                 ` <BLUPR12MB04490DF904B056086D164BDD84460-7LeqcoF/hwpTIQvHjXdJlwdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2017-10-23  7:30                   ` Christian König
       [not found]                     ` <a9ed61a0-6e0b-6093-5c0b-4459cfbb2282-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
  0 siblings, 1 reply; 20+ messages in thread
From: Christian König @ 2017-10-23  7:30 UTC (permalink / raw)
  To: Liu, Monk, Koenig, Christian, Grodzovsky, Andrey,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

> For VM updates, the situation is same as commands submission, deadlock can also be solved by moving ttm_eu_fence_buffer_objects() ahead of push_job()
That won't work. All VM updates must be completed while the reservation 
object lock is still held, otherwise you run into possible concurrent VM 
updates.

Keep in mind that you can have multiple context per VM, so that isn't 
covered by the per context lock.

> For TTM buffers moves can we pardon gpu_reset routine and wait till the TTM moves complete ?
No, TTM callbacks into our driver to do the move. So we don't have an 
opportunity to drop the reservation lock.

Additional to that kfifo has some other drawbacks as well. For example 
we waste around 20k of memory for each context for multi media ring 
buffers which are mostly unused.

Regards,
Christian.

Am 23.10.2017 um 09:24 schrieb Liu, Monk:
> For VM updates, the situation is same as commands submission, deadlock can also be solved by moving ttm_eu_fence_buffer_objects() ahead of push_job()
> For TTM buffers moves can we pardon gpu_reset routine and wait till the TTM moves complete ?
>
> -----Original Message-----
> From: Christian König [mailto:ckoenig.leichtzumerken@gmail.com]
> Sent: 2017年10月23日 15:09
> To: Liu, Monk <Monk.Liu@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Koenig, Christian <Christian.Koenig@amd.com>
> Subject: Re: [PATCH 3/3] drm/amdgpu: Fix deadlock during GPU reset.
>
> We discussed that as well, problem is that this won't be sufficient.
>
> We push to the kfifo not only during command submission, but also for VM updates and TTM buffers moves.
>
> So we can still deadlock because of them.
>
> Regards,
> Christian.
>
> Am 23.10.2017 um 05:03 schrieb Liu, Monk:
>> Why not use a more simple way ?
>>
>> Like moving ttm_eu_fence_buffer_objects() to before amd_sched_entity_push_job() ?
>> That could solve the deadlock from your description
>>
>> And the push order is already guaranteed by context->mutex (which is
>> also a patch from you)
>>
>>
>> BR Monk
>>
>> -----Original Message-----
>> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf
>> Of Andrey Grodzovsky
>> Sent: 2017年10月20日 21:32
>> To: amd-gfx@lists.freedesktop.org
>> Cc: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Koenig, Christian
>> <Christian.Koenig@amd.com>
>> Subject: [PATCH 3/3] drm/amdgpu: Fix deadlock during GPU reset.
>>
>> Switch from kfifo to SPSC queue.
>>
>> Bug:
>> Kfifo is limited at size, during GPU reset it would fill up to limit and the pushing thread (producer) would wait for the scheduler worker to consume the items in the fifo while holding reservation lock on a BO. The gpu reset thread on the other hand blocks the scheduler during reset. Before it unblocks the sceduler it might want to recover VRAM and so will try to reserve the same BO the producer thread is already holding creating a deadlock.
>>
>> Fix:
>> Switch from kfifo to SPSC queue which is unlimited in size.
>>
>> Signed-off-by: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>
>> ---
>>    drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h |  4 +-
>>    drivers/gpu/drm/amd/scheduler/gpu_scheduler.c   | 51 ++++++++++---------------
>>    drivers/gpu/drm/amd/scheduler/gpu_scheduler.h   |  4 +-
>>    3 files changed, 26 insertions(+), 33 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h
>> b/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h
>> index 8bd3810..86838a8 100644
>> --- a/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h
>> +++ b/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h
>> @@ -28,8 +28,8 @@ TRACE_EVENT(amd_sched_job,
>>    			   __entry->id = sched_job->id;
>>    			   __entry->fence = &sched_job->s_fence->finished;
>>    			   __entry->name = sched_job->sched->name;
>> -			   __entry->job_count = kfifo_len(
>> -				   &sched_job->s_entity->job_queue) / sizeof(sched_job);
>> +			   __entry->job_count = spsc_queue_count(
>> +				   &sched_job->s_entity->job_queue);
>>    			   __entry->hw_job_count = atomic_read(
>>    				   &sched_job->sched->hw_rq_count);
>>    			   ),
>> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
>> b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
>> index 1bbbce2..0c9cdc0 100644
>> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
>> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
>> @@ -28,9 +28,14 @@
>>    #include <drm/drmP.h>
>>    #include "gpu_scheduler.h"
>>    
>> +#include "spsc_queue.h"
>> +
>>    #define CREATE_TRACE_POINTS
>>    #include "gpu_sched_trace.h"
>>    
>> +#define to_amd_sched_job(sched_job)		\
>> +		container_of((sched_job), struct amd_sched_job, queue_node)
>> +
>>    static bool amd_sched_entity_is_ready(struct amd_sched_entity *entity);  static void amd_sched_wakeup(struct amd_gpu_scheduler *sched);  static void amd_sched_process_job(struct dma_fence *f, struct dma_fence_cb *cb); @@ -123,8 +128,6 @@ int amd_sched_entity_init(struct amd_gpu_scheduler *sched,
>>    			  struct amd_sched_rq *rq,
>>    			  uint32_t jobs)
>>    {
>> -	int r;
>> -
>>    	if (!(sched && entity && rq))
>>    		return -EINVAL;
>>    
>> @@ -135,9 +138,7 @@ int amd_sched_entity_init(struct amd_gpu_scheduler
>> *sched,
>>    
>>    	spin_lock_init(&entity->rq_lock);
>>    	spin_lock_init(&entity->queue_lock);
>> -	r = kfifo_alloc(&entity->job_queue, jobs * sizeof(void *), GFP_KERNEL);
>> -	if (r)
>> -		return r;
>> +	spsc_queue_init(&entity->job_queue);
>>    
>>    	atomic_set(&entity->fence_seq, 0);
>>    	entity->fence_context = dma_fence_context_alloc(2); @@ -170,7 +171,7 @@ static bool amd_sched_entity_is_initialized(struct amd_gpu_scheduler *sched,  static bool amd_sched_entity_is_idle(struct amd_sched_entity *entity)  {
>>    	rmb();
>> -	if (kfifo_is_empty(&entity->job_queue))
>> +	if (spsc_queue_peek(&entity->job_queue) == NULL)
>>    		return true;
>>    
>>    	return false;
>> @@ -185,7 +186,7 @@ static bool amd_sched_entity_is_idle(struct amd_sched_entity *entity)
>>     */
>>    static bool amd_sched_entity_is_ready(struct amd_sched_entity *entity)  {
>> -	if (kfifo_is_empty(&entity->job_queue))
>> +	if (spsc_queue_peek(&entity->job_queue) == NULL)
>>    		return false;
>>    
>>    	if (ACCESS_ONCE(entity->dependency)) @@ -227,7 +228,7 @@ void
>> amd_sched_entity_fini(struct amd_gpu_scheduler *sched,
>>    		 */
>>    		kthread_park(sched->thread);
>>    		kthread_unpark(sched->thread);
>> -		while (kfifo_out(&entity->job_queue, &job, sizeof(job))) {
>> +		while ((job =
>> +to_amd_sched_job(spsc_queue_pop(&entity->job_queue))))
>> +{
>>    			struct amd_sched_fence *s_fence = job->s_fence;
>>    			amd_sched_fence_scheduled(s_fence);
>>    			dma_fence_set_error(&s_fence->finished, -ESRCH); @@ -235,9 +236,7 @@ void amd_sched_entity_fini(struct amd_gpu_scheduler *sched,
>>    			dma_fence_put(&s_fence->finished);
>>    			sched->ops->free_job(job);
>>    		}
>> -
>>    	}
>> -	kfifo_free(&entity->job_queue);
>>    }
>>    
>>    static void amd_sched_entity_wakeup(struct dma_fence *f, struct
>> dma_fence_cb *cb) @@ -332,18 +331,21 @@ static bool
>> amd_sched_entity_add_dependency_cb(struct amd_sched_entity *entity)  }
>>    
>>    static struct amd_sched_job *
>> -amd_sched_entity_peek_job(struct amd_sched_entity *entity)
>> +amd_sched_entity_pop_job(struct amd_sched_entity *entity)
>>    {
>>    	struct amd_gpu_scheduler *sched = entity->sched;
>> -	struct amd_sched_job *sched_job;
>> +	struct amd_sched_job *sched_job = to_amd_sched_job(
>> +						spsc_queue_peek(&entity->job_queue));
>>    
>> -	if (!kfifo_out_peek(&entity->job_queue, &sched_job, sizeof(sched_job)))
>> +	if (!sched_job)
>>    		return NULL;
>>    
>>    	while ((entity->dependency = sched->ops->dependency(sched_job)))
>>    		if (amd_sched_entity_add_dependency_cb(entity))
>>    			return NULL;
>>    
>> +	sched_job->s_entity = NULL;
>> +	spsc_queue_pop(&entity->job_queue);
>>    	return sched_job;
>>    }
>>    
>> @@ -354,18 +356,14 @@ amd_sched_entity_peek_job(struct amd_sched_entity *entity)
>>     *
>>     * Returns true if we could submit the job.
>>     */
>> -static bool amd_sched_entity_in(struct amd_sched_job *sched_job)
>> +static void amd_sched_entity_in(struct amd_sched_job *sched_job)
>>    {
>>    	struct amd_gpu_scheduler *sched = sched_job->sched;
>>    	struct amd_sched_entity *entity = sched_job->s_entity;
>> -	bool added, first = false;
>> +	bool first = false;
>>    
>>    	spin_lock(&entity->queue_lock);
>> -	added = kfifo_in(&entity->job_queue, &sched_job,
>> -			sizeof(sched_job)) == sizeof(sched_job);
>> -
>> -	if (added && kfifo_len(&entity->job_queue) == sizeof(sched_job))
>> -		first = true;
>> +	first = spsc_queue_push(&entity->job_queue, &sched_job->queue_node);
>>    
>>    	spin_unlock(&entity->queue_lock);
>>    
>> @@ -377,7 +375,6 @@ static bool amd_sched_entity_in(struct amd_sched_job *sched_job)
>>    		spin_unlock(&entity->rq_lock);
>>    		amd_sched_wakeup(sched);
>>    	}
>> -	return added;
>>    }
>>    
>>    /* job_finish is called after hw fence signaled @@ -514,11 +511,8 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
>>     */
>>    void amd_sched_entity_push_job(struct amd_sched_job *sched_job)  {
>> -	struct amd_sched_entity *entity = sched_job->s_entity;
>> -
>>    	trace_amd_sched_job(sched_job);
>> -	wait_event(entity->sched->job_scheduled,
>> -		   amd_sched_entity_in(sched_job));
>> +	amd_sched_entity_in(sched_job);
>>    }
>>    
>>    /* init a sched_job with basic field */ @@ -611,7 +605,7 @@ static int amd_sched_main(void *param)  {
>>    	struct sched_param sparam = {.sched_priority = 1};
>>    	struct amd_gpu_scheduler *sched = (struct amd_gpu_scheduler *)param;
>> -	int r, count;
>> +	int r;
>>    
>>    	sched_setscheduler(current, SCHED_FIFO, &sparam);
>>    
>> @@ -629,7 +623,7 @@ static int amd_sched_main(void *param)
>>    		if (!entity)
>>    			continue;
>>    
>> -		sched_job = amd_sched_entity_peek_job(entity);
>> +		sched_job = amd_sched_entity_pop_job(entity);
>>    		if (!sched_job)
>>    			continue;
>>    
>> @@ -656,9 +650,6 @@ static int amd_sched_main(void *param)
>>    			amd_sched_process_job(NULL, &s_fence->cb);
>>    		}
>>    
>> -		count = kfifo_out(&entity->job_queue, &sched_job,
>> -				sizeof(sched_job));
>> -		WARN_ON(count != sizeof(sched_job));
>>    		wake_up(&sched->job_scheduled);
>>    	}
>>    	return 0;
>> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
>> b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
>> index 3f75b45..8844ce7 100644
>> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
>> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
>> @@ -26,6 +26,7 @@
>>    
>>    #include <linux/kfifo.h>
>>    #include <linux/dma-fence.h>
>> +#include "spsc_queue.h"
>>    
>>    struct amd_gpu_scheduler;
>>    struct amd_sched_rq;
>> @@ -56,7 +57,7 @@ struct amd_sched_entity {
>>    	struct amd_gpu_scheduler	*sched;
>>    
>>    	spinlock_t			queue_lock;
>> -	struct kfifo                    job_queue;
>> +	struct spsc_queue	job_queue;
>>    
>>    	atomic_t			fence_seq;
>>    	uint64_t                        fence_context;
>> @@ -87,6 +88,7 @@ struct amd_sched_fence {  };
>>    
>>    struct amd_sched_job {
>> +	struct spsc_node queue_node;
>>    	struct amd_gpu_scheduler        *sched;
>>    	struct amd_sched_entity         *s_entity;
>>    	struct amd_sched_fence          *s_fence;
>> --
>> 2.7.4
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH 3/3] drm/amdgpu: Fix deadlock during GPU reset.
       [not found]                     ` <a9ed61a0-6e0b-6093-5c0b-4459cfbb2282-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
@ 2017-10-23  7:38                       ` Liu, Monk
       [not found]                         ` <BLUPR12MB0449E8F1F121B277B598430684460-7LeqcoF/hwpTIQvHjXdJlwdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
  0 siblings, 1 reply; 20+ messages in thread
From: Liu, Monk @ 2017-10-23  7:38 UTC (permalink / raw)
  To: Koenig, Christian, Grodzovsky, Andrey,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Let's make it clear enough:
> That won't work. All VM updates must be completed while the reservation object lock is still held, otherwise you run into possible concurrent VM updates.

All VM updates will be represented as fence which will be hook in the sync object, and after we push job to scheduler, we immediately call ttm_eu_fence_buffer_objects()
So that means the reservation object lock is *not* hold anymore, right ? but keep in mind that this point scheduler may not even begin to work, so I don't believe 
This sentence " All VM updates must be completed while the reservation object lock is still held"

> No, TTM callbacks into our driver to do the move. So we don't have an opportunity to drop the reservation lock.

For TTM callbacks, can you help illustrate how the deadlock triggered ??? 


BR Monk


-----Original Message-----
From: Christian König [mailto:ckoenig.leichtzumerken@gmail.com] 
Sent: 2017年10月23日 15:31
To: Liu, Monk <Monk.Liu@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 3/3] drm/amdgpu: Fix deadlock during GPU reset.

> For VM updates, the situation is same as commands submission, deadlock 
> can also be solved by moving ttm_eu_fence_buffer_objects() ahead of 
> push_job()
That won't work. All VM updates must be completed while the reservation object lock is still held, otherwise you run into possible concurrent VM updates.

Keep in mind that you can have multiple context per VM, so that isn't covered by the per context lock.

> For TTM buffers moves can we pardon gpu_reset routine and wait till the TTM moves complete ?
No, TTM callbacks into our driver to do the move. So we don't have an opportunity to drop the reservation lock.

Additional to that kfifo has some other drawbacks as well. For example we waste around 20k of memory for each context for multi media ring buffers which are mostly unused.

Regards,
Christian.

Am 23.10.2017 um 09:24 schrieb Liu, Monk:
> For VM updates, the situation is same as commands submission, deadlock 
> can also be solved by moving ttm_eu_fence_buffer_objects() ahead of push_job() For TTM buffers moves can we pardon gpu_reset routine and wait till the TTM moves complete ?
>
> -----Original Message-----
> From: Christian König [mailto:ckoenig.leichtzumerken@gmail.com]
> Sent: 2017年10月23日 15:09
> To: Liu, Monk <Monk.Liu@amd.com>; Grodzovsky, Andrey 
> <Andrey.Grodzovsky@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Koenig, Christian <Christian.Koenig@amd.com>
> Subject: Re: [PATCH 3/3] drm/amdgpu: Fix deadlock during GPU reset.
>
> We discussed that as well, problem is that this won't be sufficient.
>
> We push to the kfifo not only during command submission, but also for VM updates and TTM buffers moves.
>
> So we can still deadlock because of them.
>
> Regards,
> Christian.
>
> Am 23.10.2017 um 05:03 schrieb Liu, Monk:
>> Why not use a more simple way ?
>>
>> Like moving ttm_eu_fence_buffer_objects() to before amd_sched_entity_push_job() ?
>> That could solve the deadlock from your description
>>
>> And the push order is already guaranteed by context->mutex (which is 
>> also a patch from you)
>>
>>
>> BR Monk
>>
>> -----Original Message-----
>> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On 
>> Behalf Of Andrey Grodzovsky
>> Sent: 2017年10月20日 21:32
>> To: amd-gfx@lists.freedesktop.org
>> Cc: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Koenig, Christian 
>> <Christian.Koenig@amd.com>
>> Subject: [PATCH 3/3] drm/amdgpu: Fix deadlock during GPU reset.
>>
>> Switch from kfifo to SPSC queue.
>>
>> Bug:
>> Kfifo is limited at size, during GPU reset it would fill up to limit and the pushing thread (producer) would wait for the scheduler worker to consume the items in the fifo while holding reservation lock on a BO. The gpu reset thread on the other hand blocks the scheduler during reset. Before it unblocks the sceduler it might want to recover VRAM and so will try to reserve the same BO the producer thread is already holding creating a deadlock.
>>
>> Fix:
>> Switch from kfifo to SPSC queue which is unlimited in size.
>>
>> Signed-off-by: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>
>> ---
>>    drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h |  4 +-
>>    drivers/gpu/drm/amd/scheduler/gpu_scheduler.c   | 51 ++++++++++---------------
>>    drivers/gpu/drm/amd/scheduler/gpu_scheduler.h   |  4 +-
>>    3 files changed, 26 insertions(+), 33 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h
>> b/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h
>> index 8bd3810..86838a8 100644
>> --- a/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h
>> +++ b/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h
>> @@ -28,8 +28,8 @@ TRACE_EVENT(amd_sched_job,
>>    			   __entry->id = sched_job->id;
>>    			   __entry->fence = &sched_job->s_fence->finished;
>>    			   __entry->name = sched_job->sched->name;
>> -			   __entry->job_count = kfifo_len(
>> -				   &sched_job->s_entity->job_queue) / sizeof(sched_job);
>> +			   __entry->job_count = spsc_queue_count(
>> +				   &sched_job->s_entity->job_queue);
>>    			   __entry->hw_job_count = atomic_read(
>>    				   &sched_job->sched->hw_rq_count);
>>    			   ),
>> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
>> b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
>> index 1bbbce2..0c9cdc0 100644
>> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
>> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
>> @@ -28,9 +28,14 @@
>>    #include <drm/drmP.h>
>>    #include "gpu_scheduler.h"
>>    
>> +#include "spsc_queue.h"
>> +
>>    #define CREATE_TRACE_POINTS
>>    #include "gpu_sched_trace.h"
>>    
>> +#define to_amd_sched_job(sched_job)		\
>> +		container_of((sched_job), struct amd_sched_job, queue_node)
>> +
>>    static bool amd_sched_entity_is_ready(struct amd_sched_entity *entity);  static void amd_sched_wakeup(struct amd_gpu_scheduler *sched);  static void amd_sched_process_job(struct dma_fence *f, struct dma_fence_cb *cb); @@ -123,8 +128,6 @@ int amd_sched_entity_init(struct amd_gpu_scheduler *sched,
>>    			  struct amd_sched_rq *rq,
>>    			  uint32_t jobs)
>>    {
>> -	int r;
>> -
>>    	if (!(sched && entity && rq))
>>    		return -EINVAL;
>>    
>> @@ -135,9 +138,7 @@ int amd_sched_entity_init(struct 
>> amd_gpu_scheduler *sched,
>>    
>>    	spin_lock_init(&entity->rq_lock);
>>    	spin_lock_init(&entity->queue_lock);
>> -	r = kfifo_alloc(&entity->job_queue, jobs * sizeof(void *), GFP_KERNEL);
>> -	if (r)
>> -		return r;
>> +	spsc_queue_init(&entity->job_queue);
>>    
>>    	atomic_set(&entity->fence_seq, 0);
>>    	entity->fence_context = dma_fence_context_alloc(2); @@ -170,7 +171,7 @@ static bool amd_sched_entity_is_initialized(struct amd_gpu_scheduler *sched,  static bool amd_sched_entity_is_idle(struct amd_sched_entity *entity)  {
>>    	rmb();
>> -	if (kfifo_is_empty(&entity->job_queue))
>> +	if (spsc_queue_peek(&entity->job_queue) == NULL)
>>    		return true;
>>    
>>    	return false;
>> @@ -185,7 +186,7 @@ static bool amd_sched_entity_is_idle(struct amd_sched_entity *entity)
>>     */
>>    static bool amd_sched_entity_is_ready(struct amd_sched_entity *entity)  {
>> -	if (kfifo_is_empty(&entity->job_queue))
>> +	if (spsc_queue_peek(&entity->job_queue) == NULL)
>>    		return false;
>>    
>>    	if (ACCESS_ONCE(entity->dependency)) @@ -227,7 +228,7 @@ void 
>> amd_sched_entity_fini(struct amd_gpu_scheduler *sched,
>>    		 */
>>    		kthread_park(sched->thread);
>>    		kthread_unpark(sched->thread);
>> -		while (kfifo_out(&entity->job_queue, &job, sizeof(job))) {
>> +		while ((job =
>> +to_amd_sched_job(spsc_queue_pop(&entity->job_queue))))
>> +{
>>    			struct amd_sched_fence *s_fence = job->s_fence;
>>    			amd_sched_fence_scheduled(s_fence);
>>    			dma_fence_set_error(&s_fence->finished, -ESRCH); @@ -235,9 +236,7 @@ void amd_sched_entity_fini(struct amd_gpu_scheduler *sched,
>>    			dma_fence_put(&s_fence->finished);
>>    			sched->ops->free_job(job);
>>    		}
>> -
>>    	}
>> -	kfifo_free(&entity->job_queue);
>>    }
>>    
>>    static void amd_sched_entity_wakeup(struct dma_fence *f, struct 
>> dma_fence_cb *cb) @@ -332,18 +331,21 @@ static bool 
>> amd_sched_entity_add_dependency_cb(struct amd_sched_entity *entity)  
>> }
>>    
>>    static struct amd_sched_job *
>> -amd_sched_entity_peek_job(struct amd_sched_entity *entity)
>> +amd_sched_entity_pop_job(struct amd_sched_entity *entity)
>>    {
>>    	struct amd_gpu_scheduler *sched = entity->sched;
>> -	struct amd_sched_job *sched_job;
>> +	struct amd_sched_job *sched_job = to_amd_sched_job(
>> +						spsc_queue_peek(&entity->job_queue));
>>    
>> -	if (!kfifo_out_peek(&entity->job_queue, &sched_job, sizeof(sched_job)))
>> +	if (!sched_job)
>>    		return NULL;
>>    
>>    	while ((entity->dependency = sched->ops->dependency(sched_job)))
>>    		if (amd_sched_entity_add_dependency_cb(entity))
>>    			return NULL;
>>    
>> +	sched_job->s_entity = NULL;
>> +	spsc_queue_pop(&entity->job_queue);
>>    	return sched_job;
>>    }
>>    
>> @@ -354,18 +356,14 @@ amd_sched_entity_peek_job(struct amd_sched_entity *entity)
>>     *
>>     * Returns true if we could submit the job.
>>     */
>> -static bool amd_sched_entity_in(struct amd_sched_job *sched_job)
>> +static void amd_sched_entity_in(struct amd_sched_job *sched_job)
>>    {
>>    	struct amd_gpu_scheduler *sched = sched_job->sched;
>>    	struct amd_sched_entity *entity = sched_job->s_entity;
>> -	bool added, first = false;
>> +	bool first = false;
>>    
>>    	spin_lock(&entity->queue_lock);
>> -	added = kfifo_in(&entity->job_queue, &sched_job,
>> -			sizeof(sched_job)) == sizeof(sched_job);
>> -
>> -	if (added && kfifo_len(&entity->job_queue) == sizeof(sched_job))
>> -		first = true;
>> +	first = spsc_queue_push(&entity->job_queue, 
>> +&sched_job->queue_node);
>>    
>>    	spin_unlock(&entity->queue_lock);
>>    
>> @@ -377,7 +375,6 @@ static bool amd_sched_entity_in(struct amd_sched_job *sched_job)
>>    		spin_unlock(&entity->rq_lock);
>>    		amd_sched_wakeup(sched);
>>    	}
>> -	return added;
>>    }
>>    
>>    /* job_finish is called after hw fence signaled @@ -514,11 +511,8 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
>>     */
>>    void amd_sched_entity_push_job(struct amd_sched_job *sched_job)  {
>> -	struct amd_sched_entity *entity = sched_job->s_entity;
>> -
>>    	trace_amd_sched_job(sched_job);
>> -	wait_event(entity->sched->job_scheduled,
>> -		   amd_sched_entity_in(sched_job));
>> +	amd_sched_entity_in(sched_job);
>>    }
>>    
>>    /* init a sched_job with basic field */ @@ -611,7 +605,7 @@ static int amd_sched_main(void *param)  {
>>    	struct sched_param sparam = {.sched_priority = 1};
>>    	struct amd_gpu_scheduler *sched = (struct amd_gpu_scheduler *)param;
>> -	int r, count;
>> +	int r;
>>    
>>    	sched_setscheduler(current, SCHED_FIFO, &sparam);
>>    
>> @@ -629,7 +623,7 @@ static int amd_sched_main(void *param)
>>    		if (!entity)
>>    			continue;
>>    
>> -		sched_job = amd_sched_entity_peek_job(entity);
>> +		sched_job = amd_sched_entity_pop_job(entity);
>>    		if (!sched_job)
>>    			continue;
>>    
>> @@ -656,9 +650,6 @@ static int amd_sched_main(void *param)
>>    			amd_sched_process_job(NULL, &s_fence->cb);
>>    		}
>>    
>> -		count = kfifo_out(&entity->job_queue, &sched_job,
>> -				sizeof(sched_job));
>> -		WARN_ON(count != sizeof(sched_job));
>>    		wake_up(&sched->job_scheduled);
>>    	}
>>    	return 0;
>> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
>> b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
>> index 3f75b45..8844ce7 100644
>> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
>> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
>> @@ -26,6 +26,7 @@
>>    
>>    #include <linux/kfifo.h>
>>    #include <linux/dma-fence.h>
>> +#include "spsc_queue.h"
>>    
>>    struct amd_gpu_scheduler;
>>    struct amd_sched_rq;
>> @@ -56,7 +57,7 @@ struct amd_sched_entity {
>>    	struct amd_gpu_scheduler	*sched;
>>    
>>    	spinlock_t			queue_lock;
>> -	struct kfifo                    job_queue;
>> +	struct spsc_queue	job_queue;
>>    
>>    	atomic_t			fence_seq;
>>    	uint64_t                        fence_context;
>> @@ -87,6 +88,7 @@ struct amd_sched_fence {  };
>>    
>>    struct amd_sched_job {
>> +	struct spsc_node queue_node;
>>    	struct amd_gpu_scheduler        *sched;
>>    	struct amd_sched_entity         *s_entity;
>>    	struct amd_sched_fence          *s_fence;
>> --
>> 2.7.4
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* RE: [PATCH 3/3] drm/amdgpu: Fix deadlock during GPU reset.
       [not found]                         ` <BLUPR12MB0449E8F1F121B277B598430684460-7LeqcoF/hwpTIQvHjXdJlwdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2017-10-23  8:46                           ` Liu, Monk
  0 siblings, 0 replies; 20+ messages in thread
From: Liu, Monk @ 2017-10-23  8:46 UTC (permalink / raw)
  To: Liu, Monk, Koenig, Christian, Grodzovsky, Andrey,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

I read the source again, looks like I misunderstand your point, let me clear it again:

1, the cs_submit deadlock can be resolved by moving eu_fence_buffer()
2, in GEM_VA IOCTL, the reservation lock is hold in the very beginning, and vm_update_directories() need that lock held till all related JOBs submitted to VM entity, so if VM entity is full the deadlock could hit, and we can not move "eu_backoff_reservaton" to before push_job() under job_submit() due to concurrent concerns
3, for TTM moves, e.g. ttm_bo_moves(), it will take the reserve lock on the bo from swap_lru list and then call into amdgpu ttm part, so the deadlock could also hit in job_sumbit() when KFIFO is FULL.

You guys are right, the the second and third deadlock is still there 

BR Monk

-----Original Message-----
From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf Of Liu, Monk
Sent: 2017年10月23日 15:39
To: Koenig, Christian <Christian.Koenig@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; amd-gfx@lists.freedesktop.org
Subject: RE: [PATCH 3/3] drm/amdgpu: Fix deadlock during GPU reset.

Let's make it clear enough:
> That won't work. All VM updates must be completed while the reservation object lock is still held, otherwise you run into possible concurrent VM updates.

All VM updates will be represented as fence which will be hook in the sync object, and after we push job to scheduler, we immediately call ttm_eu_fence_buffer_objects() So that means the reservation object lock is *not* hold anymore, right ? but keep in mind that this point scheduler may not even begin to work, so I don't believe This sentence " All VM updates must be completed while the reservation object lock is still held"

> No, TTM callbacks into our driver to do the move. So we don't have an opportunity to drop the reservation lock.

For TTM callbacks, can you help illustrate how the deadlock triggered ??? 


BR Monk


-----Original Message-----
From: Christian König [mailto:ckoenig.leichtzumerken@gmail.com]
Sent: 2017年10月23日 15:31
To: Liu, Monk <Monk.Liu@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 3/3] drm/amdgpu: Fix deadlock during GPU reset.

> For VM updates, the situation is same as commands submission, deadlock 
> can also be solved by moving ttm_eu_fence_buffer_objects() ahead of
> push_job()
That won't work. All VM updates must be completed while the reservation object lock is still held, otherwise you run into possible concurrent VM updates.

Keep in mind that you can have multiple context per VM, so that isn't covered by the per context lock.

> For TTM buffers moves can we pardon gpu_reset routine and wait till the TTM moves complete ?
No, TTM callbacks into our driver to do the move. So we don't have an opportunity to drop the reservation lock.

Additional to that kfifo has some other drawbacks as well. For example we waste around 20k of memory for each context for multi media ring buffers which are mostly unused.

Regards,
Christian.

Am 23.10.2017 um 09:24 schrieb Liu, Monk:
> For VM updates, the situation is same as commands submission, deadlock 
> can also be solved by moving ttm_eu_fence_buffer_objects() ahead of push_job() For TTM buffers moves can we pardon gpu_reset routine and wait till the TTM moves complete ?
>
> -----Original Message-----
> From: Christian König [mailto:ckoenig.leichtzumerken@gmail.com]
> Sent: 2017年10月23日 15:09
> To: Liu, Monk <Monk.Liu@amd.com>; Grodzovsky, Andrey 
> <Andrey.Grodzovsky@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Koenig, Christian <Christian.Koenig@amd.com>
> Subject: Re: [PATCH 3/3] drm/amdgpu: Fix deadlock during GPU reset.
>
> We discussed that as well, problem is that this won't be sufficient.
>
> We push to the kfifo not only during command submission, but also for VM updates and TTM buffers moves.
>
> So we can still deadlock because of them.
>
> Regards,
> Christian.
>
> Am 23.10.2017 um 05:03 schrieb Liu, Monk:
>> Why not use a more simple way ?
>>
>> Like moving ttm_eu_fence_buffer_objects() to before amd_sched_entity_push_job() ?
>> That could solve the deadlock from your description
>>
>> And the push order is already guaranteed by context->mutex (which is 
>> also a patch from you)
>>
>>
>> BR Monk
>>
>> -----Original Message-----
>> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On 
>> Behalf Of Andrey Grodzovsky
>> Sent: 2017年10月20日 21:32
>> To: amd-gfx@lists.freedesktop.org
>> Cc: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Koenig, Christian 
>> <Christian.Koenig@amd.com>
>> Subject: [PATCH 3/3] drm/amdgpu: Fix deadlock during GPU reset.
>>
>> Switch from kfifo to SPSC queue.
>>
>> Bug:
>> Kfifo is limited at size, during GPU reset it would fill up to limit and the pushing thread (producer) would wait for the scheduler worker to consume the items in the fifo while holding reservation lock on a BO. The gpu reset thread on the other hand blocks the scheduler during reset. Before it unblocks the sceduler it might want to recover VRAM and so will try to reserve the same BO the producer thread is already holding creating a deadlock.
>>
>> Fix:
>> Switch from kfifo to SPSC queue which is unlimited in size.
>>
>> Signed-off-by: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>
>> ---
>>    drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h |  4 +-
>>    drivers/gpu/drm/amd/scheduler/gpu_scheduler.c   | 51 ++++++++++---------------
>>    drivers/gpu/drm/amd/scheduler/gpu_scheduler.h   |  4 +-
>>    3 files changed, 26 insertions(+), 33 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h
>> b/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h
>> index 8bd3810..86838a8 100644
>> --- a/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h
>> +++ b/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h
>> @@ -28,8 +28,8 @@ TRACE_EVENT(amd_sched_job,
>>    			   __entry->id = sched_job->id;
>>    			   __entry->fence = &sched_job->s_fence->finished;
>>    			   __entry->name = sched_job->sched->name;
>> -			   __entry->job_count = kfifo_len(
>> -				   &sched_job->s_entity->job_queue) / sizeof(sched_job);
>> +			   __entry->job_count = spsc_queue_count(
>> +				   &sched_job->s_entity->job_queue);
>>    			   __entry->hw_job_count = atomic_read(
>>    				   &sched_job->sched->hw_rq_count);
>>    			   ),
>> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
>> b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
>> index 1bbbce2..0c9cdc0 100644
>> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
>> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
>> @@ -28,9 +28,14 @@
>>    #include <drm/drmP.h>
>>    #include "gpu_scheduler.h"
>>    
>> +#include "spsc_queue.h"
>> +
>>    #define CREATE_TRACE_POINTS
>>    #include "gpu_sched_trace.h"
>>    
>> +#define to_amd_sched_job(sched_job)		\
>> +		container_of((sched_job), struct amd_sched_job, queue_node)
>> +
>>    static bool amd_sched_entity_is_ready(struct amd_sched_entity *entity);  static void amd_sched_wakeup(struct amd_gpu_scheduler *sched);  static void amd_sched_process_job(struct dma_fence *f, struct dma_fence_cb *cb); @@ -123,8 +128,6 @@ int amd_sched_entity_init(struct amd_gpu_scheduler *sched,
>>    			  struct amd_sched_rq *rq,
>>    			  uint32_t jobs)
>>    {
>> -	int r;
>> -
>>    	if (!(sched && entity && rq))
>>    		return -EINVAL;
>>    
>> @@ -135,9 +138,7 @@ int amd_sched_entity_init(struct 
>> amd_gpu_scheduler *sched,
>>    
>>    	spin_lock_init(&entity->rq_lock);
>>    	spin_lock_init(&entity->queue_lock);
>> -	r = kfifo_alloc(&entity->job_queue, jobs * sizeof(void *), GFP_KERNEL);
>> -	if (r)
>> -		return r;
>> +	spsc_queue_init(&entity->job_queue);
>>    
>>    	atomic_set(&entity->fence_seq, 0);
>>    	entity->fence_context = dma_fence_context_alloc(2); @@ -170,7 +171,7 @@ static bool amd_sched_entity_is_initialized(struct amd_gpu_scheduler *sched,  static bool amd_sched_entity_is_idle(struct amd_sched_entity *entity)  {
>>    	rmb();
>> -	if (kfifo_is_empty(&entity->job_queue))
>> +	if (spsc_queue_peek(&entity->job_queue) == NULL)
>>    		return true;
>>    
>>    	return false;
>> @@ -185,7 +186,7 @@ static bool amd_sched_entity_is_idle(struct amd_sched_entity *entity)
>>     */
>>    static bool amd_sched_entity_is_ready(struct amd_sched_entity *entity)  {
>> -	if (kfifo_is_empty(&entity->job_queue))
>> +	if (spsc_queue_peek(&entity->job_queue) == NULL)
>>    		return false;
>>    
>>    	if (ACCESS_ONCE(entity->dependency)) @@ -227,7 +228,7 @@ void 
>> amd_sched_entity_fini(struct amd_gpu_scheduler *sched,
>>    		 */
>>    		kthread_park(sched->thread);
>>    		kthread_unpark(sched->thread);
>> -		while (kfifo_out(&entity->job_queue, &job, sizeof(job))) {
>> +		while ((job =
>> +to_amd_sched_job(spsc_queue_pop(&entity->job_queue))))
>> +{
>>    			struct amd_sched_fence *s_fence = job->s_fence;
>>    			amd_sched_fence_scheduled(s_fence);
>>    			dma_fence_set_error(&s_fence->finished, -ESRCH); @@ -235,9 +236,7 @@ void amd_sched_entity_fini(struct amd_gpu_scheduler *sched,
>>    			dma_fence_put(&s_fence->finished);
>>    			sched->ops->free_job(job);
>>    		}
>> -
>>    	}
>> -	kfifo_free(&entity->job_queue);
>>    }
>>    
>>    static void amd_sched_entity_wakeup(struct dma_fence *f, struct 
>> dma_fence_cb *cb) @@ -332,18 +331,21 @@ static bool 
>> amd_sched_entity_add_dependency_cb(struct amd_sched_entity *entity)  
>> }
>>    
>>    static struct amd_sched_job *
>> -amd_sched_entity_peek_job(struct amd_sched_entity *entity)
>> +amd_sched_entity_pop_job(struct amd_sched_entity *entity)
>>    {
>>    	struct amd_gpu_scheduler *sched = entity->sched;
>> -	struct amd_sched_job *sched_job;
>> +	struct amd_sched_job *sched_job = to_amd_sched_job(
>> +						spsc_queue_peek(&entity->job_queue));
>>    
>> -	if (!kfifo_out_peek(&entity->job_queue, &sched_job, sizeof(sched_job)))
>> +	if (!sched_job)
>>    		return NULL;
>>    
>>    	while ((entity->dependency = sched->ops->dependency(sched_job)))
>>    		if (amd_sched_entity_add_dependency_cb(entity))
>>    			return NULL;
>>    
>> +	sched_job->s_entity = NULL;
>> +	spsc_queue_pop(&entity->job_queue);
>>    	return sched_job;
>>    }
>>    
>> @@ -354,18 +356,14 @@ amd_sched_entity_peek_job(struct amd_sched_entity *entity)
>>     *
>>     * Returns true if we could submit the job.
>>     */
>> -static bool amd_sched_entity_in(struct amd_sched_job *sched_job)
>> +static void amd_sched_entity_in(struct amd_sched_job *sched_job)
>>    {
>>    	struct amd_gpu_scheduler *sched = sched_job->sched;
>>    	struct amd_sched_entity *entity = sched_job->s_entity;
>> -	bool added, first = false;
>> +	bool first = false;
>>    
>>    	spin_lock(&entity->queue_lock);
>> -	added = kfifo_in(&entity->job_queue, &sched_job,
>> -			sizeof(sched_job)) == sizeof(sched_job);
>> -
>> -	if (added && kfifo_len(&entity->job_queue) == sizeof(sched_job))
>> -		first = true;
>> +	first = spsc_queue_push(&entity->job_queue, 
>> +&sched_job->queue_node);
>>    
>>    	spin_unlock(&entity->queue_lock);
>>    
>> @@ -377,7 +375,6 @@ static bool amd_sched_entity_in(struct amd_sched_job *sched_job)
>>    		spin_unlock(&entity->rq_lock);
>>    		amd_sched_wakeup(sched);
>>    	}
>> -	return added;
>>    }
>>    
>>    /* job_finish is called after hw fence signaled @@ -514,11 +511,8 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
>>     */
>>    void amd_sched_entity_push_job(struct amd_sched_job *sched_job)  {
>> -	struct amd_sched_entity *entity = sched_job->s_entity;
>> -
>>    	trace_amd_sched_job(sched_job);
>> -	wait_event(entity->sched->job_scheduled,
>> -		   amd_sched_entity_in(sched_job));
>> +	amd_sched_entity_in(sched_job);
>>    }
>>    
>>    /* init a sched_job with basic field */ @@ -611,7 +605,7 @@ static int amd_sched_main(void *param)  {
>>    	struct sched_param sparam = {.sched_priority = 1};
>>    	struct amd_gpu_scheduler *sched = (struct amd_gpu_scheduler *)param;
>> -	int r, count;
>> +	int r;
>>    
>>    	sched_setscheduler(current, SCHED_FIFO, &sparam);
>>    
>> @@ -629,7 +623,7 @@ static int amd_sched_main(void *param)
>>    		if (!entity)
>>    			continue;
>>    
>> -		sched_job = amd_sched_entity_peek_job(entity);
>> +		sched_job = amd_sched_entity_pop_job(entity);
>>    		if (!sched_job)
>>    			continue;
>>    
>> @@ -656,9 +650,6 @@ static int amd_sched_main(void *param)
>>    			amd_sched_process_job(NULL, &s_fence->cb);
>>    		}
>>    
>> -		count = kfifo_out(&entity->job_queue, &sched_job,
>> -				sizeof(sched_job));
>> -		WARN_ON(count != sizeof(sched_job));
>>    		wake_up(&sched->job_scheduled);
>>    	}
>>    	return 0;
>> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
>> b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
>> index 3f75b45..8844ce7 100644
>> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
>> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
>> @@ -26,6 +26,7 @@
>>    
>>    #include <linux/kfifo.h>
>>    #include <linux/dma-fence.h>
>> +#include "spsc_queue.h"
>>    
>>    struct amd_gpu_scheduler;
>>    struct amd_sched_rq;
>> @@ -56,7 +57,7 @@ struct amd_sched_entity {
>>    	struct amd_gpu_scheduler	*sched;
>>    
>>    	spinlock_t			queue_lock;
>> -	struct kfifo                    job_queue;
>> +	struct spsc_queue	job_queue;
>>    
>>    	atomic_t			fence_seq;
>>    	uint64_t                        fence_context;
>> @@ -87,6 +88,7 @@ struct amd_sched_fence {  };
>>    
>>    struct amd_sched_job {
>> +	struct spsc_node queue_node;
>>    	struct amd_gpu_scheduler        *sched;
>>    	struct amd_sched_entity         *s_entity;
>>    	struct amd_sched_fence          *s_fence;
>> --
>> 2.7.4
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/3] drm/amdgpu: Add SPSC queue to scheduler.
       [not found]         ` <BLUPR12MB04499B9F742E2CF76139F6BC84460-7LeqcoF/hwpTIQvHjXdJlwdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
@ 2017-10-23 10:50           ` Andrey Grodzovsky
  0 siblings, 0 replies; 20+ messages in thread
From: Andrey Grodzovsky @ 2017-10-23 10:50 UTC (permalink / raw)
  To: Liu, Monk, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Koenig, Christian



On 2017-10-23 12:06 AM, Liu, Monk wrote:
> If the deadlock issue could be solved I don't see why we give up kfifo and switch to SPSC ......

The deadlock is solved because we don't block anymore waiting for 
consumer to dequeue items from the queue - which can only be
achieved with not bounded container.

Thanks,
Andrey

>
> -----Original Message-----
> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf Of Andrey Grodzovsky
> Sent: 2017年10月20日 21:32
> To: amd-gfx@lists.freedesktop.org
> Cc: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
> Subject: [PATCH 2/3] drm/amdgpu: Add SPSC queue to scheduler.
>
> It is intended to sabstitute the bounded fifo we are currently using.
>
> Signed-off-by: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>
> ---
>   drivers/gpu/drm/amd/scheduler/spsc_queue.h | 120 +++++++++++++++++++++++++++++
>   1 file changed, 120 insertions(+)
>   create mode 100644 drivers/gpu/drm/amd/scheduler/spsc_queue.h
>
> diff --git a/drivers/gpu/drm/amd/scheduler/spsc_queue.h b/drivers/gpu/drm/amd/scheduler/spsc_queue.h
> new file mode 100644
> index 0000000..a3394f1
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/scheduler/spsc_queue.h
> @@ -0,0 +1,120 @@
> +/*
> + * Copyright 2017 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person
> +obtaining a
> + * copy of this software and associated documentation files (the
> +"Software"),
> + * to deal in the Software without restriction, including without
> +limitation
> + * the rights to use, copy, modify, merge, publish, distribute,
> +sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom
> +the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be
> +included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> +EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> +MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT
> +SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM,
> +DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
> +OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
> +OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + *
> + */
> +
> +#ifndef AMD_SCHEDULER_SPSC_QUEUE_H_
> +#define AMD_SCHEDULER_SPSC_QUEUE_H_
> +
> +#include <linux/atomic.h>
> +
> +/** SPSC lockless queue */
> +
> +struct spsc_node {
> +
> +	/* Stores spsc_node* */
> +	struct spsc_node *next;
> +};
> +
> +struct spsc_queue {
> +
> +	 struct spsc_node *head;
> +
> +	/* atomic pointer to struct spsc_node* */
> +	atomic_long_t tail;
> +
> +	atomic_t job_count;
> +};
> +
> +static inline void spsc_queue_init(struct spsc_queue *queue) {
> +	queue->head = NULL;
> +	atomic_long_set(&queue->tail, (long)&queue->head);
> +	atomic_set(&queue->job_count, 0);
> +}
> +
> +static inline struct spsc_node *spsc_queue_peek(struct spsc_queue
> +*queue) {
> +	return queue->head;
> +}
> +
> +static inline int spsc_queue_count(struct spsc_queue *queue) {
> +	return atomic_read(&queue->job_count); }
> +
> +static inline bool spsc_queue_push(struct spsc_queue *queue, struct
> +spsc_node *node) {
> +	struct spsc_node **tail;
> +
> +	node->next = NULL;
> +
> +	preempt_disable();
> +
> +	tail = (struct spsc_node **)atomic_long_xchg(&queue->tail, (long)&node->next);
> +	WRITE_ONCE(*tail, node);
> +	atomic_inc(&queue->job_count);
> +
> +	/*
> +	 * In case of first element verify new node will be visible to the consumer
> +	 * thread when we ping the kernel thread that there is new work to do.
> +	 */
> +	smp_wmb();
> +
> +	preempt_enable();
> +
> +	return tail == &queue->head;
> +}
> +
> +
> +static inline struct spsc_node *spsc_queue_pop(struct spsc_queue
> +*queue) {
> +	struct spsc_node *next, *node;
> +
> +	/* Verify reading from memory and not the cache */
> +	smp_rmb();
> +
> +	node = READ_ONCE(queue->head);
> +
> +	if (!node)
> +		return NULL;
> +
> +	next = READ_ONCE(node->next);
> +	WRITE_ONCE(queue->head, next);
> +
> +	if (unlikely(!next)) {
> +		/* slowpath for the last element in the queue */
> +
> +		if (atomic_long_cmpxchg(&queue->tail,
> +				(long)&node->next,(long) &queue->head) != (long)&node->next) {
> +			/* Updating tail failed wait for new next to appear */
> +			do {
> +				smp_rmb();
> +			}while (unlikely(!(queue->head = READ_ONCE(node->next))));
> +		}
> +	}
> +
> +	atomic_dec(&queue->job_count);
> +	return node;
> +}
> +
> +
> +
> +#endif /* AMD_SCHEDULER_SPSC_QUEUE_H_ */
> --
> 2.7.4
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: Avoid accessing job->entity after the job is scheduled.
       [not found] ` <1508506327-1084-1-git-send-email-Andrey.Grodzovsky-5C7GfCeVMHo@public.gmane.org>
                     ` (2 preceding siblings ...)
  2017-10-20 15:59   ` [PATCH 1/3] drm/amdgpu: Avoid accessing job->entity after the job is scheduled Andres Rodriguez
@ 2017-10-23 12:36   ` Christian König
  3 siblings, 0 replies; 20+ messages in thread
From: Christian König @ 2017-10-23 12:36 UTC (permalink / raw)
  To: Andrey Grodzovsky, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: christian.koenig-5C7GfCeVMHo

Am 20.10.2017 um 15:32 schrieb Andrey Grodzovsky:
> Bug: amdgpu_job_free_cb was accessing s_job->s_entity when the allocated
> amdgpu_ctx (and the entity inside it) were already deallocated from
> amdgpu_cs_parser_fini.
>
> Fix: Save job's priority on it's creation instead of accessing it from
> s_entity later on.
>
> Signed-off-by: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>

Reviewed-by: Christian König <christian.koenig@amd.com>

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c        |  3 +--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c       |  5 ++---
>   drivers/gpu/drm/amd/scheduler/gpu_scheduler.c |  1 +
>   drivers/gpu/drm/amd/scheduler/gpu_scheduler.h | 32 ++++++++++++---------------
>   4 files changed, 18 insertions(+), 23 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index f7fceb6..a760b6e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -1192,8 +1192,7 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
>   	job->uf_sequence = seq;
>   
>   	amdgpu_job_free_resources(job);
> -	amdgpu_ring_priority_get(job->ring,
> -				 amd_sched_get_job_priority(&job->base));
> +	amdgpu_ring_priority_get(job->ring, job->base.s_priority);
>   
>   	trace_amdgpu_cs_ioctl(job);
>   	amd_sched_entity_push_job(&job->base);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index 0cfc68d..a58e3c5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -104,7 +104,7 @@ static void amdgpu_job_free_cb(struct amd_sched_job *s_job)
>   {
>   	struct amdgpu_job *job = container_of(s_job, struct amdgpu_job, base);
>   
> -	amdgpu_ring_priority_put(job->ring, amd_sched_get_job_priority(s_job));
> +	amdgpu_ring_priority_put(job->ring, s_job->s_priority);
>   	dma_fence_put(job->fence);
>   	amdgpu_sync_free(&job->sync);
>   	amdgpu_sync_free(&job->dep_sync);
> @@ -141,8 +141,7 @@ int amdgpu_job_submit(struct amdgpu_job *job, struct amdgpu_ring *ring,
>   	job->fence_ctx = entity->fence_context;
>   	*f = dma_fence_get(&job->base.s_fence->finished);
>   	amdgpu_job_free_resources(job);
> -	amdgpu_ring_priority_get(job->ring,
> -				 amd_sched_get_job_priority(&job->base));
> +	amdgpu_ring_priority_get(job->ring, job->base.s_priority);
>   	amd_sched_entity_push_job(&job->base);
>   
>   	return 0;
> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> index e4d3b4e..1bbbce2 100644
> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> @@ -529,6 +529,7 @@ int amd_sched_job_init(struct amd_sched_job *job,
>   {
>   	job->sched = sched;
>   	job->s_entity = entity;
> +	job->s_priority = entity->rq - sched->sched_rq;
>   	job->s_fence = amd_sched_fence_create(entity, owner);
>   	if (!job->s_fence)
>   		return -ENOMEM;
> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> index 52c8e54..3f75b45 100644
> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> @@ -30,6 +30,19 @@
>   struct amd_gpu_scheduler;
>   struct amd_sched_rq;
>   
> +enum amd_sched_priority {
> +	AMD_SCHED_PRIORITY_MIN,
> +	AMD_SCHED_PRIORITY_LOW = AMD_SCHED_PRIORITY_MIN,
> +	AMD_SCHED_PRIORITY_NORMAL,
> +	AMD_SCHED_PRIORITY_HIGH_SW,
> +	AMD_SCHED_PRIORITY_HIGH_HW,
> +	AMD_SCHED_PRIORITY_KERNEL,
> +	AMD_SCHED_PRIORITY_MAX,
> +	AMD_SCHED_PRIORITY_INVALID = -1,
> +	AMD_SCHED_PRIORITY_UNSET = -2
> +};
> +
> +
>   /**
>    * A scheduler entity is a wrapper around a job queue or a group
>    * of other entities. Entities take turns emitting jobs from their
> @@ -83,6 +96,7 @@ struct amd_sched_job {
>   	struct delayed_work		work_tdr;
>   	uint64_t			id;
>   	atomic_t karma;
> +	enum amd_sched_priority s_priority;
>   };
>   
>   extern const struct dma_fence_ops amd_sched_fence_ops_scheduled;
> @@ -114,18 +128,6 @@ struct amd_sched_backend_ops {
>   	void (*free_job)(struct amd_sched_job *sched_job);
>   };
>   
> -enum amd_sched_priority {
> -	AMD_SCHED_PRIORITY_MIN,
> -	AMD_SCHED_PRIORITY_LOW = AMD_SCHED_PRIORITY_MIN,
> -	AMD_SCHED_PRIORITY_NORMAL,
> -	AMD_SCHED_PRIORITY_HIGH_SW,
> -	AMD_SCHED_PRIORITY_HIGH_HW,
> -	AMD_SCHED_PRIORITY_KERNEL,
> -	AMD_SCHED_PRIORITY_MAX,
> -	AMD_SCHED_PRIORITY_INVALID = -1,
> -	AMD_SCHED_PRIORITY_UNSET = -2
> -};
> -
>   /**
>    * One scheduler is implemented for each hardware ring
>   */
> @@ -176,10 +178,4 @@ bool amd_sched_dependency_optimized(struct dma_fence* fence,
>   				    struct amd_sched_entity *entity);
>   void amd_sched_job_kickout(struct amd_sched_job *s_job);
>   
> -static inline enum amd_sched_priority
> -amd_sched_get_job_priority(struct amd_sched_job *job)
> -{
> -	return (job->s_entity->rq - job->sched->sched_rq);
> -}
> -
>   #endif


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/3] drm/amdgpu: Add SPSC queue to scheduler.
       [not found]     ` <1508506327-1084-2-git-send-email-Andrey.Grodzovsky-5C7GfCeVMHo@public.gmane.org>
  2017-10-23  4:06       ` Liu, Monk
@ 2017-10-23 12:38       ` Christian König
  1 sibling, 0 replies; 20+ messages in thread
From: Christian König @ 2017-10-23 12:38 UTC (permalink / raw)
  To: Andrey Grodzovsky, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: christian.koenig-5C7GfCeVMHo

Am 20.10.2017 um 15:32 schrieb Andrey Grodzovsky:
> It is intended to sabstitute the bounded fifo we are currently
> using.
>
> Signed-off-by: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>

Two style nit picks below, with those fixed the patch is Reviewed-by: 
Christian König <christian.koenig@amd.com>

> ---
>   drivers/gpu/drm/amd/scheduler/spsc_queue.h | 120 +++++++++++++++++++++++++++++
>   1 file changed, 120 insertions(+)
>   create mode 100644 drivers/gpu/drm/amd/scheduler/spsc_queue.h
>
> diff --git a/drivers/gpu/drm/amd/scheduler/spsc_queue.h b/drivers/gpu/drm/amd/scheduler/spsc_queue.h
> new file mode 100644
> index 0000000..a3394f1
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/scheduler/spsc_queue.h
> @@ -0,0 +1,120 @@
> +/*
> + * Copyright 2017 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + *
> + */
> +
> +#ifndef AMD_SCHEDULER_SPSC_QUEUE_H_
> +#define AMD_SCHEDULER_SPSC_QUEUE_H_
> +
> +#include <linux/atomic.h>
> +
> +/** SPSC lockless queue */
> +
> +struct spsc_node {
> +
> +	/* Stores spsc_node* */
> +	struct spsc_node *next;
> +};
> +
> +struct spsc_queue {
> +
> +	 struct spsc_node *head;
> +
> +	/* atomic pointer to struct spsc_node* */
> +	atomic_long_t tail;
> +
> +	atomic_t job_count;
> +};
> +
> +static inline void spsc_queue_init(struct spsc_queue *queue)
> +{
> +	queue->head = NULL;
> +	atomic_long_set(&queue->tail, (long)&queue->head);
> +	atomic_set(&queue->job_count, 0);
> +}
> +
> +static inline struct spsc_node *spsc_queue_peek(struct spsc_queue *queue
> +{
> +	return queue->head;
> +}
> +
> +static inline int spsc_queue_count(struct spsc_queue *queue) {

The "{" should be on a new line.

> +	return atomic_read(&queue->job_count);
> +}
> +
> +static inline bool spsc_queue_push(struct spsc_queue *queue, struct spsc_node *node)
> +{
> +	struct spsc_node **tail;
> +
> +	node->next = NULL;
> +
> +	preempt_disable();
> +
> +	tail = (struct spsc_node **)atomic_long_xchg(&queue->tail, (long)&node->next);
> +	WRITE_ONCE(*tail, node);
> +	atomic_inc(&queue->job_count);
> +
> +	/*
> +	 * In case of first element verify new node will be visible to the consumer
> +	 * thread when we ping the kernel thread that there is new work to do.
> +	 */
> +	smp_wmb();
> +
> +	preempt_enable();
> +
> +	return tail == &queue->head;
> +}
> +
> +
> +static inline struct spsc_node *spsc_queue_pop(struct spsc_queue *queue)
> +{
> +	struct spsc_node *next, *node;
> +
> +	/* Verify reading from memory and not the cache */
> +	smp_rmb();
> +
> +	node = READ_ONCE(queue->head);
> +
> +	if (!node)
> +		return NULL;
> +
> +	next = READ_ONCE(node->next);
> +	WRITE_ONCE(queue->head, next);
> +
> +	if (unlikely(!next)) {
> +		/* slowpath for the last element in the queue */
> +
> +		if (atomic_long_cmpxchg(&queue->tail,
> +				(long)&node->next,(long) &queue->head) != (long)&node->next) {
> +			/* Updating tail failed wait for new next to appear */
> +			do {
> +				smp_rmb();
> +			}while (unlikely(!(queue->head = READ_ONCE(node->next))));

Missing space between } and while.

Regards,
Christian.

> +		}
> +	}
> +
> +	atomic_dec(&queue->job_count);
> +	return node;
> +}
> +
> +
> +
> +#endif /* AMD_SCHEDULER_SPSC_QUEUE_H_ */


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 3/3] drm/amdgpu: Fix deadlock during GPU reset.
       [not found]     ` <1508506327-1084-3-git-send-email-Andrey.Grodzovsky-5C7GfCeVMHo@public.gmane.org>
  2017-10-23  3:03       ` Liu, Monk
@ 2017-10-23 12:42       ` Christian König
  1 sibling, 0 replies; 20+ messages in thread
From: Christian König @ 2017-10-23 12:42 UTC (permalink / raw)
  To: Andrey Grodzovsky, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW
  Cc: christian.koenig-5C7GfCeVMHo

Am 20.10.2017 um 15:32 schrieb Andrey Grodzovsky:
> Switch from kfifo to SPSC queue.
>
> Bug:
> Kfifo is limited at size, during GPU reset it would fill up to limit
> and the pushing thread (producer) would wait for the scheduler worker to
> consume the items in the fifo while holding reservation lock
> on a BO. The gpu reset thread on the other hand blocks the scheduler
> during reset. Before it unblocks the sceduler it might want
> to recover VRAM and so will try to reserve the same BO the producer
> thread is already holding creating a deadlock.
>
> Fix:
> Switch from kfifo to SPSC queue which is unlimited in size.
>
> Signed-off-by: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>
> ---
>   drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h |  4 +-
>   drivers/gpu/drm/amd/scheduler/gpu_scheduler.c   | 51 ++++++++++---------------
>   drivers/gpu/drm/amd/scheduler/gpu_scheduler.h   |  4 +-
>   3 files changed, 26 insertions(+), 33 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h b/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h
> index 8bd3810..86838a8 100644
> --- a/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h
> +++ b/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h
> @@ -28,8 +28,8 @@ TRACE_EVENT(amd_sched_job,
>   			   __entry->id = sched_job->id;
>   			   __entry->fence = &sched_job->s_fence->finished;
>   			   __entry->name = sched_job->sched->name;
> -			   __entry->job_count = kfifo_len(
> -				   &sched_job->s_entity->job_queue) / sizeof(sched_job);
> +			   __entry->job_count = spsc_queue_count(
> +				   &sched_job->s_entity->job_queue);
>   			   __entry->hw_job_count = atomic_read(
>   				   &sched_job->sched->hw_rq_count);
>   			   ),
> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> index 1bbbce2..0c9cdc0 100644
> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> @@ -28,9 +28,14 @@
>   #include <drm/drmP.h>
>   #include "gpu_scheduler.h"
>   
> +#include "spsc_queue.h"
> +
>   #define CREATE_TRACE_POINTS
>   #include "gpu_sched_trace.h"
>   
> +#define to_amd_sched_job(sched_job)		\
> +		container_of((sched_job), struct amd_sched_job, queue_node)
> +
>   static bool amd_sched_entity_is_ready(struct amd_sched_entity *entity);
>   static void amd_sched_wakeup(struct amd_gpu_scheduler *sched);
>   static void amd_sched_process_job(struct dma_fence *f, struct dma_fence_cb *cb);
> @@ -123,8 +128,6 @@ int amd_sched_entity_init(struct amd_gpu_scheduler *sched,
>   			  struct amd_sched_rq *rq,
>   			  uint32_t jobs)
>   {
> -	int r;
> -
>   	if (!(sched && entity && rq))
>   		return -EINVAL;
>   
> @@ -135,9 +138,7 @@ int amd_sched_entity_init(struct amd_gpu_scheduler *sched,
>   
>   	spin_lock_init(&entity->rq_lock);
>   	spin_lock_init(&entity->queue_lock);
> -	r = kfifo_alloc(&entity->job_queue, jobs * sizeof(void *), GFP_KERNEL);
> -	if (r)
> -		return r;
> +	spsc_queue_init(&entity->job_queue);
>   
>   	atomic_set(&entity->fence_seq, 0);
>   	entity->fence_context = dma_fence_context_alloc(2);
> @@ -170,7 +171,7 @@ static bool amd_sched_entity_is_initialized(struct amd_gpu_scheduler *sched,
>   static bool amd_sched_entity_is_idle(struct amd_sched_entity *entity)
>   {
>   	rmb();
> -	if (kfifo_is_empty(&entity->job_queue))
> +	if (spsc_queue_peek(&entity->job_queue) == NULL)
>   		return true;
>   
>   	return false;
> @@ -185,7 +186,7 @@ static bool amd_sched_entity_is_idle(struct amd_sched_entity *entity)
>    */
>   static bool amd_sched_entity_is_ready(struct amd_sched_entity *entity)
>   {
> -	if (kfifo_is_empty(&entity->job_queue))
> +	if (spsc_queue_peek(&entity->job_queue) == NULL)
>   		return false;
>   
>   	if (ACCESS_ONCE(entity->dependency))
> @@ -227,7 +228,7 @@ void amd_sched_entity_fini(struct amd_gpu_scheduler *sched,
>   		 */
>   		kthread_park(sched->thread);
>   		kthread_unpark(sched->thread);
> -		while (kfifo_out(&entity->job_queue, &job, sizeof(job))) {
> +		while ((job = to_amd_sched_job(spsc_queue_pop(&entity->job_queue)))) {
>   			struct amd_sched_fence *s_fence = job->s_fence;
>   			amd_sched_fence_scheduled(s_fence);
>   			dma_fence_set_error(&s_fence->finished, -ESRCH);
> @@ -235,9 +236,7 @@ void amd_sched_entity_fini(struct amd_gpu_scheduler *sched,
>   			dma_fence_put(&s_fence->finished);
>   			sched->ops->free_job(job);
>   		}
> -
>   	}
> -	kfifo_free(&entity->job_queue);
>   }
>   
>   static void amd_sched_entity_wakeup(struct dma_fence *f, struct dma_fence_cb *cb)
> @@ -332,18 +331,21 @@ static bool amd_sched_entity_add_dependency_cb(struct amd_sched_entity *entity)
>   }
>   
>   static struct amd_sched_job *
> -amd_sched_entity_peek_job(struct amd_sched_entity *entity)
> +amd_sched_entity_pop_job(struct amd_sched_entity *entity)
>   {
>   	struct amd_gpu_scheduler *sched = entity->sched;
> -	struct amd_sched_job *sched_job;
> +	struct amd_sched_job *sched_job = to_amd_sched_job(
> +						spsc_queue_peek(&entity->job_queue));
>   
> -	if (!kfifo_out_peek(&entity->job_queue, &sched_job, sizeof(sched_job)))
> +	if (!sched_job)
>   		return NULL;
>   
>   	while ((entity->dependency = sched->ops->dependency(sched_job)))
>   		if (amd_sched_entity_add_dependency_cb(entity))
>   			return NULL;
>   
> +	sched_job->s_entity = NULL;
> +	spsc_queue_pop(&entity->job_queue);
>   	return sched_job;
>   }
>   
> @@ -354,18 +356,14 @@ amd_sched_entity_peek_job(struct amd_sched_entity *entity)
>    *
>    * Returns true if we could submit the job.
>    */
> -static bool amd_sched_entity_in(struct amd_sched_job *sched_job)
> +static void amd_sched_entity_in(struct amd_sched_job *sched_job)
>   {
>   	struct amd_gpu_scheduler *sched = sched_job->sched;
>   	struct amd_sched_entity *entity = sched_job->s_entity;
> -	bool added, first = false;
> +	bool first = false;
>   
>   	spin_lock(&entity->queue_lock);
> -	added = kfifo_in(&entity->job_queue, &sched_job,
> -			sizeof(sched_job)) == sizeof(sched_job);
> -
> -	if (added && kfifo_len(&entity->job_queue) == sizeof(sched_job))
> -		first = true;
> +	first = spsc_queue_push(&entity->job_queue, &sched_job->queue_node);
>   
>   	spin_unlock(&entity->queue_lock);
>   
> @@ -377,7 +375,6 @@ static bool amd_sched_entity_in(struct amd_sched_job *sched_job)
>   		spin_unlock(&entity->rq_lock);
>   		amd_sched_wakeup(sched);
>   	}
> -	return added;
>   }
>   
>   /* job_finish is called after hw fence signaled
> @@ -514,11 +511,8 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
>    */
>   void amd_sched_entity_push_job(struct amd_sched_job *sched_job)
>   {
> -	struct amd_sched_entity *entity = sched_job->s_entity;
> -
>   	trace_amd_sched_job(sched_job);
> -	wait_event(entity->sched->job_scheduled,
> -		   amd_sched_entity_in(sched_job));
> +	amd_sched_entity_in(sched_job);

To save a few loc amd_sched_entity_in() can now be completely merged 
into this function.

Apart from that the patch looks really good to me and is Reviewed-by: 
Christian König <christian.koenig@amd.com>.

Ping me internally for new work if you now have time again.

Thanks,
Christian.

>   }
>   
>   /* init a sched_job with basic field */
> @@ -611,7 +605,7 @@ static int amd_sched_main(void *param)
>   {
>   	struct sched_param sparam = {.sched_priority = 1};
>   	struct amd_gpu_scheduler *sched = (struct amd_gpu_scheduler *)param;
> -	int r, count;
> +	int r;
>   
>   	sched_setscheduler(current, SCHED_FIFO, &sparam);
>   
> @@ -629,7 +623,7 @@ static int amd_sched_main(void *param)
>   		if (!entity)
>   			continue;
>   
> -		sched_job = amd_sched_entity_peek_job(entity);
> +		sched_job = amd_sched_entity_pop_job(entity);
>   		if (!sched_job)
>   			continue;
>   
> @@ -656,9 +650,6 @@ static int amd_sched_main(void *param)
>   			amd_sched_process_job(NULL, &s_fence->cb);
>   		}
>   
> -		count = kfifo_out(&entity->job_queue, &sched_job,
> -				sizeof(sched_job));
> -		WARN_ON(count != sizeof(sched_job));
>   		wake_up(&sched->job_scheduled);
>   	}
>   	return 0;
> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> index 3f75b45..8844ce7 100644
> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> @@ -26,6 +26,7 @@
>   
>   #include <linux/kfifo.h>
>   #include <linux/dma-fence.h>
> +#include "spsc_queue.h"
>   
>   struct amd_gpu_scheduler;
>   struct amd_sched_rq;
> @@ -56,7 +57,7 @@ struct amd_sched_entity {
>   	struct amd_gpu_scheduler	*sched;
>   
>   	spinlock_t			queue_lock;
> -	struct kfifo                    job_queue;
> +	struct spsc_queue	job_queue;
>   
>   	atomic_t			fence_seq;
>   	uint64_t                        fence_context;
> @@ -87,6 +88,7 @@ struct amd_sched_fence {
>   };
>   
>   struct amd_sched_job {
> +	struct spsc_node queue_node;
>   	struct amd_gpu_scheduler        *sched;
>   	struct amd_sched_entity         *s_entity;
>   	struct amd_sched_fence          *s_fence;


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2017-10-23 12:42 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-10-20 13:32 [PATCH 1/3] drm/amdgpu: Avoid accessing job->entity after the job is scheduled Andrey Grodzovsky
     [not found] ` <1508506327-1084-1-git-send-email-Andrey.Grodzovsky-5C7GfCeVMHo@public.gmane.org>
2017-10-20 13:32   ` [PATCH 2/3] drm/amdgpu: Add SPSC queue to scheduler Andrey Grodzovsky
     [not found]     ` <1508506327-1084-2-git-send-email-Andrey.Grodzovsky-5C7GfCeVMHo@public.gmane.org>
2017-10-23  4:06       ` Liu, Monk
     [not found]         ` <BLUPR12MB04499B9F742E2CF76139F6BC84460-7LeqcoF/hwpTIQvHjXdJlwdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2017-10-23 10:50           ` Andrey Grodzovsky
2017-10-23 12:38       ` Christian König
2017-10-20 13:32   ` [PATCH 3/3] drm/amdgpu: Fix deadlock during GPU reset Andrey Grodzovsky
     [not found]     ` <1508506327-1084-3-git-send-email-Andrey.Grodzovsky-5C7GfCeVMHo@public.gmane.org>
2017-10-23  3:03       ` Liu, Monk
     [not found]         ` <BLUPR12MB04491AF9EAD2623B5EA9D7B784460-7LeqcoF/hwpTIQvHjXdJlwdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2017-10-23  7:08           ` Christian König
     [not found]             ` <8153c0a9-8509-c2c3-c53e-e25f6f7e83f1-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2017-10-23  7:24               ` Liu, Monk
     [not found]                 ` <BLUPR12MB04490DF904B056086D164BDD84460-7LeqcoF/hwpTIQvHjXdJlwdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2017-10-23  7:30                   ` Christian König
     [not found]                     ` <a9ed61a0-6e0b-6093-5c0b-4459cfbb2282-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2017-10-23  7:38                       ` Liu, Monk
     [not found]                         ` <BLUPR12MB0449E8F1F121B277B598430684460-7LeqcoF/hwpTIQvHjXdJlwdYzm3356FpvxpqHgZTriW3zl9H0oFU5g@public.gmane.org>
2017-10-23  8:46                           ` Liu, Monk
2017-10-23 12:42       ` Christian König
2017-10-20 15:59   ` [PATCH 1/3] drm/amdgpu: Avoid accessing job->entity after the job is scheduled Andres Rodriguez
     [not found]     ` <ea764df5-6c80-e1cd-a4cd-562c86c7d553-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2017-10-20 16:19       ` Andrey Grodzovsky
     [not found]         ` <ecb8a9d2-ba94-c7a4-f5d7-156b061245d8-5C7GfCeVMHo@public.gmane.org>
2017-10-20 16:26           ` Andres Rodriguez
     [not found]             ` <86853e29-07c1-700b-8aa2-2183fbc64d2f-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2017-10-20 16:51               ` Andrey Grodzovsky
     [not found]                 ` <184ba414-981a-02eb-1a6f-7bd878b4f3b1-5C7GfCeVMHo@public.gmane.org>
2017-10-20 18:24                   ` Christian König
     [not found]                     ` <79fd05fa-711c-c454-36a1-54b31adda225-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2017-10-20 19:31                       ` Andres Rodriguez
2017-10-23 12:36   ` Christian König

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.