All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] kthread: finer-grained lockdep/cross-release completion
@ 2017-12-07 10:08 ` Daniel Vetter
  0 siblings, 0 replies; 36+ messages in thread
From: Daniel Vetter @ 2017-12-07 10:08 UTC (permalink / raw)
  To: LKML
  Cc: DRI Development, Intel Graphics Development, Daniel Vetter,
	Tvrtko Ursulin, Marta Lofstedt, Byungchul Park, Ingo Molnar,
	Peter Zijlstra, Tejun Heo, Kees Cook, Thomas Gleixner,
	Shaohua Li, Andrew Morton, Jens Axboe, Greg Kroah-Hartman,
	Jonathan Corbet, Oleg Nesterov, Daniel Vetter

Since -rc1 we're hitting a bunch of lockdep splats using the new
cross-release stuff around the 2 kthread completions. In all cases
they are because totally independent uses of kthread are mixed up by
lockdep into the same locking class, creating artificial deadlocks.

Fix this by converting kthread code in the same way as e.g.
alloc_workqueue already works: Use macros for the public api so we can
have a callsite specific lockdep key, then pass that through the
entire callchain. Due to the many entry points this is slightly
tedious.

Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Marta Lofstedt <marta.lofstedt@intel.com>
Cc: Byungchul Park <byungchul.park@lge.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Shaohua Li <shli@fb.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Oleg Nesterov <oleg@redhat.com>
References: https://bugs.freedesktop.org/show_bug.cgi?id=103950
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
---
 include/linux/kthread.h | 48 +++++++++++++++++++++++++++++-----
 kernel/kthread.c        | 68 ++++++++++++++++++++++++++++++++++---------------
 2 files changed, 88 insertions(+), 28 deletions(-)

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index c1961761311d..7a9463f0be5c 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -6,10 +6,12 @@
 #include <linux/sched.h>
 #include <linux/cgroup.h>
 
-__printf(4, 5)
-struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
+__printf(6, 7)
+struct task_struct *_kthread_create_on_node(int (*threadfn)(void *data),
 					   void *data,
 					   int node,
+					   struct lock_class_key *exited_key,
+					   struct lock_class_key *parked_key,
 					   const char namefmt[], ...);
 
 /**
@@ -25,12 +27,27 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
  */
 #define kthread_create(threadfn, data, namefmt, arg...) \
 	kthread_create_on_node(threadfn, data, NUMA_NO_NODE, namefmt, ##arg)
+#define kthread_create_on_node(threadfn, data, node, namefmt, arg...)	\
+({									\
+	static struct lock_class_key __exited_key, __parked_key;	\
+	_kthread_create_on_node(threadfn, data, node, &__exited_key,	\
+			       &__parked_key, namefmt, ##arg);		\
+})
 
 
-struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
+struct task_struct *_kthread_create_on_cpu(int (*threadfn)(void *data),
 					  void *data,
 					  unsigned int cpu,
+					  struct lock_class_key *exited_key,
+					  struct lock_class_key *parked_key,
 					  const char *namefmt);
+#define kthread_create_on_cpu(threadfn, data, cpu, namefmt)		\
+({									\
+	static struct lock_class_key __exited_key, __parked_key;	\
+	_kthread_create_on_cpu(threadfn, data, cpu, &__exited_key,\
+			       &__parked_key, namefmt);			\
+})
+
 
 /**
  * kthread_run - create and wake a thread.
@@ -171,13 +188,30 @@ extern void __kthread_init_worker(struct kthread_worker *worker,
 
 int kthread_worker_fn(void *worker_ptr);
 
-__printf(2, 3)
+__printf(4, 5)
 struct kthread_worker *
-kthread_create_worker(unsigned int flags, const char namefmt[], ...);
+_kthread_create_worker(unsigned int flags,
+		       struct lock_class_key *exited_key,
+		       struct lock_class_key *parked_key,
+		       const char namefmt[], ...);
+#define kthread_create_worker(flags, namefmt...)				\
+({									\
+	static struct lock_class_key __exited_key, __parked_key;	\
+	_kthread_create_worker(flags, &__exited_key, &__parked_key,	\
+			       ##namefmt);				\
+})
 
-__printf(3, 4) struct kthread_worker *
-kthread_create_worker_on_cpu(int cpu, unsigned int flags,
+__printf(5, 6) struct kthread_worker *
+_kthread_create_worker_on_cpu(int cpu, unsigned int flags,
+		       struct lock_class_key *exited_key,
+		       struct lock_class_key *parked_key,
 			     const char namefmt[], ...);
+#define kthread_create_worker_on_cpu(cpu, flags, namefmt...)	\
+({									\
+	static struct lock_class_key __exited_key, __parked_key;	\
+	_kthread_create_worker_on_cpu(cpu, flags, &__exited_key, &__parked_key,\
+			       ##namefmt);				\
+})
 
 bool kthread_queue_work(struct kthread_worker *worker,
 			struct kthread_work *work);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index cd50e99202b0..5b73e3da0d2e 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -32,6 +32,7 @@ struct kthread_create_info
 	int (*threadfn)(void *data);
 	void *data;
 	int node;
+	struct lock_class_key *exited_key, *parked_key;
 
 	/* Result passed back to kthread_create() from kthreadd. */
 	struct task_struct *result;
@@ -221,8 +222,15 @@ static int kthread(void *_create)
 	}
 
 	self->data = data;
-	init_completion(&self->exited);
-	init_completion(&self->parked);
+	lockdep_init_map_crosslock(&self->exited.map.map,
+			"(kthread completion)->exited",
+			create->exited_key, 0);
+	init_completion_map(&self->exited, &self->exited.map.map);
+	lockdep_init_map_crosslock(&self->parked.map.map,
+			"(kthread completion)->parked",
+			create->parked_key, 0);
+	init_completion_map(&self->parked, &self->exited.map.map);
+
 	current->vfork_done = &self->exited;
 
 	/* OK, tell user we're spawned, wait for stop or wakeup */
@@ -272,9 +280,11 @@ static void create_kthread(struct kthread_create_info *create)
 	}
 }
 
-static __printf(4, 0)
+static __printf(6, 0)
 struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
 						    void *data, int node,
+						    struct lock_class_key *exited_key,
+						    struct lock_class_key *parked_key,
 						    const char namefmt[],
 						    va_list args)
 {
@@ -289,6 +299,8 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
 	create->data = data;
 	create->node = node;
 	create->done = &done;
+	create->exited_key = exited_key;
+	create->parked_key = parked_key;
 
 	spin_lock(&kthread_create_lock);
 	list_add_tail(&create->list, &kthread_create_list);
@@ -353,21 +365,24 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
  *
  * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR).
  */
-struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
-					   void *data, int node,
-					   const char namefmt[],
-					   ...)
+struct task_struct *_kthread_create_on_node(int (*threadfn)(void *data),
+					    void *data, int node,
+					    struct lock_class_key *exited_key,
+					    struct lock_class_key *parked_key,
+					    const char namefmt[],
+					    ...)
 {
 	struct task_struct *task;
 	va_list args;
 
 	va_start(args, namefmt);
-	task = __kthread_create_on_node(threadfn, data, node, namefmt, args);
+	task = __kthread_create_on_node(threadfn, data, node,
+					exited_key, parked_key, namefmt, args);
 	va_end(args);
 
 	return task;
 }
-EXPORT_SYMBOL(kthread_create_on_node);
+EXPORT_SYMBOL(_kthread_create_on_node);
 
 static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, long state)
 {
@@ -421,14 +436,16 @@ EXPORT_SYMBOL(kthread_bind);
  * Description: This helper function creates and names a kernel thread
  * The thread will be woken and put into park mode.
  */
-struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
+struct task_struct *_kthread_create_on_cpu(int (*threadfn)(void *data),
 					  void *data, unsigned int cpu,
+					  struct lock_class_key *exited_key,
+					  struct lock_class_key *parked_key,
 					  const char *namefmt)
 {
 	struct task_struct *p;
 
-	p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt,
-				   cpu);
+	p = _kthread_create_on_node(threadfn, data, cpu_to_node(cpu),
+				    exited_key, parked_key, namefmt, cpu);
 	if (IS_ERR(p))
 		return p;
 	kthread_bind(p, cpu);
@@ -649,8 +666,10 @@ int kthread_worker_fn(void *worker_ptr)
 }
 EXPORT_SYMBOL_GPL(kthread_worker_fn);
 
-static __printf(3, 0) struct kthread_worker *
+static __printf(5, 0) struct kthread_worker *
 __kthread_create_worker(int cpu, unsigned int flags,
+			struct lock_class_key *exited_key,
+			struct lock_class_key *parked_key,
 			const char namefmt[], va_list args)
 {
 	struct kthread_worker *worker;
@@ -666,8 +685,8 @@ __kthread_create_worker(int cpu, unsigned int flags,
 	if (cpu >= 0)
 		node = cpu_to_node(cpu);
 
-	task = __kthread_create_on_node(kthread_worker_fn, worker,
-						node, namefmt, args);
+	task = __kthread_create_on_node(kthread_worker_fn, worker, node,
+					exited_key, parked_key, namefmt, args);
 	if (IS_ERR(task))
 		goto fail_task;
 
@@ -694,18 +713,22 @@ __kthread_create_worker(int cpu, unsigned int flags,
  * when the worker was SIGKILLed.
  */
 struct kthread_worker *
-kthread_create_worker(unsigned int flags, const char namefmt[], ...)
+_kthread_create_worker(unsigned int flags,
+		       struct lock_class_key *exited_key,
+		       struct lock_class_key *parked_key,
+		       const char namefmt[], ...)
 {
 	struct kthread_worker *worker;
 	va_list args;
 
 	va_start(args, namefmt);
-	worker = __kthread_create_worker(-1, flags, namefmt, args);
+	worker = __kthread_create_worker(-1, flags, exited_key, parked_key,
+					 namefmt, args);
 	va_end(args);
 
 	return worker;
 }
-EXPORT_SYMBOL(kthread_create_worker);
+EXPORT_SYMBOL(_kthread_create_worker);
 
 /**
  * kthread_create_worker_on_cpu - create a kthread worker and bind it
@@ -725,19 +748,22 @@ EXPORT_SYMBOL(kthread_create_worker);
  * when the worker was SIGKILLed.
  */
 struct kthread_worker *
-kthread_create_worker_on_cpu(int cpu, unsigned int flags,
+_kthread_create_worker_on_cpu(int cpu, unsigned int flags,
+			      struct lock_class_key *exited_key,
+			      struct lock_class_key *parked_key,
 			     const char namefmt[], ...)
 {
 	struct kthread_worker *worker;
 	va_list args;
 
 	va_start(args, namefmt);
-	worker = __kthread_create_worker(cpu, flags, namefmt, args);
+	worker = __kthread_create_worker(cpu, flags, exited_key, parked_key,
+					 namefmt, args);
 	va_end(args);
 
 	return worker;
 }
-EXPORT_SYMBOL(kthread_create_worker_on_cpu);
+EXPORT_SYMBOL(_kthread_create_worker_on_cpu);
 
 /*
  * Returns true when the work could not be queued at the moment.
-- 
2.15.0

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* [PATCH] kthread: finer-grained lockdep/cross-release completion
@ 2017-12-07 10:08 ` Daniel Vetter
  0 siblings, 0 replies; 36+ messages in thread
From: Daniel Vetter @ 2017-12-07 10:08 UTC (permalink / raw)
  To: LKML
  Cc: Jens Axboe, Kees Cook, Tvrtko Ursulin, Peter Zijlstra,
	Daniel Vetter, Intel Graphics Development, Jonathan Corbet,
	Andrew Morton, DRI Development, Oleg Nesterov, Byungchul Park,
	Shaohua Li, Greg Kroah-Hartman, Tejun Heo, Daniel Vetter,
	Thomas Gleixner, Ingo Molnar

Since -rc1 we're hitting a bunch of lockdep splats using the new
cross-release stuff around the 2 kthread completions. In all cases
they are because totally independent uses of kthread are mixed up by
lockdep into the same locking class, creating artificial deadlocks.

Fix this by converting kthread code in the same way as e.g.
alloc_workqueue already works: Use macros for the public api so we can
have a callsite specific lockdep key, then pass that through the
entire callchain. Due to the many entry points this is slightly
tedious.

Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Marta Lofstedt <marta.lofstedt@intel.com>
Cc: Byungchul Park <byungchul.park@lge.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Shaohua Li <shli@fb.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Oleg Nesterov <oleg@redhat.com>
References: https://bugs.freedesktop.org/show_bug.cgi?id=103950
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
---
 include/linux/kthread.h | 48 +++++++++++++++++++++++++++++-----
 kernel/kthread.c        | 68 ++++++++++++++++++++++++++++++++++---------------
 2 files changed, 88 insertions(+), 28 deletions(-)

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index c1961761311d..7a9463f0be5c 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -6,10 +6,12 @@
 #include <linux/sched.h>
 #include <linux/cgroup.h>
 
-__printf(4, 5)
-struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
+__printf(6, 7)
+struct task_struct *_kthread_create_on_node(int (*threadfn)(void *data),
 					   void *data,
 					   int node,
+					   struct lock_class_key *exited_key,
+					   struct lock_class_key *parked_key,
 					   const char namefmt[], ...);
 
 /**
@@ -25,12 +27,27 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
  */
 #define kthread_create(threadfn, data, namefmt, arg...) \
 	kthread_create_on_node(threadfn, data, NUMA_NO_NODE, namefmt, ##arg)
+#define kthread_create_on_node(threadfn, data, node, namefmt, arg...)	\
+({									\
+	static struct lock_class_key __exited_key, __parked_key;	\
+	_kthread_create_on_node(threadfn, data, node, &__exited_key,	\
+			       &__parked_key, namefmt, ##arg);		\
+})
 
 
-struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
+struct task_struct *_kthread_create_on_cpu(int (*threadfn)(void *data),
 					  void *data,
 					  unsigned int cpu,
+					  struct lock_class_key *exited_key,
+					  struct lock_class_key *parked_key,
 					  const char *namefmt);
+#define kthread_create_on_cpu(threadfn, data, cpu, namefmt)		\
+({									\
+	static struct lock_class_key __exited_key, __parked_key;	\
+	_kthread_create_on_cpu(threadfn, data, cpu, &__exited_key,\
+			       &__parked_key, namefmt);			\
+})
+
 
 /**
  * kthread_run - create and wake a thread.
@@ -171,13 +188,30 @@ extern void __kthread_init_worker(struct kthread_worker *worker,
 
 int kthread_worker_fn(void *worker_ptr);
 
-__printf(2, 3)
+__printf(4, 5)
 struct kthread_worker *
-kthread_create_worker(unsigned int flags, const char namefmt[], ...);
+_kthread_create_worker(unsigned int flags,
+		       struct lock_class_key *exited_key,
+		       struct lock_class_key *parked_key,
+		       const char namefmt[], ...);
+#define kthread_create_worker(flags, namefmt...)				\
+({									\
+	static struct lock_class_key __exited_key, __parked_key;	\
+	_kthread_create_worker(flags, &__exited_key, &__parked_key,	\
+			       ##namefmt);				\
+})
 
-__printf(3, 4) struct kthread_worker *
-kthread_create_worker_on_cpu(int cpu, unsigned int flags,
+__printf(5, 6) struct kthread_worker *
+_kthread_create_worker_on_cpu(int cpu, unsigned int flags,
+		       struct lock_class_key *exited_key,
+		       struct lock_class_key *parked_key,
 			     const char namefmt[], ...);
+#define kthread_create_worker_on_cpu(cpu, flags, namefmt...)	\
+({									\
+	static struct lock_class_key __exited_key, __parked_key;	\
+	_kthread_create_worker_on_cpu(cpu, flags, &__exited_key, &__parked_key,\
+			       ##namefmt);				\
+})
 
 bool kthread_queue_work(struct kthread_worker *worker,
 			struct kthread_work *work);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index cd50e99202b0..5b73e3da0d2e 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -32,6 +32,7 @@ struct kthread_create_info
 	int (*threadfn)(void *data);
 	void *data;
 	int node;
+	struct lock_class_key *exited_key, *parked_key;
 
 	/* Result passed back to kthread_create() from kthreadd. */
 	struct task_struct *result;
@@ -221,8 +222,15 @@ static int kthread(void *_create)
 	}
 
 	self->data = data;
-	init_completion(&self->exited);
-	init_completion(&self->parked);
+	lockdep_init_map_crosslock(&self->exited.map.map,
+			"(kthread completion)->exited",
+			create->exited_key, 0);
+	init_completion_map(&self->exited, &self->exited.map.map);
+	lockdep_init_map_crosslock(&self->parked.map.map,
+			"(kthread completion)->parked",
+			create->parked_key, 0);
+	init_completion_map(&self->parked, &self->exited.map.map);
+
 	current->vfork_done = &self->exited;
 
 	/* OK, tell user we're spawned, wait for stop or wakeup */
@@ -272,9 +280,11 @@ static void create_kthread(struct kthread_create_info *create)
 	}
 }
 
-static __printf(4, 0)
+static __printf(6, 0)
 struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
 						    void *data, int node,
+						    struct lock_class_key *exited_key,
+						    struct lock_class_key *parked_key,
 						    const char namefmt[],
 						    va_list args)
 {
@@ -289,6 +299,8 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
 	create->data = data;
 	create->node = node;
 	create->done = &done;
+	create->exited_key = exited_key;
+	create->parked_key = parked_key;
 
 	spin_lock(&kthread_create_lock);
 	list_add_tail(&create->list, &kthread_create_list);
@@ -353,21 +365,24 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
  *
  * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR).
  */
-struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
-					   void *data, int node,
-					   const char namefmt[],
-					   ...)
+struct task_struct *_kthread_create_on_node(int (*threadfn)(void *data),
+					    void *data, int node,
+					    struct lock_class_key *exited_key,
+					    struct lock_class_key *parked_key,
+					    const char namefmt[],
+					    ...)
 {
 	struct task_struct *task;
 	va_list args;
 
 	va_start(args, namefmt);
-	task = __kthread_create_on_node(threadfn, data, node, namefmt, args);
+	task = __kthread_create_on_node(threadfn, data, node,
+					exited_key, parked_key, namefmt, args);
 	va_end(args);
 
 	return task;
 }
-EXPORT_SYMBOL(kthread_create_on_node);
+EXPORT_SYMBOL(_kthread_create_on_node);
 
 static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, long state)
 {
@@ -421,14 +436,16 @@ EXPORT_SYMBOL(kthread_bind);
  * Description: This helper function creates and names a kernel thread
  * The thread will be woken and put into park mode.
  */
-struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
+struct task_struct *_kthread_create_on_cpu(int (*threadfn)(void *data),
 					  void *data, unsigned int cpu,
+					  struct lock_class_key *exited_key,
+					  struct lock_class_key *parked_key,
 					  const char *namefmt)
 {
 	struct task_struct *p;
 
-	p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt,
-				   cpu);
+	p = _kthread_create_on_node(threadfn, data, cpu_to_node(cpu),
+				    exited_key, parked_key, namefmt, cpu);
 	if (IS_ERR(p))
 		return p;
 	kthread_bind(p, cpu);
@@ -649,8 +666,10 @@ int kthread_worker_fn(void *worker_ptr)
 }
 EXPORT_SYMBOL_GPL(kthread_worker_fn);
 
-static __printf(3, 0) struct kthread_worker *
+static __printf(5, 0) struct kthread_worker *
 __kthread_create_worker(int cpu, unsigned int flags,
+			struct lock_class_key *exited_key,
+			struct lock_class_key *parked_key,
 			const char namefmt[], va_list args)
 {
 	struct kthread_worker *worker;
@@ -666,8 +685,8 @@ __kthread_create_worker(int cpu, unsigned int flags,
 	if (cpu >= 0)
 		node = cpu_to_node(cpu);
 
-	task = __kthread_create_on_node(kthread_worker_fn, worker,
-						node, namefmt, args);
+	task = __kthread_create_on_node(kthread_worker_fn, worker, node,
+					exited_key, parked_key, namefmt, args);
 	if (IS_ERR(task))
 		goto fail_task;
 
@@ -694,18 +713,22 @@ __kthread_create_worker(int cpu, unsigned int flags,
  * when the worker was SIGKILLed.
  */
 struct kthread_worker *
-kthread_create_worker(unsigned int flags, const char namefmt[], ...)
+_kthread_create_worker(unsigned int flags,
+		       struct lock_class_key *exited_key,
+		       struct lock_class_key *parked_key,
+		       const char namefmt[], ...)
 {
 	struct kthread_worker *worker;
 	va_list args;
 
 	va_start(args, namefmt);
-	worker = __kthread_create_worker(-1, flags, namefmt, args);
+	worker = __kthread_create_worker(-1, flags, exited_key, parked_key,
+					 namefmt, args);
 	va_end(args);
 
 	return worker;
 }
-EXPORT_SYMBOL(kthread_create_worker);
+EXPORT_SYMBOL(_kthread_create_worker);
 
 /**
  * kthread_create_worker_on_cpu - create a kthread worker and bind it
@@ -725,19 +748,22 @@ EXPORT_SYMBOL(kthread_create_worker);
  * when the worker was SIGKILLed.
  */
 struct kthread_worker *
-kthread_create_worker_on_cpu(int cpu, unsigned int flags,
+_kthread_create_worker_on_cpu(int cpu, unsigned int flags,
+			      struct lock_class_key *exited_key,
+			      struct lock_class_key *parked_key,
 			     const char namefmt[], ...)
 {
 	struct kthread_worker *worker;
 	va_list args;
 
 	va_start(args, namefmt);
-	worker = __kthread_create_worker(cpu, flags, namefmt, args);
+	worker = __kthread_create_worker(cpu, flags, exited_key, parked_key,
+					 namefmt, args);
 	va_end(args);
 
 	return worker;
 }
-EXPORT_SYMBOL(kthread_create_worker_on_cpu);
+EXPORT_SYMBOL(_kthread_create_worker_on_cpu);
 
 /*
  * Returns true when the work could not be queued at the moment.
-- 
2.15.0

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [PATCH] kthread: finer-grained lockdep/cross-release completion
  2017-12-07 10:08 ` Daniel Vetter
@ 2017-12-07 12:22   ` Peter Zijlstra
  -1 siblings, 0 replies; 36+ messages in thread
From: Peter Zijlstra @ 2017-12-07 12:22 UTC (permalink / raw)
  To: Daniel Vetter
  Cc: LKML, DRI Development, Intel Graphics Development,
	Tvrtko Ursulin, Marta Lofstedt, Byungchul Park, Ingo Molnar,
	Tejun Heo, Kees Cook, Thomas Gleixner, Shaohua Li, Andrew Morton,
	Jens Axboe, Greg Kroah-Hartman, Jonathan Corbet, Oleg Nesterov,
	Daniel Vetter

On Thu, Dec 07, 2017 at 11:08:49AM +0100, Daniel Vetter wrote:
> Since -rc1 we're hitting a bunch of lockdep splats using the new
> cross-release stuff around the 2 kthread completions. In all cases
> they are because totally independent uses of kthread are mixed up by
> lockdep into the same locking class, creating artificial deadlocks.
> 
> Fix this by converting kthread code in the same way as e.g.
> alloc_workqueue already works: Use macros for the public api so we can
> have a callsite specific lockdep key, then pass that through the
> entire callchain. Due to the many entry points this is slightly
> tedious.

Do you have a splat somewhere? I'm having trouble seeing how all this
comes together. That is, I've no real idea what you're actual problem is
and if this is the right solution.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] kthread: finer-grained lockdep/cross-release completion
@ 2017-12-07 12:22   ` Peter Zijlstra
  0 siblings, 0 replies; 36+ messages in thread
From: Peter Zijlstra @ 2017-12-07 12:22 UTC (permalink / raw)
  To: Daniel Vetter
  Cc: Jens Axboe, Kees Cook, Tvrtko Ursulin, Greg Kroah-Hartman,
	Intel Graphics Development, Jonathan Corbet, LKML,
	DRI Development, Oleg Nesterov, Byungchul Park, Andrew Morton,
	Shaohua Li, Tejun Heo, Daniel Vetter, Thomas Gleixner,
	Ingo Molnar

On Thu, Dec 07, 2017 at 11:08:49AM +0100, Daniel Vetter wrote:
> Since -rc1 we're hitting a bunch of lockdep splats using the new
> cross-release stuff around the 2 kthread completions. In all cases
> they are because totally independent uses of kthread are mixed up by
> lockdep into the same locking class, creating artificial deadlocks.
> 
> Fix this by converting kthread code in the same way as e.g.
> alloc_workqueue already works: Use macros for the public api so we can
> have a callsite specific lockdep key, then pass that through the
> entire callchain. Due to the many entry points this is slightly
> tedious.

Do you have a splat somewhere? I'm having trouble seeing how all this
comes together. That is, I've no real idea what you're actual problem is
and if this is the right solution.

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 36+ messages in thread

* ✗ Fi.CI.BAT: failure for kthread: finer-grained lockdep/cross-release completion
  2017-12-07 10:08 ` Daniel Vetter
  (?)
  (?)
@ 2017-12-07 12:56 ` Patchwork
  -1 siblings, 0 replies; 36+ messages in thread
From: Patchwork @ 2017-12-07 12:56 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: intel-gfx

== Series Details ==

Series: kthread: finer-grained lockdep/cross-release completion
URL   : https://patchwork.freedesktop.org/series/35022/
State : failure

== Summary ==

Applying: kthread: finer-grained lockdep/cross-release completion
error: Failed to merge in the changes.
Using index info to reconstruct a base tree...
M	include/linux/kthread.h
M	kernel/kthread.c
Falling back to patching base and 3-way merge...
Auto-merging kernel/kthread.c
CONFLICT (content): Merge conflict in kernel/kthread.c
Patch failed at 0001 kthread: finer-grained lockdep/cross-release completion
The copy of the patch that failed is found in: .git/rebase-apply/patch
When you have resolved this problem, run "git am --continue".
If you prefer to skip this patch, run "git am --skip" instead.
To restore the original branch and stop patching, run "git am --abort".

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] kthread: finer-grained lockdep/cross-release completion
  2017-12-07 12:22   ` Peter Zijlstra
@ 2017-12-07 14:58     ` Daniel Vetter
  -1 siblings, 0 replies; 36+ messages in thread
From: Daniel Vetter @ 2017-12-07 14:58 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Daniel Vetter, LKML, DRI Development, Intel Graphics Development,
	Tvrtko Ursulin, Marta Lofstedt, Byungchul Park, Ingo Molnar,
	Tejun Heo, Kees Cook, Thomas Gleixner, Shaohua Li, Andrew Morton,
	Jens Axboe, Greg Kroah-Hartman, Jonathan Corbet, Oleg Nesterov,
	Daniel Vetter

On Thu, Dec 07, 2017 at 01:22:56PM +0100, Peter Zijlstra wrote:
> On Thu, Dec 07, 2017 at 11:08:49AM +0100, Daniel Vetter wrote:
> > Since -rc1 we're hitting a bunch of lockdep splats using the new
> > cross-release stuff around the 2 kthread completions. In all cases
> > they are because totally independent uses of kthread are mixed up by
> > lockdep into the same locking class, creating artificial deadlocks.
> > 
> > Fix this by converting kthread code in the same way as e.g.
> > alloc_workqueue already works: Use macros for the public api so we can
> > have a callsite specific lockdep key, then pass that through the
> > entire callchain. Due to the many entry points this is slightly
> > tedious.
> 
> Do you have a splat somewhere? I'm having trouble seeing how all this
> comes together. That is, I've no real idea what you're actual problem is
> and if this is the right solution.

The bugzilla link is full of them. One below as example - it ties entirely
unrelated locking chains from i915 memory management together with other
stuff, with the only common bit that the kthread completions are somewhere
in there. But neither can i915 code ever get at the cpu kthread nor the
other way round, so it's flat out impossible to ever hit a deadlock this
way. Same reasons why not all workqueues share their lockdep map either.
-Daniel

[   85.062523] Setting dangerous option reset - tainting kernel
[   85.068934] i915 0000:00:02.0: Resetting chip after gpu hang

[   85.069408] ======================================================
[   85.069410] WARNING: possible circular locking dependency detected
[   85.069413] 4.15.0-rc1-CI-CI_DRM_3397+ #1 Tainted: G     U          
[   85.069415] ------------------------------------------------------
[   85.069417] gem_exec_captur/2810 is trying to acquire lock:
[   85.069419]  ((completion)&self->parked){+.+.}, at: [<ffffffff8109d69d>] kthread_park+0x3d/0x50
[   85.069426] 
               but task is already holding lock:
[   85.069428]  (&dev->struct_mutex){+.+.}, at: [<ffffffffa039b13d>] i915_reset_device+0x1bd/0x230 [i915]
[   85.069448] 
               which lock already depends on the new lock.

[   85.069451] 
               the existing dependency chain (in reverse order) is:
[   85.069454] 
               -> #3 (&dev->struct_mutex){+.+.}:
[   85.069460]        __mutex_lock+0x81/0x9b0
[   85.069481]        i915_mutex_lock_interruptible+0x47/0x130 [i915]
[   85.069502]        i915_gem_fault+0x201/0x760 [i915]
[   85.069507]        __do_fault+0x15/0x70
[   85.069509]        __handle_mm_fault+0x7bf/0xda0
[   85.069512]        handle_mm_fault+0x14f/0x2f0
[   85.069515]        __do_page_fault+0x2d1/0x560
[   85.069518]        page_fault+0x22/0x30
[   85.069520] 
               -> #2 (&mm->mmap_sem){++++}:
[   85.069525]        __might_fault+0x63/0x90
[   85.069529]        _copy_to_user+0x1e/0x70
[   85.069532]        perf_read+0x21d/0x290
[   85.069534]        __vfs_read+0x1e/0x120
[   85.069536]        vfs_read+0xa1/0x150
[   85.069539]        SyS_read+0x40/0xa0
[   85.069541]        entry_SYSCALL_64_fastpath+0x1c/0x89
[   85.069543] 
               -> #1 (&cpuctx_mutex){+.+.}:
[   85.069547]        perf_event_ctx_lock_nested+0xbc/0x1d0
[   85.069549] 
               -> #0 ((completion)&self->parked){+.+.}:
[   85.069555]        lock_acquire+0xaf/0x200
[   85.069558]        wait_for_common+0x54/0x210
[   85.069560]        kthread_park+0x3d/0x50
[   85.069579]        i915_gem_reset_prepare_engine+0x1d/0x90 [i915]
[   85.069600]        i915_gem_reset_prepare+0x2c/0x60 [i915]
[   85.069617]        i915_reset+0x66/0x230 [i915]
[   85.069635]        i915_reset_device+0x1cb/0x230 [i915]
[   85.069651]        i915_handle_error+0x2d3/0x430 [i915]
[   85.069670]        i915_wedged_set+0x79/0xc0 [i915]
[   85.069673]        simple_attr_write+0xab/0xc0
[   85.069677]        full_proxy_write+0x4b/0x70
[   85.069679]        __vfs_write+0x1e/0x130
[   85.069682]        vfs_write+0xc0/0x1b0
[   85.069684]        SyS_write+0x40/0xa0
[   85.069686]        entry_SYSCALL_64_fastpath+0x1c/0x89
[   85.069688] 
               other info that might help us debug this:

[   85.069692] Chain exists of:
                 (completion)&self->parked --> &mm->mmap_sem --> &dev->struct_mutex

[   85.069698]  Possible unsafe locking scenario:

[   85.069701]        CPU0                    CPU1
[   85.069703]        ----                    ----
[   85.069705]   lock(&dev->struct_mutex);
[   85.069707]                                lock(&mm->mmap_sem);
[   85.069710]                                lock(&dev->struct_mutex);
[   85.069713]   lock((completion)&self->parked);
[   85.069715] 
                *** DEADLOCK ***

[   85.069718] 3 locks held by gem_exec_captur/2810:
[   85.069722]  #0:  (sb_writers#10){.+.+}, at: [<ffffffff811ff05e>] vfs_write+0x15e/0x1b0
[   85.069727]  #1:  (&attr->mutex){+.+.}, at: [<ffffffff8122a866>] simple_attr_write+0x36/0xc0
[   85.069732]  #2:  (&dev->struct_mutex){+.+.}, at: [<ffffffffa039b13d>] i915_reset_device+0x1bd/0x230 [i915]
[   85.069751] 
               stack backtrace:
[   85.069754] CPU: 2 PID: 2810 Comm: gem_exec_captur Tainted: G     U           4.15.0-rc1-CI-CI_DRM_3397+ #1
[   85.069758] Hardware name: MSI MS-7924/Z97M-G43(MS-7924), BIOS V1.12 02/15/2016
[   85.069760] Call Trace:
[   85.069765]  dump_stack+0x5f/0x86
[   85.069769]  print_circular_bug+0x230/0x3b0
[   85.069772]  check_prev_add+0x439/0x7b0
[   85.069775]  ? lockdep_init_map_crosslock+0x20/0x20
[   85.069778]  ? _raw_spin_unlock_irq+0x24/0x50
[   85.069782]  ? __lock_acquire+0x1385/0x15a0
[   85.069784]  __lock_acquire+0x1385/0x15a0
[   85.069788]  lock_acquire+0xaf/0x200
[   85.069790]  ? kthread_park+0x3d/0x50
[   85.069793]  wait_for_common+0x54/0x210
[   85.069797]  ? kthread_park+0x3d/0x50
[   85.069799]  ? _raw_spin_unlock_irqrestore+0x55/0x60
[   85.069802]  ? try_to_wake_up+0x2a2/0x600
[   85.069804]  ? _raw_spin_unlock_irqrestore+0x4c/0x60
[   85.069807]  kthread_park+0x3d/0x50
[   85.069827]  i915_gem_reset_prepare_engine+0x1d/0x90 [i915]
[   85.069849]  i915_gem_reset_prepare+0x2c/0x60 [i915]
[   85.069865]  i915_reset+0x66/0x230 [i915]
[   85.069881]  i915_reset_device+0x1cb/0x230 [i915]
[   85.069898]  ? gen8_gt_irq_ack+0x160/0x160 [i915]
[   85.069902]  ? work_on_cpu_safe+0x50/0x50
[   85.069919]  i915_handle_error+0x2d3/0x430 [i915]
[   85.069923]  ? __mutex_lock+0x81/0x9b0
[   85.069926]  ? __mutex_lock+0x432/0x9b0
[   85.069929]  ? __might_fault+0x39/0x90
[   85.069933]  ? __might_fault+0x39/0x90
[   85.069951]  i915_wedged_set+0x79/0xc0 [i915]
[   85.069955]  simple_attr_write+0xab/0xc0
[   85.069959]  full_proxy_write+0x4b/0x70
[   85.069961]  __vfs_write+0x1e/0x130
[   85.069965]  ? rcu_read_lock_sched_held+0x6f/0x80
[   85.069968]  ? rcu_sync_lockdep_assert+0x25/0x50
[   85.069971]  ? __sb_start_write+0xd5/0x1f0
[   85.069973]  ? __sb_start_write+0xef/0x1f0
[   85.069976]  vfs_write+0xc0/0x1b0
[   85.069979]  SyS_write+0x40/0xa0
[   85.069982]  entry_SYSCALL_64_fastpath+0x1c/0x89
[   85.069984] RIP: 0033:0x7f22dd739670
[   85.069986] RSP: 002b:00007ffe3f6bc7d8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
[   85.069990] RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007f22dd739670
[   85.069994] RDX: 0000000000000002 RSI: 000055f260b5c376 RDI: 0000000000000007
[   85.069996] RBP: 0000000000000001 R08: 000055f2614ebed0 R09: 0000000000000000
[   85.069999] R10: 0000000000000000 R11: 0000000000000246 R12: 00007f22df019000
[   85.070002] R13: 0000000000000007 R14: 00007f22df018000 R15: 0000000000000003
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] kthread: finer-grained lockdep/cross-release completion
@ 2017-12-07 14:58     ` Daniel Vetter
  0 siblings, 0 replies; 36+ messages in thread
From: Daniel Vetter @ 2017-12-07 14:58 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Jens Axboe, Kees Cook, Daniel Vetter, Intel Graphics Development,
	Jonathan Corbet, LKML, DRI Development, Oleg Nesterov,
	Byungchul Park, Andrew Morton, Shaohua Li, Greg Kroah-Hartman,
	Tejun Heo, Daniel Vetter, Thomas Gleixner, Ingo Molnar

On Thu, Dec 07, 2017 at 01:22:56PM +0100, Peter Zijlstra wrote:
> On Thu, Dec 07, 2017 at 11:08:49AM +0100, Daniel Vetter wrote:
> > Since -rc1 we're hitting a bunch of lockdep splats using the new
> > cross-release stuff around the 2 kthread completions. In all cases
> > they are because totally independent uses of kthread are mixed up by
> > lockdep into the same locking class, creating artificial deadlocks.
> > 
> > Fix this by converting kthread code in the same way as e.g.
> > alloc_workqueue already works: Use macros for the public api so we can
> > have a callsite specific lockdep key, then pass that through the
> > entire callchain. Due to the many entry points this is slightly
> > tedious.
> 
> Do you have a splat somewhere? I'm having trouble seeing how all this
> comes together. That is, I've no real idea what you're actual problem is
> and if this is the right solution.

The bugzilla link is full of them. One below as example - it ties entirely
unrelated locking chains from i915 memory management together with other
stuff, with the only common bit that the kthread completions are somewhere
in there. But neither can i915 code ever get at the cpu kthread nor the
other way round, so it's flat out impossible to ever hit a deadlock this
way. Same reasons why not all workqueues share their lockdep map either.
-Daniel

[   85.062523] Setting dangerous option reset - tainting kernel
[   85.068934] i915 0000:00:02.0: Resetting chip after gpu hang

[   85.069408] ======================================================
[   85.069410] WARNING: possible circular locking dependency detected
[   85.069413] 4.15.0-rc1-CI-CI_DRM_3397+ #1 Tainted: G     U          
[   85.069415] ------------------------------------------------------
[   85.069417] gem_exec_captur/2810 is trying to acquire lock:
[   85.069419]  ((completion)&self->parked){+.+.}, at: [<ffffffff8109d69d>] kthread_park+0x3d/0x50
[   85.069426] 
               but task is already holding lock:
[   85.069428]  (&dev->struct_mutex){+.+.}, at: [<ffffffffa039b13d>] i915_reset_device+0x1bd/0x230 [i915]
[   85.069448] 
               which lock already depends on the new lock.

[   85.069451] 
               the existing dependency chain (in reverse order) is:
[   85.069454] 
               -> #3 (&dev->struct_mutex){+.+.}:
[   85.069460]        __mutex_lock+0x81/0x9b0
[   85.069481]        i915_mutex_lock_interruptible+0x47/0x130 [i915]
[   85.069502]        i915_gem_fault+0x201/0x760 [i915]
[   85.069507]        __do_fault+0x15/0x70
[   85.069509]        __handle_mm_fault+0x7bf/0xda0
[   85.069512]        handle_mm_fault+0x14f/0x2f0
[   85.069515]        __do_page_fault+0x2d1/0x560
[   85.069518]        page_fault+0x22/0x30
[   85.069520] 
               -> #2 (&mm->mmap_sem){++++}:
[   85.069525]        __might_fault+0x63/0x90
[   85.069529]        _copy_to_user+0x1e/0x70
[   85.069532]        perf_read+0x21d/0x290
[   85.069534]        __vfs_read+0x1e/0x120
[   85.069536]        vfs_read+0xa1/0x150
[   85.069539]        SyS_read+0x40/0xa0
[   85.069541]        entry_SYSCALL_64_fastpath+0x1c/0x89
[   85.069543] 
               -> #1 (&cpuctx_mutex){+.+.}:
[   85.069547]        perf_event_ctx_lock_nested+0xbc/0x1d0
[   85.069549] 
               -> #0 ((completion)&self->parked){+.+.}:
[   85.069555]        lock_acquire+0xaf/0x200
[   85.069558]        wait_for_common+0x54/0x210
[   85.069560]        kthread_park+0x3d/0x50
[   85.069579]        i915_gem_reset_prepare_engine+0x1d/0x90 [i915]
[   85.069600]        i915_gem_reset_prepare+0x2c/0x60 [i915]
[   85.069617]        i915_reset+0x66/0x230 [i915]
[   85.069635]        i915_reset_device+0x1cb/0x230 [i915]
[   85.069651]        i915_handle_error+0x2d3/0x430 [i915]
[   85.069670]        i915_wedged_set+0x79/0xc0 [i915]
[   85.069673]        simple_attr_write+0xab/0xc0
[   85.069677]        full_proxy_write+0x4b/0x70
[   85.069679]        __vfs_write+0x1e/0x130
[   85.069682]        vfs_write+0xc0/0x1b0
[   85.069684]        SyS_write+0x40/0xa0
[   85.069686]        entry_SYSCALL_64_fastpath+0x1c/0x89
[   85.069688] 
               other info that might help us debug this:

[   85.069692] Chain exists of:
                 (completion)&self->parked --> &mm->mmap_sem --> &dev->struct_mutex

[   85.069698]  Possible unsafe locking scenario:

[   85.069701]        CPU0                    CPU1
[   85.069703]        ----                    ----
[   85.069705]   lock(&dev->struct_mutex);
[   85.069707]                                lock(&mm->mmap_sem);
[   85.069710]                                lock(&dev->struct_mutex);
[   85.069713]   lock((completion)&self->parked);
[   85.069715] 
                *** DEADLOCK ***

[   85.069718] 3 locks held by gem_exec_captur/2810:
[   85.069722]  #0:  (sb_writers#10){.+.+}, at: [<ffffffff811ff05e>] vfs_write+0x15e/0x1b0
[   85.069727]  #1:  (&attr->mutex){+.+.}, at: [<ffffffff8122a866>] simple_attr_write+0x36/0xc0
[   85.069732]  #2:  (&dev->struct_mutex){+.+.}, at: [<ffffffffa039b13d>] i915_reset_device+0x1bd/0x230 [i915]
[   85.069751] 
               stack backtrace:
[   85.069754] CPU: 2 PID: 2810 Comm: gem_exec_captur Tainted: G     U           4.15.0-rc1-CI-CI_DRM_3397+ #1
[   85.069758] Hardware name: MSI MS-7924/Z97M-G43(MS-7924), BIOS V1.12 02/15/2016
[   85.069760] Call Trace:
[   85.069765]  dump_stack+0x5f/0x86
[   85.069769]  print_circular_bug+0x230/0x3b0
[   85.069772]  check_prev_add+0x439/0x7b0
[   85.069775]  ? lockdep_init_map_crosslock+0x20/0x20
[   85.069778]  ? _raw_spin_unlock_irq+0x24/0x50
[   85.069782]  ? __lock_acquire+0x1385/0x15a0
[   85.069784]  __lock_acquire+0x1385/0x15a0
[   85.069788]  lock_acquire+0xaf/0x200
[   85.069790]  ? kthread_park+0x3d/0x50
[   85.069793]  wait_for_common+0x54/0x210
[   85.069797]  ? kthread_park+0x3d/0x50
[   85.069799]  ? _raw_spin_unlock_irqrestore+0x55/0x60
[   85.069802]  ? try_to_wake_up+0x2a2/0x600
[   85.069804]  ? _raw_spin_unlock_irqrestore+0x4c/0x60
[   85.069807]  kthread_park+0x3d/0x50
[   85.069827]  i915_gem_reset_prepare_engine+0x1d/0x90 [i915]
[   85.069849]  i915_gem_reset_prepare+0x2c/0x60 [i915]
[   85.069865]  i915_reset+0x66/0x230 [i915]
[   85.069881]  i915_reset_device+0x1cb/0x230 [i915]
[   85.069898]  ? gen8_gt_irq_ack+0x160/0x160 [i915]
[   85.069902]  ? work_on_cpu_safe+0x50/0x50
[   85.069919]  i915_handle_error+0x2d3/0x430 [i915]
[   85.069923]  ? __mutex_lock+0x81/0x9b0
[   85.069926]  ? __mutex_lock+0x432/0x9b0
[   85.069929]  ? __might_fault+0x39/0x90
[   85.069933]  ? __might_fault+0x39/0x90
[   85.069951]  i915_wedged_set+0x79/0xc0 [i915]
[   85.069955]  simple_attr_write+0xab/0xc0
[   85.069959]  full_proxy_write+0x4b/0x70
[   85.069961]  __vfs_write+0x1e/0x130
[   85.069965]  ? rcu_read_lock_sched_held+0x6f/0x80
[   85.069968]  ? rcu_sync_lockdep_assert+0x25/0x50
[   85.069971]  ? __sb_start_write+0xd5/0x1f0
[   85.069973]  ? __sb_start_write+0xef/0x1f0
[   85.069976]  vfs_write+0xc0/0x1b0
[   85.069979]  SyS_write+0x40/0xa0
[   85.069982]  entry_SYSCALL_64_fastpath+0x1c/0x89
[   85.069984] RIP: 0033:0x7f22dd739670
[   85.069986] RSP: 002b:00007ffe3f6bc7d8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
[   85.069990] RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007f22dd739670
[   85.069994] RDX: 0000000000000002 RSI: 000055f260b5c376 RDI: 0000000000000007
[   85.069996] RBP: 0000000000000001 R08: 000055f2614ebed0 R09: 0000000000000000
[   85.069999] R10: 0000000000000000 R11: 0000000000000246 R12: 00007f22df019000
[   85.070002] R13: 0000000000000007 R14: 00007f22df018000 R15: 0000000000000003
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] kthread: finer-grained lockdep/cross-release completion
  2017-12-07 14:58     ` Daniel Vetter
@ 2017-12-07 19:57       ` Peter Zijlstra
  -1 siblings, 0 replies; 36+ messages in thread
From: Peter Zijlstra @ 2017-12-07 19:57 UTC (permalink / raw)
  To: LKML, DRI Development, Intel Graphics Development,
	Tvrtko Ursulin, Marta Lofstedt, Byungchul Park, Ingo Molnar,
	Tejun Heo, Kees Cook, Thomas Gleixner, Shaohua Li, Andrew Morton,
	Jens Axboe, Greg Kroah-Hartman, Jonathan Corbet, Oleg Nesterov,
	Daniel Vetter

On Thu, Dec 07, 2017 at 03:58:28PM +0100, Daniel Vetter wrote:

> [   85.069417] gem_exec_captur/2810 is trying to acquire lock:
> [   85.069419]  ((completion)&self->parked){+.+.}, at: [<ffffffff8109d69d>] kthread_park+0x3d/0x50
> [   85.069426] 
>                but task is already holding lock:
> [   85.069428]  (&dev->struct_mutex){+.+.}, at: [<ffffffffa039b13d>] i915_reset_device+0x1bd/0x230 [i915]
> [   85.069448] 
>                which lock already depends on the new lock.
> 
> [   85.069451] 
>                the existing dependency chain (in reverse order) is:
> [   85.069454] 
>                -> #3 (&dev->struct_mutex){+.+.}:
> [   85.069460]        __mutex_lock+0x81/0x9b0
> [   85.069481]        i915_mutex_lock_interruptible+0x47/0x130 [i915]
> [   85.069502]        i915_gem_fault+0x201/0x760 [i915]
> [   85.069507]        __do_fault+0x15/0x70
> [   85.069509]        __handle_mm_fault+0x7bf/0xda0
> [   85.069512]        handle_mm_fault+0x14f/0x2f0
> [   85.069515]        __do_page_fault+0x2d1/0x560
> [   85.069518]        page_fault+0x22/0x30
> [   85.069520] 
>                -> #2 (&mm->mmap_sem){++++}:
> [   85.069525]        __might_fault+0x63/0x90
> [   85.069529]        _copy_to_user+0x1e/0x70
> [   85.069532]        perf_read+0x21d/0x290
> [   85.069534]        __vfs_read+0x1e/0x120
> [   85.069536]        vfs_read+0xa1/0x150
> [   85.069539]        SyS_read+0x40/0xa0
> [   85.069541]        entry_SYSCALL_64_fastpath+0x1c/0x89

>                -> #0 ((completion)&self->parked){+.+.}:

> [   85.069692] Chain exists of:
>                  (completion)&self->parked --> &mm->mmap_sem --> &dev->struct_mutex

> [   85.069718] 3 locks held by gem_exec_captur/2810:

> [   85.069732]  #2:  (&dev->struct_mutex){+.+.}, at: [<ffffffffa039b13d>] i915_reset_device+0x1bd/0x230 [i915]

>                stack backtrace:

> [   85.069788]  lock_acquire+0xaf/0x200
> [   85.069793]  wait_for_common+0x54/0x210
> [   85.069807]  kthread_park+0x3d/0x50
> [   85.069827]  i915_gem_reset_prepare_engine+0x1d/0x90 [i915]
> [   85.069849]  i915_gem_reset_prepare+0x2c/0x60 [i915]
> [   85.069865]  i915_reset+0x66/0x230 [i915]
> [   85.069881]  i915_reset_device+0x1cb/0x230 [i915]
> [   85.069919]  i915_handle_error+0x2d3/0x430 [i915]
> [   85.069951]  i915_wedged_set+0x79/0xc0 [i915]
> [   85.069955]  simple_attr_write+0xab/0xc0
> [   85.069959]  full_proxy_write+0x4b/0x70
> [   85.069961]  __vfs_write+0x1e/0x130
> [   85.069976]  vfs_write+0xc0/0x1b0
> [   85.069979]  SyS_write+0x40/0xa0
> [   85.069982]  entry_SYSCALL_64_fastpath+0x1c/0x89


	Thread-A				k-Thread

	i915_reset_device
#3	  mutex_lock(&dev->struct_mutex)
	  i915_reset()
	    i915_gem_reset_prepare()
	      i915_gem_reset_prepare_engine()
	        kthread_park()

						__do_page_fault()
#2						  down_read(&mm->mmap_sem);
						  handle_mm_fault()
						    __handle_mm_fault()
						      __do_fault()
						        i915_gem_fault()
							  i915_mutex_lock_interruptible()
#3							    mutex_lock(&dev->struct_mutex)

							/* twiddles thumbs forever more */

#0		  wait_for_common()

#0						complete()


Is what it says I suppose. Now I don't know enough about that i915 code
to say if that breadcrumbs_signal thread can ever trigger a fault or
not. I got properly lost in that dma_fence callback maze.

You're saying not?


(also, that comment near need_resched() doesn't make sense to me)

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] kthread: finer-grained lockdep/cross-release completion
@ 2017-12-07 19:57       ` Peter Zijlstra
  0 siblings, 0 replies; 36+ messages in thread
From: Peter Zijlstra @ 2017-12-07 19:57 UTC (permalink / raw)
  To: LKML, DRI Development, Intel Graphics Development,
	Tvrtko Ursulin, Marta Lofstedt, Byungchul Park, Ingo Molnar,
	Tejun Heo, Kees Cook, Thomas Gleixner, Shaohua Li, Andrew Morton,
	Jens Axboe, Greg Kroah-Hartman, Jonathan Corbet, Oleg Nesterov,
	Daniel Vetter

On Thu, Dec 07, 2017 at 03:58:28PM +0100, Daniel Vetter wrote:

> [   85.069417] gem_exec_captur/2810 is trying to acquire lock:
> [   85.069419]  ((completion)&self->parked){+.+.}, at: [<ffffffff8109d69d>] kthread_park+0x3d/0x50
> [   85.069426] 
>                but task is already holding lock:
> [   85.069428]  (&dev->struct_mutex){+.+.}, at: [<ffffffffa039b13d>] i915_reset_device+0x1bd/0x230 [i915]
> [   85.069448] 
>                which lock already depends on the new lock.
> 
> [   85.069451] 
>                the existing dependency chain (in reverse order) is:
> [   85.069454] 
>                -> #3 (&dev->struct_mutex){+.+.}:
> [   85.069460]        __mutex_lock+0x81/0x9b0
> [   85.069481]        i915_mutex_lock_interruptible+0x47/0x130 [i915]
> [   85.069502]        i915_gem_fault+0x201/0x760 [i915]
> [   85.069507]        __do_fault+0x15/0x70
> [   85.069509]        __handle_mm_fault+0x7bf/0xda0
> [   85.069512]        handle_mm_fault+0x14f/0x2f0
> [   85.069515]        __do_page_fault+0x2d1/0x560
> [   85.069518]        page_fault+0x22/0x30
> [   85.069520] 
>                -> #2 (&mm->mmap_sem){++++}:
> [   85.069525]        __might_fault+0x63/0x90
> [   85.069529]        _copy_to_user+0x1e/0x70
> [   85.069532]        perf_read+0x21d/0x290
> [   85.069534]        __vfs_read+0x1e/0x120
> [   85.069536]        vfs_read+0xa1/0x150
> [   85.069539]        SyS_read+0x40/0xa0
> [   85.069541]        entry_SYSCALL_64_fastpath+0x1c/0x89

>                -> #0 ((completion)&self->parked){+.+.}:

> [   85.069692] Chain exists of:
>                  (completion)&self->parked --> &mm->mmap_sem --> &dev->struct_mutex

> [   85.069718] 3 locks held by gem_exec_captur/2810:

> [   85.069732]  #2:  (&dev->struct_mutex){+.+.}, at: [<ffffffffa039b13d>] i915_reset_device+0x1bd/0x230 [i915]

>                stack backtrace:

> [   85.069788]  lock_acquire+0xaf/0x200
> [   85.069793]  wait_for_common+0x54/0x210
> [   85.069807]  kthread_park+0x3d/0x50
> [   85.069827]  i915_gem_reset_prepare_engine+0x1d/0x90 [i915]
> [   85.069849]  i915_gem_reset_prepare+0x2c/0x60 [i915]
> [   85.069865]  i915_reset+0x66/0x230 [i915]
> [   85.069881]  i915_reset_device+0x1cb/0x230 [i915]
> [   85.069919]  i915_handle_error+0x2d3/0x430 [i915]
> [   85.069951]  i915_wedged_set+0x79/0xc0 [i915]
> [   85.069955]  simple_attr_write+0xab/0xc0
> [   85.069959]  full_proxy_write+0x4b/0x70
> [   85.069961]  __vfs_write+0x1e/0x130
> [   85.069976]  vfs_write+0xc0/0x1b0
> [   85.069979]  SyS_write+0x40/0xa0
> [   85.069982]  entry_SYSCALL_64_fastpath+0x1c/0x89


	Thread-A				k-Thread

	i915_reset_device
#3	  mutex_lock(&dev->struct_mutex)
	  i915_reset()
	    i915_gem_reset_prepare()
	      i915_gem_reset_prepare_engine()
	        kthread_park()

						__do_page_fault()
#2						  down_read(&mm->mmap_sem);
						  handle_mm_fault()
						    __handle_mm_fault()
						      __do_fault()
						        i915_gem_fault()
							  i915_mutex_lock_interruptible()
#3							    mutex_lock(&dev->struct_mutex)

							/* twiddles thumbs forever more */

#0		  wait_for_common()

#0						complete()


Is what it says I suppose. Now I don't know enough about that i915 code
to say if that breadcrumbs_signal thread can ever trigger a fault or
not. I got properly lost in that dma_fence callback maze.

You're saying not?


(also, that comment near need_resched() doesn't make sense to me)
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [Intel-gfx] [PATCH] kthread: finer-grained lockdep/cross-release completion
  2017-12-07 19:57       ` Peter Zijlstra
@ 2017-12-07 20:56         ` Daniel Vetter
  -1 siblings, 0 replies; 36+ messages in thread
From: Daniel Vetter @ 2017-12-07 20:56 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: LKML, DRI Development, Intel Graphics Development,
	Tvrtko Ursulin, Marta Lofstedt, Byungchul Park, Ingo Molnar,
	Tejun Heo, Kees Cook, Thomas Gleixner, Shaohua Li, Andrew Morton,
	Jens Axboe, Greg Kroah-Hartman, Jonathan Corbet, Oleg Nesterov,
	Daniel Vetter

On Thu, Dec 07, 2017 at 08:57:09PM +0100, Peter Zijlstra wrote:
> On Thu, Dec 07, 2017 at 03:58:28PM +0100, Daniel Vetter wrote:
> 
> > [   85.069417] gem_exec_captur/2810 is trying to acquire lock:
> > [   85.069419]  ((completion)&self->parked){+.+.}, at: [<ffffffff8109d69d>] kthread_park+0x3d/0x50
> > [   85.069426] 
> >                but task is already holding lock:
> > [   85.069428]  (&dev->struct_mutex){+.+.}, at: [<ffffffffa039b13d>] i915_reset_device+0x1bd/0x230 [i915]
> > [   85.069448] 
> >                which lock already depends on the new lock.
> > 
> > [   85.069451] 
> >                the existing dependency chain (in reverse order) is:
> > [   85.069454] 
> >                -> #3 (&dev->struct_mutex){+.+.}:
> > [   85.069460]        __mutex_lock+0x81/0x9b0
> > [   85.069481]        i915_mutex_lock_interruptible+0x47/0x130 [i915]
> > [   85.069502]        i915_gem_fault+0x201/0x760 [i915]
> > [   85.069507]        __do_fault+0x15/0x70
> > [   85.069509]        __handle_mm_fault+0x7bf/0xda0
> > [   85.069512]        handle_mm_fault+0x14f/0x2f0
> > [   85.069515]        __do_page_fault+0x2d1/0x560
> > [   85.069518]        page_fault+0x22/0x30
> > [   85.069520] 
> >                -> #2 (&mm->mmap_sem){++++}:
> > [   85.069525]        __might_fault+0x63/0x90
> > [   85.069529]        _copy_to_user+0x1e/0x70
> > [   85.069532]        perf_read+0x21d/0x290
> > [   85.069534]        __vfs_read+0x1e/0x120
> > [   85.069536]        vfs_read+0xa1/0x150
> > [   85.069539]        SyS_read+0x40/0xa0
> > [   85.069541]        entry_SYSCALL_64_fastpath+0x1c/0x89
> 
> >                -> #0 ((completion)&self->parked){+.+.}:
> 
> > [   85.069692] Chain exists of:
> >                  (completion)&self->parked --> &mm->mmap_sem --> &dev->struct_mutex
> 
> > [   85.069718] 3 locks held by gem_exec_captur/2810:
> 
> > [   85.069732]  #2:  (&dev->struct_mutex){+.+.}, at: [<ffffffffa039b13d>] i915_reset_device+0x1bd/0x230 [i915]
> 
> >                stack backtrace:
> 
> > [   85.069788]  lock_acquire+0xaf/0x200
> > [   85.069793]  wait_for_common+0x54/0x210
> > [   85.069807]  kthread_park+0x3d/0x50
> > [   85.069827]  i915_gem_reset_prepare_engine+0x1d/0x90 [i915]
> > [   85.069849]  i915_gem_reset_prepare+0x2c/0x60 [i915]
> > [   85.069865]  i915_reset+0x66/0x230 [i915]
> > [   85.069881]  i915_reset_device+0x1cb/0x230 [i915]
> > [   85.069919]  i915_handle_error+0x2d3/0x430 [i915]
> > [   85.069951]  i915_wedged_set+0x79/0xc0 [i915]
> > [   85.069955]  simple_attr_write+0xab/0xc0
> > [   85.069959]  full_proxy_write+0x4b/0x70
> > [   85.069961]  __vfs_write+0x1e/0x130
> > [   85.069976]  vfs_write+0xc0/0x1b0
> > [   85.069979]  SyS_write+0x40/0xa0
> > [   85.069982]  entry_SYSCALL_64_fastpath+0x1c/0x89
> 
> 
> 	Thread-A				k-Thread
> 
> 	i915_reset_device
> #3	  mutex_lock(&dev->struct_mutex)
> 	  i915_reset()
> 	    i915_gem_reset_prepare()
> 	      i915_gem_reset_prepare_engine()
> 	        kthread_park()
> 
> 						__do_page_fault()
> #2						  down_read(&mm->mmap_sem);
> 						  handle_mm_fault()
> 						    __handle_mm_fault()
> 						      __do_fault()
> 						        i915_gem_fault()
> 							  i915_mutex_lock_interruptible()
> #3							    mutex_lock(&dev->struct_mutex)
> 
> 							/* twiddles thumbs forever more */
> 
> #0		  wait_for_common()
> 
> #0						complete()
> 
> 
> Is what it says I suppose. Now I don't know enough about that i915 code
> to say if that breadcrumbs_signal thread can ever trigger a fault or
> not. I got properly lost in that dma_fence callback maze.
> 
> You're saying not?

Our own kthread, no. At least a tons of run on our CI with the kthread
patch applied shut up lockdep splats for good. And since we have all the
i915 kthreads still with the same lockdep_map even with the patch applied,
since they are all created in the same function, I think that's pretty
solid evidence.

[There's also really no reasonable reason for it to fault, but I trust
automated tools more to check this stuff than my own brain. The test suite
we're running is fairly nasty and does all kinds of corner case
thrashing. Note that the dma_fence callbacks can be provideded by any
other driver (think multi-gpu desktops and stuff), but the contract is
that they must be able to handle hardirq context. Faulting's definitely
not on the table.]

The problem lockdep seems to complain about is that some random other
kthread could fault, end up in the i915 fault handler, and get stuck until
i915_reset_device is done doing what it needs to do. But as long as that
kthread is in turn not providing a service that i915_reset_device needs, I
don't see how that can deadlock. And if we have that case (there was
definitely plenty of that stuff that cross-release uncovered in our code,
we had to shuffle a bunch of allocations and things out from under
dev->struct_mutex), then there should be another lock or completion that
closes the loop again.

Or I'm not understanding what your asking, this stuff is a bit complicated
:-)

> (also, that comment near need_resched() doesn't make sense to me)

I assume you mean the one in intel_breadcrumbs_signaler(). The hw design
is somewhat screwed up and depends upon ridiculously low interrupt
servicing time. We get there by essentially implementing something like
netdev polled mode, from irq context. Like net polling if the scheduler
gets pissed at us we stop and dump it all into a kthread. From a latency
and ops/sec pov a gpu is pretty close to networking sometimes.

[Note: I just have a rough idea what the code is supposed to do, I didn't
write/review/debug that one.]

Cheers, Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] kthread: finer-grained lockdep/cross-release completion
@ 2017-12-07 20:56         ` Daniel Vetter
  0 siblings, 0 replies; 36+ messages in thread
From: Daniel Vetter @ 2017-12-07 20:56 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Jens Axboe, Kees Cook, Greg Kroah-Hartman,
	Intel Graphics Development, Jonathan Corbet, LKML,
	DRI Development, Oleg Nesterov, Byungchul Park, Andrew Morton,
	Shaohua Li, Tejun Heo, Daniel Vetter, Thomas Gleixner,
	Ingo Molnar

On Thu, Dec 07, 2017 at 08:57:09PM +0100, Peter Zijlstra wrote:
> On Thu, Dec 07, 2017 at 03:58:28PM +0100, Daniel Vetter wrote:
> 
> > [   85.069417] gem_exec_captur/2810 is trying to acquire lock:
> > [   85.069419]  ((completion)&self->parked){+.+.}, at: [<ffffffff8109d69d>] kthread_park+0x3d/0x50
> > [   85.069426] 
> >                but task is already holding lock:
> > [   85.069428]  (&dev->struct_mutex){+.+.}, at: [<ffffffffa039b13d>] i915_reset_device+0x1bd/0x230 [i915]
> > [   85.069448] 
> >                which lock already depends on the new lock.
> > 
> > [   85.069451] 
> >                the existing dependency chain (in reverse order) is:
> > [   85.069454] 
> >                -> #3 (&dev->struct_mutex){+.+.}:
> > [   85.069460]        __mutex_lock+0x81/0x9b0
> > [   85.069481]        i915_mutex_lock_interruptible+0x47/0x130 [i915]
> > [   85.069502]        i915_gem_fault+0x201/0x760 [i915]
> > [   85.069507]        __do_fault+0x15/0x70
> > [   85.069509]        __handle_mm_fault+0x7bf/0xda0
> > [   85.069512]        handle_mm_fault+0x14f/0x2f0
> > [   85.069515]        __do_page_fault+0x2d1/0x560
> > [   85.069518]        page_fault+0x22/0x30
> > [   85.069520] 
> >                -> #2 (&mm->mmap_sem){++++}:
> > [   85.069525]        __might_fault+0x63/0x90
> > [   85.069529]        _copy_to_user+0x1e/0x70
> > [   85.069532]        perf_read+0x21d/0x290
> > [   85.069534]        __vfs_read+0x1e/0x120
> > [   85.069536]        vfs_read+0xa1/0x150
> > [   85.069539]        SyS_read+0x40/0xa0
> > [   85.069541]        entry_SYSCALL_64_fastpath+0x1c/0x89
> 
> >                -> #0 ((completion)&self->parked){+.+.}:
> 
> > [   85.069692] Chain exists of:
> >                  (completion)&self->parked --> &mm->mmap_sem --> &dev->struct_mutex
> 
> > [   85.069718] 3 locks held by gem_exec_captur/2810:
> 
> > [   85.069732]  #2:  (&dev->struct_mutex){+.+.}, at: [<ffffffffa039b13d>] i915_reset_device+0x1bd/0x230 [i915]
> 
> >                stack backtrace:
> 
> > [   85.069788]  lock_acquire+0xaf/0x200
> > [   85.069793]  wait_for_common+0x54/0x210
> > [   85.069807]  kthread_park+0x3d/0x50
> > [   85.069827]  i915_gem_reset_prepare_engine+0x1d/0x90 [i915]
> > [   85.069849]  i915_gem_reset_prepare+0x2c/0x60 [i915]
> > [   85.069865]  i915_reset+0x66/0x230 [i915]
> > [   85.069881]  i915_reset_device+0x1cb/0x230 [i915]
> > [   85.069919]  i915_handle_error+0x2d3/0x430 [i915]
> > [   85.069951]  i915_wedged_set+0x79/0xc0 [i915]
> > [   85.069955]  simple_attr_write+0xab/0xc0
> > [   85.069959]  full_proxy_write+0x4b/0x70
> > [   85.069961]  __vfs_write+0x1e/0x130
> > [   85.069976]  vfs_write+0xc0/0x1b0
> > [   85.069979]  SyS_write+0x40/0xa0
> > [   85.069982]  entry_SYSCALL_64_fastpath+0x1c/0x89
> 
> 
> 	Thread-A				k-Thread
> 
> 	i915_reset_device
> #3	  mutex_lock(&dev->struct_mutex)
> 	  i915_reset()
> 	    i915_gem_reset_prepare()
> 	      i915_gem_reset_prepare_engine()
> 	        kthread_park()
> 
> 						__do_page_fault()
> #2						  down_read(&mm->mmap_sem);
> 						  handle_mm_fault()
> 						    __handle_mm_fault()
> 						      __do_fault()
> 						        i915_gem_fault()
> 							  i915_mutex_lock_interruptible()
> #3							    mutex_lock(&dev->struct_mutex)
> 
> 							/* twiddles thumbs forever more */
> 
> #0		  wait_for_common()
> 
> #0						complete()
> 
> 
> Is what it says I suppose. Now I don't know enough about that i915 code
> to say if that breadcrumbs_signal thread can ever trigger a fault or
> not. I got properly lost in that dma_fence callback maze.
> 
> You're saying not?

Our own kthread, no. At least a tons of run on our CI with the kthread
patch applied shut up lockdep splats for good. And since we have all the
i915 kthreads still with the same lockdep_map even with the patch applied,
since they are all created in the same function, I think that's pretty
solid evidence.

[There's also really no reasonable reason for it to fault, but I trust
automated tools more to check this stuff than my own brain. The test suite
we're running is fairly nasty and does all kinds of corner case
thrashing. Note that the dma_fence callbacks can be provideded by any
other driver (think multi-gpu desktops and stuff), but the contract is
that they must be able to handle hardirq context. Faulting's definitely
not on the table.]

The problem lockdep seems to complain about is that some random other
kthread could fault, end up in the i915 fault handler, and get stuck until
i915_reset_device is done doing what it needs to do. But as long as that
kthread is in turn not providing a service that i915_reset_device needs, I
don't see how that can deadlock. And if we have that case (there was
definitely plenty of that stuff that cross-release uncovered in our code,
we had to shuffle a bunch of allocations and things out from under
dev->struct_mutex), then there should be another lock or completion that
closes the loop again.

Or I'm not understanding what your asking, this stuff is a bit complicated
:-)

> (also, that comment near need_resched() doesn't make sense to me)

I assume you mean the one in intel_breadcrumbs_signaler(). The hw design
is somewhat screwed up and depends upon ridiculously low interrupt
servicing time. We get there by essentially implementing something like
netdev polled mode, from irq context. Like net polling if the scheduler
gets pissed at us we stop and dump it all into a kthread. From a latency
and ops/sec pov a gpu is pretty close to networking sometimes.

[Note: I just have a rough idea what the code is supposed to do, I didn't
write/review/debug that one.]

Cheers, Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [Intel-gfx] [PATCH] kthread: finer-grained lockdep/cross-release completion
  2017-12-07 20:56         ` Daniel Vetter
@ 2017-12-08 10:14           ` Peter Zijlstra
  -1 siblings, 0 replies; 36+ messages in thread
From: Peter Zijlstra @ 2017-12-08 10:14 UTC (permalink / raw)
  To: LKML, DRI Development, Intel Graphics Development,
	Tvrtko Ursulin, Marta Lofstedt, Byungchul Park, Ingo Molnar,
	Tejun Heo, Kees Cook, Thomas Gleixner, Shaohua Li, Andrew Morton,
	Jens Axboe, Greg Kroah-Hartman, Jonathan Corbet, Oleg Nesterov,
	Daniel Vetter

On Thu, Dec 07, 2017 at 09:56:57PM +0100, Daniel Vetter wrote:
> On Thu, Dec 07, 2017 at 08:57:09PM +0100, Peter Zijlstra wrote:

> > Is what it says I suppose. Now I don't know enough about that i915 code
> > to say if that breadcrumbs_signal thread can ever trigger a fault or
> > not. I got properly lost in that dma_fence callback maze.
> > 
> > You're saying not?
> 
> Our own kthread, no. At least a tons of run on our CI with the kthread
> patch applied shut up lockdep splats for good. And since we have all the
> i915 kthreads still with the same lockdep_map even with the patch applied,
> since they are all created in the same function, I think that's pretty
> solid evidence.
> 
> [There's also really no reasonable reason for it to fault, but I trust
> automated tools more to check this stuff than my own brain. The test suite
> we're running is fairly nasty and does all kinds of corner case
> thrashing. Note that the dma_fence callbacks can be provideded by any
> other driver (think multi-gpu desktops and stuff), but the contract is
> that they must be able to handle hardirq context. Faulting's definitely
> not on the table.]

OK, good.

> The problem lockdep seems to complain about is that some random other
> kthread could fault, end up in the i915 fault handler, and get stuck until
> i915_reset_device is done doing what it needs to do. But as long as that
> kthread is in turn not providing a service that i915_reset_device needs, I
> don't see how that can deadlock. And if we have that case (there was
> definitely plenty of that stuff that cross-release uncovered in our code,
> we had to shuffle a bunch of allocations and things out from under
> dev->struct_mutex), then there should be another lock or completion that
> closes the loop again.

Indeed so.

> > (also, that comment near need_resched() doesn't make sense to me)
> 
> I assume you mean the one in intel_breadcrumbs_signaler(). The hw design
> is somewhat screwed up and depends upon ridiculously low interrupt
> servicing time. We get there by essentially implementing something like
> netdev polled mode, from irq context. Like net polling if the scheduler
> gets pissed at us we stop and dump it all into a kthread. From a latency
> and ops/sec pov a gpu is pretty close to networking sometimes.
> 
> [Note: I just have a rough idea what the code is supposed to do, I didn't
> write/review/debug that one.]

The thing is though; that calling schedule() from an RT thread doesn't
help anything if it goes running instantly again.

And looking more; that uses the waitqueue code 'creatively' it doesn't
actually have a condition to wait on, so wtf is it doing with a
waitqueue?

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] kthread: finer-grained lockdep/cross-release completion
@ 2017-12-08 10:14           ` Peter Zijlstra
  0 siblings, 0 replies; 36+ messages in thread
From: Peter Zijlstra @ 2017-12-08 10:14 UTC (permalink / raw)
  To: LKML, DRI Development, Intel Graphics Development,
	Tvrtko Ursulin, Marta Lofstedt, Byungchul Park, Ingo Molnar,
	Tejun Heo, Kees Cook, Thomas Gleixner, Shaohua Li, Andrew Morton,
	Jens Axboe, Greg Kroah-Hartman, Jonathan Corbet, Oleg Nesterov,
	Daniel Vetter

On Thu, Dec 07, 2017 at 09:56:57PM +0100, Daniel Vetter wrote:
> On Thu, Dec 07, 2017 at 08:57:09PM +0100, Peter Zijlstra wrote:

> > Is what it says I suppose. Now I don't know enough about that i915 code
> > to say if that breadcrumbs_signal thread can ever trigger a fault or
> > not. I got properly lost in that dma_fence callback maze.
> > 
> > You're saying not?
> 
> Our own kthread, no. At least a tons of run on our CI with the kthread
> patch applied shut up lockdep splats for good. And since we have all the
> i915 kthreads still with the same lockdep_map even with the patch applied,
> since they are all created in the same function, I think that's pretty
> solid evidence.
> 
> [There's also really no reasonable reason for it to fault, but I trust
> automated tools more to check this stuff than my own brain. The test suite
> we're running is fairly nasty and does all kinds of corner case
> thrashing. Note that the dma_fence callbacks can be provideded by any
> other driver (think multi-gpu desktops and stuff), but the contract is
> that they must be able to handle hardirq context. Faulting's definitely
> not on the table.]

OK, good.

> The problem lockdep seems to complain about is that some random other
> kthread could fault, end up in the i915 fault handler, and get stuck until
> i915_reset_device is done doing what it needs to do. But as long as that
> kthread is in turn not providing a service that i915_reset_device needs, I
> don't see how that can deadlock. And if we have that case (there was
> definitely plenty of that stuff that cross-release uncovered in our code,
> we had to shuffle a bunch of allocations and things out from under
> dev->struct_mutex), then there should be another lock or completion that
> closes the loop again.

Indeed so.

> > (also, that comment near need_resched() doesn't make sense to me)
> 
> I assume you mean the one in intel_breadcrumbs_signaler(). The hw design
> is somewhat screwed up and depends upon ridiculously low interrupt
> servicing time. We get there by essentially implementing something like
> netdev polled mode, from irq context. Like net polling if the scheduler
> gets pissed at us we stop and dump it all into a kthread. From a latency
> and ops/sec pov a gpu is pretty close to networking sometimes.
> 
> [Note: I just have a rough idea what the code is supposed to do, I didn't
> write/review/debug that one.]

The thing is though; that calling schedule() from an RT thread doesn't
help anything if it goes running instantly again.

And looking more; that uses the waitqueue code 'creatively' it doesn't
actually have a condition to wait on, so wtf is it doing with a
waitqueue?

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] kthread: finer-grained lockdep/cross-release completion
  2017-12-07 10:08 ` Daniel Vetter
@ 2017-12-08 10:54   ` Peter Zijlstra
  -1 siblings, 0 replies; 36+ messages in thread
From: Peter Zijlstra @ 2017-12-08 10:54 UTC (permalink / raw)
  To: Daniel Vetter
  Cc: LKML, DRI Development, Intel Graphics Development,
	Tvrtko Ursulin, Marta Lofstedt, Byungchul Park, Ingo Molnar,
	Tejun Heo, Kees Cook, Thomas Gleixner, Shaohua Li, Andrew Morton,
	Jens Axboe, Greg Kroah-Hartman, Jonathan Corbet, Oleg Nesterov,
	Daniel Vetter

On Thu, Dec 07, 2017 at 11:08:49AM +0100, Daniel Vetter wrote:
> Since -rc1 we're hitting a bunch of lockdep splats using the new
> cross-release stuff around the 2 kthread completions. In all cases
> they are because totally independent uses of kthread are mixed up by
> lockdep into the same locking class, creating artificial deadlocks.
> 
> Fix this by converting kthread code in the same way as e.g.
> alloc_workqueue already works: Use macros for the public api so we can
> have a callsite specific lockdep key, then pass that through the
> entire callchain. Due to the many entry points this is slightly
> tedious.
> 
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Marta Lofstedt <marta.lofstedt@intel.com>
> Cc: Byungchul Park <byungchul.park@lge.com>
> Cc: Ingo Molnar <mingo@kernel.org>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Tejun Heo <tj@kernel.org>
> Cc: Kees Cook <keescook@chromium.org>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Shaohua Li <shli@fb.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Jens Axboe <axboe@kernel.dk>
> Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
> Cc: Jonathan Corbet <corbet@lwn.net>
> Cc: Oleg Nesterov <oleg@redhat.com>
> References: https://bugs.freedesktop.org/show_bug.cgi?id=103950
> Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] kthread: finer-grained lockdep/cross-release completion
@ 2017-12-08 10:54   ` Peter Zijlstra
  0 siblings, 0 replies; 36+ messages in thread
From: Peter Zijlstra @ 2017-12-08 10:54 UTC (permalink / raw)
  To: Daniel Vetter
  Cc: Jens Axboe, Kees Cook, Greg Kroah-Hartman,
	Intel Graphics Development, Jonathan Corbet, LKML,
	DRI Development, Oleg Nesterov, Byungchul Park, Andrew Morton,
	Shaohua Li, Tejun Heo, Daniel Vetter, Thomas Gleixner,
	Ingo Molnar

On Thu, Dec 07, 2017 at 11:08:49AM +0100, Daniel Vetter wrote:
> Since -rc1 we're hitting a bunch of lockdep splats using the new
> cross-release stuff around the 2 kthread completions. In all cases
> they are because totally independent uses of kthread are mixed up by
> lockdep into the same locking class, creating artificial deadlocks.
> 
> Fix this by converting kthread code in the same way as e.g.
> alloc_workqueue already works: Use macros for the public api so we can
> have a callsite specific lockdep key, then pass that through the
> entire callchain. Due to the many entry points this is slightly
> tedious.
> 
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Marta Lofstedt <marta.lofstedt@intel.com>
> Cc: Byungchul Park <byungchul.park@lge.com>
> Cc: Ingo Molnar <mingo@kernel.org>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Tejun Heo <tj@kernel.org>
> Cc: Kees Cook <keescook@chromium.org>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Shaohua Li <shli@fb.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Jens Axboe <axboe@kernel.dk>
> Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
> Cc: Jonathan Corbet <corbet@lwn.net>
> Cc: Oleg Nesterov <oleg@redhat.com>
> References: https://bugs.freedesktop.org/show_bug.cgi?id=103950
> Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>


_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [Intel-gfx] [PATCH] kthread: finer-grained lockdep/cross-release completion
  2017-12-08 10:14           ` Peter Zijlstra
@ 2017-12-08 16:36             ` Daniel Vetter
  -1 siblings, 0 replies; 36+ messages in thread
From: Daniel Vetter @ 2017-12-08 16:36 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: LKML, DRI Development, Intel Graphics Development,
	Tvrtko Ursulin, Marta Lofstedt, Byungchul Park, Ingo Molnar,
	Tejun Heo, Kees Cook, Thomas Gleixner, Shaohua Li, Andrew Morton,
	Jens Axboe, Greg Kroah-Hartman, Jonathan Corbet, Oleg Nesterov,
	Daniel Vetter

On Fri, Dec 08, 2017 at 11:14:16AM +0100, Peter Zijlstra wrote:
> On Thu, Dec 07, 2017 at 09:56:57PM +0100, Daniel Vetter wrote:
> > On Thu, Dec 07, 2017 at 08:57:09PM +0100, Peter Zijlstra wrote:
> 
> > > Is what it says I suppose. Now I don't know enough about that i915 code
> > > to say if that breadcrumbs_signal thread can ever trigger a fault or
> > > not. I got properly lost in that dma_fence callback maze.
> > > 
> > > You're saying not?
> > 
> > Our own kthread, no. At least a tons of run on our CI with the kthread
> > patch applied shut up lockdep splats for good. And since we have all the
> > i915 kthreads still with the same lockdep_map even with the patch applied,
> > since they are all created in the same function, I think that's pretty
> > solid evidence.
> > 
> > [There's also really no reasonable reason for it to fault, but I trust
> > automated tools more to check this stuff than my own brain. The test suite
> > we're running is fairly nasty and does all kinds of corner case
> > thrashing. Note that the dma_fence callbacks can be provideded by any
> > other driver (think multi-gpu desktops and stuff), but the contract is
> > that they must be able to handle hardirq context. Faulting's definitely
> > not on the table.]
> 
> OK, good.

Aside: Could/should we take some fake lockdep locks around these
callbacks, since not all drivers call them from a hardirq context? Just to
validate that everyone follows the contract.

I need to ponder proper lockdep annotations for dma_fence anyway, since
they're just completions which also have some support for direct hw->hw
signalling.

> > The problem lockdep seems to complain about is that some random other
> > kthread could fault, end up in the i915 fault handler, and get stuck until
> > i915_reset_device is done doing what it needs to do. But as long as that
> > kthread is in turn not providing a service that i915_reset_device needs, I
> > don't see how that can deadlock. And if we have that case (there was
> > definitely plenty of that stuff that cross-release uncovered in our code,
> > we had to shuffle a bunch of allocations and things out from under
> > dev->struct_mutex), then there should be another lock or completion that
> > closes the loop again.
> 
> Indeed so.
> 
> > > (also, that comment near need_resched() doesn't make sense to me)
> > 
> > I assume you mean the one in intel_breadcrumbs_signaler(). The hw design
> > is somewhat screwed up and depends upon ridiculously low interrupt
> > servicing time. We get there by essentially implementing something like
> > netdev polled mode, from irq context. Like net polling if the scheduler
> > gets pissed at us we stop and dump it all into a kthread. From a latency
> > and ops/sec pov a gpu is pretty close to networking sometimes.
> > 
> > [Note: I just have a rough idea what the code is supposed to do, I didn't
> > write/review/debug that one.]
> 
> The thing is though; that calling schedule() from an RT thread doesn't
> help anything if it goes running instantly again.
> 
> And looking more; that uses the waitqueue code 'creatively' it doesn't
> actually have a condition to wait on, so wtf is it doing with a
> waitqueue?

Yeah that looks fishy. I discussed it with Chris, and the waitqueue stuff
is indeed broken. Chris has a patch to address that.

The main wakeup logic of the thread is done through
wake_up_process(breadcrumb->signaller) directly, so no ordering issue wrt
adding to the waitqueue. And then the proper waker/wakee pattern holds:

- set tast state
- check condition
- schedule()

Link to Chris' patch:

https://www.spinics.net/lists/intel-gfx/msg149891.html

Thanks, Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] kthread: finer-grained lockdep/cross-release completion
@ 2017-12-08 16:36             ` Daniel Vetter
  0 siblings, 0 replies; 36+ messages in thread
From: Daniel Vetter @ 2017-12-08 16:36 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Jens Axboe, Kees Cook, Greg Kroah-Hartman,
	Intel Graphics Development, Jonathan Corbet, LKML,
	DRI Development, Oleg Nesterov, Byungchul Park, Andrew Morton,
	Shaohua Li, Tejun Heo, Daniel Vetter, Thomas Gleixner,
	Ingo Molnar

On Fri, Dec 08, 2017 at 11:14:16AM +0100, Peter Zijlstra wrote:
> On Thu, Dec 07, 2017 at 09:56:57PM +0100, Daniel Vetter wrote:
> > On Thu, Dec 07, 2017 at 08:57:09PM +0100, Peter Zijlstra wrote:
> 
> > > Is what it says I suppose. Now I don't know enough about that i915 code
> > > to say if that breadcrumbs_signal thread can ever trigger a fault or
> > > not. I got properly lost in that dma_fence callback maze.
> > > 
> > > You're saying not?
> > 
> > Our own kthread, no. At least a tons of run on our CI with the kthread
> > patch applied shut up lockdep splats for good. And since we have all the
> > i915 kthreads still with the same lockdep_map even with the patch applied,
> > since they are all created in the same function, I think that's pretty
> > solid evidence.
> > 
> > [There's also really no reasonable reason for it to fault, but I trust
> > automated tools more to check this stuff than my own brain. The test suite
> > we're running is fairly nasty and does all kinds of corner case
> > thrashing. Note that the dma_fence callbacks can be provideded by any
> > other driver (think multi-gpu desktops and stuff), but the contract is
> > that they must be able to handle hardirq context. Faulting's definitely
> > not on the table.]
> 
> OK, good.

Aside: Could/should we take some fake lockdep locks around these
callbacks, since not all drivers call them from a hardirq context? Just to
validate that everyone follows the contract.

I need to ponder proper lockdep annotations for dma_fence anyway, since
they're just completions which also have some support for direct hw->hw
signalling.

> > The problem lockdep seems to complain about is that some random other
> > kthread could fault, end up in the i915 fault handler, and get stuck until
> > i915_reset_device is done doing what it needs to do. But as long as that
> > kthread is in turn not providing a service that i915_reset_device needs, I
> > don't see how that can deadlock. And if we have that case (there was
> > definitely plenty of that stuff that cross-release uncovered in our code,
> > we had to shuffle a bunch of allocations and things out from under
> > dev->struct_mutex), then there should be another lock or completion that
> > closes the loop again.
> 
> Indeed so.
> 
> > > (also, that comment near need_resched() doesn't make sense to me)
> > 
> > I assume you mean the one in intel_breadcrumbs_signaler(). The hw design
> > is somewhat screwed up and depends upon ridiculously low interrupt
> > servicing time. We get there by essentially implementing something like
> > netdev polled mode, from irq context. Like net polling if the scheduler
> > gets pissed at us we stop and dump it all into a kthread. From a latency
> > and ops/sec pov a gpu is pretty close to networking sometimes.
> > 
> > [Note: I just have a rough idea what the code is supposed to do, I didn't
> > write/review/debug that one.]
> 
> The thing is though; that calling schedule() from an RT thread doesn't
> help anything if it goes running instantly again.
> 
> And looking more; that uses the waitqueue code 'creatively' it doesn't
> actually have a condition to wait on, so wtf is it doing with a
> waitqueue?

Yeah that looks fishy. I discussed it with Chris, and the waitqueue stuff
is indeed broken. Chris has a patch to address that.

The main wakeup logic of the thread is done through
wake_up_process(breadcrumb->signaller) directly, so no ordering issue wrt
adding to the waitqueue. And then the proper waker/wakee pattern holds:

- set tast state
- check condition
- schedule()

Link to Chris' patch:

https://www.spinics.net/lists/intel-gfx/msg149891.html

Thanks, Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [Intel-gfx] [PATCH] kthread: finer-grained lockdep/cross-release completion
  2017-12-08 16:36             ` Daniel Vetter
@ 2017-12-08 18:40               ` Peter Zijlstra
  -1 siblings, 0 replies; 36+ messages in thread
From: Peter Zijlstra @ 2017-12-08 18:40 UTC (permalink / raw)
  To: LKML, DRI Development, Intel Graphics Development,
	Tvrtko Ursulin, Marta Lofstedt, Byungchul Park, Ingo Molnar,
	Tejun Heo, Kees Cook, Thomas Gleixner, Shaohua Li, Andrew Morton,
	Jens Axboe, Greg Kroah-Hartman, Jonathan Corbet, Oleg Nesterov,
	Daniel Vetter

On Fri, Dec 08, 2017 at 05:36:28PM +0100, Daniel Vetter wrote:

> Aside: Could/should we take some fake lockdep locks around these
> callbacks, since not all drivers call them from a hardirq context? Just to
> validate that everyone follows the contract.

What I typically do is use local_irq_save/restore over the actual
callback. Then if someone doesn't honour the contract, it shows real
quick :-)

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [Intel-gfx] [PATCH] kthread: finer-grained lockdep/cross-release completion
@ 2017-12-08 18:40               ` Peter Zijlstra
  0 siblings, 0 replies; 36+ messages in thread
From: Peter Zijlstra @ 2017-12-08 18:40 UTC (permalink / raw)
  To: LKML, DRI Development, Intel Graphics Development,
	Tvrtko Ursulin, Marta Lofstedt, Byungchul Park, Ingo Molnar,
	Tejun Heo, Kees Cook, Thomas Gleixner, Shaohua Li, Andrew Morton,
	Jens Axboe, Greg Kroah-Hartman, Jonathan Corbet, Oleg Nesterov,
	Daniel Vetter

On Fri, Dec 08, 2017 at 05:36:28PM +0100, Daniel Vetter wrote:

> Aside: Could/should we take some fake lockdep locks around these
> callbacks, since not all drivers call them from a hardirq context? Just to
> validate that everyone follows the contract.

What I typically do is use local_irq_save/restore over the actual
callback. Then if someone doesn't honour the contract, it shows real
quick :-)
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] kthread: finer-grained lockdep/cross-release completion
  2017-12-08 10:54   ` Peter Zijlstra
@ 2017-12-11  9:19     ` Daniel Vetter
  -1 siblings, 0 replies; 36+ messages in thread
From: Daniel Vetter @ 2017-12-11  9:19 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Daniel Vetter, LKML, DRI Development, Intel Graphics Development,
	Tvrtko Ursulin, Marta Lofstedt, Byungchul Park, Ingo Molnar,
	Tejun Heo, Kees Cook, Thomas Gleixner, Shaohua Li, Andrew Morton,
	Jens Axboe, Greg Kroah-Hartman, Jonathan Corbet, Oleg Nesterov,
	Daniel Vetter

On Fri, Dec 08, 2017 at 11:54:19AM +0100, Peter Zijlstra wrote:
> On Thu, Dec 07, 2017 at 11:08:49AM +0100, Daniel Vetter wrote:
> > Since -rc1 we're hitting a bunch of lockdep splats using the new
> > cross-release stuff around the 2 kthread completions. In all cases
> > they are because totally independent uses of kthread are mixed up by
> > lockdep into the same locking class, creating artificial deadlocks.
> > 
> > Fix this by converting kthread code in the same way as e.g.
> > alloc_workqueue already works: Use macros for the public api so we can
> > have a callsite specific lockdep key, then pass that through the
> > entire callchain. Due to the many entry points this is slightly
> > tedious.
> > 
> > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > Cc: Marta Lofstedt <marta.lofstedt@intel.com>
> > Cc: Byungchul Park <byungchul.park@lge.com>
> > Cc: Ingo Molnar <mingo@kernel.org>
> > Cc: Peter Zijlstra <peterz@infradead.org>
> > Cc: Tejun Heo <tj@kernel.org>
> > Cc: Kees Cook <keescook@chromium.org>
> > Cc: Thomas Gleixner <tglx@linutronix.de>
> > Cc: Shaohua Li <shli@fb.com>
> > Cc: Andrew Morton <akpm@linux-foundation.org>
> > Cc: Jens Axboe <axboe@kernel.dk>
> > Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
> > Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
> > Cc: Jonathan Corbet <corbet@lwn.net>
> > Cc: Oleg Nesterov <oleg@redhat.com>
> > References: https://bugs.freedesktop.org/show_bug.cgi?id=103950
> > Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
> 
> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>

Who's going to pick this up? Ingo, Andrew?

Thanks, Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] kthread: finer-grained lockdep/cross-release completion
@ 2017-12-11  9:19     ` Daniel Vetter
  0 siblings, 0 replies; 36+ messages in thread
From: Daniel Vetter @ 2017-12-11  9:19 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Jens Axboe, Kees Cook, Tvrtko Ursulin, Daniel Vetter,
	Intel Graphics Development, Jonathan Corbet, LKML,
	DRI Development, Oleg Nesterov, Byungchul Park, Andrew Morton,
	Shaohua Li, Greg Kroah-Hartman, Tejun Heo, Daniel Vetter,
	Thomas Gleixner, Ingo Molnar

On Fri, Dec 08, 2017 at 11:54:19AM +0100, Peter Zijlstra wrote:
> On Thu, Dec 07, 2017 at 11:08:49AM +0100, Daniel Vetter wrote:
> > Since -rc1 we're hitting a bunch of lockdep splats using the new
> > cross-release stuff around the 2 kthread completions. In all cases
> > they are because totally independent uses of kthread are mixed up by
> > lockdep into the same locking class, creating artificial deadlocks.
> > 
> > Fix this by converting kthread code in the same way as e.g.
> > alloc_workqueue already works: Use macros for the public api so we can
> > have a callsite specific lockdep key, then pass that through the
> > entire callchain. Due to the many entry points this is slightly
> > tedious.
> > 
> > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > Cc: Marta Lofstedt <marta.lofstedt@intel.com>
> > Cc: Byungchul Park <byungchul.park@lge.com>
> > Cc: Ingo Molnar <mingo@kernel.org>
> > Cc: Peter Zijlstra <peterz@infradead.org>
> > Cc: Tejun Heo <tj@kernel.org>
> > Cc: Kees Cook <keescook@chromium.org>
> > Cc: Thomas Gleixner <tglx@linutronix.de>
> > Cc: Shaohua Li <shli@fb.com>
> > Cc: Andrew Morton <akpm@linux-foundation.org>
> > Cc: Jens Axboe <axboe@kernel.dk>
> > Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
> > Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
> > Cc: Jonathan Corbet <corbet@lwn.net>
> > Cc: Oleg Nesterov <oleg@redhat.com>
> > References: https://bugs.freedesktop.org/show_bug.cgi?id=103950
> > Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
> 
> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>

Who's going to pick this up? Ingo, Andrew?

Thanks, Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] kthread: finer-grained lockdep/cross-release completion
  2017-12-11  9:19     ` Daniel Vetter
@ 2017-12-18  7:11       ` Daniel Vetter
  -1 siblings, 0 replies; 36+ messages in thread
From: Daniel Vetter @ 2017-12-18  7:11 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Daniel Vetter, LKML, DRI Development, Intel Graphics Development,
	Tvrtko Ursulin, Marta Lofstedt, Byungchul Park, Ingo Molnar,
	Tejun Heo, Kees Cook, Thomas Gleixner, Shaohua Li, Andrew Morton,
	Jens Axboe, Greg Kroah-Hartman, Jonathan Corbet, Oleg Nesterov,
	Daniel Vetter, Linus Torvalds

On Mon, Dec 11, 2017 at 10:19:28AM +0100, Daniel Vetter wrote:
> On Fri, Dec 08, 2017 at 11:54:19AM +0100, Peter Zijlstra wrote:
> > On Thu, Dec 07, 2017 at 11:08:49AM +0100, Daniel Vetter wrote:
> > > Since -rc1 we're hitting a bunch of lockdep splats using the new
> > > cross-release stuff around the 2 kthread completions. In all cases
> > > they are because totally independent uses of kthread are mixed up by
> > > lockdep into the same locking class, creating artificial deadlocks.
> > > 
> > > Fix this by converting kthread code in the same way as e.g.
> > > alloc_workqueue already works: Use macros for the public api so we can
> > > have a callsite specific lockdep key, then pass that through the
> > > entire callchain. Due to the many entry points this is slightly
> > > tedious.
> > > 
> > > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > > Cc: Marta Lofstedt <marta.lofstedt@intel.com>
> > > Cc: Byungchul Park <byungchul.park@lge.com>
> > > Cc: Ingo Molnar <mingo@kernel.org>
> > > Cc: Peter Zijlstra <peterz@infradead.org>
> > > Cc: Tejun Heo <tj@kernel.org>
> > > Cc: Kees Cook <keescook@chromium.org>
> > > Cc: Thomas Gleixner <tglx@linutronix.de>
> > > Cc: Shaohua Li <shli@fb.com>
> > > Cc: Andrew Morton <akpm@linux-foundation.org>
> > > Cc: Jens Axboe <axboe@kernel.dk>
> > > Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
> > > Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
> > > Cc: Jonathan Corbet <corbet@lwn.net>
> > > Cc: Oleg Nesterov <oleg@redhat.com>
> > > References: https://bugs.freedesktop.org/show_bug.cgi?id=103950
> > > Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
> > 
> > Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> 
> Who's going to pick this up? Ingo, Andrew?

This didn't seem to have made it into -rc4. Anything needed to get it
going?

Thanks, Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] kthread: finer-grained lockdep/cross-release completion
@ 2017-12-18  7:11       ` Daniel Vetter
  0 siblings, 0 replies; 36+ messages in thread
From: Daniel Vetter @ 2017-12-18  7:11 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Jens Axboe, Kees Cook, Tvrtko Ursulin, Daniel Vetter,
	Intel Graphics Development, Jonathan Corbet, LKML,
	DRI Development, Oleg Nesterov, Byungchul Park, Andrew Morton,
	Shaohua Li, Greg Kroah-Hartman, Tejun Heo, Daniel Vetter,
	Thomas Gleixner, Linus Torvalds, Ingo Molnar

On Mon, Dec 11, 2017 at 10:19:28AM +0100, Daniel Vetter wrote:
> On Fri, Dec 08, 2017 at 11:54:19AM +0100, Peter Zijlstra wrote:
> > On Thu, Dec 07, 2017 at 11:08:49AM +0100, Daniel Vetter wrote:
> > > Since -rc1 we're hitting a bunch of lockdep splats using the new
> > > cross-release stuff around the 2 kthread completions. In all cases
> > > they are because totally independent uses of kthread are mixed up by
> > > lockdep into the same locking class, creating artificial deadlocks.
> > > 
> > > Fix this by converting kthread code in the same way as e.g.
> > > alloc_workqueue already works: Use macros for the public api so we can
> > > have a callsite specific lockdep key, then pass that through the
> > > entire callchain. Due to the many entry points this is slightly
> > > tedious.
> > > 
> > > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > > Cc: Marta Lofstedt <marta.lofstedt@intel.com>
> > > Cc: Byungchul Park <byungchul.park@lge.com>
> > > Cc: Ingo Molnar <mingo@kernel.org>
> > > Cc: Peter Zijlstra <peterz@infradead.org>
> > > Cc: Tejun Heo <tj@kernel.org>
> > > Cc: Kees Cook <keescook@chromium.org>
> > > Cc: Thomas Gleixner <tglx@linutronix.de>
> > > Cc: Shaohua Li <shli@fb.com>
> > > Cc: Andrew Morton <akpm@linux-foundation.org>
> > > Cc: Jens Axboe <axboe@kernel.dk>
> > > Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
> > > Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
> > > Cc: Jonathan Corbet <corbet@lwn.net>
> > > Cc: Oleg Nesterov <oleg@redhat.com>
> > > References: https://bugs.freedesktop.org/show_bug.cgi?id=103950
> > > Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
> > 
> > Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> 
> Who's going to pick this up? Ingo, Andrew?

This didn't seem to have made it into -rc4. Anything needed to get it
going?

Thanks, Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] kthread: finer-grained lockdep/cross-release completion
  2017-12-18  7:11       ` Daniel Vetter
@ 2017-12-18 17:42         ` Linus Torvalds
  -1 siblings, 0 replies; 36+ messages in thread
From: Linus Torvalds @ 2017-12-18 17:42 UTC (permalink / raw)
  To: Peter Zijlstra, LKML, DRI Development,
	Intel Graphics Development, Tvrtko Ursulin, Marta Lofstedt,
	Byungchul Park, Ingo Molnar, Tejun Heo, Kees Cook,
	Thomas Gleixner, Shaohua Li, Andrew Morton, Jens Axboe,
	Greg Kroah-Hartman, Jonathan Corbet, Oleg Nesterov,
	Daniel Vetter, Daniel Vetter

On Sun, Dec 17, 2017 at 11:11 PM, Daniel Vetter <daniel@ffwll.ch> wrote:
>
> This didn't seem to have made it into -rc4. Anything needed to get it
> going?

Do you actually see the problem in -rc4?

Because we ended up removing the cross-release checking due to other
developers complaining. It seemed to need a lot more work before it
was ready.

So I suspect the patch is simply not relevant any more (even if it's
not necessarily wrong either).

                   Linus

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] kthread: finer-grained lockdep/cross-release completion
@ 2017-12-18 17:42         ` Linus Torvalds
  0 siblings, 0 replies; 36+ messages in thread
From: Linus Torvalds @ 2017-12-18 17:42 UTC (permalink / raw)
  To: Peter Zijlstra, LKML, DRI Development,
	Intel Graphics Development, Tvrtko Ursulin, Marta Lofstedt,
	Byungchul Park, Ingo Molnar, Tejun Heo, Kees Cook,
	Thomas Gleixner, Shaohua Li, Andrew Morton, Jens Axboe,
	Greg Kroah-Hartman, Jonathan Corbet, Oleg Nesterov,
	Daniel Vetter, Daniel Vetter

On Sun, Dec 17, 2017 at 11:11 PM, Daniel Vetter <daniel@ffwll.ch> wrote:
>
> This didn't seem to have made it into -rc4. Anything needed to get it
> going?

Do you actually see the problem in -rc4?

Because we ended up removing the cross-release checking due to other
developers complaining. It seemed to need a lot more work before it
was ready.

So I suspect the patch is simply not relevant any more (even if it's
not necessarily wrong either).

                   Linus
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] kthread: finer-grained lockdep/cross-release completion
  2017-12-18 17:42         ` Linus Torvalds
@ 2017-12-19  9:59           ` Daniel Vetter
  -1 siblings, 0 replies; 36+ messages in thread
From: Daniel Vetter @ 2017-12-19  9:59 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Peter Zijlstra, LKML, DRI Development,
	Intel Graphics Development, Tvrtko Ursulin, Marta Lofstedt,
	Byungchul Park, Ingo Molnar, Tejun Heo, Kees Cook,
	Thomas Gleixner, Shaohua Li, Andrew Morton, Jens Axboe,
	Greg Kroah-Hartman, Jonathan Corbet, Oleg Nesterov,
	Daniel Vetter, Daniel Vetter

On Mon, Dec 18, 2017 at 09:42:13AM -0800, Linus Torvalds wrote:
> On Sun, Dec 17, 2017 at 11:11 PM, Daniel Vetter <daniel@ffwll.ch> wrote:
> >
> > This didn't seem to have made it into -rc4. Anything needed to get it
> > going?
> 
> Do you actually see the problem in -rc4?
> 
> Because we ended up removing the cross-release checking due to other
> developers complaining. It seemed to need a lot more work before it
> was ready.
> 
> So I suspect the patch is simply not relevant any more (even if it's
> not necessarily wrong either).

Awww ... I like the cross release stuff, it's catching lots of good bugs
for us - writing a gpu memory manager that's supposed to interface with
the core one ain't the simplest thing to get right. That's also why we're
going around and fixing up fallout (we've worked on the worker annotations
for 4.14 too). I guess next release, hopefully.

I think between 4.14 and 4.15 attemps cross-release spotted around 5 or so
genuine deadlocks in i915 code. And at least right now, with the current
set of fixups our CI runs are splat-free. So at least a Kconfig option
would be nice, to be able to enable it easily when you want to.

Wrt us not noticing: Getting the patches in through normal means takes too
long, so we constantly carry arounda  bunch of fixup patches to address
serious issues that block CI (no lockdep is no good CI). That's why we
won't immediately notice when an issue is resolved some other way.

Cheers, Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] kthread: finer-grained lockdep/cross-release completion
@ 2017-12-19  9:59           ` Daniel Vetter
  0 siblings, 0 replies; 36+ messages in thread
From: Daniel Vetter @ 2017-12-19  9:59 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Jens Axboe, Kees Cook, Peter Zijlstra, Greg Kroah-Hartman,
	Intel Graphics Development, Jonathan Corbet, LKML,
	DRI Development, Oleg Nesterov, Byungchul Park, Andrew Morton,
	Shaohua Li, Tejun Heo, Daniel Vetter, Thomas Gleixner,
	Ingo Molnar

On Mon, Dec 18, 2017 at 09:42:13AM -0800, Linus Torvalds wrote:
> On Sun, Dec 17, 2017 at 11:11 PM, Daniel Vetter <daniel@ffwll.ch> wrote:
> >
> > This didn't seem to have made it into -rc4. Anything needed to get it
> > going?
> 
> Do you actually see the problem in -rc4?
> 
> Because we ended up removing the cross-release checking due to other
> developers complaining. It seemed to need a lot more work before it
> was ready.
> 
> So I suspect the patch is simply not relevant any more (even if it's
> not necessarily wrong either).

Awww ... I like the cross release stuff, it's catching lots of good bugs
for us - writing a gpu memory manager that's supposed to interface with
the core one ain't the simplest thing to get right. That's also why we're
going around and fixing up fallout (we've worked on the worker annotations
for 4.14 too). I guess next release, hopefully.

I think between 4.14 and 4.15 attemps cross-release spotted around 5 or so
genuine deadlocks in i915 code. And at least right now, with the current
set of fixups our CI runs are splat-free. So at least a Kconfig option
would be nice, to be able to enable it easily when you want to.

Wrt us not noticing: Getting the patches in through normal means takes too
long, so we constantly carry arounda  bunch of fixup patches to address
serious issues that block CI (no lockdep is no good CI). That's why we
won't immediately notice when an issue is resolved some other way.

Cheers, Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] kthread: finer-grained lockdep/cross-release completion
  2017-12-19  9:59           ` Daniel Vetter
@ 2017-12-20  1:14             ` Byungchul Park
  -1 siblings, 0 replies; 36+ messages in thread
From: Byungchul Park @ 2017-12-20  1:14 UTC (permalink / raw)
  To: Linus Torvalds, Peter Zijlstra, LKML, DRI Development,
	Intel Graphics Development, Tvrtko Ursulin, Marta Lofstedt,
	Ingo Molnar, Tejun Heo, Kees Cook, Thomas Gleixner, Shaohua Li,
	Andrew Morton, Jens Axboe, Greg Kroah-Hartman, Jonathan Corbet,
	Oleg Nesterov, Daniel Vetter
  Cc: kernel-team, tglx, mingo, tytso, Amir Goldstein

On 12/19/2017 6:59 PM, Daniel Vetter wrote:
> On Mon, Dec 18, 2017 at 09:42:13AM -0800, Linus Torvalds wrote:
>> On Sun, Dec 17, 2017 at 11:11 PM, Daniel Vetter <daniel@ffwll.ch> wrote:
>>>
>>> This didn't seem to have made it into -rc4. Anything needed to get it
>>> going?
>>
>> Do you actually see the problem in -rc4?
>>
>> Because we ended up removing the cross-release checking due to other
>> developers complaining. It seemed to need a lot more work before it
>> was ready.
>>
>> So I suspect the patch is simply not relevant any more (even if it's
>> not necessarily wrong either).
> 
> Awww ... I like the cross release stuff, it's catching lots of good bugs
> for us - writing a gpu memory manager that's supposed to interface with
> the core one ain't the simplest thing to get right. That's also why we're
> going around and fixing up fallout (we've worked on the worker annotations
> for 4.14 too). I guess next release, hopefully.
> 
> I think between 4.14 and 4.15 attemps cross-release spotted around 5 or so
> genuine deadlocks in i915 code. And at least right now, with the current
> set of fixups our CI runs are splat-free. So at least a Kconfig option
> would be nice, to be able to enable it easily when you want to.
> 
> Wrt us not noticing: Getting the patches in through normal means takes too
> long, so we constantly carry arounda  bunch of fixup patches to address
> serious issues that block CI (no lockdep is no good CI). That's why we
> won't immediately notice when an issue is resolved some other way.

Hello Ingo and Linus,

IMO, taking it off by default is enough. What fs folk actually
wanted is not to remove whole stuff but make it off by default.

Cross-release logic itself makes sense. Please consider it back
and apply my patch making it off by default.

I will do what I can do for the classification in layered fs.

-- 
Thanks,
Byungchul

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] kthread: finer-grained lockdep/cross-release completion
@ 2017-12-20  1:14             ` Byungchul Park
  0 siblings, 0 replies; 36+ messages in thread
From: Byungchul Park @ 2017-12-20  1:14 UTC (permalink / raw)
  To: Linus Torvalds, Peter Zijlstra, LKML, DRI Development,
	Intel Graphics Development, Tvrtko Ursulin, Marta Lofstedt,
	Tejun Heo, Kees Cook, Shaohua Li, Andrew Morton, Jens Axboe,
	Greg Kroah-Hartman, Jonathan Corbet, Oleg Nesterov,
	Daniel Vetter
  Cc: kernel-team, tglx, mingo, tytso, Amir Goldstein

On 12/19/2017 6:59 PM, Daniel Vetter wrote:
> On Mon, Dec 18, 2017 at 09:42:13AM -0800, Linus Torvalds wrote:
>> On Sun, Dec 17, 2017 at 11:11 PM, Daniel Vetter <daniel@ffwll.ch> wrote:
>>>
>>> This didn't seem to have made it into -rc4. Anything needed to get it
>>> going?
>>
>> Do you actually see the problem in -rc4?
>>
>> Because we ended up removing the cross-release checking due to other
>> developers complaining. It seemed to need a lot more work before it
>> was ready.
>>
>> So I suspect the patch is simply not relevant any more (even if it's
>> not necessarily wrong either).
> 
> Awww ... I like the cross release stuff, it's catching lots of good bugs
> for us - writing a gpu memory manager that's supposed to interface with
> the core one ain't the simplest thing to get right. That's also why we're
> going around and fixing up fallout (we've worked on the worker annotations
> for 4.14 too). I guess next release, hopefully.
> 
> I think between 4.14 and 4.15 attemps cross-release spotted around 5 or so
> genuine deadlocks in i915 code. And at least right now, with the current
> set of fixups our CI runs are splat-free. So at least a Kconfig option
> would be nice, to be able to enable it easily when you want to.
> 
> Wrt us not noticing: Getting the patches in through normal means takes too
> long, so we constantly carry arounda  bunch of fixup patches to address
> serious issues that block CI (no lockdep is no good CI). That's why we
> won't immediately notice when an issue is resolved some other way.

Hello Ingo and Linus,

IMO, taking it off by default is enough. What fs folk actually
wanted is not to remove whole stuff but make it off by default.

Cross-release logic itself makes sense. Please consider it back
and apply my patch making it off by default.

I will do what I can do for the classification in layered fs.

-- 
Thanks,
Byungchul

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] kthread: finer-grained lockdep/cross-release completion
  2017-12-20  1:14             ` Byungchul Park
@ 2018-03-15 10:31               ` Daniel Vetter
  -1 siblings, 0 replies; 36+ messages in thread
From: Daniel Vetter @ 2018-03-15 10:31 UTC (permalink / raw)
  To: Byungchul Park
  Cc: Linus Torvalds, Peter Zijlstra, LKML, DRI Development,
	Intel Graphics Development, Tvrtko Ursulin, Marta Lofstedt,
	Ingo Molnar, Tejun Heo, Kees Cook, Thomas Gleixner, Shaohua Li,
	Andrew Morton, Jens Axboe, Greg Kroah-Hartman, Jonathan Corbet,
	Oleg Nesterov, Daniel Vetter, kernel-team, Amir Goldstein,
	Theodore Ts'o

On Wed, Dec 20, 2017 at 2:14 AM, Byungchul Park <byungchul.park@lge.com> wrote:
> On 12/19/2017 6:59 PM, Daniel Vetter wrote:
>>
>> On Mon, Dec 18, 2017 at 09:42:13AM -0800, Linus Torvalds wrote:
>>>
>>> On Sun, Dec 17, 2017 at 11:11 PM, Daniel Vetter <daniel@ffwll.ch> wrote:
>>>>
>>>>
>>>> This didn't seem to have made it into -rc4. Anything needed to get it
>>>> going?
>>>
>>>
>>> Do you actually see the problem in -rc4?
>>>
>>> Because we ended up removing the cross-release checking due to other
>>> developers complaining. It seemed to need a lot more work before it
>>> was ready.
>>>
>>> So I suspect the patch is simply not relevant any more (even if it's
>>> not necessarily wrong either).
>>
>>
>> Awww ... I like the cross release stuff, it's catching lots of good bugs
>> for us - writing a gpu memory manager that's supposed to interface with
>> the core one ain't the simplest thing to get right. That's also why we're
>> going around and fixing up fallout (we've worked on the worker annotations
>> for 4.14 too). I guess next release, hopefully.
>>
>> I think between 4.14 and 4.15 attemps cross-release spotted around 5 or so
>> genuine deadlocks in i915 code. And at least right now, with the current
>> set of fixups our CI runs are splat-free. So at least a Kconfig option
>> would be nice, to be able to enable it easily when you want to.
>>
>> Wrt us not noticing: Getting the patches in through normal means takes too
>> long, so we constantly carry arounda  bunch of fixup patches to address
>> serious issues that block CI (no lockdep is no good CI). That's why we
>> won't immediately notice when an issue is resolved some other way.
>
>
> Hello Ingo and Linus,
>
> IMO, taking it off by default is enough. What fs folk actually
> wanted is not to remove whole stuff but make it off by default.
>
> Cross-release logic itself makes sense. Please consider it back
> and apply my patch making it off by default.
>
> I will do what I can do for the classification in layered fs.

Is there any progress on getting cross-release enabled again?
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] kthread: finer-grained lockdep/cross-release completion
@ 2018-03-15 10:31               ` Daniel Vetter
  0 siblings, 0 replies; 36+ messages in thread
From: Daniel Vetter @ 2018-03-15 10:31 UTC (permalink / raw)
  To: Byungchul Park
  Cc: Jens Axboe, Amir Goldstein, Theodore Ts'o, Kees Cook,
	Tvrtko Ursulin, Peter Zijlstra, Greg Kroah-Hartman,
	Intel Graphics Development, Jonathan Corbet, LKML,
	DRI Development, Oleg Nesterov, Andrew Morton, kernel-team,
	Shaohua Li, Tejun Heo, Daniel Vetter, Thomas Gleixner,
	Linus Torvalds, Ingo Molnar

On Wed, Dec 20, 2017 at 2:14 AM, Byungchul Park <byungchul.park@lge.com> wrote:
> On 12/19/2017 6:59 PM, Daniel Vetter wrote:
>>
>> On Mon, Dec 18, 2017 at 09:42:13AM -0800, Linus Torvalds wrote:
>>>
>>> On Sun, Dec 17, 2017 at 11:11 PM, Daniel Vetter <daniel@ffwll.ch> wrote:
>>>>
>>>>
>>>> This didn't seem to have made it into -rc4. Anything needed to get it
>>>> going?
>>>
>>>
>>> Do you actually see the problem in -rc4?
>>>
>>> Because we ended up removing the cross-release checking due to other
>>> developers complaining. It seemed to need a lot more work before it
>>> was ready.
>>>
>>> So I suspect the patch is simply not relevant any more (even if it's
>>> not necessarily wrong either).
>>
>>
>> Awww ... I like the cross release stuff, it's catching lots of good bugs
>> for us - writing a gpu memory manager that's supposed to interface with
>> the core one ain't the simplest thing to get right. That's also why we're
>> going around and fixing up fallout (we've worked on the worker annotations
>> for 4.14 too). I guess next release, hopefully.
>>
>> I think between 4.14 and 4.15 attemps cross-release spotted around 5 or so
>> genuine deadlocks in i915 code. And at least right now, with the current
>> set of fixups our CI runs are splat-free. So at least a Kconfig option
>> would be nice, to be able to enable it easily when you want to.
>>
>> Wrt us not noticing: Getting the patches in through normal means takes too
>> long, so we constantly carry arounda  bunch of fixup patches to address
>> serious issues that block CI (no lockdep is no good CI). That's why we
>> won't immediately notice when an issue is resolved some other way.
>
>
> Hello Ingo and Linus,
>
> IMO, taking it off by default is enough. What fs folk actually
> wanted is not to remove whole stuff but make it off by default.
>
> Cross-release logic itself makes sense. Please consider it back
> and apply my patch making it off by default.
>
> I will do what I can do for the classification in layered fs.

Is there any progress on getting cross-release enabled again?
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] kthread: finer-grained lockdep/cross-release completion
  2018-03-15 10:31               ` Daniel Vetter
@ 2018-03-15 12:41                 ` Peter Zijlstra
  -1 siblings, 0 replies; 36+ messages in thread
From: Peter Zijlstra @ 2018-03-15 12:41 UTC (permalink / raw)
  To: Daniel Vetter
  Cc: Byungchul Park, Linus Torvalds, LKML, DRI Development,
	Intel Graphics Development, Tvrtko Ursulin, Marta Lofstedt,
	Ingo Molnar, Tejun Heo, Kees Cook, Thomas Gleixner, Shaohua Li,
	Andrew Morton, Jens Axboe, Greg Kroah-Hartman, Jonathan Corbet,
	Oleg Nesterov, Daniel Vetter, kernel-team, Amir Goldstein,
	Theodore Ts'o

On Thu, Mar 15, 2018 at 11:31:57AM +0100, Daniel Vetter wrote:
> Is there any progress on getting cross-release enabled again?

Not yet, I'm still fighting the meltdown/spectre induced backlog.

We'll get to it eventually.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] kthread: finer-grained lockdep/cross-release completion
@ 2018-03-15 12:41                 ` Peter Zijlstra
  0 siblings, 0 replies; 36+ messages in thread
From: Peter Zijlstra @ 2018-03-15 12:41 UTC (permalink / raw)
  To: Daniel Vetter
  Cc: Jens Axboe, Amir Goldstein, Theodore Ts'o, Kees Cook,
	Tvrtko Ursulin, Greg Kroah-Hartman, Intel Graphics Development,
	Jonathan Corbet, LKML, DRI Development, Oleg Nesterov,
	Byungchul Park, kernel-team, Andrew Morton, Shaohua Li,
	Tejun Heo, Daniel Vetter, Thomas Gleixner, Linus Torvalds,
	Ingo Molnar

On Thu, Mar 15, 2018 at 11:31:57AM +0100, Daniel Vetter wrote:
> Is there any progress on getting cross-release enabled again?

Not yet, I'm still fighting the meltdown/spectre induced backlog.

We'll get to it eventually.
_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] kthread: finer-grained lockdep/cross-release completion
  2018-03-15 12:41                 ` Peter Zijlstra
  (?)
@ 2018-03-15 23:26                 ` Byungchul Park
  2018-10-18  8:56                     ` Daniel Vetter
  -1 siblings, 1 reply; 36+ messages in thread
From: Byungchul Park @ 2018-03-15 23:26 UTC (permalink / raw)
  To: Peter Zijlstra, Daniel Vetter
  Cc: Linus Torvalds, LKML, DRI Development,
	Intel Graphics Development, Tvrtko Ursulin, Marta Lofstedt,
	Ingo Molnar, Tejun Heo, Kees Cook, Thomas Gleixner, Shaohua Li,
	Andrew Morton, Jens Axboe, Greg Kroah-Hartman, Jonathan Corbet,
	Oleg Nesterov, Daniel Vetter, kernel-team, Amir Goldstein,
	Theodore Ts'o

On 3/15/2018 9:41 PM, Peter Zijlstra wrote:
> On Thu, Mar 15, 2018 at 11:31:57AM +0100, Daniel Vetter wrote:
>> Is there any progress on getting cross-release enabled again?
> 
> Not yet, I'm still fighting the meltdown/spectre induced backlog.
> 
> We'll get to it eventually.

Please let me know when you get available so that I can start.

-- 
Thanks,
Byungchul

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] kthread: finer-grained lockdep/cross-release completion
  2018-03-15 23:26                 ` Byungchul Park
@ 2018-10-18  8:56                     ` Daniel Vetter
  0 siblings, 0 replies; 36+ messages in thread
From: Daniel Vetter @ 2018-10-18  8:56 UTC (permalink / raw)
  To: Byungchul Park
  Cc: Peter Zijlstra, Linus Torvalds, Linux Kernel Mailing List,
	dri-devel, intel-gfx, Tvrtko Ursulin, Marta Lofstedt,
	Ingo Molnar, Tejun Heo, Kees Cook, Thomas Gleixner, Shaohua Li,
	Andrew Morton, Jens Axboe, Greg KH, Jonathan Corbet,
	Oleg Nesterov, Daniel Vetter, kernel-team, Amir Goldstein,
	Theodore Ts'o

On Fri, Mar 16, 2018 at 12:26 AM Byungchul Park <byungchul.park@lge.com> wrote:
>
> On 3/15/2018 9:41 PM, Peter Zijlstra wrote:
> > On Thu, Mar 15, 2018 at 11:31:57AM +0100, Daniel Vetter wrote:
> >> Is there any progress on getting cross-release enabled again?
> >
> > Not yet, I'm still fighting the meltdown/spectre induced backlog.
> >
> > We'll get to it eventually.
>
> Please let me know when you get available so that I can start.

Hi Byungchul,

Do you have a branch somewhere with latest/rebased cross-release
patches? I'd be interested in resurrecting them, and least for local
testing in drm. I'm still carrying around some patches to improve
annotations here.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [PATCH] kthread: finer-grained lockdep/cross-release completion
@ 2018-10-18  8:56                     ` Daniel Vetter
  0 siblings, 0 replies; 36+ messages in thread
From: Daniel Vetter @ 2018-10-18  8:56 UTC (permalink / raw)
  To: Byungchul Park
  Cc: Jens Axboe, Amir Goldstein, Theodore Ts'o, Kees Cook,
	Peter Zijlstra, Greg KH, intel-gfx, Jonathan Corbet,
	Linux Kernel Mailing List, dri-devel, Oleg Nesterov,
	Andrew Morton, kernel-team, Marta Lofstedt, Shaohua Li,
	Tejun Heo, Daniel Vetter, Thomas Gleixner, Linus Torvalds,
	Ingo Molnar

On Fri, Mar 16, 2018 at 12:26 AM Byungchul Park <byungchul.park@lge.com> wrote:
>
> On 3/15/2018 9:41 PM, Peter Zijlstra wrote:
> > On Thu, Mar 15, 2018 at 11:31:57AM +0100, Daniel Vetter wrote:
> >> Is there any progress on getting cross-release enabled again?
> >
> > Not yet, I'm still fighting the meltdown/spectre induced backlog.
> >
> > We'll get to it eventually.
>
> Please let me know when you get available so that I can start.

Hi Byungchul,

Do you have a branch somewhere with latest/rebased cross-release
patches? I'd be interested in resurrecting them, and least for local
testing in drm. I'm still carrying around some patches to improve
annotations here.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 36+ messages in thread

end of thread, other threads:[~2018-10-18  8:56 UTC | newest]

Thread overview: 36+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-12-07 10:08 [PATCH] kthread: finer-grained lockdep/cross-release completion Daniel Vetter
2017-12-07 10:08 ` Daniel Vetter
2017-12-07 12:22 ` Peter Zijlstra
2017-12-07 12:22   ` Peter Zijlstra
2017-12-07 14:58   ` Daniel Vetter
2017-12-07 14:58     ` Daniel Vetter
2017-12-07 19:57     ` Peter Zijlstra
2017-12-07 19:57       ` Peter Zijlstra
2017-12-07 20:56       ` [Intel-gfx] " Daniel Vetter
2017-12-07 20:56         ` Daniel Vetter
2017-12-08 10:14         ` [Intel-gfx] " Peter Zijlstra
2017-12-08 10:14           ` Peter Zijlstra
2017-12-08 16:36           ` [Intel-gfx] " Daniel Vetter
2017-12-08 16:36             ` Daniel Vetter
2017-12-08 18:40             ` [Intel-gfx] " Peter Zijlstra
2017-12-08 18:40               ` Peter Zijlstra
2017-12-07 12:56 ` ✗ Fi.CI.BAT: failure for " Patchwork
2017-12-08 10:54 ` [PATCH] " Peter Zijlstra
2017-12-08 10:54   ` Peter Zijlstra
2017-12-11  9:19   ` Daniel Vetter
2017-12-11  9:19     ` Daniel Vetter
2017-12-18  7:11     ` Daniel Vetter
2017-12-18  7:11       ` Daniel Vetter
2017-12-18 17:42       ` Linus Torvalds
2017-12-18 17:42         ` Linus Torvalds
2017-12-19  9:59         ` Daniel Vetter
2017-12-19  9:59           ` Daniel Vetter
2017-12-20  1:14           ` Byungchul Park
2017-12-20  1:14             ` Byungchul Park
2018-03-15 10:31             ` Daniel Vetter
2018-03-15 10:31               ` Daniel Vetter
2018-03-15 12:41               ` Peter Zijlstra
2018-03-15 12:41                 ` Peter Zijlstra
2018-03-15 23:26                 ` Byungchul Park
2018-10-18  8:56                   ` Daniel Vetter
2018-10-18  8:56                     ` Daniel Vetter

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.