All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING
@ 2017-08-17  8:57 Byungchul Park
  2017-08-17  8:57 ` [PATCH v3 2/3] lockdep: Reword title of LOCKDEP_CROSSRELEASE config Byungchul Park
                   ` (3 more replies)
  0 siblings, 4 replies; 50+ messages in thread
From: Byungchul Park @ 2017-08-17  8:57 UTC (permalink / raw)
  To: peterz, mingo; +Cc: linux-kernel, kernel-team

Crossrelease added LOCKDEP_CROSSRELEASE and LOCKDEP_COMPLETE configs. It
makes little sense to enable them when PROVE_LOCKING is disabled.

Make them non-interative option and all part of PROVE_LOCKING.

Signed-off-by: Byungchul Park <byungchul.park@lge.com>
---
 lib/Kconfig.debug | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index ee9e534..84bb4c5 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1084,6 +1084,8 @@ config PROVE_LOCKING
 	select DEBUG_MUTEXES
 	select DEBUG_RT_MUTEXES if RT_MUTEXES
 	select DEBUG_LOCK_ALLOC
+	select LOCKDEP_CROSSRELEASE
+	select LOCKDEP_COMPLETE
 	select TRACE_IRQFLAGS
 	default n
 	help
@@ -1155,8 +1157,6 @@ config LOCK_STAT
 
 config LOCKDEP_CROSSRELEASE
 	bool "Lock debugging: make lockdep work for crosslocks"
-	depends on PROVE_LOCKING
-	default n
 	help
 	 This makes lockdep work for crosslock which is a lock allowed to
 	 be released in a different context from the acquisition context.
@@ -1167,9 +1167,6 @@ config LOCKDEP_CROSSRELEASE
 
 config LOCKDEP_COMPLETE
 	bool "Lock debugging: allow completions to use deadlock detector"
-	depends on PROVE_LOCKING
-	select LOCKDEP_CROSSRELEASE
-	default n
 	help
 	 A deadlock caused by wait_for_completion() and complete() can be
 	 detected by lockdep using crossrelease feature.
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 50+ messages in thread

* [PATCH v3 2/3] lockdep: Reword title of LOCKDEP_CROSSRELEASE config
  2017-08-17  8:57 [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING Byungchul Park
@ 2017-08-17  8:57 ` Byungchul Park
  2017-08-17 10:21   ` [tip:locking/core] locking/lockdep: " tip-bot for Byungchul Park
  2017-08-17  8:57 ` [PATCH v3 3/3] lockdep: Rename LOCKDEP_COMPLETE config Byungchul Park
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 50+ messages in thread
From: Byungchul Park @ 2017-08-17  8:57 UTC (permalink / raw)
  To: peterz, mingo; +Cc: linux-kernel, kernel-team

Lockdep doesn't have to be made to work with crossrelease and just works
with them. Reword the title so that what the option does is clear.

Signed-off-by: Byungchul Park <byungchul.park@lge.com>
---
 lib/Kconfig.debug | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 84bb4c5..444bb7b 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1156,7 +1156,7 @@ config LOCK_STAT
 	 (CONFIG_LOCKDEP defines "acquire" and "release" events.)
 
 config LOCKDEP_CROSSRELEASE
-	bool "Lock debugging: make lockdep work for crosslocks"
+	bool "Lock debugging: enable cross-locking checks in lockdep"
 	help
 	 This makes lockdep work for crosslock which is a lock allowed to
 	 be released in a different context from the acquisition context.
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 50+ messages in thread

* [PATCH v3 3/3] lockdep: Rename LOCKDEP_COMPLETE config
  2017-08-17  8:57 [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING Byungchul Park
  2017-08-17  8:57 ` [PATCH v3 2/3] lockdep: Reword title of LOCKDEP_CROSSRELEASE config Byungchul Park
@ 2017-08-17  8:57 ` Byungchul Park
  2017-08-17 10:22   ` [tip:locking/core] locking/lockdep: Rename CONFIG_LOCKDEP_COMPLETE to CONFIG_LOCKDEP_COMPLETIONS tip-bot for Byungchul Park
  2017-08-17 10:21 ` [tip:locking/core] locking/lockdep: Make CONFIG_LOCKDEP_CROSSRELEASE part of CONFIG_PROVE_LOCKING tip-bot for Byungchul Park
  2017-08-21 15:46 ` [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING Peter Zijlstra
  3 siblings, 1 reply; 50+ messages in thread
From: Byungchul Park @ 2017-08-17  8:57 UTC (permalink / raw)
  To: peterz, mingo; +Cc: linux-kernel, kernel-team

'complete' is an adjective and sounds like 'lockdep is complete'.
Anyway, it was not named properly. LOCKDEP_COMPLETIONS would be
better to explain what it represents.

Signed-off-by: Byungchul Park <byungchul.park@lge.com>
---
 include/linux/completion.h | 8 ++++----
 lib/Kconfig.debug          | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/linux/completion.h b/include/linux/completion.h
index 9bcebf5..791f053 100644
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -9,7 +9,7 @@
  */
 
 #include <linux/wait.h>
-#ifdef CONFIG_LOCKDEP_COMPLETE
+#ifdef CONFIG_LOCKDEP_COMPLETIONS
 #include <linux/lockdep.h>
 #endif
 
@@ -28,12 +28,12 @@
 struct completion {
 	unsigned int done;
 	wait_queue_head_t wait;
-#ifdef CONFIG_LOCKDEP_COMPLETE
+#ifdef CONFIG_LOCKDEP_COMPLETIONS
 	struct lockdep_map_cross map;
 #endif
 };
 
-#ifdef CONFIG_LOCKDEP_COMPLETE
+#ifdef CONFIG_LOCKDEP_COMPLETIONS
 static inline void complete_acquire(struct completion *x)
 {
 	lock_acquire_exclusive((struct lockdep_map *)&x->map, 0, 0, NULL, _RET_IP_);
@@ -64,7 +64,7 @@ static inline void complete_release(struct completion *x) {}
 static inline void complete_release_commit(struct completion *x) {}
 #endif
 
-#ifdef CONFIG_LOCKDEP_COMPLETE
+#ifdef CONFIG_LOCKDEP_COMPLETIONS
 #define COMPLETION_INITIALIZER(work) \
 	{ 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait), \
 	STATIC_CROSS_LOCKDEP_MAP_INIT("(complete)" #work, &(work)) }
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 444bb7b..8524eba 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1085,7 +1085,7 @@ config PROVE_LOCKING
 	select DEBUG_RT_MUTEXES if RT_MUTEXES
 	select DEBUG_LOCK_ALLOC
 	select LOCKDEP_CROSSRELEASE
-	select LOCKDEP_COMPLETE
+	select LOCKDEP_COMPLETIONS
 	select TRACE_IRQFLAGS
 	default n
 	help
@@ -1165,7 +1165,7 @@ config LOCKDEP_CROSSRELEASE
 	 such as page locks or completions can use the lock correctness
 	 detector, lockdep.
 
-config LOCKDEP_COMPLETE
+config LOCKDEP_COMPLETIONS
 	bool "Lock debugging: allow completions to use deadlock detector"
 	help
 	 A deadlock caused by wait_for_completion() and complete() can be
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 50+ messages in thread

* [tip:locking/core] locking/lockdep: Make CONFIG_LOCKDEP_CROSSRELEASE part of CONFIG_PROVE_LOCKING
  2017-08-17  8:57 [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING Byungchul Park
  2017-08-17  8:57 ` [PATCH v3 2/3] lockdep: Reword title of LOCKDEP_CROSSRELEASE config Byungchul Park
  2017-08-17  8:57 ` [PATCH v3 3/3] lockdep: Rename LOCKDEP_COMPLETE config Byungchul Park
@ 2017-08-17 10:21 ` tip-bot for Byungchul Park
  2017-08-17 10:45   ` Ingo Molnar
  2017-08-21 15:46 ` [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING Peter Zijlstra
  3 siblings, 1 reply; 50+ messages in thread
From: tip-bot for Byungchul Park @ 2017-08-17 10:21 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: hpa, mingo, byungchul.park, linux-kernel, tglx, torvalds, peterz

Commit-ID:  d0541b0fa64b36665d6261079974a26943c75009
Gitweb:     http://git.kernel.org/tip/d0541b0fa64b36665d6261079974a26943c75009
Author:     Byungchul Park <byungchul.park@lge.com>
AuthorDate: Thu, 17 Aug 2017 17:57:39 +0900
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Thu, 17 Aug 2017 11:38:54 +0200

locking/lockdep: Make CONFIG_LOCKDEP_CROSSRELEASE part of CONFIG_PROVE_LOCKING

Crossrelease support added the CONFIG_LOCKDEP_CROSSRELEASE and CONFIG_LOCKDEP_COMPLETE
options. It makes little sense to enable them when PROVE_LOCKING is disabled.

Make them non-interative options and part of PROVE_LOCKING to simplify the UI.

Signed-off-by: Byungchul Park <byungchul.park@lge.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kernel-team@lge.com
Link: http://lkml.kernel.org/r/1502960261-16206-1-git-send-email-byungchul.park@lge.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 lib/Kconfig.debug | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index ebd40d3..1ad7f1b 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1081,6 +1081,8 @@ config PROVE_LOCKING
 	select DEBUG_MUTEXES
 	select DEBUG_RT_MUTEXES if RT_MUTEXES
 	select DEBUG_LOCK_ALLOC
+	select LOCKDEP_CROSSRELEASE
+	select LOCKDEP_COMPLETE
 	select TRACE_IRQFLAGS
 	default n
 	help
@@ -1152,8 +1154,6 @@ config LOCK_STAT
 
 config LOCKDEP_CROSSRELEASE
 	bool "Lock debugging: make lockdep work for crosslocks"
-	depends on PROVE_LOCKING
-	default n
 	help
 	 This makes lockdep work for crosslock which is a lock allowed to
 	 be released in a different context from the acquisition context.
@@ -1164,9 +1164,6 @@ config LOCKDEP_CROSSRELEASE
 
 config LOCKDEP_COMPLETE
 	bool "Lock debugging: allow completions to use deadlock detector"
-	depends on PROVE_LOCKING
-	select LOCKDEP_CROSSRELEASE
-	default n
 	help
 	 A deadlock caused by wait_for_completion() and complete() can be
 	 detected by lockdep using crossrelease feature.

^ permalink raw reply related	[flat|nested] 50+ messages in thread

* [tip:locking/core] locking/lockdep: Reword title of LOCKDEP_CROSSRELEASE config
  2017-08-17  8:57 ` [PATCH v3 2/3] lockdep: Reword title of LOCKDEP_CROSSRELEASE config Byungchul Park
@ 2017-08-17 10:21   ` tip-bot for Byungchul Park
  0 siblings, 0 replies; 50+ messages in thread
From: tip-bot for Byungchul Park @ 2017-08-17 10:21 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: byungchul.park, peterz, mingo, hpa, linux-kernel, tglx, torvalds

Commit-ID:  0f0a22260d613b4ee3f483ee1ea6fa27f92a9e40
Gitweb:     http://git.kernel.org/tip/0f0a22260d613b4ee3f483ee1ea6fa27f92a9e40
Author:     Byungchul Park <byungchul.park@lge.com>
AuthorDate: Thu, 17 Aug 2017 17:57:40 +0900
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Thu, 17 Aug 2017 11:38:55 +0200

locking/lockdep: Reword title of LOCKDEP_CROSSRELEASE config

Lockdep doesn't have to be made to work with crossrelease and just works
with them. Reword the title so that what the option does is clear.

Signed-off-by: Byungchul Park <byungchul.park@lge.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kernel-team@lge.com
Link: http://lkml.kernel.org/r/1502960261-16206-2-git-send-email-byungchul.park@lge.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 lib/Kconfig.debug | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 1ad7f1b..8fb8a20 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1153,7 +1153,7 @@ config LOCK_STAT
 	 (CONFIG_LOCKDEP defines "acquire" and "release" events.)
 
 config LOCKDEP_CROSSRELEASE
-	bool "Lock debugging: make lockdep work for crosslocks"
+	bool "Lock debugging: enable cross-locking checks in lockdep"
 	help
 	 This makes lockdep work for crosslock which is a lock allowed to
 	 be released in a different context from the acquisition context.

^ permalink raw reply related	[flat|nested] 50+ messages in thread

* [tip:locking/core] locking/lockdep: Rename CONFIG_LOCKDEP_COMPLETE to CONFIG_LOCKDEP_COMPLETIONS
  2017-08-17  8:57 ` [PATCH v3 3/3] lockdep: Rename LOCKDEP_COMPLETE config Byungchul Park
@ 2017-08-17 10:22   ` tip-bot for Byungchul Park
  0 siblings, 0 replies; 50+ messages in thread
From: tip-bot for Byungchul Park @ 2017-08-17 10:22 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: byungchul.park, hpa, tglx, linux-kernel, peterz, torvalds, mingo

Commit-ID:  ea3f2c0fdfbb180f142cdbc0d1f055c7b9a5e63e
Gitweb:     http://git.kernel.org/tip/ea3f2c0fdfbb180f142cdbc0d1f055c7b9a5e63e
Author:     Byungchul Park <byungchul.park@lge.com>
AuthorDate: Thu, 17 Aug 2017 17:57:41 +0900
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Thu, 17 Aug 2017 11:38:55 +0200

locking/lockdep: Rename CONFIG_LOCKDEP_COMPLETE to CONFIG_LOCKDEP_COMPLETIONS

'complete' is an adjective and LOCKDEP_COMPLETE sounds like 'lockdep is complete',
so pick a better name that uses a noun.

Signed-off-by: Byungchul Park <byungchul.park@lge.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kernel-team@lge.com
Link: http://lkml.kernel.org/r/1502960261-16206-3-git-send-email-byungchul.park@lge.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/completion.h | 8 ++++----
 lib/Kconfig.debug          | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/linux/completion.h b/include/linux/completion.h
index 9bcebf5..791f053 100644
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -9,7 +9,7 @@
  */
 
 #include <linux/wait.h>
-#ifdef CONFIG_LOCKDEP_COMPLETE
+#ifdef CONFIG_LOCKDEP_COMPLETIONS
 #include <linux/lockdep.h>
 #endif
 
@@ -28,12 +28,12 @@
 struct completion {
 	unsigned int done;
 	wait_queue_head_t wait;
-#ifdef CONFIG_LOCKDEP_COMPLETE
+#ifdef CONFIG_LOCKDEP_COMPLETIONS
 	struct lockdep_map_cross map;
 #endif
 };
 
-#ifdef CONFIG_LOCKDEP_COMPLETE
+#ifdef CONFIG_LOCKDEP_COMPLETIONS
 static inline void complete_acquire(struct completion *x)
 {
 	lock_acquire_exclusive((struct lockdep_map *)&x->map, 0, 0, NULL, _RET_IP_);
@@ -64,7 +64,7 @@ static inline void complete_release(struct completion *x) {}
 static inline void complete_release_commit(struct completion *x) {}
 #endif
 
-#ifdef CONFIG_LOCKDEP_COMPLETE
+#ifdef CONFIG_LOCKDEP_COMPLETIONS
 #define COMPLETION_INITIALIZER(work) \
 	{ 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait), \
 	STATIC_CROSS_LOCKDEP_MAP_INIT("(complete)" #work, &(work)) }
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 8fb8a20..a0e60d5 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1082,7 +1082,7 @@ config PROVE_LOCKING
 	select DEBUG_RT_MUTEXES if RT_MUTEXES
 	select DEBUG_LOCK_ALLOC
 	select LOCKDEP_CROSSRELEASE
-	select LOCKDEP_COMPLETE
+	select LOCKDEP_COMPLETIONS
 	select TRACE_IRQFLAGS
 	default n
 	help
@@ -1162,7 +1162,7 @@ config LOCKDEP_CROSSRELEASE
 	 such as page locks or completions can use the lock correctness
 	 detector, lockdep.
 
-config LOCKDEP_COMPLETE
+config LOCKDEP_COMPLETIONS
 	bool "Lock debugging: allow completions to use deadlock detector"
 	help
 	 A deadlock caused by wait_for_completion() and complete() can be

^ permalink raw reply related	[flat|nested] 50+ messages in thread

* Re: [tip:locking/core] locking/lockdep: Make CONFIG_LOCKDEP_CROSSRELEASE part of CONFIG_PROVE_LOCKING
  2017-08-17 10:21 ` [tip:locking/core] locking/lockdep: Make CONFIG_LOCKDEP_CROSSRELEASE part of CONFIG_PROVE_LOCKING tip-bot for Byungchul Park
@ 2017-08-17 10:45   ` Ingo Molnar
  2017-08-18  5:33     ` Byungchul Park
  0 siblings, 1 reply; 50+ messages in thread
From: Ingo Molnar @ 2017-08-17 10:45 UTC (permalink / raw)
  To: torvalds, peterz, tglx, linux-kernel, byungchul.park, hpa
  Cc: linux-tip-commits


* tip-bot for Byungchul Park <tipbot@zytor.com> wrote:

> Commit-ID:  d0541b0fa64b36665d6261079974a26943c75009
> Gitweb:     http://git.kernel.org/tip/d0541b0fa64b36665d6261079974a26943c75009
> Author:     Byungchul Park <byungchul.park@lge.com>
> AuthorDate: Thu, 17 Aug 2017 17:57:39 +0900
> Committer:  Ingo Molnar <mingo@kernel.org>
> CommitDate: Thu, 17 Aug 2017 11:38:54 +0200
> 
> locking/lockdep: Make CONFIG_LOCKDEP_CROSSRELEASE part of CONFIG_PROVE_LOCKING
> 
> Crossrelease support added the CONFIG_LOCKDEP_CROSSRELEASE and CONFIG_LOCKDEP_COMPLETE
> options. It makes little sense to enable them when PROVE_LOCKING is disabled.
> 
> Make them non-interative options and part of PROVE_LOCKING to simplify the UI.
> 
> Signed-off-by: Byungchul Park <byungchul.park@lge.com>
> Cc: Linus Torvalds <torvalds@linux-foundation.org>
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: kernel-team@lge.com
> Link: http://lkml.kernel.org/r/1502960261-16206-1-git-send-email-byungchul.park@lge.com
> Signed-off-by: Ingo Molnar <mingo@kernel.org>
> ---
>  lib/Kconfig.debug | 7 ++-----
>  1 file changed, 2 insertions(+), 5 deletions(-)
> 
> diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
> index ebd40d3..1ad7f1b 100644
> --- a/lib/Kconfig.debug
> +++ b/lib/Kconfig.debug
> @@ -1081,6 +1081,8 @@ config PROVE_LOCKING
>  	select DEBUG_MUTEXES
>  	select DEBUG_RT_MUTEXES if RT_MUTEXES
>  	select DEBUG_LOCK_ALLOC
> +	select LOCKDEP_CROSSRELEASE
> +	select LOCKDEP_COMPLETE
>  	select TRACE_IRQFLAGS
>  	default n
>  	help
> @@ -1152,8 +1154,6 @@ config LOCK_STAT
>  
>  config LOCKDEP_CROSSRELEASE
>  	bool "Lock debugging: make lockdep work for crosslocks"
> -	depends on PROVE_LOCKING
> -	default n
>  	help
>  	 This makes lockdep work for crosslock which is a lock allowed to
>  	 be released in a different context from the acquisition context.
> @@ -1164,9 +1164,6 @@ config LOCKDEP_CROSSRELEASE
>  
>  config LOCKDEP_COMPLETE
>  	bool "Lock debugging: allow completions to use deadlock detector"
> -	depends on PROVE_LOCKING
> -	select LOCKDEP_CROSSRELEASE
> -	default n
>  	help

Yeah, so I only noticed this after committing the patches, but this change does 
not make the option non-interactive. The way to do that is to remove the "" help 
text, i.e. make it a simple 'bool'.

I'll do that and re-push.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [tip:locking/core] locking/lockdep: Make CONFIG_LOCKDEP_CROSSRELEASE part of CONFIG_PROVE_LOCKING
  2017-08-17 10:45   ` Ingo Molnar
@ 2017-08-18  5:33     ` Byungchul Park
  0 siblings, 0 replies; 50+ messages in thread
From: Byungchul Park @ 2017-08-18  5:33 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: torvalds, peterz, tglx, linux-kernel, hpa, linux-tip-commits

On Thu, Aug 17, 2017 at 12:45:44PM +0200, Ingo Molnar wrote:
> > @@ -1164,9 +1164,6 @@ config LOCKDEP_CROSSRELEASE
> >  
> >  config LOCKDEP_COMPLETE
> >  	bool "Lock debugging: allow completions to use deadlock detector"
> > -	depends on PROVE_LOCKING
> > -	select LOCKDEP_CROSSRELEASE
> > -	default n
> >  	help
> 
> Yeah, so I only noticed this after committing the patches, but this change does 
> not make the option non-interactive. The way to do that is to remove the "" help 
> text, i.e. make it a simple 'bool'.
> 
> I'll do that and re-push.

I am sorry for making it rather harder to be done.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING
  2017-08-17  8:57 [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING Byungchul Park
                   ` (2 preceding siblings ...)
  2017-08-17 10:21 ` [tip:locking/core] locking/lockdep: Make CONFIG_LOCKDEP_CROSSRELEASE part of CONFIG_PROVE_LOCKING tip-bot for Byungchul Park
@ 2017-08-21 15:46 ` Peter Zijlstra
  2017-08-22  5:14   ` Byungchul Park
  2017-08-22  5:46   ` Dave Chinner
  3 siblings, 2 replies; 50+ messages in thread
From: Peter Zijlstra @ 2017-08-21 15:46 UTC (permalink / raw)
  To: Byungchul Park
  Cc: mingo, linux-kernel, kernel-team, Arnaldo Carvalho de Melo,
	Dave Chinner, Tejun Heo



Booting the very latest -tip on my test machine gets me the below splat.

Dave, TJ, FYI, lockdep grew annotations for completions; it remembers
which locks were taken before we complete() and checks none of those are
held while we wait_for_completion().

That is, something like:

					mutex_lock(&A);
					mutex_unlock(&A);
					complete(&C);

	mutex_lock(&A);
	wait_for_completion(&C);

Is now considered a problem. Note that in order for C to link to A it
needs to have observed the complete() thread acquiring it after a
wait_for_completion(), something like:

	wait_for_completion(&C)
					mutex_lock(&A);
					mutex_unlock(&A);
					complete(&C);

That is, only locks observed taken between C's wait_for_completion() and
it's complete() are considered.

Now given the above observance rule and the fact that the below report
is from the complete, the thing that happened appears to be:


	lockdep_map_acquire(&work->lockdep_map)
	down_write(&A)

			down_write(&A)
			wait_for_completion(&C)

					lockdep_map_acquire(&work_lockdep_map);
					complete(&C)

Which lockdep then puked over because both sides saw the same work
class.

Byungchul; should we not exclude the work class itself, it appears to me
the workqueue code is explicitly parallel, or am I confused again?


[   38.882358] ======================================================
[   38.889256] WARNING: possible circular locking dependency detected
[   38.896155] 4.13.0-rc6-00609-g645737361ab6-dirty #3 Not tainted
[   38.902752] ------------------------------------------------------
[   38.909650] kworker/0:0/3 is trying to acquire lock:
[   38.915189]  ((&ioend->io_work)){+.+.}, at: [<ffffffff8110ed5f>] process_one_work+0x1ef/0x6b0
[   38.924715] 
[   38.924715] but now in release context of a crosslock acquired at the following:
[   38.934618]  ((complete)&bp->b_iowait){+.+.}, at: [<ffffffff8151ce42>] xfs_buf_submit_wait+0xb2/0x590
[   38.944919] 
[   38.944919] which lock already depends on the new lock.
[   38.944919] 
[   38.954046] 
[   38.954046] the existing dependency chain (in reverse order) is:
[   38.962397] 
[   38.962397] -> #2 ((complete)&bp->b_iowait){+.+.}:
[   38.969401]        __lock_acquire+0x10a1/0x1100
[   38.974460]        lock_acquire+0xea/0x1f0
[   38.979036]        wait_for_completion+0x3b/0x130
[   38.984285]        xfs_buf_submit_wait+0xb2/0x590
[   38.989535]        _xfs_buf_read+0x23/0x30
[   38.994108]        xfs_buf_read_map+0x14f/0x320
[   38.999169]        xfs_trans_read_buf_map+0x170/0x5d0
[   39.004812]        xfs_read_agf+0x97/0x1d0
[   39.009386]        xfs_alloc_read_agf+0x60/0x240
[   39.014541]        xfs_alloc_fix_freelist+0x32a/0x3d0
[   39.020180]        xfs_free_extent_fix_freelist+0x6b/0xa0
[   39.026206]        xfs_free_extent+0x48/0x120
[   39.031068]        xfs_trans_free_extent+0x57/0x200
[   39.036512]        xfs_extent_free_finish_item+0x26/0x40
[   39.042442]        xfs_defer_finish+0x174/0x770
[   39.047494]        xfs_itruncate_extents+0x126/0x470
[   39.053035]        xfs_setattr_size+0x2a1/0x310
[   39.058093]        xfs_vn_setattr_size+0x57/0x160
[   39.063342]        xfs_vn_setattr+0x50/0x70
[   39.068015]        notify_change+0x2ee/0x420
[   39.072785]        do_truncate+0x5d/0x90
[   39.077165]        path_openat+0xab2/0xc00
[   39.081737]        do_filp_open+0x8a/0xf0
[   39.086213]        do_sys_open+0x123/0x200
[   39.090787]        SyS_open+0x1e/0x20
[   39.094876]        entry_SYSCALL_64_fastpath+0x23/0xc2
[   39.100611] 
[   39.100611] -> #1 (&xfs_nondir_ilock_class){++++}:
[   39.107612]        __lock_acquire+0x10a1/0x1100
[   39.112660]        lock_acquire+0xea/0x1f0
[   39.117224]        down_write_nested+0x26/0x60
[   39.122184]        xfs_ilock+0x166/0x220
[   39.126563]        __xfs_setfilesize+0x30/0x200
[   39.131612]        xfs_setfilesize_ioend+0x7f/0xb0
[   39.136958]        xfs_end_io+0x49/0xf0
[   39.141240]        process_one_work+0x273/0x6b0
[   39.146288]        worker_thread+0x48/0x3f0
[   39.150960]        kthread+0x147/0x180
[   39.155146]        ret_from_fork+0x2a/0x40
[   39.159708] 
[   39.159708] -> #0 ((&ioend->io_work)){+.+.}:
[   39.166126]        process_one_work+0x244/0x6b0
[   39.171184]        worker_thread+0x48/0x3f0
[   39.175845]        kthread+0x147/0x180
[   39.180020]        ret_from_fork+0x2a/0x40
[   39.184593]        0xffffffffffffffff
[   39.188677] 
[   39.188677] other info that might help us debug this:
[   39.188677] 
[   39.197611] Chain exists of:
[   39.197611]   (&ioend->io_work) --> &xfs_nondir_ilock_class --> (complete)&bp->b_iowait
[   39.197611] 
[   39.211399]  Possible unsafe locking scenario by crosslock:
[   39.211399] 
[   39.219268]        CPU0                    CPU1
[   39.224321]        ----                    ----
[   39.229377]   lock(&xfs_nondir_ilock_class);
[   39.234135]   lock((complete)&bp->b_iowait);
[   39.238902]                                lock((&ioend->io_work));
[   39.245889]                                unlock((complete)&bp->b_iowait);
[   39.253660] 
[   39.253660]  *** DEADLOCK ***
[   39.253660] 
[   39.260267] 3 locks held by kworker/0:0/3:
[   39.264838]  #0:  ("xfs-buf/%s"mp->m_fsname){++++}, at: [<ffffffff8110ed5f>] process_one_work+0x1ef/0x6b0
[   39.275523]  #1:  ((&bp->b_ioend_work)){+.+.}, at: [<ffffffff8110ed5f>] process_one_work+0x1ef/0x6b0
[   39.285721]  #2:  (&x->wait#17){....}, at: [<ffffffff8113daed>] complete+0x1d/0x60
[   39.294176] 
[   39.294176] stack backtrace:
[   39.299039] CPU: 0 PID: 3 Comm: kworker/0:0 Not tainted 4.13.0-rc6-00609-g645737361ab6-dirty #3
[   39.308749] Hardware name: Intel Corporation S2600GZ/S2600GZ, BIOS SE5C600.86B.02.02.0002.122320131210 12/23/2013
[   39.320209] Workqueue: xfs-buf/sdb1 xfs_buf_ioend_work
[   39.325945] Call Trace:
[   39.328677]  dump_stack+0x86/0xcf
[   39.332377]  print_circular_bug+0x203/0x2f0
[   39.337048]  check_prev_add+0x3be/0x700
[   39.341331]  ? print_bfs_bug+0x40/0x40
[   39.345518]  lock_commit_crosslock+0x40d/0x5a0
[   39.350479]  ? lock_commit_crosslock+0x40d/0x5a0
[   39.355633]  ? xfs_buf_ioend_work+0x15/0x20
[   39.360304]  complete+0x29/0x60
[   39.363810]  xfs_buf_ioend+0x15e/0x470
[   39.367987]  xfs_buf_ioend_work+0x15/0x20
[   39.372463]  process_one_work+0x273/0x6b0
[   39.376931]  worker_thread+0x48/0x3f0
[   39.381021]  kthread+0x147/0x180
[   39.384624]  ? process_one_work+0x6b0/0x6b0
[   39.389294]  ? kthread_create_on_node+0x40/0x40
[   39.394351]  ret_from_fork+0x2a/0x40

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING
  2017-08-21 15:46 ` [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING Peter Zijlstra
@ 2017-08-22  5:14   ` Byungchul Park
  2017-08-22  7:52     ` Peter Zijlstra
  2017-08-22  5:46   ` Dave Chinner
  1 sibling, 1 reply; 50+ messages in thread
From: Byungchul Park @ 2017-08-22  5:14 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, linux-kernel, kernel-team, Arnaldo Carvalho de Melo,
	Dave Chinner, Tejun Heo, johannes

On Mon, Aug 21, 2017 at 05:46:00PM +0200, Peter Zijlstra wrote:
> Now given the above observance rule and the fact that the below report
> is from the complete, the thing that happened appears to be:
> 
> 
> 	lockdep_map_acquire(&work->lockdep_map)
> 	down_write(&A)
> 
> 			down_write(&A)
> 			wait_for_completion(&C)
> 
> 					lockdep_map_acquire(&work_lockdep_map);
> 					complete(&C)
> 
> Which lockdep then puked over because both sides saw the same work
> class.
> 
> Byungchul; should we not exclude the work class itself, it appears to me
> the workqueue code is explicitly parallel, or am I confused again?

Do you mean the lockdep_map_acquire(&work->lockdep_map) used manuallly?

That was introduced by Johannes:

commit 4e6045f134784f4b158b3c0f7a282b04bd816887
"workqueue: debug flushing deadlocks with lockdep"

I am not sure but, for that purpose, IMHO, we can use a
lockdep_map_acquire_read() instead, in process_one_work(), can't we?

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING
  2017-08-21 15:46 ` [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING Peter Zijlstra
  2017-08-22  5:14   ` Byungchul Park
@ 2017-08-22  5:46   ` Dave Chinner
  2017-08-22  9:06     ` Peter Zijlstra
  2017-08-23  1:56     ` Byungchul Park
  1 sibling, 2 replies; 50+ messages in thread
From: Dave Chinner @ 2017-08-22  5:46 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Byungchul Park, mingo, linux-kernel, kernel-team,
	Arnaldo Carvalho de Melo, Tejun Heo

On Mon, Aug 21, 2017 at 05:46:00PM +0200, Peter Zijlstra wrote:
> 
> 
> Booting the very latest -tip on my test machine gets me the below splat.
> 
> Dave, TJ, FYI, lockdep grew annotations for completions; it remembers
> which locks were taken before we complete() and checks none of those are
> held while we wait_for_completion().
> 
> That is, something like:
> 
> 					mutex_lock(&A);
> 					mutex_unlock(&A);
> 					complete(&C);
> 
> 	mutex_lock(&A);
> 	wait_for_completion(&C);

Ok, that seems reasonable...

> Is now considered a problem. Note that in order for C to link to A it
> needs to have observed the complete() thread acquiring it after a
> wait_for_completion(), something like:
> 
> 	wait_for_completion(&C)
> 					mutex_lock(&A);
> 					mutex_unlock(&A);
> 					complete(&C);

Sure.

> 
> That is, only locks observed taken between C's wait_for_completion() and
> it's complete() are considered.
> 
> Now given the above observance rule and the fact that the below report
> is from the complete, the thing that happened appears to be:
> 
> 
> 	lockdep_map_acquire(&work->lockdep_map)
> 	down_write(&A)
> 
> 			down_write(&A)
> 			wait_for_completion(&C)
> 
> 					lockdep_map_acquire(&work_lockdep_map);
> 					complete(&C)
> 
> Which lockdep then puked over because both sides saw the same work
> class.

Let me get this straight, first. If we:

	1. take a lock in a work queue context; and
	2. in a separate context, hold that lock while we wait for a
	completion; and
	3. Run the completion from the original workqueue where we
	/once/ held that lock

lockdep will barf?

IIUC, that's a problem because XFS does this all over the place.
Let me pull the trace apart in reverse order to explain why XFS is
going to throw endless false positives on this:

> [   39.159708] -> #0 ((&ioend->io_work)){+.+.}:
> [   39.166126]        process_one_work+0x244/0x6b0
> [   39.171184]        worker_thread+0x48/0x3f0
> [   39.175845]        kthread+0x147/0x180
> [   39.180020]        ret_from_fork+0x2a/0x40
> [   39.184593]        0xffffffffffffffff

That's a data IO completion, which will take an inode lock if we
have to run a transaction to update inode size or convert an
unwritten extent.

> [   39.100611] -> #1 (&xfs_nondir_ilock_class){++++}:
> [   39.107612]        __lock_acquire+0x10a1/0x1100
> [   39.112660]        lock_acquire+0xea/0x1f0
> [   39.117224]        down_write_nested+0x26/0x60
> [   39.122184]        xfs_ilock+0x166/0x220
> [   39.126563]        __xfs_setfilesize+0x30/0x200
> [   39.131612]        xfs_setfilesize_ioend+0x7f/0xb0
> [   39.136958]        xfs_end_io+0x49/0xf0
> [   39.141240]        process_one_work+0x273/0x6b0
> [   39.146288]        worker_thread+0x48/0x3f0
> [   39.150960]        kthread+0x147/0x180
> [   39.155146]        ret_from_fork+0x2a/0x40

That's the data IO completion locking the inode inside a transaction
to update the inode size inside a workqueue.


> [   38.962397] -> #2 ((complete)&bp->b_iowait){+.+.}:
> [   38.969401]        __lock_acquire+0x10a1/0x1100
> [   38.974460]        lock_acquire+0xea/0x1f0
> [   38.979036]        wait_for_completion+0x3b/0x130
> [   38.984285]        xfs_buf_submit_wait+0xb2/0x590
> [   38.989535]        _xfs_buf_read+0x23/0x30
> [   38.994108]        xfs_buf_read_map+0x14f/0x320
> [   38.999169]        xfs_trans_read_buf_map+0x170/0x5d0
> [   39.004812]        xfs_read_agf+0x97/0x1d0
> [   39.009386]        xfs_alloc_read_agf+0x60/0x240
> [   39.014541]        xfs_alloc_fix_freelist+0x32a/0x3d0
> [   39.020180]        xfs_free_extent_fix_freelist+0x6b/0xa0
> [   39.026206]        xfs_free_extent+0x48/0x120
> [   39.031068]        xfs_trans_free_extent+0x57/0x200
> [   39.036512]        xfs_extent_free_finish_item+0x26/0x40
> [   39.042442]        xfs_defer_finish+0x174/0x770
> [   39.047494]        xfs_itruncate_extents+0x126/0x470
> [   39.053035]        xfs_setattr_size+0x2a1/0x310
> [   39.058093]        xfs_vn_setattr_size+0x57/0x160
> [   39.063342]        xfs_vn_setattr+0x50/0x70
> [   39.068015]        notify_change+0x2ee/0x420
> [   39.072785]        do_truncate+0x5d/0x90
> [   39.077165]        path_openat+0xab2/0xc00
> [   39.081737]        do_filp_open+0x8a/0xf0
> [   39.086213]        do_sys_open+0x123/0x200
> [   39.090787]        SyS_open+0x1e/0x20
> [   39.094876]        entry_SYSCALL_64_fastpath+0x23/0xc2

And that's waiting for a metadata read IO to complete during a
truncate transaction. This has no direct connection to the inode at
all - it's a allocation group header that we have to lock to do
block allocation. The completion it is waiting on doesn't even run
through the same workqueue as the ioends - ioends go through
mp->m_data_workqueue or mp->m_unwritten_workqueue, metadata buffers
go through mp->m_buf_workqueue or mp->m_log_workqueue.

So from that perspective, an ioend blocked on an inode lock should
not ever prevent metadata buffer completions from being run. Hence
I'd call this a false positive at best, though I think it really
indicates a bug in the new lockdep code because it isn't
discriminating between different workqueue contexts properly.

Even if I ignore the fact that buffer completions are run on
different workqueues, there seems to be a bigger problem with this
sort of completion checking.

That is, the trace looks plausible because we are definitely hold an
inode locked deep inside a truncate operation where the completion
if flagged.  Indeed, some transactions that would flag like this
could be holding up to 5 inodes locked and have tens of other
metadata objects locked. There are potentially tens (maybe even
hundreds) of different paths into this IO wait point, and all have
different combinations of objects locked when it triggers. So
there's massive scope for potential deadlocks....

.... and so we must have some way of avoiding this whole class of
problems that lockdep is unaware of.

Indeed, I don't think we can get stuck here because of the IO
locking rules we have specifically to avoid this sort of issue.
Having IO in flight and running completions during a truncate
operation on the same inode is a Realy Bad Thing to be doing. It's
actually a corruption and/or data exposure vector, neither of which
make users happy, and so fileystems tend to have designed in
mechanisms for avoiding these problems.

In the case of XFS, it's the data IO serialisation model that
prevents such problems. i.e. the inode locks we hold at this point
in the truncate process (i.e. the XFS_IOLOCK a.k.a i_rwsem) prevent
new IO from being run, and we don't start the truncate until we've
waited for all in progress IO to complete.  Hence while the truncate
runs and blocks on metadata IO completions, no data IO can be in
progress on that inode, so there is no completions being run on that
inode in workqueues.

And therefore the IO completion deadlock path reported by lockdep
can not actually be executed during a truncate, and so it's a false
positive.

Back to the drawing board, I guess....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING
  2017-08-22  5:14   ` Byungchul Park
@ 2017-08-22  7:52     ` Peter Zijlstra
  2017-08-22  8:51       ` Byungchul Park
  0 siblings, 1 reply; 50+ messages in thread
From: Peter Zijlstra @ 2017-08-22  7:52 UTC (permalink / raw)
  To: Byungchul Park
  Cc: mingo, linux-kernel, kernel-team, Arnaldo Carvalho de Melo,
	Dave Chinner, Tejun Heo, johannes

On Tue, Aug 22, 2017 at 02:14:38PM +0900, Byungchul Park wrote:
> On Mon, Aug 21, 2017 at 05:46:00PM +0200, Peter Zijlstra wrote:
> > Now given the above observance rule and the fact that the below report
> > is from the complete, the thing that happened appears to be:
> > 
> > 
> > 	lockdep_map_acquire(&work->lockdep_map)
> > 	down_write(&A)
> > 
> > 			down_write(&A)
> > 			wait_for_completion(&C)
> > 
> > 					lockdep_map_acquire(&work_lockdep_map);
> > 					complete(&C)
> > 
> > Which lockdep then puked over because both sides saw the same work
> > class.
> > 
> > Byungchul; should we not exclude the work class itself, it appears to me
> > the workqueue code is explicitly parallel, or am I confused again?
> 
> Do you mean the lockdep_map_acquire(&work->lockdep_map) used manuallly?
> 
> That was introduced by Johannes:
> 
> commit 4e6045f134784f4b158b3c0f7a282b04bd816887
> "workqueue: debug flushing deadlocks with lockdep"
> 
> I am not sure but, for that purpose, IMHO, we can use a
> lockdep_map_acquire_read() instead, in process_one_work(), can't we?

That wouldn't work. That annotation is to help find deadlocks like:


	mutex_lock(&A)
				<work>
				mutex_lock(&A)

	flush_work(&work)


The 'fake' lock connects the lock chain inside the work to the
lock-chain at flush_work() and thus detects these. That's perfectly
fine.

It just seems to confuse the completions stuff... Let me go read Dave's
email and see if I can come up with something.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING
  2017-08-22  7:52     ` Peter Zijlstra
@ 2017-08-22  8:51       ` Byungchul Park
  2017-08-22  9:21         ` Peter Zijlstra
  0 siblings, 1 reply; 50+ messages in thread
From: Byungchul Park @ 2017-08-22  8:51 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, linux-kernel, kernel-team, Arnaldo Carvalho de Melo,
	Dave Chinner, Tejun Heo, johannes

On Tue, Aug 22, 2017 at 09:52:38AM +0200, Peter Zijlstra wrote:
> That wouldn't work. That annotation is to help find deadlocks like:
> 
> 
> 	mutex_lock(&A)
> 				<work>
> 				mutex_lock(&A)
> 
> 	flush_work(&work)
> 

I meant:

 	mutex_lock(&A)
 				<work>
 				lockdep_map_acquire_read(&work)
 				mutex_lock(&A)

 	lockdep_map_acquire(&work)
 	flush_work(&work)

I mean it can still be detected with a read acquisition in work.
Am I wrong?

> The 'fake' lock connects the lock chain inside the work to the
> lock-chain at flush_work() and thus detects these. That's perfectly
> fine.
> 
> It just seems to confuse the completions stuff... Let me go read Dave's
> email and see if I can come up with something.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING
  2017-08-22  5:46   ` Dave Chinner
@ 2017-08-22  9:06     ` Peter Zijlstra
  2017-08-22  9:22       ` Byungchul Park
                         ` (2 more replies)
  2017-08-23  1:56     ` Byungchul Park
  1 sibling, 3 replies; 50+ messages in thread
From: Peter Zijlstra @ 2017-08-22  9:06 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Byungchul Park, mingo, linux-kernel, kernel-team,
	Arnaldo Carvalho de Melo, Tejun Heo, Boqun Feng

On Tue, Aug 22, 2017 at 03:46:03PM +1000, Dave Chinner wrote:
> On Mon, Aug 21, 2017 at 05:46:00PM +0200, Peter Zijlstra wrote:

> Let me get this straight, first. If we:
> 
> 	1. take a lock in a work queue context; and
> 	2. in a separate context, hold that lock while we wait for a
> 	completion; and
> 	3. Run the completion from the original workqueue where we
> 	/once/ held that lock
> 
> lockdep will barf?
> 
> IIUC, that's a problem because XFS does this all over the place.
> Let me pull the trace apart in reverse order to explain why XFS is
> going to throw endless false positives on this:

Yeah, and I agree that's not right or desired. Just trying to figure out
how to go about fixing that.

Because, with possible exception for the single threaded workqueues,
there is no actual deadlock there unless its the very same work
instance. Its the classic instance vs class thing.

And splitting up work classes is going to be a nightmare too.

> > [   39.159708] -> #0 ((&ioend->io_work)){+.+.}:
> > [   39.166126]        process_one_work+0x244/0x6b0
> > [   39.171184]        worker_thread+0x48/0x3f0
> > [   39.175845]        kthread+0x147/0x180
> > [   39.180020]        ret_from_fork+0x2a/0x40
> > [   39.184593]        0xffffffffffffffff
> 
> That's a data IO completion, which will take an inode lock if we
> have to run a transaction to update inode size or convert an
> unwritten extent.
> 
> > [   39.100611] -> #1 (&xfs_nondir_ilock_class){++++}:
> > [   39.107612]        __lock_acquire+0x10a1/0x1100
> > [   39.112660]        lock_acquire+0xea/0x1f0
> > [   39.117224]        down_write_nested+0x26/0x60
> > [   39.122184]        xfs_ilock+0x166/0x220
> > [   39.126563]        __xfs_setfilesize+0x30/0x200
> > [   39.131612]        xfs_setfilesize_ioend+0x7f/0xb0
> > [   39.136958]        xfs_end_io+0x49/0xf0
> > [   39.141240]        process_one_work+0x273/0x6b0
> > [   39.146288]        worker_thread+0x48/0x3f0
> > [   39.150960]        kthread+0x147/0x180
> > [   39.155146]        ret_from_fork+0x2a/0x40
> 
> That's the data IO completion locking the inode inside a transaction
> to update the inode size inside a workqueue.
> 
> 
> > [   38.962397] -> #2 ((complete)&bp->b_iowait){+.+.}:
> > [   38.969401]        __lock_acquire+0x10a1/0x1100
> > [   38.974460]        lock_acquire+0xea/0x1f0
> > [   38.979036]        wait_for_completion+0x3b/0x130
> > [   38.984285]        xfs_buf_submit_wait+0xb2/0x590
> > [   38.989535]        _xfs_buf_read+0x23/0x30
> > [   38.994108]        xfs_buf_read_map+0x14f/0x320
> > [   38.999169]        xfs_trans_read_buf_map+0x170/0x5d0
> > [   39.004812]        xfs_read_agf+0x97/0x1d0
> > [   39.009386]        xfs_alloc_read_agf+0x60/0x240
> > [   39.014541]        xfs_alloc_fix_freelist+0x32a/0x3d0
> > [   39.020180]        xfs_free_extent_fix_freelist+0x6b/0xa0
> > [   39.026206]        xfs_free_extent+0x48/0x120
> > [   39.031068]        xfs_trans_free_extent+0x57/0x200
> > [   39.036512]        xfs_extent_free_finish_item+0x26/0x40
> > [   39.042442]        xfs_defer_finish+0x174/0x770
> > [   39.047494]        xfs_itruncate_extents+0x126/0x470
> > [   39.053035]        xfs_setattr_size+0x2a1/0x310
> > [   39.058093]        xfs_vn_setattr_size+0x57/0x160
> > [   39.063342]        xfs_vn_setattr+0x50/0x70
> > [   39.068015]        notify_change+0x2ee/0x420
> > [   39.072785]        do_truncate+0x5d/0x90
> > [   39.077165]        path_openat+0xab2/0xc00
> > [   39.081737]        do_filp_open+0x8a/0xf0
> > [   39.086213]        do_sys_open+0x123/0x200
> > [   39.090787]        SyS_open+0x1e/0x20
> > [   39.094876]        entry_SYSCALL_64_fastpath+0x23/0xc2
> 
> And that's waiting for a metadata read IO to complete during a
> truncate transaction. This has no direct connection to the inode at
> all - it's a allocation group header that we have to lock to do
> block allocation. The completion it is waiting on doesn't even run
> through the same workqueue as the ioends - ioends go through
> mp->m_data_workqueue or mp->m_unwritten_workqueue, metadata buffers
> go through mp->m_buf_workqueue or mp->m_log_workqueue.
> 
> So from that perspective, an ioend blocked on an inode lock should
> not ever prevent metadata buffer completions from being run. Hence
> I'd call this a false positive at best, though I think it really
> indicates a bug in the new lockdep code because it isn't
> discriminating between different workqueue contexts properly.

Agreed. The interaction with workqueues is buggered.

> Even if I ignore the fact that buffer completions are run on
> different workqueues, there seems to be a bigger problem with this
> sort of completion checking.
> 
> That is, the trace looks plausible because we are definitely hold an
> inode locked deep inside a truncate operation where the completion
> if flagged.  Indeed, some transactions that would flag like this
> could be holding up to 5 inodes locked and have tens of other
> metadata objects locked. There are potentially tens (maybe even
> hundreds) of different paths into this IO wait point, and all have
> different combinations of objects locked when it triggers. So
> there's massive scope for potential deadlocks....
> 
> .... and so we must have some way of avoiding this whole class of
> problems that lockdep is unaware of.


So I did the below little hack, which basically wipes the entire lock
history when we start a work and thereby disregards/looses the
dependency on the work 'lock'.

It makes my test box able to boot and build a kernel on XFS, so while I
see what you're saying (I think), it doesn't appear to instantly show.

Should I run xfstests or something to further verify things are OK? Does
that need a scratch partition (I keep forgetting how to run that stuff
:/).

---
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 66011c9f5df3..de91cdce9460 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -4756,10 +4756,14 @@ void crossrelease_hist_start(enum xhlock_context_t c)
 {
 	struct task_struct *cur = current;
 
-	if (cur->xhlocks) {
-		cur->xhlock_idx_hist[c] = cur->xhlock_idx;
-		cur->hist_id_save[c] = cur->hist_id;
-	}
+	if (!cur->xhlocks)
+		return;
+
+	if (c == XHLOCK_PROC)
+		invalidate_xhlock(&xhlock(cur->xhlock_idx));
+
+	cur->xhlock_idx_hist[c] = cur->xhlock_idx;
+	cur->hist_id_save[c] = cur->hist_id;
 }
 
 void crossrelease_hist_end(enum xhlock_context_t c)

^ permalink raw reply related	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING
  2017-08-22  8:51       ` Byungchul Park
@ 2017-08-22  9:21         ` Peter Zijlstra
  2017-08-22  9:33           ` Byungchul Park
  0 siblings, 1 reply; 50+ messages in thread
From: Peter Zijlstra @ 2017-08-22  9:21 UTC (permalink / raw)
  To: Byungchul Park
  Cc: mingo, linux-kernel, kernel-team, Arnaldo Carvalho de Melo,
	Dave Chinner, Tejun Heo, johannes

On Tue, Aug 22, 2017 at 05:51:00PM +0900, Byungchul Park wrote:
> On Tue, Aug 22, 2017 at 09:52:38AM +0200, Peter Zijlstra wrote:
> > That wouldn't work. That annotation is to help find deadlocks like:
> > 
> > 
> > 	mutex_lock(&A)
> > 				<work>
> > 				mutex_lock(&A)
> > 
> > 	flush_work(&work)
> > 
> 
> I meant:
> 
>  	mutex_lock(&A)
>  				<work>
>  				lockdep_map_acquire_read(&work)
>  				mutex_lock(&A)
> 
>  	lockdep_map_acquire(&work)
>  	flush_work(&work)
> 
> I mean it can still be detected with a read acquisition in work.
> Am I wrong?

Think so, although there's something weird with read locks that I keep
forgetting. But I'm not sure it'll actually solve the problem. But I can
try I suppose.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING
  2017-08-22  9:06     ` Peter Zijlstra
@ 2017-08-22  9:22       ` Byungchul Park
  2017-08-22  9:37         ` Peter Zijlstra
  2017-08-22 21:19       ` Dave Chinner
  2017-08-23  2:31       ` Byungchul Park
  2 siblings, 1 reply; 50+ messages in thread
From: Byungchul Park @ 2017-08-22  9:22 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Dave Chinner, mingo, linux-kernel, kernel-team,
	Arnaldo Carvalho de Melo, Tejun Heo, Boqun Feng

On Tue, Aug 22, 2017 at 11:06:03AM +0200, Peter Zijlstra wrote:
> So I did the below little hack, which basically wipes the entire lock
> history when we start a work and thereby disregards/looses the
> dependency on the work 'lock'.
> 
> It makes my test box able to boot and build a kernel on XFS, so while I
> see what you're saying (I think), it doesn't appear to instantly show.
> 
> Should I run xfstests or something to further verify things are OK? Does
> that need a scratch partition (I keep forgetting how to run that stuff
> :/).
> 
> ---
> diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
> index 66011c9f5df3..de91cdce9460 100644
> --- a/kernel/locking/lockdep.c
> +++ b/kernel/locking/lockdep.c
> @@ -4756,10 +4756,14 @@ void crossrelease_hist_start(enum xhlock_context_t c)
>  {
>  	struct task_struct *cur = current;
>  
> -	if (cur->xhlocks) {
> -		cur->xhlock_idx_hist[c] = cur->xhlock_idx;
> -		cur->hist_id_save[c] = cur->hist_id;
> -	}
> +	if (!cur->xhlocks)
> +		return;
> +
> +	if (c == XHLOCK_PROC)
> +		invalidate_xhlock(&xhlock(cur->xhlock_idx));

We have to detect dependecies if it exists, even in the following case:

oooooooiiiiiiiiiiiiiiiiiii.........
  |<- range for commit ->|

  where
  o: acquisition outside of each work,
  i: acquisition inside of each work,

With yours, we can never detect dependecies wrt 'o'.

We have to remove false dependencies if we established them incorrectly.

I will also think it more.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING
  2017-08-22  9:21         ` Peter Zijlstra
@ 2017-08-22  9:33           ` Byungchul Park
  2017-08-22 10:08             ` Peter Zijlstra
  0 siblings, 1 reply; 50+ messages in thread
From: Byungchul Park @ 2017-08-22  9:33 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, linux-kernel, kernel-team, Arnaldo Carvalho de Melo,
	Dave Chinner, Tejun Heo, johannes

On Tue, Aug 22, 2017 at 11:21:41AM +0200, Peter Zijlstra wrote:
> On Tue, Aug 22, 2017 at 05:51:00PM +0900, Byungchul Park wrote:
> > On Tue, Aug 22, 2017 at 09:52:38AM +0200, Peter Zijlstra wrote:
> > > That wouldn't work. That annotation is to help find deadlocks like:
> > > 
> > > 
> > > 	mutex_lock(&A)
> > > 				<work>
> > > 				mutex_lock(&A)
> > > 
> > > 	flush_work(&work)
> > > 
> > 
> > I meant:
> > 
> >  	mutex_lock(&A)
> >  				<work>
> >  				lockdep_map_acquire_read(&work)
> >  				mutex_lock(&A)
> > 
> >  	lockdep_map_acquire(&work)
> >  	flush_work(&work)
> > 
> > I mean it can still be detected with a read acquisition in work.
> > Am I wrong?
> 
> Think so, although there's something weird with read locks that I keep
> forgetting. But I'm not sure it'll actually solve the problem. But I can

I mean, read acquisitions are nothing but ones allowing read ones to be
re-acquired legally, IOW, we want to check entrance of flush_work() and
works, not between works. That's why I suggested to use read ones in work
in that case.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING
  2017-08-22  9:22       ` Byungchul Park
@ 2017-08-22  9:37         ` Peter Zijlstra
  2017-08-22  9:42           ` Peter Zijlstra
  2017-08-23  2:12           ` Byungchul Park
  0 siblings, 2 replies; 50+ messages in thread
From: Peter Zijlstra @ 2017-08-22  9:37 UTC (permalink / raw)
  To: Byungchul Park
  Cc: Dave Chinner, mingo, linux-kernel, kernel-team,
	Arnaldo Carvalho de Melo, Tejun Heo, Boqun Feng

On Tue, Aug 22, 2017 at 06:22:36PM +0900, Byungchul Park wrote:
> On Tue, Aug 22, 2017 at 11:06:03AM +0200, Peter Zijlstra wrote:
> > So I did the below little hack, which basically wipes the entire lock
> > history when we start a work and thereby disregards/looses the
> > dependency on the work 'lock'.
> > 
> > It makes my test box able to boot and build a kernel on XFS, so while I
> > see what you're saying (I think), it doesn't appear to instantly show.
> > 
> > Should I run xfstests or something to further verify things are OK? Does
> > that need a scratch partition (I keep forgetting how to run that stuff
> > :/).
> > 
> > ---
> > diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
> > index 66011c9f5df3..de91cdce9460 100644
> > --- a/kernel/locking/lockdep.c
> > +++ b/kernel/locking/lockdep.c
> > @@ -4756,10 +4756,14 @@ void crossrelease_hist_start(enum xhlock_context_t c)
> >  {
> >  	struct task_struct *cur = current;
> >  
> > -	if (cur->xhlocks) {
> > -		cur->xhlock_idx_hist[c] = cur->xhlock_idx;
> > -		cur->hist_id_save[c] = cur->hist_id;
> > -	}
> > +	if (!cur->xhlocks)
> > +		return;
> > +
> > +	if (c == XHLOCK_PROC)
> > +		invalidate_xhlock(&xhlock(cur->xhlock_idx));
> 
> We have to detect dependecies if it exists, even in the following case:
> 
> oooooooiiiiiiiiiiiiiiiiiii.........
>   |<- range for commit ->|
> 
>   where
>   o: acquisition outside of each work,
>   i: acquisition inside of each work,
> 
> With yours, we can never detect dependecies wrt 'o'.

There really shouldn't be any o's when you call
crossrelease_hist_start(XHLOCK_PROC), it should denote the bottom of a
context, see:

  https://lkml.kernel.org/r/20170301104328.GD6515@twins.programming.kicks-ass.net

And in that respect you placed the calls wrongly in process_one_work(),
except it now works out nicely to disregard these fake locks.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING
  2017-08-22  9:37         ` Peter Zijlstra
@ 2017-08-22  9:42           ` Peter Zijlstra
  2017-08-23  2:12           ` Byungchul Park
  1 sibling, 0 replies; 50+ messages in thread
From: Peter Zijlstra @ 2017-08-22  9:42 UTC (permalink / raw)
  To: Byungchul Park
  Cc: Dave Chinner, mingo, linux-kernel, kernel-team,
	Arnaldo Carvalho de Melo, Tejun Heo, Boqun Feng


TJ, afaict the workqueue stuff doesn't in fact use signals, right?

Should we do the below? That is, the only reason it appears to use
INTERRUPTIBLE is to avoid increasing the loadavg, and we've introduced
IDLE for that a while back:

  80ed87c8a9ca ("sched/wait: Introduce TASK_NOLOAD and TASK_IDLE")

---
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f128b3becfe1..737a079480e8 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2249,7 +2249,7 @@ static int worker_thread(void *__worker)
 	 * event.
 	 */
 	worker_enter_idle(worker);
-	__set_current_state(TASK_INTERRUPTIBLE);
+	__set_current_state(TASK_IDLE);
 	spin_unlock_irq(&pool->lock);
 	schedule();
 	goto woke_up;
@@ -2291,7 +2291,7 @@ static int rescuer_thread(void *__rescuer)
 	 */
 	rescuer->task->flags |= PF_WQ_WORKER;
 repeat:
-	set_current_state(TASK_INTERRUPTIBLE);
+	set_current_state(TASK_IDLE);
 
 	/*
 	 * By the time the rescuer is requested to stop, the workqueue

^ permalink raw reply related	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING
  2017-08-22  9:33           ` Byungchul Park
@ 2017-08-22 10:08             ` Peter Zijlstra
  2017-08-22 13:49               ` Peter Zijlstra
  0 siblings, 1 reply; 50+ messages in thread
From: Peter Zijlstra @ 2017-08-22 10:08 UTC (permalink / raw)
  To: Byungchul Park
  Cc: mingo, linux-kernel, kernel-team, Arnaldo Carvalho de Melo,
	Dave Chinner, Tejun Heo, johannes

On Tue, Aug 22, 2017 at 06:33:37PM +0900, Byungchul Park wrote:
> On Tue, Aug 22, 2017 at 11:21:41AM +0200, Peter Zijlstra wrote:
> > On Tue, Aug 22, 2017 at 05:51:00PM +0900, Byungchul Park wrote:
> > > On Tue, Aug 22, 2017 at 09:52:38AM +0200, Peter Zijlstra wrote:
> > > > That wouldn't work. That annotation is to help find deadlocks like:
> > > > 
> > > > 
> > > > 	mutex_lock(&A)
> > > > 				<work>
> > > > 				mutex_lock(&A)
> > > > 
> > > > 	flush_work(&work)
> > > > 
> > > 
> > > I meant:
> > > 
> > >  	mutex_lock(&A)
> > >  				<work>
> > >  				lockdep_map_acquire_read(&work)
> > >  				mutex_lock(&A)
> > > 
> > >  	lockdep_map_acquire(&work)
> > >  	flush_work(&work)
> > > 
> > > I mean it can still be detected with a read acquisition in work.
> > > Am I wrong?
> > 
> > Think so, although there's something weird with read locks that I keep
> > forgetting. But I'm not sure it'll actually solve the problem. But I can
> 
> I mean, read acquisitions are nothing but ones allowing read ones to be
> re-acquired legally, IOW, we want to check entrance of flush_work() and
> works, not between works. That's why I suggested to use read ones in work
> in that case.

Does seem to work.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING
  2017-08-22 10:08             ` Peter Zijlstra
@ 2017-08-22 13:49               ` Peter Zijlstra
  2017-08-22 14:46                 ` Peter Zijlstra
  2017-08-23  2:43                 ` Byungchul Park
  0 siblings, 2 replies; 50+ messages in thread
From: Peter Zijlstra @ 2017-08-22 13:49 UTC (permalink / raw)
  To: Byungchul Park
  Cc: mingo, linux-kernel, kernel-team, Arnaldo Carvalho de Melo,
	Dave Chinner, Tejun Heo, johannes

On Tue, Aug 22, 2017 at 12:08:40PM +0200, Peter Zijlstra wrote:

> > > > I meant:
> > > > 
> > > >  	mutex_lock(&A)
> > > >  				<work>
> > > >  				lockdep_map_acquire_read(&work)
> > > >  				mutex_lock(&A)
> > > > 
> > > >  	lockdep_map_acquire(&work)
> > > >  	flush_work(&work)
> > > > 
> > > > I mean it can still be detected with a read acquisition in work.
> > > > Am I wrong?
> > > 
> > > Think so, although there's something weird with read locks that I keep
> > > forgetting. But I'm not sure it'll actually solve the problem. But I can
> > 
> > I mean, read acquisitions are nothing but ones allowing read ones to be
> > re-acquired legally, IOW, we want to check entrance of flush_work() and
> > works, not between works. That's why I suggested to use read ones in work
> > in that case.
> 
> Does seem to work.

So I think we'll end up hitting a lockdep deficiency and not trigger the
splat on flush_work(), see also:

  https://lwn.net/Articles/332801/

lock_map_acquire_read() is a read-recursive and will not in fact create
any dependencies because of this issue.

In specific, check_prev_add() has:

	if (next->read == 2 || prev->read == 2)
		return 1;

This means that for:

	lock_map_acquire_read(W)(2)
	down_write(A)		(0)

			down_write(A)		(0)
			wait_for_completion(C)	(0)

					lock_map_acquire_read(W)(2)
					complete(C)		(0)

All the (2) effectively go away and 'solve' our current issue, but:

	lock_map_acquire_read(W)(2)
	mutex_lock(A)		(0)

			mutex_lock(A)		(0)
			lock_map_acquire(W)	(0)

as per flush_work() will not in fact trigger anymore either.
See also the below locking-selftest changes.


Now, this means I also have to consider the existing
lock_map_acquire_read() users and if they really wanted to be recursive
or not. When I change lock_map_acquire_read() to use
lock_acquire_shared() this annotation no longer suffices and the splat
comes back.


Also, the acquire_read() annotation will (obviously) no longer work to
cure this problem when we switch to normal read (1), because then the
generated chain:

	W(1) -> A(0) -> C(0) -> W(1)

spells deadlock, since W isn't allowed to recurse.


/me goes dig through commit:

  e159489baa71 ("workqueue: relax lockdep annotation on flush_work()")

to figure out wth the existing users really want.


[    0.000000] ----------------------------------------------------------------------------
[    0.000000]                                  | spin |wlock |rlock |mutex | wsem | rsem |
[    0.000000]   --------------------------------------------------------------------------

[    0.000000]   --------------------------------------------------------------------------
[    0.000000]               recursive read-lock:             |  ok  |             |  ok  |
[    0.000000]            recursive read-lock #2:             |  ok  |             |  ok  |
[    0.000000]             mixed read-write-lock:             |  ok  |             |  ok  |
[    0.000000]             mixed write-read-lock:             |  ok  |             |  ok  |
[    0.000000]   mixed read-lock/lock-write ABBA:             |FAILED|             |  ok  |
[    0.000000]    mixed read-lock/lock-read ABBA:             |  ok  |             |  ok  |
[    0.000000]  mixed write-lock/lock-write ABBA:             |  ok  |             |  ok  |
[    0.000000]   --------------------------------------------------------------------------

---
 lib/locking-selftest.c | 117 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 116 insertions(+), 1 deletion(-)

diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
index 6f2b135dc5e8..b99d365cf399 100644
--- a/lib/locking-selftest.c
+++ b/lib/locking-selftest.c
@@ -363,6 +363,103 @@ static void rsem_AA3(void)
 }
 
 /*
+ * read_lock(A)
+ * spin_lock(B)
+ *		spin_lock(B)
+ *		write_lock(A)
+ */
+static void rlock_ABBA1(void)
+{
+	RL(X1);
+	L(Y1);
+	U(Y1);
+	RU(X1);
+
+	L(Y1);
+	WL(X1);
+	WU(X1);
+	U(Y1); // should fail
+}
+
+static void rwsem_ABBA1(void)
+{
+	RSL(X1);
+	ML(Y1);
+	MU(Y1);
+	RSU(X1);
+
+	ML(Y1);
+	WSL(X1);
+	WSU(X1);
+	MU(Y1); // should fail
+}
+
+/*
+ * read_lock(A)
+ * spin_lock(B)
+ *		spin_lock(B)
+ *		read_lock(A)
+ */
+static void rlock_ABBA2(void)
+{
+	RL(X1);
+	L(Y1);
+	U(Y1);
+	RU(X1);
+
+	L(Y1);
+	RL(X1);
+	RU(X1);
+	U(Y1); // should NOT fail
+}
+
+static void rwsem_ABBA2(void)
+{
+	RSL(X1);
+	ML(Y1);
+	MU(Y1);
+	RSU(X1);
+
+	ML(Y1);
+	RSL(X1);
+	RSU(X1);
+	MU(Y1); // should fail
+}
+
+
+/*
+ * write_lock(A)
+ * spin_lock(B)
+ *		spin_lock(B)
+ *		write_lock(A)
+ */
+static void rlock_ABBA3(void)
+{
+	WL(X1);
+	L(Y1);
+	U(Y1);
+	WU(X1);
+
+	L(Y1);
+	WL(X1);
+	WU(X1);
+	U(Y1); // should fail
+}
+
+static void rwsem_ABBA3(void)
+{
+	WSL(X1);
+	ML(Y1);
+	MU(Y1);
+	WSU(X1);
+
+	ML(Y1);
+	WSL(X1);
+	WSU(X1);
+	MU(Y1); // should fail
+}
+
+/*
  * ABBA deadlock:
  */
 
@@ -1057,7 +1154,7 @@ static void dotest(void (*testcase_fn)(void), int expected, int lockclass_mask)
 		unexpected_testcase_failures++;
 		pr_cont("FAILED|");
 
-		dump_stack();
+//		dump_stack();
 	} else {
 		testcase_successes++;
 		pr_cont("  ok  |");
@@ -1933,6 +2030,24 @@ void locking_selftest(void)
 	dotest(rsem_AA3, FAILURE, LOCKTYPE_RWSEM);
 	pr_cont("\n");
 
+	print_testname("mixed read-lock/lock-write ABBA");
+	pr_cont("             |");
+	dotest(rlock_ABBA1, FAILURE, LOCKTYPE_RWLOCK);
+	pr_cont("             |");
+	dotest(rwsem_ABBA1, FAILURE, LOCKTYPE_RWSEM);
+
+	print_testname("mixed read-lock/lock-read ABBA");
+	pr_cont("             |");
+	dotest(rlock_ABBA2, SUCCESS, LOCKTYPE_RWLOCK);
+	pr_cont("             |");
+	dotest(rwsem_ABBA2, FAILURE, LOCKTYPE_RWSEM);
+
+	print_testname("mixed write-lock/lock-write ABBA");
+	pr_cont("             |");
+	dotest(rlock_ABBA3, FAILURE, LOCKTYPE_RWLOCK);
+	pr_cont("             |");
+	dotest(rwsem_ABBA3, FAILURE, LOCKTYPE_RWSEM);
+
 	printk("  --------------------------------------------------------------------------\n");
 
 	/*

^ permalink raw reply related	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING
  2017-08-22 13:49               ` Peter Zijlstra
@ 2017-08-22 14:46                 ` Peter Zijlstra
  2017-08-22 15:10                   ` Peter Zijlstra
                                     ` (2 more replies)
  2017-08-23  2:43                 ` Byungchul Park
  1 sibling, 3 replies; 50+ messages in thread
From: Peter Zijlstra @ 2017-08-22 14:46 UTC (permalink / raw)
  To: Byungchul Park
  Cc: mingo, linux-kernel, kernel-team, Arnaldo Carvalho de Melo,
	Dave Chinner, Tejun Heo, johannes, Oleg Nesterov

On Tue, Aug 22, 2017 at 03:49:22PM +0200, Peter Zijlstra wrote:
> Now, this means I also have to consider the existing
> lock_map_acquire_read() users and if they really wanted to be recursive
> or not. When I change lock_map_acquire_read() to use
> lock_acquire_shared() this annotation no longer suffices and the splat
> comes back.
> 
> 
> Also, the acquire_read() annotation will (obviously) no longer work to
> cure this problem when we switch to normal read (1), because then the
> generated chain:
> 
> 	W(1) -> A(0) -> C(0) -> W(1)
> 
> spells deadlock, since W isn't allowed to recurse.
> 
> 
> /me goes dig through commit:
> 
>   e159489baa71 ("workqueue: relax lockdep annotation on flush_work()")
> 
> to figure out wth the existing users really want.

Yep, they really want recursive, the pattern there is one work flushing
another on the same workqueue, which ends up being:

Work W1:		Work W2:	Task:

AR(Q)			AR(Q)		M(A)
A(W1)			A(W2)		flush_workqueue(Q)
  flush_work(W2)	  M(A)		  A(Q)
    A(W2)		R(W2)		  R(Q)
    R(W2)		R(Q)
    AR(Q)
    R(Q)
R(W1)
R(Q)

should spell deadlock (AQ-QA), and W1 takes Q recursively.

I am however slightly puzzled by the need of flush_work() to take Q,
what deadlock potential is there?

Task:			Work-W1:	Work-W2:

M(A)			AR(Q)		AR(Q)
flush_work(W1)		A(W1)		A(W2)
 A(W1)					  M(A)
 R(W1)
 AR(Q)
 R(Q)

Spells deadlock on AQ-QA, but why? Why is flush_work() linked to any lock
taken inside random other works. If we can get rid of flush_work()'s
usage of Q, we can drop the recursive nature.

It was added by Oleg in commit:

  a67da70dc095 ("workqueues: lockdep annotations for flush_work()")

Which has a distinct lack of Changelog. However, that is still very much
the old workqueue code, where I think the annotation makes sense because
that was a single thread running the works consecutively. But I don't
see it making sense for the current workqueue that runs works
concurrently.

TJ, Oleg, can we agree flush_work() no longer needs the dependency on Q?


Also, TJ, what protects @pwq in start_flush_work() at the time of
lock_map_*() ?


Also^2, TJ, what's the purpose of using atomic_long_t for work->data?
All it ever seems to do is atomic_long_read() and atomic_long_set(),
neither of which provides anything stronger than
READ_ONCE()/WRITE_ONCE() respectively.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING
  2017-08-22 14:46                 ` Peter Zijlstra
@ 2017-08-22 15:10                   ` Peter Zijlstra
  2017-08-22 15:59                   ` Oleg Nesterov
  2017-08-23 16:39                   ` Oleg Nesterov
  2 siblings, 0 replies; 50+ messages in thread
From: Peter Zijlstra @ 2017-08-22 15:10 UTC (permalink / raw)
  To: Byungchul Park
  Cc: mingo, linux-kernel, kernel-team, Arnaldo Carvalho de Melo,
	Dave Chinner, Tejun Heo, johannes, Oleg Nesterov

On Tue, Aug 22, 2017 at 04:46:02PM +0200, Peter Zijlstra wrote:
> I am however slightly puzzled by the need of flush_work() to take Q,
> what deadlock potential is there?
> 
> Task:			Work-W1:	Work-W2:
> 
> M(A)			AR(Q)		AR(Q)
> flush_work(W1)		A(W1)		A(W2)
>  A(W1)					  M(A)
>  R(W1)
>  AR(Q)
>  R(Q)
> 
> Spells deadlock on AQ-QA, but why? Why is flush_work() linked to any lock
> taken inside random other works. If we can get rid of flush_work()'s
> usage of Q, we can drop the recursive nature.
> 
> It was added by Oleg in commit:
> 
>   a67da70dc095 ("workqueues: lockdep annotations for flush_work()")
> 
> Which has a distinct lack of Changelog. However, that is still very much
> the old workqueue code, where I think the annotation makes sense because
> that was a single thread running the works consecutively. But I don't
> see it making sense for the current workqueue that runs works
> concurrently.
> 
> TJ, Oleg, can we agree flush_work() no longer needs the dependency on Q?

So we still need it in case of max_active==1 and that rescuer thing, and
then we still need to support calling it from a work, which then
recurses. How about the below, its a bit icky, but should work (boots
and builds a kernel).

---
 kernel/workqueue.c | 41 ++++++++++++++++++++++++++++++-----------
 1 file changed, 30 insertions(+), 11 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index e86733a8b344..c37b761f60b1 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2091,7 +2091,7 @@ __acquires(&pool->lock)
 
 	spin_unlock_irq(&pool->lock);
 
-	lock_map_acquire_read(&pwq->wq->lockdep_map);
+	lock_map_acquire(&pwq->wq->lockdep_map);
 	lock_map_acquire(&lockdep_map);
 	crossrelease_hist_start(XHLOCK_PROC);
 	trace_workqueue_execute_start(work);
@@ -2783,6 +2783,32 @@ void drain_workqueue(struct workqueue_struct *wq)
 }
 EXPORT_SYMBOL_GPL(drain_workqueue);
 
+static bool need_wq_lock(struct pool_workqueue *pwq)
+{
+	struct worker *worker = current_wq_worker();
+
+	/*
+	 * If current is running a work of the same pwq, we already hold
+	 * pwq->wq->lockdep_map, no need to take it again.
+	 */
+	if (worker && worker->current_pwq == pwq)
+		return false;
+
+	/*
+	 * If @max_active is 1 or rescuer is in use, flushing another work
+	 * item on the same workqueue may lead to deadlock.  Make sure the
+	 * flusher is not running on the same workqueue by verifying write
+	 * access.
+	 */
+	if (pwq->wq->saved_max_active == 1)
+		return true;
+
+	if (pwq->wq->rescuer)
+		return true;
+
+	return false;
+}
+
 static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
 {
 	struct worker *worker = NULL;
@@ -2816,17 +2842,10 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
 	insert_wq_barrier(pwq, barr, work, worker);
 	spin_unlock_irq(&pool->lock);
 
-	/*
-	 * If @max_active is 1 or rescuer is in use, flushing another work
-	 * item on the same workqueue may lead to deadlock.  Make sure the
-	 * flusher is not running on the same workqueue by verifying write
-	 * access.
-	 */
-	if (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer)
+	if (need_wq_lock(pwq)) {
 		lock_map_acquire(&pwq->wq->lockdep_map);
-	else
-		lock_map_acquire_read(&pwq->wq->lockdep_map);
-	lock_map_release(&pwq->wq->lockdep_map);
+		lock_map_release(&pwq->wq->lockdep_map);
+	}
 
 	return true;
 already_gone:

^ permalink raw reply related	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING
  2017-08-22 14:46                 ` Peter Zijlstra
  2017-08-22 15:10                   ` Peter Zijlstra
@ 2017-08-22 15:59                   ` Oleg Nesterov
  2017-08-22 16:35                     ` Peter Zijlstra
  2017-08-23 16:39                   ` Oleg Nesterov
  2 siblings, 1 reply; 50+ messages in thread
From: Oleg Nesterov @ 2017-08-22 15:59 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Byungchul Park, mingo, linux-kernel, kernel-team,
	Arnaldo Carvalho de Melo, Dave Chinner, Tejun Heo, johannes

Peter, I'll read your email tomorrow, just one note...

On 08/22, Peter Zijlstra wrote:
>
> Also^2, TJ, what's the purpose of using atomic_long_t for work->data?
> All it ever seems to do is atomic_long_read() and atomic_long_set(),

plust set/clear bit, for example

	test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work));

commit a08727bae727fc2ca3a6ee9506d77786b71070b3
Author: Linus Torvalds <torvalds@woody.osdl.org>
Date:   Sat Dec 16 09:53:50 2006 -0800

    Make workqueue bit operations work on "atomic_long_t"

    On architectures where the atomicity of the bit operations is handled by
    external means (ie a separate spinlock to protect concurrent accesses),
    just doing a direct assignment on the workqueue data field (as done by
    commit 4594bf159f1962cec3b727954b7c598b07e2e737) can cause the
    assignment to be lost due to lack of serialization with the bitops on
    the same word.

    So we need to serialize the assignment with the locks on those
    architectures (notably older ARM chips, PA-RISC and sparc32).

    So rather than using an "unsigned long", let's use "atomic_long_t",
    which already has a safe assignment operation (atomic_long_set()) on
    such architectures.

    This requires that the atomic operations use the same atomicity locks as
    the bit operations do, but that is largely the case anyway.  Sparc32
    will probably need fixing.

    Architectures (including modern ARM with LL/SC) that implement sane
    atomic operations for SMP won't see any of this matter.

Oleg.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING
  2017-08-22 15:59                   ` Oleg Nesterov
@ 2017-08-22 16:35                     ` Peter Zijlstra
  0 siblings, 0 replies; 50+ messages in thread
From: Peter Zijlstra @ 2017-08-22 16:35 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Byungchul Park, mingo, linux-kernel, kernel-team,
	Arnaldo Carvalho de Melo, Dave Chinner, Tejun Heo, johannes

On Tue, Aug 22, 2017 at 05:59:06PM +0200, Oleg Nesterov wrote:
> Peter, I'll read your email tomorrow, just one note...
> 
> On 08/22, Peter Zijlstra wrote:
> >
> > Also^2, TJ, what's the purpose of using atomic_long_t for work->data?
> > All it ever seems to do is atomic_long_read() and atomic_long_set(),
> 
> plust set/clear bit, for example
> 
> 	test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work));

Ooh, that's unintuitive. That could certainly use a comment.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING
  2017-08-22  9:06     ` Peter Zijlstra
  2017-08-22  9:22       ` Byungchul Park
@ 2017-08-22 21:19       ` Dave Chinner
  2017-08-23  2:31       ` Byungchul Park
  2 siblings, 0 replies; 50+ messages in thread
From: Dave Chinner @ 2017-08-22 21:19 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Byungchul Park, mingo, linux-kernel, kernel-team,
	Arnaldo Carvalho de Melo, Tejun Heo, Boqun Feng

On Tue, Aug 22, 2017 at 11:06:03AM +0200, Peter Zijlstra wrote:
> On Tue, Aug 22, 2017 at 03:46:03PM +1000, Dave Chinner wrote:
> > Even if I ignore the fact that buffer completions are run on
> > different workqueues, there seems to be a bigger problem with this
> > sort of completion checking.
> > 
> > That is, the trace looks plausible because we are definitely hold an
> > inode locked deep inside a truncate operation where the completion
> > if flagged.  Indeed, some transactions that would flag like this
> > could be holding up to 5 inodes locked and have tens of other
> > metadata objects locked. There are potentially tens (maybe even
> > hundreds) of different paths into this IO wait point, and all have
> > different combinations of objects locked when it triggers. So
> > there's massive scope for potential deadlocks....
> > 
> > .... and so we must have some way of avoiding this whole class of
> > problems that lockdep is unaware of.
> 
> So I did the below little hack, which basically wipes the entire lock
> history when we start a work and thereby disregards/looses the
> dependency on the work 'lock'.

Ok, so now it treats workqueue worker threads like any other
process?

> It makes my test box able to boot and build a kernel on XFS, so while I
> see what you're saying (I think), it doesn't appear to instantly show.
> 
> Should I run xfstests or something to further verify things are OK? Does
> that need a scratch partition (I keep forgetting how to run that stuff
> :/).

A couple of 4-8GB ramdisks/fake pmem regions is all you need. Put
this in the configs/<hostname>.config file, modifying the devices
to suit:

[xfs]
FSTYP=xfs
TEST_DIR=/mnt/test
TEST_DEV=/dev/pmem0
SCRATCH_MNT=/mnt/scratch
SCRATCH_DEV=/dev/pmem1

and run "./check -s xfs -g auto" from the root of the xfstests
source tree.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING
  2017-08-22  5:46   ` Dave Chinner
  2017-08-22  9:06     ` Peter Zijlstra
@ 2017-08-23  1:56     ` Byungchul Park
  1 sibling, 0 replies; 50+ messages in thread
From: Byungchul Park @ 2017-08-23  1:56 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Peter Zijlstra, mingo, linux-kernel, kernel-team,
	Arnaldo Carvalho de Melo, Tejun Heo

On Tue, Aug 22, 2017 at 03:46:03PM +1000, Dave Chinner wrote:
> On Mon, Aug 21, 2017 at 05:46:00PM +0200, Peter Zijlstra wrote:
> > 
> > 
> > Booting the very latest -tip on my test machine gets me the below splat.
> > 
> > Dave, TJ, FYI, lockdep grew annotations for completions; it remembers
> > which locks were taken before we complete() and checks none of those are
> > held while we wait_for_completion().
> > 
> > That is, something like:
> > 
> > 					mutex_lock(&A);
> > 					mutex_unlock(&A);
> > 					complete(&C);
> > 
> > 	mutex_lock(&A);
> > 	wait_for_completion(&C);
> 
> Ok, that seems reasonable...
> 
> > Is now considered a problem. Note that in order for C to link to A it
> > needs to have observed the complete() thread acquiring it after a
> > wait_for_completion(), something like:
> > 
> > 	wait_for_completion(&C)
> > 					mutex_lock(&A);
> > 					mutex_unlock(&A);
> > 					complete(&C);
> 
> Sure.
> 
> > 
> > That is, only locks observed taken between C's wait_for_completion() and
> > it's complete() are considered.
> > 
> > Now given the above observance rule and the fact that the below report
> > is from the complete, the thing that happened appears to be:
> > 
> > 
> > 	lockdep_map_acquire(&work->lockdep_map)
> > 	down_write(&A)
> > 
> > 			down_write(&A)
> > 			wait_for_completion(&C)
> > 
> > 					lockdep_map_acquire(&work_lockdep_map);
> > 					complete(&C)
> > 
> > Which lockdep then puked over because both sides saw the same work
> > class.
> 
> Let me get this straight, first. If we:
> 
> 	1. take a lock in a work queue context; and
> 	2. in a separate context, hold that lock while we wait for a
> 	completion; and
> 	3. Run the completion from the original workqueue where we
> 	/once/ held that lock
> 
> lockdep will barf?

Do you mean the following?

For example:

A work in a workqueue
---------------------
mutex_lock(&A)

A task
------
mutex_lock(&A)
wait_for_completion(&B)

Another work in the workqueue
-----------------------------
complete(&B)

Do you mean this?

In this case, lockdep including crossrelease does not barf if all
manual lockdep_map acquisitions are used correctly.

> IIUC, that's a problem because XFS does this all over the place.
> Let me pull the trace apart in reverse order to explain why XFS is
> going to throw endless false positives on this:
> 
> > [   39.159708] -> #0 ((&ioend->io_work)){+.+.}:
> > [   39.166126]        process_one_work+0x244/0x6b0
> > [   39.171184]        worker_thread+0x48/0x3f0
> > [   39.175845]        kthread+0x147/0x180
> > [   39.180020]        ret_from_fork+0x2a/0x40
> > [   39.184593]        0xffffffffffffffff
> 
> That's a data IO completion, which will take an inode lock if we
> have to run a transaction to update inode size or convert an
> unwritten extent.
> 
> > [   39.100611] -> #1 (&xfs_nondir_ilock_class){++++}:
> > [   39.107612]        __lock_acquire+0x10a1/0x1100
> > [   39.112660]        lock_acquire+0xea/0x1f0
> > [   39.117224]        down_write_nested+0x26/0x60
> > [   39.122184]        xfs_ilock+0x166/0x220
> > [   39.126563]        __xfs_setfilesize+0x30/0x200
> > [   39.131612]        xfs_setfilesize_ioend+0x7f/0xb0
> > [   39.136958]        xfs_end_io+0x49/0xf0
> > [   39.141240]        process_one_work+0x273/0x6b0
> > [   39.146288]        worker_thread+0x48/0x3f0
> > [   39.150960]        kthread+0x147/0x180
> > [   39.155146]        ret_from_fork+0x2a/0x40
> 
> That's the data IO completion locking the inode inside a transaction
> to update the inode size inside a workqueue.
> 
> 
> > [   38.962397] -> #2 ((complete)&bp->b_iowait){+.+.}:
> > [   38.969401]        __lock_acquire+0x10a1/0x1100
> > [   38.974460]        lock_acquire+0xea/0x1f0
> > [   38.979036]        wait_for_completion+0x3b/0x130
> > [   38.984285]        xfs_buf_submit_wait+0xb2/0x590
> > [   38.989535]        _xfs_buf_read+0x23/0x30
> > [   38.994108]        xfs_buf_read_map+0x14f/0x320
> > [   38.999169]        xfs_trans_read_buf_map+0x170/0x5d0
> > [   39.004812]        xfs_read_agf+0x97/0x1d0
> > [   39.009386]        xfs_alloc_read_agf+0x60/0x240
> > [   39.014541]        xfs_alloc_fix_freelist+0x32a/0x3d0
> > [   39.020180]        xfs_free_extent_fix_freelist+0x6b/0xa0
> > [   39.026206]        xfs_free_extent+0x48/0x120
> > [   39.031068]        xfs_trans_free_extent+0x57/0x200
> > [   39.036512]        xfs_extent_free_finish_item+0x26/0x40
> > [   39.042442]        xfs_defer_finish+0x174/0x770
> > [   39.047494]        xfs_itruncate_extents+0x126/0x470
> > [   39.053035]        xfs_setattr_size+0x2a1/0x310
> > [   39.058093]        xfs_vn_setattr_size+0x57/0x160
> > [   39.063342]        xfs_vn_setattr+0x50/0x70
> > [   39.068015]        notify_change+0x2ee/0x420
> > [   39.072785]        do_truncate+0x5d/0x90
> > [   39.077165]        path_openat+0xab2/0xc00
> > [   39.081737]        do_filp_open+0x8a/0xf0
> > [   39.086213]        do_sys_open+0x123/0x200
> > [   39.090787]        SyS_open+0x1e/0x20
> > [   39.094876]        entry_SYSCALL_64_fastpath+0x23/0xc2
> 
> And that's waiting for a metadata read IO to complete during a
> truncate transaction. This has no direct connection to the inode at
> all - it's a allocation group header that we have to lock to do
> block allocation. The completion it is waiting on doesn't even run
> through the same workqueue as the ioends - ioends go through
> mp->m_data_workqueue or mp->m_unwritten_workqueue, metadata buffers
> go through mp->m_buf_workqueue or mp->m_log_workqueue.

In this cases, a lockdep_map class for the former and one for the latter
must be different from each other. Not only one for workqueue but also
one for work must be different which can be same currently, IMHO, it's
wrong. What we should fix is that.

Or, if it's intended by workqueue, xfs code should be fixed so that
different works, not only workqueues, are used for different work.

> So from that perspective, an ioend blocked on an inode lock should
> not ever prevent metadata buffer completions from being run. Hence
> I'd call this a false positive at best, though I think it really
> indicates a bug in the new lockdep code because it isn't

IMHO, it's not a bug of lockdep but lockdep_map_acquire users.

> discriminating between different workqueue contexts properly.

We have to choose one between the following:

   1. Make works' lock class of different workqueues different
   2. Make xfs's works and workqueue for different works different

IMHO, we should choose 1, but I'm not sure since I'm not familiar with
workqueue.

> Even if I ignore the fact that buffer completions are run on
> different workqueues, there seems to be a bigger problem with this
> sort of completion checking.

It does not work as you concern.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING
  2017-08-22  9:37         ` Peter Zijlstra
  2017-08-22  9:42           ` Peter Zijlstra
@ 2017-08-23  2:12           ` Byungchul Park
  2017-08-23  6:03             ` Byungchul Park
  2017-08-23 10:20             ` Peter Zijlstra
  1 sibling, 2 replies; 50+ messages in thread
From: Byungchul Park @ 2017-08-23  2:12 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Dave Chinner, mingo, linux-kernel, kernel-team,
	Arnaldo Carvalho de Melo, Tejun Heo, Boqun Feng

On Tue, Aug 22, 2017 at 11:37:03AM +0200, Peter Zijlstra wrote:
> On Tue, Aug 22, 2017 at 06:22:36PM +0900, Byungchul Park wrote:
> > On Tue, Aug 22, 2017 at 11:06:03AM +0200, Peter Zijlstra wrote:
> > > So I did the below little hack, which basically wipes the entire lock
> > > history when we start a work and thereby disregards/looses the
> > > dependency on the work 'lock'.
> > > 
> > > It makes my test box able to boot and build a kernel on XFS, so while I
> > > see what you're saying (I think), it doesn't appear to instantly show.
> > > 
> > > Should I run xfstests or something to further verify things are OK? Does
> > > that need a scratch partition (I keep forgetting how to run that stuff
> > > :/).
> > > 
> > > ---
> > > diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
> > > index 66011c9f5df3..de91cdce9460 100644
> > > --- a/kernel/locking/lockdep.c
> > > +++ b/kernel/locking/lockdep.c
> > > @@ -4756,10 +4756,14 @@ void crossrelease_hist_start(enum xhlock_context_t c)
> > >  {
> > >  	struct task_struct *cur = current;
> > >  
> > > -	if (cur->xhlocks) {
> > > -		cur->xhlock_idx_hist[c] = cur->xhlock_idx;
> > > -		cur->hist_id_save[c] = cur->hist_id;
> > > -	}
> > > +	if (!cur->xhlocks)
> > > +		return;
> > > +
> > > +	if (c == XHLOCK_PROC)
> > > +		invalidate_xhlock(&xhlock(cur->xhlock_idx));
> > 
> > We have to detect dependecies if it exists, even in the following case:
> > 
> > oooooooiiiiiiiiiiiiiiiiiii.........
> >   |<- range for commit ->|
> > 
> >   where
> >   o: acquisition outside of each work,
> >   i: acquisition inside of each work,
> > 
> > With yours, we can never detect dependecies wrt 'o'.
> 
> There really shouldn't be any o's when you call

There can be any o's.

> crossrelease_hist_start(XHLOCK_PROC), it should denote the bottom of a

No, I don't think so. It can be either the bottom or not.

hist_start() and hist_end() is only for special contexts which need roll
back on exit e.g. irq, work and so on. Normal kernel context should work
well w/o hist_start() or hist_end().

> context, see:
> 
>   https://lkml.kernel.org/r/20170301104328.GD6515@twins.programming.kicks-ass.net

Actually, I don't agree with that.

> And in that respect you placed the calls wrongly in process_one_work(),

Why is it wrong? It's intended. Could you tell me why?

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING
  2017-08-22  9:06     ` Peter Zijlstra
  2017-08-22  9:22       ` Byungchul Park
  2017-08-22 21:19       ` Dave Chinner
@ 2017-08-23  2:31       ` Byungchul Park
  2017-08-23  6:11         ` Byungchul Park
  2017-08-23 10:46         ` Peter Zijlstra
  2 siblings, 2 replies; 50+ messages in thread
From: Byungchul Park @ 2017-08-23  2:31 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Dave Chinner, mingo, linux-kernel, kernel-team,
	Arnaldo Carvalho de Melo, Tejun Heo, Boqun Feng

On Tue, Aug 22, 2017 at 11:06:03AM +0200, Peter Zijlstra wrote:
> On Tue, Aug 22, 2017 at 03:46:03PM +1000, Dave Chinner wrote:
> > On Mon, Aug 21, 2017 at 05:46:00PM +0200, Peter Zijlstra wrote:
> 
> > Let me get this straight, first. If we:
> > 
> > 	1. take a lock in a work queue context; and
> > 	2. in a separate context, hold that lock while we wait for a
> > 	completion; and
> > 	3. Run the completion from the original workqueue where we
> > 	/once/ held that lock
> > 
> > lockdep will barf?
> > 
> > IIUC, that's a problem because XFS does this all over the place.
> > Let me pull the trace apart in reverse order to explain why XFS is
> > going to throw endless false positives on this:
> 
> Yeah, and I agree that's not right or desired. Just trying to figure out
> how to go about fixing that.
> 
> Because, with possible exception for the single threaded workqueues,
> there is no actual deadlock there unless its the very same work
> instance. Its the classic instance vs class thing.

Currently, we do the following in process_one_work(),

lockdep_map_acquire for a workqueue
lockdep_map_acquire for a work

But IMHO it should be,

lockdep_map_acquire for a pair of workqueue and work.

Right?

> And splitting up work classes is going to be a nightmare too.
> 
> > > [   39.159708] -> #0 ((&ioend->io_work)){+.+.}:
> > > [   39.166126]        process_one_work+0x244/0x6b0
> > > [   39.171184]        worker_thread+0x48/0x3f0
> > > [   39.175845]        kthread+0x147/0x180
> > > [   39.180020]        ret_from_fork+0x2a/0x40
> > > [   39.184593]        0xffffffffffffffff
> > 
> > That's a data IO completion, which will take an inode lock if we
> > have to run a transaction to update inode size or convert an
> > unwritten extent.
> > 
> > > [   39.100611] -> #1 (&xfs_nondir_ilock_class){++++}:
> > > [   39.107612]        __lock_acquire+0x10a1/0x1100
> > > [   39.112660]        lock_acquire+0xea/0x1f0
> > > [   39.117224]        down_write_nested+0x26/0x60
> > > [   39.122184]        xfs_ilock+0x166/0x220
> > > [   39.126563]        __xfs_setfilesize+0x30/0x200
> > > [   39.131612]        xfs_setfilesize_ioend+0x7f/0xb0
> > > [   39.136958]        xfs_end_io+0x49/0xf0
> > > [   39.141240]        process_one_work+0x273/0x6b0
> > > [   39.146288]        worker_thread+0x48/0x3f0
> > > [   39.150960]        kthread+0x147/0x180
> > > [   39.155146]        ret_from_fork+0x2a/0x40
> > 
> > That's the data IO completion locking the inode inside a transaction
> > to update the inode size inside a workqueue.
> > 
> > 
> > > [   38.962397] -> #2 ((complete)&bp->b_iowait){+.+.}:
> > > [   38.969401]        __lock_acquire+0x10a1/0x1100
> > > [   38.974460]        lock_acquire+0xea/0x1f0
> > > [   38.979036]        wait_for_completion+0x3b/0x130
> > > [   38.984285]        xfs_buf_submit_wait+0xb2/0x590
> > > [   38.989535]        _xfs_buf_read+0x23/0x30
> > > [   38.994108]        xfs_buf_read_map+0x14f/0x320
> > > [   38.999169]        xfs_trans_read_buf_map+0x170/0x5d0
> > > [   39.004812]        xfs_read_agf+0x97/0x1d0
> > > [   39.009386]        xfs_alloc_read_agf+0x60/0x240
> > > [   39.014541]        xfs_alloc_fix_freelist+0x32a/0x3d0
> > > [   39.020180]        xfs_free_extent_fix_freelist+0x6b/0xa0
> > > [   39.026206]        xfs_free_extent+0x48/0x120
> > > [   39.031068]        xfs_trans_free_extent+0x57/0x200
> > > [   39.036512]        xfs_extent_free_finish_item+0x26/0x40
> > > [   39.042442]        xfs_defer_finish+0x174/0x770
> > > [   39.047494]        xfs_itruncate_extents+0x126/0x470
> > > [   39.053035]        xfs_setattr_size+0x2a1/0x310
> > > [   39.058093]        xfs_vn_setattr_size+0x57/0x160
> > > [   39.063342]        xfs_vn_setattr+0x50/0x70
> > > [   39.068015]        notify_change+0x2ee/0x420
> > > [   39.072785]        do_truncate+0x5d/0x90
> > > [   39.077165]        path_openat+0xab2/0xc00
> > > [   39.081737]        do_filp_open+0x8a/0xf0
> > > [   39.086213]        do_sys_open+0x123/0x200
> > > [   39.090787]        SyS_open+0x1e/0x20
> > > [   39.094876]        entry_SYSCALL_64_fastpath+0x23/0xc2
> > 
> > And that's waiting for a metadata read IO to complete during a
> > truncate transaction. This has no direct connection to the inode at
> > all - it's a allocation group header that we have to lock to do
> > block allocation. The completion it is waiting on doesn't even run
> > through the same workqueue as the ioends - ioends go through
> > mp->m_data_workqueue or mp->m_unwritten_workqueue, metadata buffers
> > go through mp->m_buf_workqueue or mp->m_log_workqueue.
> > 
> > So from that perspective, an ioend blocked on an inode lock should
> > not ever prevent metadata buffer completions from being run. Hence
> > I'd call this a false positive at best, though I think it really
> > indicates a bug in the new lockdep code because it isn't
> > discriminating between different workqueue contexts properly.
> 
> Agreed. The interaction with workqueues is buggered.

I think original uses of lockdep_map were already wrong. I mean it's
not a problem of newly introduced code.

> > Even if I ignore the fact that buffer completions are run on
> > different workqueues, there seems to be a bigger problem with this
> > sort of completion checking.
> > 
> > That is, the trace looks plausible because we are definitely hold an
> > inode locked deep inside a truncate operation where the completion
> > if flagged.  Indeed, some transactions that would flag like this
> > could be holding up to 5 inodes locked and have tens of other
> > metadata objects locked. There are potentially tens (maybe even
> > hundreds) of different paths into this IO wait point, and all have
> > different combinations of objects locked when it triggers. So
> > there's massive scope for potential deadlocks....
> > 
> > .... and so we must have some way of avoiding this whole class of
> > problems that lockdep is unaware of.
> 
> 
> So I did the below little hack, which basically wipes the entire lock
> history when we start a work and thereby disregards/looses the
> dependency on the work 'lock'.

We should not do the following. Why should we give up valuable
dependencies?

> 
> It makes my test box able to boot and build a kernel on XFS, so while I
> see what you're saying (I think), it doesn't appear to instantly show.
> 
> Should I run xfstests or something to further verify things are OK? Does
> that need a scratch partition (I keep forgetting how to run that stuff
> :/).
> 
> ---
> diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
> index 66011c9f5df3..de91cdce9460 100644
> --- a/kernel/locking/lockdep.c
> +++ b/kernel/locking/lockdep.c
> @@ -4756,10 +4756,14 @@ void crossrelease_hist_start(enum xhlock_context_t c)
>  {
>  	struct task_struct *cur = current;
>  
> -	if (cur->xhlocks) {
> -		cur->xhlock_idx_hist[c] = cur->xhlock_idx;
> -		cur->hist_id_save[c] = cur->hist_id;
> -	}
> +	if (!cur->xhlocks)
> +		return;
> +
> +	if (c == XHLOCK_PROC)
> +		invalidate_xhlock(&xhlock(cur->xhlock_idx));
> +
> +	cur->xhlock_idx_hist[c] = cur->xhlock_idx;
> +	cur->hist_id_save[c] = cur->hist_id;
>  }
>  
>  void crossrelease_hist_end(enum xhlock_context_t c)

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING
  2017-08-22 13:49               ` Peter Zijlstra
  2017-08-22 14:46                 ` Peter Zijlstra
@ 2017-08-23  2:43                 ` Byungchul Park
  2017-08-23  6:31                   ` Byungchul Park
  2017-08-23 10:26                   ` Peter Zijlstra
  1 sibling, 2 replies; 50+ messages in thread
From: Byungchul Park @ 2017-08-23  2:43 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, linux-kernel, kernel-team, Arnaldo Carvalho de Melo,
	Dave Chinner, Tejun Heo, johannes

On Tue, Aug 22, 2017 at 03:49:22PM +0200, Peter Zijlstra wrote:
> On Tue, Aug 22, 2017 at 12:08:40PM +0200, Peter Zijlstra wrote:
> 
> > > > > I meant:
> > > > > 
> > > > >  	mutex_lock(&A)
> > > > >  				<work>
> > > > >  				lockdep_map_acquire_read(&work)
> > > > >  				mutex_lock(&A)
> > > > > 
> > > > >  	lockdep_map_acquire(&work)
> > > > >  	flush_work(&work)
> > > > > 
> > > > > I mean it can still be detected with a read acquisition in work.
> > > > > Am I wrong?
> > > > 
> > > > Think so, although there's something weird with read locks that I keep
> > > > forgetting. But I'm not sure it'll actually solve the problem. But I can
> > > 
> > > I mean, read acquisitions are nothing but ones allowing read ones to be
> > > re-acquired legally, IOW, we want to check entrance of flush_work() and
> > > works, not between works. That's why I suggested to use read ones in work
> > > in that case.
> > 
> > Does seem to work.
> 
> So I think we'll end up hitting a lockdep deficiency and not trigger the
> splat on flush_work(), see also:
> 
>   https://lwn.net/Articles/332801/
> 
> lock_map_acquire_read() is a read-recursive and will not in fact create
> any dependencies because of this issue.
> 
> In specific, check_prev_add() has:
> 
> 	if (next->read == 2 || prev->read == 2)
> 		return 1;
> 
> This means that for:
> 
> 	lock_map_acquire_read(W)(2)
> 	down_write(A)		(0)
> 
> 			down_write(A)		(0)
> 			wait_for_completion(C)	(0)
> 
> 					lock_map_acquire_read(W)(2)
> 					complete(C)		(0)
> 
> All the (2) effectively go away and 'solve' our current issue, but:
> 
> 	lock_map_acquire_read(W)(2)
> 	mutex_lock(A)		(0)
> 
> 			mutex_lock(A)		(0)
> 			lock_map_acquire(W)	(0)
> 
> as per flush_work() will not in fact trigger anymore either.

It should be triggered. Lockdep code should be fixed so that it does.

> See also the below locking-selftest changes.
> 
> 
> Now, this means I also have to consider the existing
> lock_map_acquire_read() users and if they really wanted to be recursive
> or not. When I change lock_map_acquire_read() to use
> lock_acquire_shared() this annotation no longer suffices and the splat
> comes back.
> 
> 
> Also, the acquire_read() annotation will (obviously) no longer work to
> cure this problem when we switch to normal read (1), because then the
> generated chain:
> 
> 	W(1) -> A(0) -> C(0) -> W(1)

Please explain what W/A/C stand for.

> 
> spells deadlock, since W isn't allowed to recurse.
> 
> 
> /me goes dig through commit:
> 
>   e159489baa71 ("workqueue: relax lockdep annotation on flush_work()")
> 
> to figure out wth the existing users really want.
> 
> 
> [    0.000000] ----------------------------------------------------------------------------
> [    0.000000]                                  | spin |wlock |rlock |mutex | wsem | rsem |
> [    0.000000]   --------------------------------------------------------------------------
> 
> [    0.000000]   --------------------------------------------------------------------------
> [    0.000000]               recursive read-lock:             |  ok  |             |  ok  |
> [    0.000000]            recursive read-lock #2:             |  ok  |             |  ok  |
> [    0.000000]             mixed read-write-lock:             |  ok  |             |  ok  |
> [    0.000000]             mixed write-read-lock:             |  ok  |             |  ok  |
> [    0.000000]   mixed read-lock/lock-write ABBA:             |FAILED|             |  ok  |
> [    0.000000]    mixed read-lock/lock-read ABBA:             |  ok  |             |  ok  |
> [    0.000000]  mixed write-lock/lock-write ABBA:             |  ok  |             |  ok  |
> [    0.000000]   --------------------------------------------------------------------------
> 
> ---
>  lib/locking-selftest.c | 117 ++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 116 insertions(+), 1 deletion(-)
> 
> diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
> index 6f2b135dc5e8..b99d365cf399 100644
> --- a/lib/locking-selftest.c
> +++ b/lib/locking-selftest.c
> @@ -363,6 +363,103 @@ static void rsem_AA3(void)
>  }
>  
>  /*
> + * read_lock(A)
> + * spin_lock(B)
> + *		spin_lock(B)
> + *		write_lock(A)
> + */
> +static void rlock_ABBA1(void)
> +{
> +	RL(X1);
> +	L(Y1);
> +	U(Y1);
> +	RU(X1);
> +
> +	L(Y1);
> +	WL(X1);
> +	WU(X1);
> +	U(Y1); // should fail
> +}
> +
> +static void rwsem_ABBA1(void)
> +{
> +	RSL(X1);
> +	ML(Y1);
> +	MU(Y1);
> +	RSU(X1);
> +
> +	ML(Y1);
> +	WSL(X1);
> +	WSU(X1);
> +	MU(Y1); // should fail
> +}
> +
> +/*
> + * read_lock(A)
> + * spin_lock(B)
> + *		spin_lock(B)
> + *		read_lock(A)
> + */
> +static void rlock_ABBA2(void)
> +{
> +	RL(X1);
> +	L(Y1);
> +	U(Y1);
> +	RU(X1);
> +
> +	L(Y1);
> +	RL(X1);
> +	RU(X1);
> +	U(Y1); // should NOT fail
> +}
> +
> +static void rwsem_ABBA2(void)
> +{
> +	RSL(X1);
> +	ML(Y1);
> +	MU(Y1);
> +	RSU(X1);
> +
> +	ML(Y1);
> +	RSL(X1);
> +	RSU(X1);
> +	MU(Y1); // should fail
> +}
> +
> +
> +/*
> + * write_lock(A)
> + * spin_lock(B)
> + *		spin_lock(B)
> + *		write_lock(A)
> + */
> +static void rlock_ABBA3(void)
> +{
> +	WL(X1);
> +	L(Y1);
> +	U(Y1);
> +	WU(X1);
> +
> +	L(Y1);
> +	WL(X1);
> +	WU(X1);
> +	U(Y1); // should fail
> +}
> +
> +static void rwsem_ABBA3(void)
> +{
> +	WSL(X1);
> +	ML(Y1);
> +	MU(Y1);
> +	WSU(X1);
> +
> +	ML(Y1);
> +	WSL(X1);
> +	WSU(X1);
> +	MU(Y1); // should fail
> +}
> +
> +/*
>   * ABBA deadlock:
>   */
>  
> @@ -1057,7 +1154,7 @@ static void dotest(void (*testcase_fn)(void), int expected, int lockclass_mask)
>  		unexpected_testcase_failures++;
>  		pr_cont("FAILED|");
>  
> -		dump_stack();
> +//		dump_stack();
>  	} else {
>  		testcase_successes++;
>  		pr_cont("  ok  |");
> @@ -1933,6 +2030,24 @@ void locking_selftest(void)
>  	dotest(rsem_AA3, FAILURE, LOCKTYPE_RWSEM);
>  	pr_cont("\n");
>  
> +	print_testname("mixed read-lock/lock-write ABBA");
> +	pr_cont("             |");
> +	dotest(rlock_ABBA1, FAILURE, LOCKTYPE_RWLOCK);
> +	pr_cont("             |");
> +	dotest(rwsem_ABBA1, FAILURE, LOCKTYPE_RWSEM);
> +
> +	print_testname("mixed read-lock/lock-read ABBA");
> +	pr_cont("             |");
> +	dotest(rlock_ABBA2, SUCCESS, LOCKTYPE_RWLOCK);
> +	pr_cont("             |");
> +	dotest(rwsem_ABBA2, FAILURE, LOCKTYPE_RWSEM);
> +
> +	print_testname("mixed write-lock/lock-write ABBA");
> +	pr_cont("             |");
> +	dotest(rlock_ABBA3, FAILURE, LOCKTYPE_RWLOCK);
> +	pr_cont("             |");
> +	dotest(rwsem_ABBA3, FAILURE, LOCKTYPE_RWSEM);
> +
>  	printk("  --------------------------------------------------------------------------\n");
>  
>  	/*

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING
  2017-08-23  2:12           ` Byungchul Park
@ 2017-08-23  6:03             ` Byungchul Park
  2017-08-23 10:20             ` Peter Zijlstra
  1 sibling, 0 replies; 50+ messages in thread
From: Byungchul Park @ 2017-08-23  6:03 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Dave Chinner, mingo, linux-kernel, kernel-team,
	Arnaldo Carvalho de Melo, Tejun Heo, Boqun Feng

On Wed, Aug 23, 2017 at 11:12:17AM +0900, Byungchul Park wrote:
> > > We have to detect dependecies if it exists, even in the following case:
> > > 
> > > oooooooiiiiiiiiiiiiiiiiiii.........
> > >   |<- range for commit ->|
> > > 
> > >   where
> > >   o: acquisition outside of each work,
> > >   i: acquisition inside of each work,
> > > 
> > > With yours, we can never detect dependecies wrt 'o'.
> > 
> > There really shouldn't be any o's when you call
> 
> There can be any o's.

I meant it's very possible for 'o's to exist. And we have to, of course,
consider them wrt dependencies. No reason we should ignore them.

> > crossrelease_hist_start(XHLOCK_PROC), it should denote the bottom of a

Honestly, I prefer another naming like XHLOCK_WORK or XHLOCK_SYSCALL over
XHLOCK_PROC, since the functions are for special contexts e.g. works.
But I thought XHLOCK_PROC is not bad because XHLOCK_WORK and
XHLOCK_SYSCALL does never conflict to each other and they anyway run as
normal process contexts.

Remind they are for special contexts.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING
  2017-08-23  2:31       ` Byungchul Park
@ 2017-08-23  6:11         ` Byungchul Park
  2017-08-23 10:46         ` Peter Zijlstra
  1 sibling, 0 replies; 50+ messages in thread
From: Byungchul Park @ 2017-08-23  6:11 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Dave Chinner, mingo, linux-kernel, kernel-team,
	Arnaldo Carvalho de Melo, Tejun Heo, Boqun Feng

On Wed, Aug 23, 2017 at 11:31:18AM +0900, Byungchul Park wrote:
> > > IIUC, that's a problem because XFS does this all over the place.
> > > Let me pull the trace apart in reverse order to explain why XFS is
> > > going to throw endless false positives on this:
> > 
> > Yeah, and I agree that's not right or desired. Just trying to figure out
> > how to go about fixing that.
> > 
> > Because, with possible exception for the single threaded workqueues,
> > there is no actual deadlock there unless its the very same work
> > instance. Its the classic instance vs class thing.
> 
> Currently, we do the following in process_one_work(),
> 
> lockdep_map_acquire for a workqueue
> lockdep_map_acquire for a work
> 
> But IMHO it should be,
> 
> lockdep_map_acquire for a pair of workqueue and work.

IMHO, we should remove workqueue's lockdep_map or work's one, and make
the remaining consider the other, so that it works as if
lockdep_map_acquire was used with a pair of workqueue and work, if it's
needed.

> Right?
> 
> > And splitting up work classes is going to be a nightmare too.
> > 
> > > > [   39.159708] -> #0 ((&ioend->io_work)){+.+.}:
> > > > [   39.166126]        process_one_work+0x244/0x6b0
> > > > [   39.171184]        worker_thread+0x48/0x3f0
> > > > [   39.175845]        kthread+0x147/0x180
> > > > [   39.180020]        ret_from_fork+0x2a/0x40
> > > > [   39.184593]        0xffffffffffffffff
> > > 
> > > That's a data IO completion, which will take an inode lock if we
> > > have to run a transaction to update inode size or convert an
> > > unwritten extent.
> > > 
> > > > [   39.100611] -> #1 (&xfs_nondir_ilock_class){++++}:
> > > > [   39.107612]        __lock_acquire+0x10a1/0x1100
> > > > [   39.112660]        lock_acquire+0xea/0x1f0
> > > > [   39.117224]        down_write_nested+0x26/0x60
> > > > [   39.122184]        xfs_ilock+0x166/0x220
> > > > [   39.126563]        __xfs_setfilesize+0x30/0x200
> > > > [   39.131612]        xfs_setfilesize_ioend+0x7f/0xb0
> > > > [   39.136958]        xfs_end_io+0x49/0xf0
> > > > [   39.141240]        process_one_work+0x273/0x6b0
> > > > [   39.146288]        worker_thread+0x48/0x3f0
> > > > [   39.150960]        kthread+0x147/0x180
> > > > [   39.155146]        ret_from_fork+0x2a/0x40
> > > 
> > > That's the data IO completion locking the inode inside a transaction
> > > to update the inode size inside a workqueue.
> > > 
> > > 
> > > > [   38.962397] -> #2 ((complete)&bp->b_iowait){+.+.}:
> > > > [   38.969401]        __lock_acquire+0x10a1/0x1100
> > > > [   38.974460]        lock_acquire+0xea/0x1f0
> > > > [   38.979036]        wait_for_completion+0x3b/0x130
> > > > [   38.984285]        xfs_buf_submit_wait+0xb2/0x590
> > > > [   38.989535]        _xfs_buf_read+0x23/0x30
> > > > [   38.994108]        xfs_buf_read_map+0x14f/0x320
> > > > [   38.999169]        xfs_trans_read_buf_map+0x170/0x5d0
> > > > [   39.004812]        xfs_read_agf+0x97/0x1d0
> > > > [   39.009386]        xfs_alloc_read_agf+0x60/0x240
> > > > [   39.014541]        xfs_alloc_fix_freelist+0x32a/0x3d0
> > > > [   39.020180]        xfs_free_extent_fix_freelist+0x6b/0xa0
> > > > [   39.026206]        xfs_free_extent+0x48/0x120
> > > > [   39.031068]        xfs_trans_free_extent+0x57/0x200
> > > > [   39.036512]        xfs_extent_free_finish_item+0x26/0x40
> > > > [   39.042442]        xfs_defer_finish+0x174/0x770
> > > > [   39.047494]        xfs_itruncate_extents+0x126/0x470
> > > > [   39.053035]        xfs_setattr_size+0x2a1/0x310
> > > > [   39.058093]        xfs_vn_setattr_size+0x57/0x160
> > > > [   39.063342]        xfs_vn_setattr+0x50/0x70
> > > > [   39.068015]        notify_change+0x2ee/0x420
> > > > [   39.072785]        do_truncate+0x5d/0x90
> > > > [   39.077165]        path_openat+0xab2/0xc00
> > > > [   39.081737]        do_filp_open+0x8a/0xf0
> > > > [   39.086213]        do_sys_open+0x123/0x200
> > > > [   39.090787]        SyS_open+0x1e/0x20
> > > > [   39.094876]        entry_SYSCALL_64_fastpath+0x23/0xc2
> > > 
> > > And that's waiting for a metadata read IO to complete during a
> > > truncate transaction. This has no direct connection to the inode at
> > > all - it's a allocation group header that we have to lock to do
> > > block allocation. The completion it is waiting on doesn't even run
> > > through the same workqueue as the ioends - ioends go through
> > > mp->m_data_workqueue or mp->m_unwritten_workqueue, metadata buffers
> > > go through mp->m_buf_workqueue or mp->m_log_workqueue.
> > > 
> > > So from that perspective, an ioend blocked on an inode lock should
> > > not ever prevent metadata buffer completions from being run. Hence
> > > I'd call this a false positive at best, though I think it really
> > > indicates a bug in the new lockdep code because it isn't
> > > discriminating between different workqueue contexts properly.
> > 
> > Agreed. The interaction with workqueues is buggered.
> 
> I think original uses of lockdep_map were already wrong. I mean it's

I meant 'uses in workqueue'.

> not a problem of newly introduced code.
> 
> > > Even if I ignore the fact that buffer completions are run on
> > > different workqueues, there seems to be a bigger problem with this
> > > sort of completion checking.
> > > 
> > > That is, the trace looks plausible because we are definitely hold an
> > > inode locked deep inside a truncate operation where the completion
> > > if flagged.  Indeed, some transactions that would flag like this
> > > could be holding up to 5 inodes locked and have tens of other
> > > metadata objects locked. There are potentially tens (maybe even
> > > hundreds) of different paths into this IO wait point, and all have
> > > different combinations of objects locked when it triggers. So
> > > there's massive scope for potential deadlocks....
> > > 
> > > .... and so we must have some way of avoiding this whole class of
> > > problems that lockdep is unaware of.
> > 
> > 
> > So I did the below little hack, which basically wipes the entire lock
> > history when we start a work and thereby disregards/looses the
> > dependency on the work 'lock'.
> 
> We should not do the following. Why should we give up valuable
> dependencies?
> 
> > 
> > It makes my test box able to boot and build a kernel on XFS, so while I
> > see what you're saying (I think), it doesn't appear to instantly show.
> > 
> > Should I run xfstests or something to further verify things are OK? Does
> > that need a scratch partition (I keep forgetting how to run that stuff
> > :/).
> > 
> > ---
> > diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
> > index 66011c9f5df3..de91cdce9460 100644
> > --- a/kernel/locking/lockdep.c
> > +++ b/kernel/locking/lockdep.c
> > @@ -4756,10 +4756,14 @@ void crossrelease_hist_start(enum xhlock_context_t c)
> >  {
> >  	struct task_struct *cur = current;
> >  
> > -	if (cur->xhlocks) {
> > -		cur->xhlock_idx_hist[c] = cur->xhlock_idx;
> > -		cur->hist_id_save[c] = cur->hist_id;
> > -	}
> > +	if (!cur->xhlocks)
> > +		return;
> > +
> > +	if (c == XHLOCK_PROC)
> > +		invalidate_xhlock(&xhlock(cur->xhlock_idx));
> > +
> > +	cur->xhlock_idx_hist[c] = cur->xhlock_idx;
> > +	cur->hist_id_save[c] = cur->hist_id;
> >  }
> >  
> >  void crossrelease_hist_end(enum xhlock_context_t c)

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING
  2017-08-23  2:43                 ` Byungchul Park
@ 2017-08-23  6:31                   ` Byungchul Park
  2017-08-23 10:26                   ` Peter Zijlstra
  1 sibling, 0 replies; 50+ messages in thread
From: Byungchul Park @ 2017-08-23  6:31 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, linux-kernel, kernel-team, Arnaldo Carvalho de Melo,
	Dave Chinner, Tejun Heo, johannes

On Wed, Aug 23, 2017 at 11:43:23AM +0900, Byungchul Park wrote:
> On Tue, Aug 22, 2017 at 03:49:22PM +0200, Peter Zijlstra wrote:
> > On Tue, Aug 22, 2017 at 12:08:40PM +0200, Peter Zijlstra wrote:
> > 
> > > > > > I meant:
> > > > > > 
> > > > > >  	mutex_lock(&A)
> > > > > >  				<work>
> > > > > >  				lockdep_map_acquire_read(&work)
> > > > > >  				mutex_lock(&A)
> > > > > > 
> > > > > >  	lockdep_map_acquire(&work)
> > > > > >  	flush_work(&work)
> > > > > > 
> > > > > > I mean it can still be detected with a read acquisition in work.
> > > > > > Am I wrong?
> > > > > 
> > > > > Think so, although there's something weird with read locks that I keep
> > > > > forgetting. But I'm not sure it'll actually solve the problem. But I can
> > > > 
> > > > I mean, read acquisitions are nothing but ones allowing read ones to be
> > > > re-acquired legally, IOW, we want to check entrance of flush_work() and
> > > > works, not between works. That's why I suggested to use read ones in work
> > > > in that case.
> > > 
> > > Does seem to work.
> > 
> > So I think we'll end up hitting a lockdep deficiency and not trigger the
> > splat on flush_work(), see also:
> > 
> >   https://lwn.net/Articles/332801/
> > 
> > lock_map_acquire_read() is a read-recursive and will not in fact create
> > any dependencies because of this issue.
> > 
> > In specific, check_prev_add() has:
> > 
> > 	if (next->read == 2 || prev->read == 2)
> > 		return 1;
> > 
> > This means that for:
> > 
> > 	lock_map_acquire_read(W)(2)
> > 	down_write(A)		(0)
> > 
> > 			down_write(A)		(0)
> > 			wait_for_completion(C)	(0)
> > 
> > 					lock_map_acquire_read(W)(2)
> > 					complete(C)		(0)
> > 
> > All the (2) effectively go away and 'solve' our current issue, but:
> > 
> > 	lock_map_acquire_read(W)(2)
> > 	mutex_lock(A)		(0)
> > 
> > 			mutex_lock(A)		(0)
> > 			lock_map_acquire(W)	(0)
> > 
> > as per flush_work() will not in fact trigger anymore either.
> 
> It should be triggered. Lockdep code should be fixed so that it does.
> 
> > See also the below locking-selftest changes.
> > 
> > 
> > Now, this means I also have to consider the existing
> > lock_map_acquire_read() users and if they really wanted to be recursive
> > or not. When I change lock_map_acquire_read() to use
> > lock_acquire_shared() this annotation no longer suffices and the splat
> > comes back.
> > 
> > 
> > Also, the acquire_read() annotation will (obviously) no longer work to
> > cure this problem when we switch to normal read (1), because then the
> > generated chain:
> > 
> > 	W(1) -> A(0) -> C(0) -> W(1)
> 
> Please explain what W/A/C stand for.

I eventually found them in your words. Let me read this again.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING
  2017-08-23  2:12           ` Byungchul Park
  2017-08-23  6:03             ` Byungchul Park
@ 2017-08-23 10:20             ` Peter Zijlstra
  2017-08-24  2:02               ` Byungchul Park
  1 sibling, 1 reply; 50+ messages in thread
From: Peter Zijlstra @ 2017-08-23 10:20 UTC (permalink / raw)
  To: Byungchul Park
  Cc: Dave Chinner, mingo, linux-kernel, kernel-team,
	Arnaldo Carvalho de Melo, Tejun Heo, Boqun Feng

On Wed, Aug 23, 2017 at 11:12:17AM +0900, Byungchul Park wrote:

> > > We have to detect dependecies if it exists, even in the following case:
> > > 
> > > oooooooiiiiiiiiiiiiiiiiiii.........
> > >   |<- range for commit ->|
> > > 
> > >   where
> > >   o: acquisition outside of each work,
> > >   i: acquisition inside of each work,
> > > 
> > > With yours, we can never detect dependecies wrt 'o'.
> > 
> > There really shouldn't be any o's when you call
> 
> There can be any o's.

Yes, but they _must_ be irrelevant, see below.

> > crossrelease_hist_start(XHLOCK_PROC), it should denote the bottom of a
> 
> No, I don't think so. It can be either the bottom or not.

Nope, wrong, _must_ be bottom.

> hist_start() and hist_end() is only for special contexts which need roll
> back on exit e.g. irq, work and so on. Normal kernel context should work
> well w/o hist_start() or hist_end().

The (soft) IRQ ones, yes, the PROC one is special though.

> > context, see:
> > 
> >   https://lkml.kernel.org/r/20170301104328.GD6515@twins.programming.kicks-ass.net
> 
> Actually, I don't agree with that.
> 
> > And in that respect you placed the calls wrongly in process_one_work(),
> 
> Why is it wrong? It's intended. Could you tell me why?

The purpose of the PROC thing is to annotate _independent_ execution,
like work's.

Either they should _all_ depend on a held lock, or they should not
depend on it at all. Either way this means we should start(PROC) before
taking any locks.

Similar with history, independence means to not depend on prior state,
so per definition we should not have history at this point.

So by this reasoning the workqueue annotation should be:


	crossrelease_hist_start(XHLOCK_PROC);

	lock_map_acquire(wq->lockdep_map);
	lock_map_acquire(lockdep_map);

	work->func(work); /* go hard */

	lock_map_release(lockdep_map);
	lock_map_release(wq->lockdep_map);

	crossrelease_hist_end(XHLOCK_PROC);


This way _all_ works are guaranteed the same context, and not only those
that didn't overflow the history stack.

Now, it so happens we have an unfortunate interaction with the
flush_work{,queue}() annotations. The read-recursive thing would work if
lockdep were fixed, however I don't think there is a possible deadlock
between flush_work() and complete() (except for single-threaded
workqueues)

So until we fix lockdep for read-recursive things, I don't think its a
problem if we (ab)use your wrong placement to kill the interaction
between these two annotations.

I'll send some patches..

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING
  2017-08-23  2:43                 ` Byungchul Park
  2017-08-23  6:31                   ` Byungchul Park
@ 2017-08-23 10:26                   ` Peter Zijlstra
  2017-08-24  5:07                     ` Byungchul Park
  1 sibling, 1 reply; 50+ messages in thread
From: Peter Zijlstra @ 2017-08-23 10:26 UTC (permalink / raw)
  To: Byungchul Park
  Cc: mingo, linux-kernel, kernel-team, Arnaldo Carvalho de Melo,
	Dave Chinner, Tejun Heo, johannes

On Wed, Aug 23, 2017 at 11:43:23AM +0900, Byungchul Park wrote:
> On Tue, Aug 22, 2017 at 03:49:22PM +0200, Peter Zijlstra wrote:

> > So I think we'll end up hitting a lockdep deficiency and not trigger the
> > splat on flush_work(), see also:
> > 
> >   https://lwn.net/Articles/332801/
> > 
> > lock_map_acquire_read() is a read-recursive and will not in fact create
> > any dependencies because of this issue.
> > 
> > In specific, check_prev_add() has:
> > 
> > 	if (next->read == 2 || prev->read == 2)
> > 		return 1;
> > 
> > This means that for:
> > 
> > 	lock_map_acquire_read(W)(2)
> > 	down_write(A)		(0)
> > 
> > 			down_write(A)		(0)
> > 			wait_for_completion(C)	(0)
> > 
> > 					lock_map_acquire_read(W)(2)
> > 					complete(C)		(0)
> > 
> > All the (2) effectively go away and 'solve' our current issue, but:
> > 
> > 	lock_map_acquire_read(W)(2)
> > 	mutex_lock(A)		(0)
> > 
> > 			mutex_lock(A)		(0)
> > 			lock_map_acquire(W)	(0)
> > 
> > as per flush_work() will not in fact trigger anymore either.
> 
> It should be triggered. Lockdep code should be fixed so that it does.

Yeah, its just something we never got around to. Once every so often I
get reminded of it, like now. But then it sits on the todo list and
never quite happens.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING
  2017-08-23  2:31       ` Byungchul Park
  2017-08-23  6:11         ` Byungchul Park
@ 2017-08-23 10:46         ` Peter Zijlstra
  2017-08-24  5:06           ` Byungchul Park
  1 sibling, 1 reply; 50+ messages in thread
From: Peter Zijlstra @ 2017-08-23 10:46 UTC (permalink / raw)
  To: Byungchul Park
  Cc: Dave Chinner, mingo, linux-kernel, kernel-team,
	Arnaldo Carvalho de Melo, Tejun Heo, Boqun Feng

On Wed, Aug 23, 2017 at 11:31:18AM +0900, Byungchul Park wrote:
> On Tue, Aug 22, 2017 at 11:06:03AM +0200, Peter Zijlstra wrote:

> Currently, we do the following in process_one_work(),
> 
> lockdep_map_acquire for a workqueue
> lockdep_map_acquire for a work
> 
> But IMHO it should be,
> 
> lockdep_map_acquire for a pair of workqueue and work.
> 
> Right?

No, it is right. We need the two 'locks'.

The work one is for flush_work(), the workqueue one is for
flush_workqueue().

Just like how flush_work() must not depend on any lock taken inside the
work, flush_workqueue() callers must not hold any lock acquired inside
any work ran inside the workqueue. This cannot be done with a single
'lock'.

The reason flush_work() also depends on the wq 'lock' is because doing
flush_work() from inside work is a possible problem for single threaded
workqueues and workqueues with a rescuer.
 
> > Agreed. The interaction with workqueues is buggered.
> 
> I think original uses of lockdep_map were already wrong. I mean it's
> not a problem of newly introduced code.

Not wrong per-se, the new code certainly places more constraints on it.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING
  2017-08-22 14:46                 ` Peter Zijlstra
  2017-08-22 15:10                   ` Peter Zijlstra
  2017-08-22 15:59                   ` Oleg Nesterov
@ 2017-08-23 16:39                   ` Oleg Nesterov
  2017-08-23 17:47                     ` Peter Zijlstra
  2 siblings, 1 reply; 50+ messages in thread
From: Oleg Nesterov @ 2017-08-23 16:39 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Byungchul Park, mingo, linux-kernel, kernel-team,
	Arnaldo Carvalho de Melo, Dave Chinner, Tejun Heo, johannes

Peter, I am all confused and I am still trying to understand your email.
In particular, because I no longer understand the lockdep annotations in
workqueue.c, it turns out I forgot everything...

On 08/22, Peter Zijlstra wrote:
>
> I am however slightly puzzled by the need of flush_work() to take Q,
> what deadlock potential is there?

Do you really mean flush_work()? Or start_flush_work() ?

> It was added by Oleg in commit:
>
>   a67da70dc095 ("workqueues: lockdep annotations for flush_work()")

No, these annotations were moved later into start_flush, iiuc...

This

	lock_map_acquire(&work->lockdep_map);
	lock_map_release(&work->lockdep_map);

was added by another commit 0976dfc1d0cd80a4e9dfaf87bd8744612bde475a
"workqueue: Catch more locking problems with flush_work()", and at
first glance it is fine.

At the same time, I have a vague feeling that (perhaps) we can remove
these 2 annotations if we change process_one_work() and start_flush_work(),
not sure...

Oleg.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING
  2017-08-23 16:39                   ` Oleg Nesterov
@ 2017-08-23 17:47                     ` Peter Zijlstra
  2017-08-24  6:11                       ` Byungchul Park
  2017-08-29 15:52                       ` Oleg Nesterov
  0 siblings, 2 replies; 50+ messages in thread
From: Peter Zijlstra @ 2017-08-23 17:47 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Byungchul Park, mingo, linux-kernel, kernel-team,
	Arnaldo Carvalho de Melo, Dave Chinner, Tejun Heo, johannes

On Wed, Aug 23, 2017 at 06:39:03PM +0200, Oleg Nesterov wrote:
> Peter, I am all confused and I am still trying to understand your email.
> In particular, because I no longer understand the lockdep annotations in
> workqueue.c, it turns out I forgot everything...

Yeah, that happens :/

> On 08/22, Peter Zijlstra wrote:
> >
> > I am however slightly puzzled by the need of flush_work() to take Q,
> > what deadlock potential is there?
> 
> Do you really mean flush_work()? Or start_flush_work() ?

Same thing, start_flush_work() has exactly one caller: flush_work().

> > It was added by Oleg in commit:
> >
> >   a67da70dc095 ("workqueues: lockdep annotations for flush_work()")
> 
> No, these annotations were moved later into start_flush, iiuc...
> 
> This
> 
> 	lock_map_acquire(&work->lockdep_map);
> 	lock_map_release(&work->lockdep_map);
> 
> was added by another commit 0976dfc1d0cd80a4e9dfaf87bd8744612bde475a
> "workqueue: Catch more locking problems with flush_work()", and at
> first glance it is fine.

Those are fine and are indeed the flush_work() vs work inversion.

The two straight forward annotations are:

flush_work(work)	process_one_work(wq, work)
  A(work)		  A(work)
  R(work)		  work->func(work);
			  R(work)

Which catches:

Task-1:			work:

  mutex_lock(&A);	mutex_lock(&A);
  flush_work(work);


And the analogous:

flush_workqueue(wq)	process_one_work(wq, work)
  A(wq)			  A(wq)
  R(wq)			  work->func(work);
			  R(wq)


The thing I puzzled over was flush_work() (really start_flush_work())
doing:

        if (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer)
                lock_map_acquire(&pwq->wq->lockdep_map);
        else
                lock_map_acquire_read(&pwq->wq->lockdep_map);
        lock_map_release(&pwq->wq->lockdep_map);

Why does flush_work() care about the wq->lockdep_map?

The answer is because, for single-threaded workqueues, doing
flush_work() from a work is a potential deadlock:

workqueue-thread:

	work-n:
	  flush_work(work-n+1);

	work-n+1:


Will not be going anywhere fast..

And by taking the wq->lockdep_map from flush_work(), but _only_ when
single-threaded (or rescuer, see other emails), and by doing that, it
forces a recursive lock deadlock message like:

  process_one_work(wq, work)
   A(wq)
   A(work)

   work->func(work)
     flush_work(work2)
       A(work2)
       R(work2)

       A(wq) <-- recursive lock deadlock




Make sense?

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING
  2017-08-23 10:20             ` Peter Zijlstra
@ 2017-08-24  2:02               ` Byungchul Park
  2017-08-24  7:30                 ` Byungchul Park
  0 siblings, 1 reply; 50+ messages in thread
From: Byungchul Park @ 2017-08-24  2:02 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Dave Chinner, mingo, linux-kernel, kernel-team,
	Arnaldo Carvalho de Melo, Tejun Heo, Boqun Feng

On Wed, Aug 23, 2017 at 12:20:48PM +0200, Peter Zijlstra wrote:
> On Wed, Aug 23, 2017 at 11:12:17AM +0900, Byungchul Park wrote:
> 
> > > > We have to detect dependecies if it exists, even in the following case:
> > > > 
> > > > oooooooiiiiiiiiiiiiiiiiiii.........
> > > >   |<- range for commit ->|
> > > > 
> > > >   where
> > > >   o: acquisition outside of each work,
> > > >   i: acquisition inside of each work,
> > > > 
> > > > With yours, we can never detect dependecies wrt 'o'.
> > > 
> > > There really shouldn't be any o's when you call
> > 
> > There can be any o's.
> 
> Yes, but they _must_ be irrelevant, see below.

No, they can be relevant, see below.

> > > crossrelease_hist_start(XHLOCK_PROC), it should denote the bottom of a
> > 
> > No, I don't think so. It can be either the bottom or not.
> 
> Nope, wrong, _must_ be bottom.

No, wrong, it can be either one.

> > hist_start() and hist_end() is only for special contexts which need roll
> > back on exit e.g. irq, work and so on. Normal kernel context should work
> > well w/o hist_start() or hist_end().
> 
> The (soft) IRQ ones, yes, the PROC one is special though.
> 
> > > context, see:
> > > 
> > >   https://lkml.kernel.org/r/20170301104328.GD6515@twins.programming.kicks-ass.net
> > 
> > Actually, I don't agree with that.
> > 
> > > And in that respect you placed the calls wrongly in process_one_work(),
> > 
> > Why is it wrong? It's intended. Could you tell me why?
> 
> The purpose of the PROC thing is to annotate _independent_ execution,
> like work's.

The purpose of the PROC thing is to annotate independent execution of
work _itself_.

'o' in my examplel can be relevant and should be relevant with each
execution, that is, 'i' in my example.

> Either they should _all_ depend on a held lock, or they should not
> depend on it at all. Either way this means we should start(PROC) before
> taking any locks.

No, as I said, we don't have to start(PROC) for normal kernel contexts
before taking any locks. They should be able to do it w/o start(PROC).
That is necessary only for spectial contexts such as work.

> Similar with history, independence means to not depend on prior state,

Independence means to not depend on each other context e.i. work. IOW,
we don't have to make dependencies between 'o's and 'i's indenpendent.
The reason is simple. It's becasue the dependencies can exist.

> so per definition we should not have history at this point.
> 
> So by this reasoning the workqueue annotation should be:
> 
> 
> 	crossrelease_hist_start(XHLOCK_PROC);
> 
> 	lock_map_acquire(wq->lockdep_map);
> 	lock_map_acquire(lockdep_map);
> 
> 	work->func(work); /* go hard */
> 
> 	lock_map_release(lockdep_map);
> 	lock_map_release(wq->lockdep_map);
> 
> 	crossrelease_hist_end(XHLOCK_PROC);
>

No matter whether the lock_map_acquire()s is 'o' or 'i', it works
depending on what we want. If we want to make each lock_map_acquire()
here independent, then we should choose your way. But if we should not,
then we should keep my way unchaged.

> This way _all_ works are guaranteed the same context, and not only those

I don't understand what you intended here. This way all works become
independent each other.

> Now, it so happens we have an unfortunate interaction with the
> flush_work{,queue}() annotations. The read-recursive thing would work if
> lockdep were fixed, however I don't think there is a possible deadlock
> between flush_work() and complete() (except for single-threaded
> workqueues)

I got it. I want to work and make the read-recursive thing in lockdep
work soon.

> So until we fix lockdep for read-recursive things, I don't think its a
> problem if we (ab)use your wrong placement to kill the interaction
> between these two annotations.

I think you get confused now about start(PROC)/end(PROC). Or am I
confused wrt what you intend?

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING
  2017-08-23 10:46         ` Peter Zijlstra
@ 2017-08-24  5:06           ` Byungchul Park
  0 siblings, 0 replies; 50+ messages in thread
From: Byungchul Park @ 2017-08-24  5:06 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Dave Chinner, mingo, linux-kernel, kernel-team,
	Arnaldo Carvalho de Melo, Tejun Heo, Boqun Feng

On Wed, Aug 23, 2017 at 12:46:48PM +0200, Peter Zijlstra wrote:
> On Wed, Aug 23, 2017 at 11:31:18AM +0900, Byungchul Park wrote:
> > On Tue, Aug 22, 2017 at 11:06:03AM +0200, Peter Zijlstra wrote:
> 
> > Currently, we do the following in process_one_work(),
> > 
> > lockdep_map_acquire for a workqueue
> > lockdep_map_acquire for a work
> > 
> > But IMHO it should be,
> > 
> > lockdep_map_acquire for a pair of workqueue and work.
> > 
> > Right?
> 
> No, it is right. We need the two 'locks'.
> 
> The work one is for flush_work(), the workqueue one is for
> flush_workqueue().
> 
> Just like how flush_work() must not depend on any lock taken inside the
> work, flush_workqueue() callers must not hold any lock acquired inside
> any work ran inside the workqueue. This cannot be done with a single
> 'lock'.

Thank you for explanation.

> The reason flush_work() also depends on the wq 'lock' is because doing
> flush_work() from inside work is a possible problem for single threaded
> workqueues and workqueues with a rescuer.
>  
> > > Agreed. The interaction with workqueues is buggered.
> > 
> > I think original uses of lockdep_map were already wrong. I mean it's
> > not a problem of newly introduced code.
> 
> Not wrong per-se, the new code certainly places more constraints on it.

"the new code places more constraints on it" is just the right expression.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING
  2017-08-23 10:26                   ` Peter Zijlstra
@ 2017-08-24  5:07                     ` Byungchul Park
  0 siblings, 0 replies; 50+ messages in thread
From: Byungchul Park @ 2017-08-24  5:07 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, linux-kernel, kernel-team, Arnaldo Carvalho de Melo,
	Dave Chinner, Tejun Heo, johannes

On Wed, Aug 23, 2017 at 12:26:52PM +0200, Peter Zijlstra wrote:
> On Wed, Aug 23, 2017 at 11:43:23AM +0900, Byungchul Park wrote:
> > On Tue, Aug 22, 2017 at 03:49:22PM +0200, Peter Zijlstra wrote:
> 
> > > So I think we'll end up hitting a lockdep deficiency and not trigger the
> > > splat on flush_work(), see also:
> > > 
> > >   https://lwn.net/Articles/332801/
> > > 
> > > lock_map_acquire_read() is a read-recursive and will not in fact create
> > > any dependencies because of this issue.
> > > 
> > > In specific, check_prev_add() has:
> > > 
> > > 	if (next->read == 2 || prev->read == 2)
> > > 		return 1;
> > > 
> > > This means that for:
> > > 
> > > 	lock_map_acquire_read(W)(2)
> > > 	down_write(A)		(0)
> > > 
> > > 			down_write(A)		(0)
> > > 			wait_for_completion(C)	(0)
> > > 
> > > 					lock_map_acquire_read(W)(2)
> > > 					complete(C)		(0)
> > > 
> > > All the (2) effectively go away and 'solve' our current issue, but:
> > > 
> > > 	lock_map_acquire_read(W)(2)
> > > 	mutex_lock(A)		(0)
> > > 
> > > 			mutex_lock(A)		(0)
> > > 			lock_map_acquire(W)	(0)
> > > 
> > > as per flush_work() will not in fact trigger anymore either.
> > 
> > It should be triggered. Lockdep code should be fixed so that it does.
> 
> Yeah, its just something we never got around to. Once every so often I
> get reminded of it, like now. But then it sits on the todo list and
> never quite happens.

I want to try it.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING
  2017-08-23 17:47                     ` Peter Zijlstra
@ 2017-08-24  6:11                       ` Byungchul Park
  2017-08-24  7:37                         ` Byungchul Park
  2017-08-29 15:52                       ` Oleg Nesterov
  1 sibling, 1 reply; 50+ messages in thread
From: Byungchul Park @ 2017-08-24  6:11 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Oleg Nesterov, mingo, linux-kernel, kernel-team,
	Arnaldo Carvalho de Melo, Dave Chinner, Tejun Heo, johannes

On Wed, Aug 23, 2017 at 07:47:14PM +0200, Peter Zijlstra wrote:
> Those are fine and are indeed the flush_work() vs work inversion.
> 
> The two straight forward annotations are:
> 
> flush_work(work)	process_one_work(wq, work)
>   A(work)		  A(work)
>   R(work)		  work->func(work);
> 			  R(work)
> 
> Which catches:
> 
> Task-1:			work:
> 
>   mutex_lock(&A);	mutex_lock(&A);
>   flush_work(work);

I'm not sure but, with LOCKDEP_COMPLETE enabled, this issue would
automatically be covered w/o additional A(work)/R(work). Right?

A(work)/R(work) seem to be used for preventing wait_for_completion()
in flush_work() from waiting for the completion forever because of the
work using mutex_lock(&A). Am I understanding correctly?

If yes, we can use just LOCKDEP_COMPLETE for that purpose.

> And the analogous:
> 
> flush_workqueue(wq)	process_one_work(wq, work)
>   A(wq)			  A(wq)
>   R(wq)			  work->func(work);
>				  (wq)

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING
  2017-08-24  2:02               ` Byungchul Park
@ 2017-08-24  7:30                 ` Byungchul Park
  0 siblings, 0 replies; 50+ messages in thread
From: Byungchul Park @ 2017-08-24  7:30 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Dave Chinner, mingo, linux-kernel, kernel-team,
	Arnaldo Carvalho de Melo, Tejun Heo, Boqun Feng

On Thu, Aug 24, 2017 at 11:02:36AM +0900, Byungchul Park wrote:
> On Wed, Aug 23, 2017 at 12:20:48PM +0200, Peter Zijlstra wrote:
> > On Wed, Aug 23, 2017 at 11:12:17AM +0900, Byungchul Park wrote:
> > 
> > > > > We have to detect dependecies if it exists, even in the following case:
> > > > > 
> > > > > oooooooiiiiiiiiiiiiiiiiiii.........
> > > > >   |<- range for commit ->|
> > > > > 
> > > > >   where
> > > > >   o: acquisition outside of each work,
> > > > >   i: acquisition inside of each work,
> > > > > 
> > > > > With yours, we can never detect dependecies wrt 'o'.
> > > > 
> > > > There really shouldn't be any o's when you call
> > > 
> > > There can be any o's.
> > 
> > Yes, but they _must_ be irrelevant, see below.
> 
> No, they can be relevant, see below.

I meant we have to detect problems like, just for example:

A worker:

   acquire(A)
   process_one_work()
      acquire(B)

      crossrelease_hist_start(PROC)
      work_x->func()
         acquire(C)
         release(C)
         complete(D)
      crossrelease_hist_end(PROC)

      release(B)
   release(A)

A task:

   acquire(A)
   acquire(B)
   initiate work_x
   wait_for_completion(D)

In this case, I want to detect a problem by creating true dependencies,
which are:

A -> B
B -> C
D -> A // You are currently trying to invalidate this unnecessarily.
D -> B // You are currently trying to invalidate this unnecessarily.
D -> C

in the worker,

B -> D

in the task.

Crossrelease should detect the problem with the following chain:

A -> B -> D -> A

or

B -> D -> B

So, please keep my original code unchanged conceptially.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING
  2017-08-24  6:11                       ` Byungchul Park
@ 2017-08-24  7:37                         ` Byungchul Park
  2017-08-24  8:11                           ` Byungchul Park
  0 siblings, 1 reply; 50+ messages in thread
From: Byungchul Park @ 2017-08-24  7:37 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Oleg Nesterov, mingo, linux-kernel, kernel-team,
	Arnaldo Carvalho de Melo, Dave Chinner, Tejun Heo, johannes

On Thu, Aug 24, 2017 at 03:11:53PM +0900, Byungchul Park wrote:
> On Wed, Aug 23, 2017 at 07:47:14PM +0200, Peter Zijlstra wrote:
> > Those are fine and are indeed the flush_work() vs work inversion.
> > 
> > The two straight forward annotations are:
> > 
> > flush_work(work)	process_one_work(wq, work)
> >   A(work)		  A(work)
> >   R(work)		  work->func(work);
> > 			  R(work)
> > 
> > Which catches:
> > 
> > Task-1:			work:
> > 
> >   mutex_lock(&A);	mutex_lock(&A);
> >   flush_work(work);
> 
> I'm not sure but, with LOCKDEP_COMPLETE enabled, this issue would
> automatically be covered w/o additional A(work)/R(work). Right?
> 
> A(work)/R(work) seem to be used for preventing wait_for_completion()
> in flush_work() from waiting for the completion forever because of the
> work using mutex_lock(&A). Am I understanding correctly?
> 
> If yes, we can use just LOCKDEP_COMPLETE for that purpose.

I'm not familiar with workqueue but, the manual lockdep_map_acquire() in
workqueue code seems to be introduced to do what LOCKDEP_COMPLETE does
for wait_for_completion() and complete().

Wrong?

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING
  2017-08-24  7:37                         ` Byungchul Park
@ 2017-08-24  8:11                           ` Byungchul Park
  2017-08-25  1:14                             ` Byungchul Park
  0 siblings, 1 reply; 50+ messages in thread
From: Byungchul Park @ 2017-08-24  8:11 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Oleg Nesterov, mingo, linux-kernel, kernel-team,
	Arnaldo Carvalho de Melo, Dave Chinner, Tejun Heo, johannes

On Thu, Aug 24, 2017 at 04:37:13PM +0900, Byungchul Park wrote:
> On Thu, Aug 24, 2017 at 03:11:53PM +0900, Byungchul Park wrote:
> > On Wed, Aug 23, 2017 at 07:47:14PM +0200, Peter Zijlstra wrote:
> > > Those are fine and are indeed the flush_work() vs work inversion.
> > > 
> > > The two straight forward annotations are:
> > > 
> > > flush_work(work)	process_one_work(wq, work)
> > >   A(work)		  A(work)
> > >   R(work)		  work->func(work);
> > > 			  R(work)
> > > 
> > > Which catches:
> > > 
> > > Task-1:			work:
> > > 
> > >   mutex_lock(&A);	mutex_lock(&A);
> > >   flush_work(work);
> > 
> > I'm not sure but, with LOCKDEP_COMPLETE enabled, this issue would
> > automatically be covered w/o additional A(work)/R(work). Right?
> > 
> > A(work)/R(work) seem to be used for preventing wait_for_completion()
> > in flush_work() from waiting for the completion forever because of the
> > work using mutex_lock(&A). Am I understanding correctly?
> > 
> > If yes, we can use just LOCKDEP_COMPLETE for that purpose.
> 
> I'm not familiar with workqueue but, the manual lockdep_map_acquire() in
> workqueue code seems to be introduced to do what LOCKDEP_COMPLETE does
> for wait_for_completion() and complete().
> 
> Wrong?

As I understand how workqueue code works more, thanks to Peterz, I get
convinced. What they want to detect with acquire(w/wq) is a deadlock
caused by wait_for_completion() mixed with typical locks.

We have to detect it with _variables_ which it actually waits for, i.e.
completion variable, neither _work_ nor _workqueue_ which we are
currently using.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING
  2017-08-24  8:11                           ` Byungchul Park
@ 2017-08-25  1:14                             ` Byungchul Park
  0 siblings, 0 replies; 50+ messages in thread
From: Byungchul Park @ 2017-08-25  1:14 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Oleg Nesterov, mingo, linux-kernel, kernel-team,
	Arnaldo Carvalho de Melo, Dave Chinner, Tejun Heo, johannes

On Thu, Aug 24, 2017 at 05:11:01PM +0900, Byungchul Park wrote:
> On Thu, Aug 24, 2017 at 04:37:13PM +0900, Byungchul Park wrote:
> > On Thu, Aug 24, 2017 at 03:11:53PM +0900, Byungchul Park wrote:
> > > On Wed, Aug 23, 2017 at 07:47:14PM +0200, Peter Zijlstra wrote:
> > > > Those are fine and are indeed the flush_work() vs work inversion.
> > > > 
> > > > The two straight forward annotations are:
> > > > 
> > > > flush_work(work)	process_one_work(wq, work)
> > > >   A(work)		  A(work)
> > > >   R(work)		  work->func(work);
> > > > 			  R(work)
> > > > 
> > > > Which catches:
> > > > 
> > > > Task-1:			work:
> > > > 
> > > >   mutex_lock(&A);	mutex_lock(&A);
> > > >   flush_work(work);
> > > 
> > > I'm not sure but, with LOCKDEP_COMPLETE enabled, this issue would
> > > automatically be covered w/o additional A(work)/R(work). Right?
> > > 
> > > A(work)/R(work) seem to be used for preventing wait_for_completion()
> > > in flush_work() from waiting for the completion forever because of the
> > > work using mutex_lock(&A). Am I understanding correctly?
> > > 
> > > If yes, we can use just LOCKDEP_COMPLETE for that purpose.
> > 
> > I'm not familiar with workqueue but, the manual lockdep_map_acquire() in
> > workqueue code seems to be introduced to do what LOCKDEP_COMPLETE does
> > for wait_for_completion() and complete().
> > 
> > Wrong?
> 
> As I understand how workqueue code works more, thanks to Peterz, I get
> convinced. What they want to detect with acquire(w/wq) is a deadlock
> caused by wait_for_completion() mixed with typical locks.
> 
> We have to detect it with _variables_ which it actually waits for, i.e.
> completion variable, neither _work_ nor _workqueue_ which we are
> currently using.

Please read this more _carefully_.

I would be sorry if wrong,

_BUT_,

it could be a key to solve the issue of workqueue in right way, IIUC.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING
  2017-08-23 17:47                     ` Peter Zijlstra
  2017-08-24  6:11                       ` Byungchul Park
@ 2017-08-29 15:52                       ` Oleg Nesterov
  2017-08-29 17:07                         ` lockdep && recursive-read Oleg Nesterov
  2017-08-29 17:51                         ` [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING Peter Zijlstra
  1 sibling, 2 replies; 50+ messages in thread
From: Oleg Nesterov @ 2017-08-29 15:52 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Byungchul Park, mingo, linux-kernel, kernel-team,
	Arnaldo Carvalho de Melo, Dave Chinner, Tejun Heo, johannes

Peter, sorry for delay, didn't have a chance to return to this discussion...

On 08/23, Peter Zijlstra wrote:
>
> > > It was added by Oleg in commit:
> > >
> > >   a67da70dc095 ("workqueues: lockdep annotations for flush_work()")
> >
> > No, these annotations were moved later into start_flush, iiuc...
> >
> > This
> >
> > 	lock_map_acquire(&work->lockdep_map);
> > 	lock_map_release(&work->lockdep_map);
> >
> > was added by another commit 0976dfc1d0cd80a4e9dfaf87bd8744612bde475a
> > "workqueue: Catch more locking problems with flush_work()", and at
> > first glance it is fine.
>
> Those are fine and are indeed the flush_work() vs work inversion.
>
> The two straight forward annotations are:
>
> flush_work(work)	process_one_work(wq, work)
>   A(work)		  A(work)
>   R(work)		  work->func(work);
> 			  R(work)
>
> Which catches:
>
> Task-1:			work:
>
>   mutex_lock(&A);	mutex_lock(&A);
>   flush_work(work);

Yes, yes, this is clear.

But if we ignore the multithreaded workqueues, in this particular case
we could rely on A(wq)/R(wq) in start_flush() and process_one_work().

The problem is that start_flush_work() does not do acquire/release
unconditionally, it does this only if it is going to wait, and I am not
sure this is right...

Plus process_one_work() does lock_map_acquire_read(), I don't really
understand this too.


> And the analogous:
>
> flush_workqueue(wq)	process_one_work(wq, work)
>   A(wq)			  A(wq)
>   R(wq)			  work->func(work);
> 			  R(wq)
>
>
> The thing I puzzled over was flush_work() (really start_flush_work())
> doing:
>
>         if (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer)
>                 lock_map_acquire(&pwq->wq->lockdep_map);
>         else
>                 lock_map_acquire_read(&pwq->wq->lockdep_map);
>         lock_map_release(&pwq->wq->lockdep_map);
>
> Why does flush_work() care about the wq->lockdep_map?
>
> The answer is because, for single-threaded workqueues, doing
> flush_work() from a work is a potential deadlock:

Yes, but the simple answer is that flush_work() doesn't really differ
from flush_workqueue() in this respect?

If nothing else, if some WORK is the last queued work on WQ, then
flush_work(WORK) is the same thing as flush_workqueuw(WQ), more or less.
Again, I am talking about single-threaded workqueues.

> workqueue-thread:
>
> 	work-n:
> 	  flush_work(work-n+1);
>
> 	work-n+1:
>
>
> Will not be going anywhere fast..

Or another example,

	lock(LOCK);
	flush_work(WORK);
	unlock(LOCK);

	workqueue-thread:
		another_pending_work:
			LOCK(LOCK);
			UNLOCK(LOCK);

		WORK:

In this case we do not care about WORK->lockdep_map, but
taking the wq->lockdep_map from flush_work() (if single-threaded) allows
to report the deadlock.

Again, this is just like flush_workqueue().

Oleg.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* lockdep && recursive-read
  2017-08-29 15:52                       ` Oleg Nesterov
@ 2017-08-29 17:07                         ` Oleg Nesterov
  2017-08-29 17:30                           ` Peter Zijlstra
  2017-08-29 17:51                         ` [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING Peter Zijlstra
  1 sibling, 1 reply; 50+ messages in thread
From: Oleg Nesterov @ 2017-08-29 17:07 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Byungchul Park, mingo, linux-kernel, kernel-team,
	Arnaldo Carvalho de Melo, Dave Chinner, Tejun Heo, johannes

On 08/29, Oleg Nesterov wrote:
>
> Plus process_one_work() does lock_map_acquire_read(), I don't really
> understand this too.

and in fact I don't understand lock_map_acquire_read() itself. I mean, read == 2
and this code in check_prevs_add()

		/*
		 * Only non-recursive-read entries get new dependencies
		 * added:
		 */
		if (hlock->read != 2 && hlock->check) {
			if (!check_prev_add(curr, hlock, next,
						distance, &stack_saved))


Well, I forgot everything I ever knew about lockdep, unlikely I understand what
the code above actually does. But I verified that this code

		static DEFINE_SPINLOCK(exlk);
		static DEFINE_RWLOCK(rwlk);

		spin_lock(&exlk);
		write_lock(&rwlk);
		write_unlock(&rwlk);
		spin_unlock(&exlk);

		read_lock(&rwlk);
		spin_lock(&exlk);
		spin_unlock(&exlk);
		read_unlock(&rwlk);

runs without any warning from lockdep. Doesn't look right or I am totally
confused...

Oleg.

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: lockdep && recursive-read
  2017-08-29 17:07                         ` lockdep && recursive-read Oleg Nesterov
@ 2017-08-29 17:30                           ` Peter Zijlstra
  0 siblings, 0 replies; 50+ messages in thread
From: Peter Zijlstra @ 2017-08-29 17:30 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Byungchul Park, mingo, linux-kernel, kernel-team,
	Arnaldo Carvalho de Melo, Dave Chinner, Tejun Heo, johannes

On Tue, Aug 29, 2017 at 07:07:57PM +0200, Oleg Nesterov wrote:
> On 08/29, Oleg Nesterov wrote:
> >
> > Plus process_one_work() does lock_map_acquire_read(), I don't really
> > understand this too.
> 
> and in fact I don't understand lock_map_acquire_read() itself. I mean, read == 2
> and this code in check_prevs_add()
> 
> 		/*
> 		 * Only non-recursive-read entries get new dependencies
> 		 * added:
> 		 */
> 		if (hlock->read != 2 && hlock->check) {
> 			if (!check_prev_add(curr, hlock, next,
> 						distance, &stack_saved))
> 
> 
> Well, I forgot everything I ever knew about lockdep, unlikely I understand what
> the code above actually does. But I verified that this code
> 
> 		static DEFINE_SPINLOCK(exlk);
> 		static DEFINE_RWLOCK(rwlk);
> 
> 		spin_lock(&exlk);
> 		write_lock(&rwlk);
> 		write_unlock(&rwlk);
> 		spin_unlock(&exlk);
> 
> 		read_lock(&rwlk);
> 		spin_lock(&exlk);
> 		spin_unlock(&exlk);
> 		read_unlock(&rwlk);
> 
> runs without any warning from lockdep. Doesn't look right or I am totally
> confused...

Long standing lockdep issue that you used to know about ;-)

Boqun recently started working on it:

https://lkml.kernel.org/r/20170828151608.19636-1-boqun.feng@gmail.com

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING
  2017-08-29 15:52                       ` Oleg Nesterov
  2017-08-29 17:07                         ` lockdep && recursive-read Oleg Nesterov
@ 2017-08-29 17:51                         ` Peter Zijlstra
  1 sibling, 0 replies; 50+ messages in thread
From: Peter Zijlstra @ 2017-08-29 17:51 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Byungchul Park, mingo, linux-kernel, kernel-team,
	Arnaldo Carvalho de Melo, Dave Chinner, Tejun Heo, johannes

On Tue, Aug 29, 2017 at 05:52:05PM +0200, Oleg Nesterov wrote:

> The problem is that start_flush_work() does not do acquire/release
> unconditionally, it does this only if it is going to wait, and I am not
> sure this is right...

Right, I think you're right, we can move it earlier, once we have the
pwq.

> Plus process_one_work() does lock_map_acquire_read(), I don't really
> understand this too.

Right, so aside from recursive-reads being broken, I think that was a
mistake.

> > And the analogous:
> >
> > flush_workqueue(wq)	process_one_work(wq, work)
> >   A(wq)			  A(wq)
> >   R(wq)			  work->func(work);
> > 			  R(wq)
> >
> >
> > The thing I puzzled over was flush_work() (really start_flush_work())
> > doing:
> >
> >         if (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer)
> >                 lock_map_acquire(&pwq->wq->lockdep_map);
> >         else
> >                 lock_map_acquire_read(&pwq->wq->lockdep_map);
> >         lock_map_release(&pwq->wq->lockdep_map);
> >
> > Why does flush_work() care about the wq->lockdep_map?
> >
> > The answer is because, for single-threaded workqueues, doing
> > flush_work() from a work is a potential deadlock:
> 
> Yes, but the simple answer is that flush_work() doesn't really differ
> from flush_workqueue() in this respect?

Right, and I think that the new code (aside from maybe placing it
earlier) does that. If single-threaded we use wq->lockdep_map, otherwise
we (also) use work->lockdep_map.

> If nothing else, if some WORK is the last queued work on WQ, then
> flush_work(WORK) is the same thing as flush_workqueuw(WQ), more or less.
> Again, I am talking about single-threaded workqueues.

Right, so single-threaded workqueues are special and are what we need
this extra bit for, but only for single-threaded.

> > workqueue-thread:
> >
> > 	work-n:
> > 	  flush_work(work-n+1);
> >
> > 	work-n+1:
> >
> >
> > Will not be going anywhere fast..
> 
> Or another example,
> 
> 	lock(LOCK);
> 	flush_work(WORK);
> 	unlock(LOCK);
> 
> 	workqueue-thread:
> 		another_pending_work:
> 			LOCK(LOCK);
> 			UNLOCK(LOCK);
> 
> 		WORK:
> 
> In this case we do not care about WORK->lockdep_map, but
> taking the wq->lockdep_map from flush_work() (if single-threaded) allows
> to report the deadlock.

Right. And the new code does so.

^ permalink raw reply	[flat|nested] 50+ messages in thread

end of thread, other threads:[~2017-08-29 17:51 UTC | newest]

Thread overview: 50+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-08-17  8:57 [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING Byungchul Park
2017-08-17  8:57 ` [PATCH v3 2/3] lockdep: Reword title of LOCKDEP_CROSSRELEASE config Byungchul Park
2017-08-17 10:21   ` [tip:locking/core] locking/lockdep: " tip-bot for Byungchul Park
2017-08-17  8:57 ` [PATCH v3 3/3] lockdep: Rename LOCKDEP_COMPLETE config Byungchul Park
2017-08-17 10:22   ` [tip:locking/core] locking/lockdep: Rename CONFIG_LOCKDEP_COMPLETE to CONFIG_LOCKDEP_COMPLETIONS tip-bot for Byungchul Park
2017-08-17 10:21 ` [tip:locking/core] locking/lockdep: Make CONFIG_LOCKDEP_CROSSRELEASE part of CONFIG_PROVE_LOCKING tip-bot for Byungchul Park
2017-08-17 10:45   ` Ingo Molnar
2017-08-18  5:33     ` Byungchul Park
2017-08-21 15:46 ` [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING Peter Zijlstra
2017-08-22  5:14   ` Byungchul Park
2017-08-22  7:52     ` Peter Zijlstra
2017-08-22  8:51       ` Byungchul Park
2017-08-22  9:21         ` Peter Zijlstra
2017-08-22  9:33           ` Byungchul Park
2017-08-22 10:08             ` Peter Zijlstra
2017-08-22 13:49               ` Peter Zijlstra
2017-08-22 14:46                 ` Peter Zijlstra
2017-08-22 15:10                   ` Peter Zijlstra
2017-08-22 15:59                   ` Oleg Nesterov
2017-08-22 16:35                     ` Peter Zijlstra
2017-08-23 16:39                   ` Oleg Nesterov
2017-08-23 17:47                     ` Peter Zijlstra
2017-08-24  6:11                       ` Byungchul Park
2017-08-24  7:37                         ` Byungchul Park
2017-08-24  8:11                           ` Byungchul Park
2017-08-25  1:14                             ` Byungchul Park
2017-08-29 15:52                       ` Oleg Nesterov
2017-08-29 17:07                         ` lockdep && recursive-read Oleg Nesterov
2017-08-29 17:30                           ` Peter Zijlstra
2017-08-29 17:51                         ` [PATCH v3 1/3] lockdep: Make LOCKDEP_CROSSRELEASE configs all part of PROVE_LOCKING Peter Zijlstra
2017-08-23  2:43                 ` Byungchul Park
2017-08-23  6:31                   ` Byungchul Park
2017-08-23 10:26                   ` Peter Zijlstra
2017-08-24  5:07                     ` Byungchul Park
2017-08-22  5:46   ` Dave Chinner
2017-08-22  9:06     ` Peter Zijlstra
2017-08-22  9:22       ` Byungchul Park
2017-08-22  9:37         ` Peter Zijlstra
2017-08-22  9:42           ` Peter Zijlstra
2017-08-23  2:12           ` Byungchul Park
2017-08-23  6:03             ` Byungchul Park
2017-08-23 10:20             ` Peter Zijlstra
2017-08-24  2:02               ` Byungchul Park
2017-08-24  7:30                 ` Byungchul Park
2017-08-22 21:19       ` Dave Chinner
2017-08-23  2:31       ` Byungchul Park
2017-08-23  6:11         ` Byungchul Park
2017-08-23 10:46         ` Peter Zijlstra
2017-08-24  5:06           ` Byungchul Park
2017-08-23  1:56     ` Byungchul Park

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.