[PATCH RFC] kernel/locking, fs/direct-io: Introduce and use down_write_nolockdep()

* [PATCH RFC] kernel/locking, fs/direct-io: Introduce and use down_write_nolockdep()
@ 2018-10-26 16:49 Bart Van Assche
  2018-10-26 17:43 ` Matthew Wilcox
  2018-10-27  5:37 ` Dave Chinner
  0 siblings, 2 replies; 9+ messages in thread
From: Bart Van Assche @ 2018-10-26 16:49 UTC (permalink / raw)
  To: Alexander Viro
  Cc: linux-fsdevel, Johannes Berg, Bart Van Assche, Peter Zijlstra,
	Ingo Molnar, Theodore Ts'o

The rwsem locking functions contain lockdep annotations (down_write(),
up_write(), down_read(), up_read(), ...). Unfortunately lockdep and
semaphores are not a good match because lockdep assumes that releasing
a synchronization object will happen from the same kernel thread that
acquired the synchronization object. This is the case for most but not
all semaphore locking calls. When a semaphore is released from another
context than the context from which it has been acquired lockdep may
report a false positive circular locking report. This patch avoids
that lockdep reports the false positive report shown below for the
direct I/O code. Please note that the lockdep complaint shown below is
not the same as a closely related lockdep complaint that has been
reported recently by syzbot. This patch has been tested on top of a
patch that was suggested by Ted and Tejun, namely to change a
destroy_workqueue() call in the direct-io code into a call to
destroy_workqueue_no_drain(). For the syzbot report and the discussion
of that report, see also https://lkml.org/lkml/2018/10/23/511.

======================================================
WARNING: possible circular locking dependency detected
4.19.0-dbg+ #14 Not tainted
------------------------------------------------------
kworker/3:1/56 is trying to acquire lock:
0000000076123785 (&sb->s_type->i_mutex_key#14){+.+.}, at:
__generic_file_fsync+0x77/0xf0

but task is already holding lock:
000000006a866e67 ((work_completion)(&dio->complete_work)){.+.+}, at:
process_one_work+0x3c9/0x9f0

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #2 ((work_completion)(&dio->complete_work)){.+.+}:
       process_one_work+0x44d/0x9f0
       worker_thread+0x63/0x5a0
       kthread+0x1cf/0x1f0
       ret_from_fork+0x24/0x30

-> #1 ((wq_completion)"dio/%s"sb->s_id){++++}:
       flush_workqueue+0xf3/0x970
       drain_workqueue+0xec/0x220
       destroy_workqueue+0x23/0x350
       sb_init_dio_done_wq+0x6a/0x80
       do_blockdev_direct_IO+0x1f33/0x4be0
       __blockdev_direct_IO+0x79/0x86
       ext4_direct_IO+0x5df/0xbb0
       generic_file_direct_write+0x119/0x220
       __generic_file_write_iter+0x131/0x2d0
       ext4_file_write_iter+0x3fa/0x710
       aio_write+0x235/0x330
       io_submit_one+0x510/0xeb0
       __x64_sys_io_submit+0x122/0x340
       do_syscall_64+0x71/0x220
       entry_SYSCALL_64_after_hwframe+0x49/0xbe

-> #0 (&sb->s_type->i_mutex_key#14){+.+.}:
       lock_acquire+0xc5/0x200
       down_write+0x3d/0x80
       __generic_file_fsync+0x77/0xf0
       ext4_sync_file+0x3c9/0x780
       vfs_fsync_range+0x66/0x100
       dio_complete+0x2f5/0x360
       dio_aio_complete_work+0x1c/0x20
       process_one_work+0x487/0x9f0
       worker_thread+0x63/0x5a0
       kthread+0x1cf/0x1f0
       ret_from_fork+0x24/0x30

other info that might help us debug this:

Chain exists of:
  &sb->s_type->i_mutex_key#14 --> (wq_completion)"dio/%s"sb->s_id -->
(work_completion)(&dio->complete_work)

 Possible unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
  lock((work_completion)(&dio->complete_work));
                               lock((wq_completion)"dio/%s"sb->s_id);
                               lock((work_completion)(&dio->complete_work));
  lock(&sb->s_type->i_mutex_key#14);

 *** DEADLOCK ***

2 locks held by kworker/3:1/56:
 #0: 000000005cbfa331 ((wq_completion)"dio/%s"sb->s_id){++++}, at:
process_one_work+0x3c9/0x9f0
 #1: 000000006a866e67 ((work_completion)(&dio->complete_work)){.+.+}, at:
process_one_work+0x3c9/0x9f0

stack backtrace:
CPU: 3 PID: 56 Comm: kworker/3:1 Not tainted 4.19.0-dbg+ #14
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1
04/01/2014
Workqueue: dio/dm-0 dio_aio_complete_work
Call Trace:
 dump_stack+0x86/0xc5
 print_circular_bug.isra.32+0x20a/0x218
 __lock_acquire+0x1c68/0x1cf0
 lock_acquire+0xc5/0x200
 down_write+0x3d/0x80
 __generic_file_fsync+0x77/0xf0
 ext4_sync_file+0x3c9/0x780
 vfs_fsync_range+0x66/0x100
 dio_complete+0x2f5/0x360
 dio_aio_complete_work+0x1c/0x20
 process_one_work+0x487/0x9f0
 worker_thread+0x63/0x5a0
 kthread+0x1cf/0x1f0
 ret_from_fork+0x24/0x30

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
 fs/direct-io.c                  |  2 +-
 include/linux/fs.h              |  5 ++++
 include/linux/rwsem.h           |  5 ++++
 kernel/locking/rwsem-spinlock.c |  1 +
 kernel/locking/rwsem-xadd.c     |  1 +
 kernel/locking/rwsem.c          | 51 +++++++++++++++++++++++++++++----
 mm/rmap.c                       |  1 +
 7 files changed, 59 insertions(+), 7 deletions(-)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index b49f40594d3b..426ed5b4fe66 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1221,7 +1221,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 					iocb->ki_filp->f_mapping;
 
 			/* will be released by direct_io_worker */
-			inode_lock(inode);
+			inode_lock_nolockdep(inode);
 
 			retval = filemap_write_and_wait_range(mapping, offset,
 							      end - 1);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 897eae8faee1..0bbfde0af5b6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -733,6 +733,11 @@ enum inode_i_mutex_lock_class
 	I_MUTEX_PARENT2,
 };
 
+static inline void inode_lock_nolockdep(struct inode *inode)
+{
+	down_write_nolockdep(&inode->i_rwsem);
+}
+
 static inline void inode_lock(struct inode *inode)
 {
 	down_write(&inode->i_rwsem);
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 67dbb57508b1..4354de397f1b 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -41,6 +41,10 @@ struct rw_semaphore {
 #endif
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map	dep_map;
+	/*
+	 * Number of up_write() calls that must skip rwsem_release().
+	 */
+	unsigned		nolockdep;
 #endif
 };
 
@@ -128,6 +132,7 @@ extern int down_read_trylock(struct rw_semaphore *sem);
 /*
  * lock for writing
  */
+extern void down_write_nolockdep(struct rw_semaphore *sem);
 extern void down_write(struct rw_semaphore *sem);
 extern int __must_check down_write_killable(struct rw_semaphore *sem);
 
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index a7ffb2a96ede..d6ca3c41681c 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -47,6 +47,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
 	 */
 	debug_check_no_locks_freed((void *)sem, sizeof(*sem));
 	lockdep_init_map(&sem->dep_map, name, key, 0);
+	sem->nolockdep = 0;
 #endif
 	sem->count = 0;
 	raw_spin_lock_init(&sem->wait_lock);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 09b180063ee1..b7bab2ddf6c8 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -82,6 +82,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
 	 */
 	debug_check_no_locks_freed((void *)sem, sizeof(*sem));
 	lockdep_init_map(&sem->dep_map, name, key, 0);
+	sem->nolockdep = 0;
 #endif
 	atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE);
 	raw_spin_lock_init(&sem->wait_lock);
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index e586f0d03ad3..aed55ce92ec3 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -61,18 +61,52 @@ int down_read_trylock(struct rw_semaphore *sem)
 
 EXPORT_SYMBOL(down_read_trylock);
 
-/*
- * lock for writing
- */
-void __sched down_write(struct rw_semaphore *sem)
+void down_write_impl(struct rw_semaphore *sem)
 {
 	might_sleep();
-	rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
 
 	LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
 	rwsem_set_owner(sem);
 }
 
+/*
+ * down_write - lock for writing without informing lockdep
+ * @sem: Semaphore that serializes writers against other readers and writers.
+ *
+ * Lockdep assumes that up_write() will be called from the same thread that
+ * calls down_write(). That can result in false positive lockdep complaints
+ * if another thread will call up_write().
+ */
+void __sched down_write_nolockdep(struct rw_semaphore *sem)
+{
+	down_write_impl(sem);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	/*
+	 * It is assumed that up_write() will not be called (from another
+	 * thread) before down_write() returns.
+	 */
+	sem->nolockdep++;
+#endif
+}
+EXPORT_SYMBOL(down_write_nolockdep);
+
+/**
+ * down_write - lock for writing
+ * @sem: Semaphore that serializes writers against other readers and writers.
+ */
+void __sched down_write(struct rw_semaphore *sem)
+{
+	rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
+	down_write_impl(sem);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	/*
+	 * Complain if down_write() is called without having called
+	 * init_rwsem() first.
+	 */
+	WARN_ON_ONCE(sem->nolockdep);
+#endif
+}
+
 EXPORT_SYMBOL(down_write);
 
 /*
@@ -130,7 +164,12 @@ EXPORT_SYMBOL(up_read);
  */
 void up_write(struct rw_semaphore *sem)
 {
-	rwsem_release(&sem->dep_map, 1, _RET_IP_);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	if (sem->nolockdep == 0)
+		rwsem_release(&sem->dep_map, 1, _RET_IP_);
+	else
+		sem->nolockdep--;
+#endif
 	DEBUG_RWSEMS_WARN_ON(sem->owner != current);
 
 	rwsem_clear_owner(sem);
diff --git a/mm/rmap.c b/mm/rmap.c
index 1e79fac3186b..2a953d3b7431 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -81,6 +81,7 @@ static inline struct anon_vma *anon_vma_alloc(void)
 
 	anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
 	if (anon_vma) {
+		init_rwsem(&anon_vma->rwsem);
 		atomic_set(&anon_vma->refcount, 1);
 		anon_vma->degree = 1;	/* Reference for first vma */
 		anon_vma->parent = anon_vma;
-- 
2.19.1.568.g152ad8e336-goog

^ permalink raw reply related	[flat|nested] 9+ messages in thread