linux-btrfs.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v3 0/2] Refactor snapshot vs nocow writers locking
@ 2020-02-24 15:26 Nikolay Borisov
  2020-02-24 15:26 ` [PATCH v3 1/2] btrfs: convert snapshot/nocow exlcusion to drw lock Nikolay Borisov
                   ` (3 more replies)
  0 siblings, 4 replies; 6+ messages in thread
From: Nikolay Borisov @ 2020-02-24 15:26 UTC (permalink / raw)
  To: linux-btrfs; +Cc: Nikolay Borisov

Here is v3 of the DRW locking patches.

Main changes in this verison:
 * Removed EXPORT_SYMBOL for function since I do not intend to submit the locktorture
 patch
 * Added high-level comment as per David's request.

V2:
* Fixed all checkpatch warnings (Andrea Parri)
* Properly call write_unlock in btrfs_drw_try_write_lock (Filipe Manana)
* Comment fix.
* Stress tested it via locktorture. Survived for 8 straight days on a 4 socket
48 thread machine.

Nikolay Borisov (2):
  btrfs: convert snapshot/nocow exlcusion to drw lock
  btrfs: Hook btrfs' DRW lock to locktorture infrastructure

 fs/btrfs/ctree.h             |  9 +----
 fs/btrfs/disk-io.c           | 46 ++++++---------------
 fs/btrfs/extent-tree.c       | 44 ---------------------
 fs/btrfs/file.c              | 11 +++---
 fs/btrfs/inode.c             |  8 ++--
 fs/btrfs/ioctl.c             | 10 ++---
 fs/btrfs/locking.c           |  5 +++
 fs/btrfs/locking.h           |  1 +
 kernel/locking/locktorture.c | 77 +++++++++++++++++++++++++++++++++++-
 9 files changed, 107 insertions(+), 104 deletions(-)

--
2.17.1


^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH v3 1/2] btrfs: convert snapshot/nocow exlcusion to drw lock
  2020-02-24 15:26 [PATCH v3 0/2] Refactor snapshot vs nocow writers locking Nikolay Borisov
@ 2020-02-24 15:26 ` Nikolay Borisov
  2020-02-24 15:26 ` [PATCH v3 2/2] btrfs: Hook btrfs' DRW lock to locktorture infrastructure Nikolay Borisov
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 6+ messages in thread
From: Nikolay Borisov @ 2020-02-24 15:26 UTC (permalink / raw)
  To: linux-btrfs; +Cc: Nikolay Borisov

This patch removes all haphazard code implementing nocow writers
exclusion from pending snapshot creation and switches to using the drw
lock to ensure this invariant still holds. "Readers" are snapshot
creators from create_snapshot and 'writers' are nocow writers from
buffered write path or btrfs_setsize. This locking scheme allows for
multiple snapshots to happen while any nocow writers are blocked, since
writes to page cache in the nocow path will make snapshots inconsistent.

So for performance reasons we'd like to have the ability to run multiple
concurrent snapshots and also favors readers in this case. And in case
there aren't pending snapshots (which will be the majority of the cases)
we rely on the percpu's writers counter to avoid cacheline contention.

The main gain from using the drw is it's now a lot easier to reason about
the guarantees of the locking scheme and whether there is some silent
breakage lurking.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
---
 fs/btrfs/ctree.h       |  9 ++-------
 fs/btrfs/disk-io.c     | 46 ++++++++++--------------------------------
 fs/btrfs/extent-tree.c | 44 ----------------------------------------
 fs/btrfs/file.c        | 11 +++++-----
 fs/btrfs/inode.c       |  8 ++++----
 fs/btrfs/ioctl.c       | 10 +++------
 6 files changed, 25 insertions(+), 103 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index efc2cd147141..91cc25590040 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -957,11 +957,6 @@ static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
 	return sb->s_fs_info;
 }
 
-struct btrfs_subvolume_writers {
-	struct percpu_counter	counter;
-	wait_queue_head_t	wait;
-};
-
 /*
  * The state of btrfs root
  */
@@ -1133,8 +1128,8 @@ struct btrfs_root {
 	 * root_item_lock.
 	 */
 	int dedupe_in_progress;
-	struct btrfs_subvolume_writers *subv_writers;
-	atomic_t will_be_snapshotted;
+	struct btrfs_drw_lock snapshot_lock;
+
 	atomic_t snapshot_force_cow;
 
 	/* For qgroup metadata reserved space */
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 770d469e1d9c..e2576aa70960 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1104,32 +1104,6 @@ void btrfs_clean_tree_block(struct extent_buffer *buf)
 	}
 }
 
-static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void)
-{
-	struct btrfs_subvolume_writers *writers;
-	int ret;
-
-	writers = kmalloc(sizeof(*writers), GFP_NOFS);
-	if (!writers)
-		return ERR_PTR(-ENOMEM);
-
-	ret = percpu_counter_init(&writers->counter, 0, GFP_NOFS);
-	if (ret < 0) {
-		kfree(writers);
-		return ERR_PTR(ret);
-	}
-
-	init_waitqueue_head(&writers->wait);
-	return writers;
-}
-
-static void
-btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers)
-{
-	percpu_counter_destroy(&writers->counter);
-	kfree(writers);
-}
-
 static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
 			 u64 objectid)
 {
@@ -1178,7 +1152,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
 	atomic_set(&root->log_writers, 0);
 	atomic_set(&root->log_batch, 0);
 	refcount_set(&root->refs, 1);
-	atomic_set(&root->will_be_snapshotted, 0);
 	atomic_set(&root->snapshot_force_cow, 0);
 	atomic_set(&root->nr_swapfiles, 0);
 	root->log_transid = 0;
@@ -1450,7 +1423,7 @@ struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
 static int btrfs_init_fs_root(struct btrfs_root *root)
 {
 	int ret;
-	struct btrfs_subvolume_writers *writers;
+	unsigned int nofs_flag;
 
 	root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
 	root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
@@ -1460,12 +1433,16 @@ static int btrfs_init_fs_root(struct btrfs_root *root)
 		goto fail;
 	}
 
-	writers = btrfs_alloc_subvolume_writers();
-	if (IS_ERR(writers)) {
-		ret = PTR_ERR(writers);
+	/*
+	 * We might be called under a transaction (e.g. indirect
+	 * backref resolution) which could deadlock if it triggers memory
+	 * reclaim
+	 */
+	nofs_flag = memalloc_nofs_save();
+	ret = btrfs_drw_lock_init(&root->snapshot_lock);
+	memalloc_nofs_restore(nofs_flag);
+	if (ret)
 		goto fail;
-	}
-	root->subv_writers = writers;
 
 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
 		set_bit(BTRFS_ROOT_REF_COWS, &root->state);
@@ -3961,8 +3938,7 @@ void btrfs_free_fs_root(struct btrfs_root *root)
 	WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
 	if (root->anon_dev)
 		free_anon_bdev(root->anon_dev);
-	if (root->subv_writers)
-		btrfs_free_subvolume_writers(root->subv_writers);
+	btrfs_drw_lock_destroy(&root->snapshot_lock);
 	free_extent_buffer(root->node);
 	free_extent_buffer(root->commit_root);
 	kfree(root->free_ino_ctl);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 7eef91d6c2b6..9dcd70cc3ca3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5740,47 +5740,3 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
 		return bg_ret;
 	return dev_ret;
 }
-
-/*
- * btrfs_{start,end}_write_no_snapshotting() are similar to
- * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
- * data into the page cache through nocow before the subvolume is snapshoted,
- * but flush the data into disk after the snapshot creation, or to prevent
- * operations while snapshotting is ongoing and that cause the snapshot to be
- * inconsistent (writes followed by expanding truncates for example).
- */
-void btrfs_end_write_no_snapshotting(struct btrfs_root *root)
-{
-	percpu_counter_dec(&root->subv_writers->counter);
-	cond_wake_up(&root->subv_writers->wait);
-}
-
-int btrfs_start_write_no_snapshotting(struct btrfs_root *root)
-{
-	if (atomic_read(&root->will_be_snapshotted))
-		return 0;
-
-	percpu_counter_inc(&root->subv_writers->counter);
-	/*
-	 * Make sure counter is updated before we check for snapshot creation.
-	 */
-	smp_mb();
-	if (atomic_read(&root->will_be_snapshotted)) {
-		btrfs_end_write_no_snapshotting(root);
-		return 0;
-	}
-	return 1;
-}
-
-void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
-{
-	while (true) {
-		int ret;
-
-		ret = btrfs_start_write_no_snapshotting(root);
-		if (ret)
-			break;
-		wait_var_event(&root->will_be_snapshotted,
-			       !atomic_read(&root->will_be_snapshotted));
-	}
-}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index fd52ad00b6c8..1031a0b5002e 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1553,8 +1553,7 @@ static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
 	u64 num_bytes;
 	int ret;
 
-	ret = btrfs_start_write_no_snapshotting(root);
-	if (!ret)
+	if (!btrfs_drw_try_write_lock(&root->snapshot_lock))
 		return -EAGAIN;
 
 	lockstart = round_down(pos, fs_info->sectorsize);
@@ -1569,7 +1568,7 @@ static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
 			NULL, NULL, NULL);
 	if (ret <= 0) {
 		ret = 0;
-		btrfs_end_write_no_snapshotting(root);
+		btrfs_drw_write_unlock(&root->snapshot_lock);
 	} else {
 		*write_bytes = min_t(size_t, *write_bytes ,
 				     num_bytes - pos + lockstart);
@@ -1675,7 +1674,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
 						data_reserved, pos,
 						write_bytes);
 			else
-				btrfs_end_write_no_snapshotting(root);
+				btrfs_drw_write_unlock(&root->snapshot_lock);
 			break;
 		}
 
@@ -1779,7 +1778,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
 
 		release_bytes = 0;
 		if (only_release_metadata)
-			btrfs_end_write_no_snapshotting(root);
+			btrfs_drw_write_unlock(&root->snapshot_lock);
 
 		if (only_release_metadata && copied > 0) {
 			lockstart = round_down(pos,
@@ -1808,7 +1807,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
 
 	if (release_bytes) {
 		if (only_release_metadata) {
-			btrfs_end_write_no_snapshotting(root);
+			btrfs_drw_write_unlock(&root->snapshot_lock);
 			btrfs_delalloc_release_metadata(BTRFS_I(inode),
 					release_bytes, true);
 		} else {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 85d6b31ab51c..9a12bdf4969b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4726,16 +4726,16 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
 		 * truncation, it must capture all writes that happened before
 		 * this truncation.
 		 */
-		btrfs_wait_for_snapshot_creation(root);
+		btrfs_drw_write_lock(&root->snapshot_lock);
 		ret = btrfs_cont_expand(inode, oldsize, newsize);
 		if (ret) {
-			btrfs_end_write_no_snapshotting(root);
+			btrfs_drw_write_unlock(&root->snapshot_lock);
 			return ret;
 		}
 
 		trans = btrfs_start_transaction(root, 1);
 		if (IS_ERR(trans)) {
-			btrfs_end_write_no_snapshotting(root);
+			btrfs_drw_write_unlock(&root->snapshot_lock);
 			return PTR_ERR(trans);
 		}
 
@@ -4743,7 +4743,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
 		btrfs_inode_safe_disk_i_size_write(inode, 0);
 		pagecache_isize_extended(inode, oldsize, newsize);
 		ret = btrfs_update_inode(trans, root, inode);
-		btrfs_end_write_no_snapshotting(root);
+		btrfs_drw_write_unlock(&root->snapshot_lock);
 		btrfs_end_transaction(trans);
 	} else {
 
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index e97fd0f91406..f4b1ed038378 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -790,11 +790,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
 	 * possible. This is to avoid later writeback (running dealloc) to
 	 * fallback to COW mode and unexpectedly fail with ENOSPC.
 	 */
-	atomic_inc(&root->will_be_snapshotted);
-	smp_mb__after_atomic();
-	/* wait for no snapshot writes */
-	wait_event(root->subv_writers->wait,
-		   percpu_counter_sum(&root->subv_writers->counter) == 0);
+	btrfs_drw_read_lock(&root->snapshot_lock);
 
 	ret = btrfs_start_delalloc_snapshot(root);
 	if (ret)
@@ -875,8 +871,8 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
 dec_and_free:
 	if (snapshot_force_cow)
 		atomic_dec(&root->snapshot_force_cow);
-	if (atomic_dec_and_test(&root->will_be_snapshotted))
-		wake_up_var(&root->will_be_snapshotted);
+	btrfs_drw_read_unlock(&root->snapshot_lock);
+
 free_pending:
 	kfree(pending_snapshot->root_item);
 	btrfs_free_path(pending_snapshot->path);
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH v3 2/2] btrfs: Hook btrfs' DRW lock to locktorture infrastructure
  2020-02-24 15:26 [PATCH v3 0/2] Refactor snapshot vs nocow writers locking Nikolay Borisov
  2020-02-24 15:26 ` [PATCH v3 1/2] btrfs: convert snapshot/nocow exlcusion to drw lock Nikolay Borisov
@ 2020-02-24 15:26 ` Nikolay Borisov
  2020-02-24 15:32   ` Nikolay Borisov
  2020-02-24 15:32 ` [PATCH 1/2] btrfs: Implement DRW lock Nikolay Borisov
  2020-02-24 16:02 ` [PATCH v3 0/2] Refactor snapshot vs nocow writers locking David Sterba
  3 siblings, 1 reply; 6+ messages in thread
From: Nikolay Borisov @ 2020-02-24 15:26 UTC (permalink / raw)
  To: linux-btrfs; +Cc: Nikolay Borisov

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
---
 fs/btrfs/locking.c           |  5 +++
 fs/btrfs/locking.h           |  1 +
 kernel/locking/locktorture.c | 77 +++++++++++++++++++++++++++++++++++-
 3 files changed, 82 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index d890833694c9..e7645f3fd9cd 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -614,6 +614,7 @@ bool btrfs_drw_try_write_lock(struct btrfs_drw_lock *lock)
 
 	return true;
 }
+EXPORT_SYMBOL(btrfs_drw_try_write_lock);
 
 void btrfs_drw_write_lock(struct btrfs_drw_lock *lock)
 {
@@ -623,12 +624,14 @@ void btrfs_drw_write_lock(struct btrfs_drw_lock *lock)
 		wait_event(lock->pending_writers, !atomic_read(&lock->readers));
 	}
 }
+EXPORT_SYMBOL(btrfs_drw_write_lock);
 
 void btrfs_drw_write_unlock(struct btrfs_drw_lock *lock)
 {
 	percpu_counter_dec(&lock->writers);
 	cond_wake_up(&lock->pending_readers);
 }
+EXPORT_SYMBOL(btrfs_drw_write_unlock);
 
 void btrfs_drw_read_lock(struct btrfs_drw_lock *lock)
 {
@@ -645,6 +648,7 @@ void btrfs_drw_read_lock(struct btrfs_drw_lock *lock)
 	wait_event(lock->pending_readers,
 		   percpu_counter_sum(&lock->writers) == 0);
 }
+EXPORT_SYMBOL(btrfs_drw_read_lock);
 
 void btrfs_drw_read_unlock(struct btrfs_drw_lock *lock)
 {
@@ -655,3 +659,4 @@ void btrfs_drw_read_unlock(struct btrfs_drw_lock *lock)
 	if (atomic_dec_and_test(&lock->readers))
 		wake_up(&lock->pending_writers);
 }
+EXPORT_SYMBOL(btrfs_drw_read_unlock);
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index ba60318c53d5..c56b0cf59357 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -10,6 +10,7 @@
 #include <linux/atomic.h>
 #include <linux/wait.h>
 #include <linux/percpu_counter.h>
+#include "extent_io.h"
 
 #define BTRFS_WRITE_LOCK 1
 #define BTRFS_READ_LOCK 2
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 99475a66c94f..921afbd58fff 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -29,6 +29,8 @@
 #include <linux/slab.h>
 #include <linux/percpu-rwsem.h>
 #include <linux/torture.h>
+#include "../../fs/btrfs/ctree.h"
+#include "../../fs/btrfs/locking.h"
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
@@ -84,6 +86,7 @@ struct lock_torture_ops {
 
 	unsigned long flags; /* for irq spinlocks */
 	const char *name;
+	bool multiple;
 };
 
 struct lock_torture_cxt {
@@ -599,6 +602,7 @@ static void torture_percpu_rwsem_up_read(void) __releases(pcpu_rwsem)
 	percpu_up_read(&pcpu_rwsem);
 }
 
+
 static struct lock_torture_ops percpu_rwsem_lock_ops = {
 	.init		= torture_percpu_rwsem_init,
 	.writelock	= torture_percpu_rwsem_down_write,
@@ -611,6 +615,76 @@ static struct lock_torture_ops percpu_rwsem_lock_ops = {
 	.name		= "percpu_rwsem_lock"
 };
 
+static struct btrfs_drw_lock torture_drw_lock;
+
+void torture_drw_init(void)
+{
+	BUG_ON(btrfs_drw_lock_init(&torture_drw_lock));
+}
+
+static int torture_drw_write_lock(void) __acquires(torture_drw_lock)
+{
+	btrfs_drw_write_lock(&torture_drw_lock);
+	return 0;
+}
+
+static void torture_drw_write_unlock(void) __releases(torture_drw_lock)
+{
+	btrfs_drw_write_unlock(&torture_drw_lock);
+}
+
+static int torture_drw_read_lock(void) __acquires(torture_drw_lock)
+{
+	btrfs_drw_read_lock(&torture_drw_lock);
+	return 0;
+}
+
+static void torture_drw_read_unlock(void) __releases(torture_drw_lock)
+{
+	btrfs_drw_read_unlock(&torture_drw_lock);
+}
+
+static void torture_drw_write_delay(struct torture_random_state *trsp)
+{
+	const unsigned long longdelay_ms = 100;
+
+	/* We want a long delay occasionally to force massive contention.  */
+	if (!(torture_random(trsp) %
+	      (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
+		mdelay(longdelay_ms * 10);
+	else
+		mdelay(longdelay_ms / 10);
+	if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
+		torture_preempt_schedule();  /* Allow test to be preempted. */
+}
+
+static void torture_drw_read_delay(struct torture_random_state *trsp)
+{
+	const unsigned long longdelay_ms = 100;
+
+	/* We want a long delay occasionally to force massive contention.  */
+	if (!(torture_random(trsp) %
+	      (cxt.nrealreaders_stress * 2000 * longdelay_ms)))
+		mdelay(longdelay_ms * 2);
+	else
+		mdelay(longdelay_ms / 2);
+	if (!(torture_random(trsp) % (cxt.nrealreaders_stress * 20000)))
+		torture_preempt_schedule();  /* Allow test to be preempted. */
+}
+
+static struct lock_torture_ops btrfs_drw_lock_ops = {
+	.init		= torture_drw_init,
+	.writelock	= torture_drw_write_lock,
+	.write_delay	= torture_drw_write_delay,
+	.task_boost     = torture_boost_dummy,
+	.writeunlock	= torture_drw_write_unlock,
+	.readlock       = torture_drw_read_lock,
+	.read_delay     = torture_drw_read_delay, /* figure what to do with this */
+	.readunlock     = torture_drw_read_unlock,
+	.multiple = true,
+	.name		= "btrfs_drw_lock"
+};
+
 /*
  * Lock torture writer kthread.  Repeatedly acquires and releases
  * the lock, checking for duplicate acquisitions.
@@ -629,7 +703,7 @@ static int lock_torture_writer(void *arg)
 
 		cxt.cur_ops->task_boost(&rand);
 		cxt.cur_ops->writelock();
-		if (WARN_ON_ONCE(lock_is_write_held))
+		if (!cxt.cur_ops->multiple && WARN_ON_ONCE(lock_is_write_held))
 			lwsp->n_lock_fail++;
 		lock_is_write_held = 1;
 		if (WARN_ON_ONCE(lock_is_read_held))
@@ -851,6 +925,7 @@ static int __init lock_torture_init(void)
 #endif
 		&rwsem_lock_ops,
 		&percpu_rwsem_lock_ops,
+		&btrfs_drw_lock_ops
 	};
 
 	if (!torture_init_begin(torture_type, verbose))
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 1/2] btrfs: Implement DRW lock
  2020-02-24 15:26 [PATCH v3 0/2] Refactor snapshot vs nocow writers locking Nikolay Borisov
  2020-02-24 15:26 ` [PATCH v3 1/2] btrfs: convert snapshot/nocow exlcusion to drw lock Nikolay Borisov
  2020-02-24 15:26 ` [PATCH v3 2/2] btrfs: Hook btrfs' DRW lock to locktorture infrastructure Nikolay Borisov
@ 2020-02-24 15:32 ` Nikolay Borisov
  2020-02-24 16:02 ` [PATCH v3 0/2] Refactor snapshot vs nocow writers locking David Sterba
  3 siblings, 0 replies; 6+ messages in thread
From: Nikolay Borisov @ 2020-02-24 15:32 UTC (permalink / raw)
  To: linux-btrfs; +Cc: Nikolay Borisov

A (D)ouble (R)eader (W)riter lock is a locking primitive that allows
to have multiple readers or multiple writers but not multiple readers
and writers holding it concurrently. The code is factored out from
the existing open-coded locking scheme used to exclude pending
snapshots from nocow writers and vice-versa. Current implementation
actually favors Readers (that is snapshot creaters) to writers (nocow
writers of the filesystem).

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
---
 fs/btrfs/ctree.h   |  1 +
 fs/btrfs/locking.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/locking.h | 21 +++++++++++
 3 files changed, 112 insertions(+)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ad275d06e95f..efc2cd147141 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -33,6 +33,7 @@
 #include "extent_map.h"
 #include "async-thread.h"
 #include "block-rsv.h"
+#include "locking.h"
 
 struct btrfs_trans_handle;
 struct btrfs_transaction;
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index e713900f96b6..d890833694c9 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -565,3 +565,93 @@ struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
 	}
 	return eb;
 }
+
+/*
+ * DRW stands for double-reader-writer lock. It's used in situation where you
+ * want to provide A-B exclusion but not AA or BB. Currently implementation
+ * gives more priority to reader. If a reader and a writer both race to acquire
+ * their respective sides of the lock the writer would yield its lock as soon as
+ * it detects a concurrent reader. Additionally if there are pending readers no
+ * new writers would be allowed to come in and acquire the lock.
+ */
+int btrfs_drw_lock_init(struct btrfs_drw_lock *lock)
+{
+	int ret;
+
+	ret = percpu_counter_init(&lock->writers, 0, GFP_KERNEL);
+	if (ret)
+		return ret;
+
+	atomic_set(&lock->readers, 0);
+	init_waitqueue_head(&lock->pending_readers);
+	init_waitqueue_head(&lock->pending_writers);
+
+	return 0;
+}
+
+void btrfs_drw_lock_destroy(struct btrfs_drw_lock *lock)
+{
+	percpu_counter_destroy(&lock->writers);
+}
+
+/* Return true if acquisition is successful, false otherwise */
+bool btrfs_drw_try_write_lock(struct btrfs_drw_lock *lock)
+{
+	if (atomic_read(&lock->readers))
+		return false;
+
+	percpu_counter_inc(&lock->writers);
+
+	/*
+	 * Ensure writers count is updated before we check for
+	 * pending readers
+	 */
+	smp_mb();
+	if (atomic_read(&lock->readers)) {
+		btrfs_drw_write_unlock(lock);
+		return false;
+	}
+
+	return true;
+}
+
+void btrfs_drw_write_lock(struct btrfs_drw_lock *lock)
+{
+	while (true) {
+		if (btrfs_drw_try_write_lock(lock))
+			return;
+		wait_event(lock->pending_writers, !atomic_read(&lock->readers));
+	}
+}
+
+void btrfs_drw_write_unlock(struct btrfs_drw_lock *lock)
+{
+	percpu_counter_dec(&lock->writers);
+	cond_wake_up(&lock->pending_readers);
+}
+
+void btrfs_drw_read_lock(struct btrfs_drw_lock *lock)
+{
+	atomic_inc(&lock->readers);
+
+	/*
+	 * Ensure the pending reader count is perceieved BEFORE this reader
+	 * goes to sleep in case of active writers. This guarantees new writers
+	 * won't be allowed and that the current reader will be woken up when
+	 * the last active writer finishes its jobs.
+	 */
+	smp_mb__after_atomic();
+
+	wait_event(lock->pending_readers,
+		   percpu_counter_sum(&lock->writers) == 0);
+}
+
+void btrfs_drw_read_unlock(struct btrfs_drw_lock *lock)
+{
+	/*
+	 * atomic_dec_and_test implies a full barrier, so woken up writers
+	 * are guaranteed to see the decrement
+	 */
+	if (atomic_dec_and_test(&lock->readers))
+		wake_up(&lock->pending_writers);
+}
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 21a285883e89..ba60318c53d5 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -7,12 +7,17 @@
 #define BTRFS_LOCKING_H
 
 #include "extent_io.h"
+#include <linux/atomic.h>
+#include <linux/wait.h>
+#include <linux/percpu_counter.h>
 
 #define BTRFS_WRITE_LOCK 1
 #define BTRFS_READ_LOCK 2
 #define BTRFS_WRITE_LOCK_BLOCKING 3
 #define BTRFS_READ_LOCK_BLOCKING 4
 
+struct btrfs_path;
+
 void btrfs_tree_lock(struct extent_buffer *eb);
 void btrfs_tree_unlock(struct extent_buffer *eb);
 
@@ -48,4 +53,20 @@ static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw)
 		BUG();
 }
 
+
+struct btrfs_drw_lock {
+	atomic_t readers;
+	struct percpu_counter writers;
+	wait_queue_head_t pending_writers;
+	wait_queue_head_t pending_readers;
+};
+
+int btrfs_drw_lock_init(struct btrfs_drw_lock *lock);
+void btrfs_drw_lock_destroy(struct btrfs_drw_lock *lock);
+void btrfs_drw_write_lock(struct btrfs_drw_lock *lock);
+bool btrfs_drw_try_write_lock(struct btrfs_drw_lock *lock);
+void btrfs_drw_write_unlock(struct btrfs_drw_lock *lock);
+void btrfs_drw_read_lock(struct btrfs_drw_lock *lock);
+void btrfs_drw_read_unlock(struct btrfs_drw_lock *lock);
+
 #endif
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH v3 2/2] btrfs: Hook btrfs' DRW lock to locktorture infrastructure
  2020-02-24 15:26 ` [PATCH v3 2/2] btrfs: Hook btrfs' DRW lock to locktorture infrastructure Nikolay Borisov
@ 2020-02-24 15:32   ` Nikolay Borisov
  0 siblings, 0 replies; 6+ messages in thread
From: Nikolay Borisov @ 2020-02-24 15:32 UTC (permalink / raw)
  To: linux-btrfs



On 24.02.20 г. 17:26 ч., Nikolay Borisov wrote:
> Signed-off-by: Nikolay Borisov <nborisov@suse.com>

Ignore this one and instead replace it with the Implement DRW locking
that I just sent ...

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v3 0/2] Refactor snapshot vs nocow writers locking
  2020-02-24 15:26 [PATCH v3 0/2] Refactor snapshot vs nocow writers locking Nikolay Borisov
                   ` (2 preceding siblings ...)
  2020-02-24 15:32 ` [PATCH 1/2] btrfs: Implement DRW lock Nikolay Borisov
@ 2020-02-24 16:02 ` David Sterba
  3 siblings, 0 replies; 6+ messages in thread
From: David Sterba @ 2020-02-24 16:02 UTC (permalink / raw)
  To: Nikolay Borisov; +Cc: linux-btrfs

On Mon, Feb 24, 2020 at 05:26:35PM +0200, Nikolay Borisov wrote:
> Here is v3 of the DRW locking patches.

As discussed, let's rename it to DREW locks, where E stands for
exclusion: double reader/writer exclusion.

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2020-02-24 16:03 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-02-24 15:26 [PATCH v3 0/2] Refactor snapshot vs nocow writers locking Nikolay Borisov
2020-02-24 15:26 ` [PATCH v3 1/2] btrfs: convert snapshot/nocow exlcusion to drw lock Nikolay Borisov
2020-02-24 15:26 ` [PATCH v3 2/2] btrfs: Hook btrfs' DRW lock to locktorture infrastructure Nikolay Borisov
2020-02-24 15:32   ` Nikolay Borisov
2020-02-24 15:32 ` [PATCH 1/2] btrfs: Implement DRW lock Nikolay Borisov
2020-02-24 16:02 ` [PATCH v3 0/2] Refactor snapshot vs nocow writers locking David Sterba

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).