All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/3] super: allow waiting without s_umount held
@ 2023-08-17 10:47 Christian Brauner
  2023-08-17 10:47 ` [PATCH 1/3] super: use super_{lock,unlock}() Christian Brauner
                   ` (2 more replies)
  0 siblings, 3 replies; 15+ messages in thread
From: Christian Brauner @ 2023-08-17 10:47 UTC (permalink / raw)
  To: Jan Kara, Christoph Hellwig; +Cc: Al Viro, linux-fsdevel, Christian Brauner

Hey everyone,

This is an attempty to allow concurrent mounters and iterators to wait
on superblock state changes without having to hold s_umount. This is
made necessary by recent attempts to open block devices after superblock
creation and fixing deadlocks due to blkdev_put() trying to acquire
s_umount while s_umount is already held.

This is on top of Jan's and Christoph's work in vfs.super. Obviously not
for v6.6. I guess this is in between an RFC and meaning it. I hope I got
it right but this is intricate. 

It reliably survives xfstests for btrfs, ext4, and xfs while
concurrently having 7 processes running ustat() hammering on
@super_blocks and a while true loop that tries to mount a filsystem with
an invalid superblock hammering on sget{_fc}() concurrently as well.

Thanks and don't hit me over the head with things.
Christian

Signed-off-by: Christian Brauner <brauner@kernel.org>
---
Christian Brauner (3):
      super: use super_{lock,unlock}()
      super: wait for nascent superblocks
      super: wait until we passed kill super

 fs/super.c         | 327 ++++++++++++++++++++++++++++++++++++++++++-----------
 include/linux/fs.h |   2 +
 2 files changed, 261 insertions(+), 68 deletions(-)
---
base-commit: f3aeab61fb15edef1e81828da8dbf0814541e49b
change-id: 20230816-vfs-super-fixes-v3-f2cff6192a50


^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 1/3] super: use super_{lock,unlock}()
  2023-08-17 10:47 [PATCH 0/3] super: allow waiting without s_umount held Christian Brauner
@ 2023-08-17 10:47 ` Christian Brauner
  2023-08-17 11:39   ` Jan Kara
  2023-08-17 10:47 ` [PATCH 2/3] super: wait for nascent superblocks Christian Brauner
  2023-08-17 10:47 ` [PATCH 3/3] super: wait until we passed kill super Christian Brauner
  2 siblings, 1 reply; 15+ messages in thread
From: Christian Brauner @ 2023-08-17 10:47 UTC (permalink / raw)
  To: Jan Kara, Christoph Hellwig; +Cc: Al Viro, linux-fsdevel, Christian Brauner

Replace the open-coded {down,up}_{read,write}() calls with simple
wrappers. Follow-up patches will benefit from this as well.

Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/super.c | 126 ++++++++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 78 insertions(+), 48 deletions(-)

diff --git a/fs/super.c b/fs/super.c
index c878e7373f93..878675921bdc 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -50,6 +50,42 @@ static char *sb_writers_name[SB_FREEZE_LEVELS] = {
 	"sb_internal",
 };
 
+static inline void super_lock(struct super_block *sb, bool excl)
+{
+	if (excl)
+		down_write(&sb->s_umount);
+	else
+		down_read(&sb->s_umount);
+}
+
+static inline void super_unlock(struct super_block *sb, bool excl)
+{
+	if (excl)
+		up_write(&sb->s_umount);
+	else
+		up_read(&sb->s_umount);
+}
+
+static inline void super_lock_write(struct super_block *sb)
+{
+	super_lock(sb, true);
+}
+
+static inline void super_lock_read(struct super_block *sb)
+{
+	super_lock(sb, false);
+}
+
+static inline void super_unlock_write(struct super_block *sb)
+{
+	super_unlock(sb, true);
+}
+
+static inline void super_unlock_read(struct super_block *sb)
+{
+	super_unlock(sb, false);
+}
+
 /*
  * One thing we have to be careful of with a per-sb shrinker is that we don't
  * drop the last active reference to the superblock from within the shrinker.
@@ -110,7 +146,7 @@ static unsigned long super_cache_scan(struct shrinker *shrink,
 		freed += sb->s_op->free_cached_objects(sb, sc);
 	}
 
-	up_read(&sb->s_umount);
+	super_unlock_read(sb);
 	return freed;
 }
 
@@ -176,7 +212,7 @@ static void destroy_unused_super(struct super_block *s)
 {
 	if (!s)
 		return;
-	up_write(&s->s_umount);
+	super_unlock_write(s);
 	list_lru_destroy(&s->s_dentry_lru);
 	list_lru_destroy(&s->s_inode_lru);
 	security_sb_free(s);
@@ -340,7 +376,7 @@ void deactivate_locked_super(struct super_block *s)
 		put_filesystem(fs);
 		put_super(s);
 	} else {
-		up_write(&s->s_umount);
+		super_unlock_write(s);
 	}
 }
 
@@ -357,7 +393,7 @@ EXPORT_SYMBOL(deactivate_locked_super);
 void deactivate_super(struct super_block *s)
 {
 	if (!atomic_add_unless(&s->s_active, -1, 1)) {
-		down_write(&s->s_umount);
+		super_lock_write(s);
 		deactivate_locked_super(s);
 	}
 }
@@ -381,12 +417,12 @@ static int grab_super(struct super_block *s) __releases(sb_lock)
 {
 	s->s_count++;
 	spin_unlock(&sb_lock);
-	down_write(&s->s_umount);
+	super_lock_write(s);
 	if ((s->s_flags & SB_BORN) && atomic_inc_not_zero(&s->s_active)) {
 		put_super(s);
 		return 1;
 	}
-	up_write(&s->s_umount);
+	super_unlock_write(s);
 	put_super(s);
 	return 0;
 }
@@ -414,7 +450,7 @@ bool trylock_super(struct super_block *sb)
 		if (!hlist_unhashed(&sb->s_instances) &&
 		    sb->s_root && (sb->s_flags & SB_BORN))
 			return true;
-		up_read(&sb->s_umount);
+		super_unlock_read(sb);
 	}
 
 	return false;
@@ -439,13 +475,13 @@ bool trylock_super(struct super_block *sb)
 void retire_super(struct super_block *sb)
 {
 	WARN_ON(!sb->s_bdev);
-	down_write(&sb->s_umount);
+	super_lock_write(sb);
 	if (sb->s_iflags & SB_I_PERSB_BDI) {
 		bdi_unregister(sb->s_bdi);
 		sb->s_iflags &= ~SB_I_PERSB_BDI;
 	}
 	sb->s_iflags |= SB_I_RETIRED;
-	up_write(&sb->s_umount);
+	super_unlock_write(sb);
 }
 EXPORT_SYMBOL(retire_super);
 
@@ -521,7 +557,7 @@ void generic_shutdown_super(struct super_block *sb)
 	/* should be initialized for __put_super_and_need_restart() */
 	hlist_del_init(&sb->s_instances);
 	spin_unlock(&sb_lock);
-	up_write(&sb->s_umount);
+	super_unlock_write(sb);
 	if (sb->s_bdi != &noop_backing_dev_info) {
 		if (sb->s_iflags & SB_I_PERSB_BDI)
 			bdi_unregister(sb->s_bdi);
@@ -685,7 +721,7 @@ EXPORT_SYMBOL(sget);
 
 void drop_super(struct super_block *sb)
 {
-	up_read(&sb->s_umount);
+	super_unlock_read(sb);
 	put_super(sb);
 }
 
@@ -693,7 +729,7 @@ EXPORT_SYMBOL(drop_super);
 
 void drop_super_exclusive(struct super_block *sb)
 {
-	up_write(&sb->s_umount);
+	super_unlock_write(sb);
 	put_super(sb);
 }
 EXPORT_SYMBOL(drop_super_exclusive);
@@ -739,10 +775,10 @@ void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
 		sb->s_count++;
 		spin_unlock(&sb_lock);
 
-		down_read(&sb->s_umount);
+		super_lock_read(sb);
 		if (sb->s_root && (sb->s_flags & SB_BORN))
 			f(sb, arg);
-		up_read(&sb->s_umount);
+		super_unlock_read(sb);
 
 		spin_lock(&sb_lock);
 		if (p)
@@ -773,10 +809,10 @@ void iterate_supers_type(struct file_system_type *type,
 		sb->s_count++;
 		spin_unlock(&sb_lock);
 
-		down_read(&sb->s_umount);
+		super_lock_read(sb);
 		if (sb->s_root && (sb->s_flags & SB_BORN))
 			f(sb, arg);
-		up_read(&sb->s_umount);
+		super_unlock_read(sb);
 
 		spin_lock(&sb_lock);
 		if (p)
@@ -813,7 +849,7 @@ struct super_block *get_active_super(struct block_device *bdev)
 		if (sb->s_bdev == bdev) {
 			if (!grab_super(sb))
 				goto restart;
-			up_write(&sb->s_umount);
+			super_unlock_write(sb);
 			return sb;
 		}
 	}
@@ -833,17 +869,11 @@ struct super_block *user_get_super(dev_t dev, bool excl)
 		if (sb->s_dev ==  dev) {
 			sb->s_count++;
 			spin_unlock(&sb_lock);
-			if (excl)
-				down_write(&sb->s_umount);
-			else
-				down_read(&sb->s_umount);
+			super_lock(sb, excl);
 			/* still alive? */
 			if (sb->s_root && (sb->s_flags & SB_BORN))
 				return sb;
-			if (excl)
-				up_write(&sb->s_umount);
-			else
-				up_read(&sb->s_umount);
+			super_unlock(sb, excl);
 			/* nope, got unmounted */
 			spin_lock(&sb_lock);
 			__put_super(sb);
@@ -889,9 +919,9 @@ int reconfigure_super(struct fs_context *fc)
 
 	if (remount_ro) {
 		if (!hlist_empty(&sb->s_pins)) {
-			up_write(&sb->s_umount);
+			super_unlock_write(sb);
 			group_pin_kill(&sb->s_pins);
-			down_write(&sb->s_umount);
+			super_lock_write(sb);
 			if (!sb->s_root)
 				return 0;
 			if (sb->s_writers.frozen != SB_UNFROZEN)
@@ -954,7 +984,7 @@ int reconfigure_super(struct fs_context *fc)
 
 static void do_emergency_remount_callback(struct super_block *sb)
 {
-	down_write(&sb->s_umount);
+	super_lock_write(sb);
 	if (sb->s_root && sb->s_bdev && (sb->s_flags & SB_BORN) &&
 	    !sb_rdonly(sb)) {
 		struct fs_context *fc;
@@ -967,7 +997,7 @@ static void do_emergency_remount_callback(struct super_block *sb)
 			put_fs_context(fc);
 		}
 	}
-	up_write(&sb->s_umount);
+	super_unlock_write(sb);
 }
 
 static void do_emergency_remount(struct work_struct *work)
@@ -990,12 +1020,12 @@ void emergency_remount(void)
 
 static void do_thaw_all_callback(struct super_block *sb)
 {
-	down_write(&sb->s_umount);
+	super_lock_write(sb);
 	if (sb->s_root && sb->s_flags & SB_BORN) {
 		emergency_thaw_bdev(sb);
 		thaw_super_locked(sb);
 	} else {
-		up_write(&sb->s_umount);
+		super_unlock_write(sb);
 	}
 }
 
@@ -1182,10 +1212,10 @@ EXPORT_SYMBOL(get_tree_keyed);
  */
 static bool lock_active_super(struct super_block *sb)
 {
-	down_read(&sb->s_umount);
+	super_lock_read(sb);
 	if (!sb->s_root ||
 	    (sb->s_flags & (SB_ACTIVE | SB_BORN)) != (SB_ACTIVE | SB_BORN)) {
-		up_read(&sb->s_umount);
+		super_unlock_read(sb);
 		return false;
 	}
 	return true;
@@ -1208,7 +1238,7 @@ static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise)
 	if (sb->s_op->shutdown)
 		sb->s_op->shutdown(sb);
 
-	up_read(&sb->s_umount);
+	super_unlock_read(sb);
 }
 
 static void fs_bdev_sync(struct block_device *bdev)
@@ -1220,7 +1250,7 @@ static void fs_bdev_sync(struct block_device *bdev)
 	if (!lock_active_super(sb))
 		return;
 	sync_filesystem(sb);
-	up_read(&sb->s_umount);
+	super_unlock_read(sb);
 }
 
 const struct blk_holder_ops fs_holder_ops = {
@@ -1342,9 +1372,9 @@ int get_tree_bdev(struct fs_context *fc,
 		 * bdev_mark_dead()). It is safe because we have active sb
 		 * reference and SB_BORN is not set yet.
 		 */
-		up_write(&s->s_umount);
+		super_unlock_write(s);
 		error = setup_bdev_super(s, fc->sb_flags, fc);
-		down_write(&s->s_umount);
+		super_lock_write(s);
 		if (!error)
 			error = fill_super(s, fc);
 		if (error) {
@@ -1394,9 +1424,9 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
 		 * bdev_mark_dead()). It is safe because we have active sb
 		 * reference and SB_BORN is not set yet.
 		 */
-		up_write(&s->s_umount);
+		super_unlock_write(s);
 		error = setup_bdev_super(s, flags, NULL);
-		down_write(&s->s_umount);
+		super_lock_write(s);
 		if (!error)
 			error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
 		if (error) {
@@ -1685,29 +1715,29 @@ int freeze_super(struct super_block *sb)
 	int ret;
 
 	atomic_inc(&sb->s_active);
-	down_write(&sb->s_umount);
+	super_lock_write(sb);
 	if (sb->s_writers.frozen != SB_UNFROZEN) {
 		deactivate_locked_super(sb);
 		return -EBUSY;
 	}
 
 	if (!(sb->s_flags & SB_BORN)) {
-		up_write(&sb->s_umount);
+		super_unlock_write(sb);
 		return 0;	/* sic - it's "nothing to do" */
 	}
 
 	if (sb_rdonly(sb)) {
 		/* Nothing to do really... */
 		sb->s_writers.frozen = SB_FREEZE_COMPLETE;
-		up_write(&sb->s_umount);
+		super_unlock_write(sb);
 		return 0;
 	}
 
 	sb->s_writers.frozen = SB_FREEZE_WRITE;
 	/* Release s_umount to preserve sb_start_write -> s_umount ordering */
-	up_write(&sb->s_umount);
+	super_unlock_write(sb);
 	sb_wait_write(sb, SB_FREEZE_WRITE);
-	down_write(&sb->s_umount);
+	super_lock_write(sb);
 
 	/* Now we go and block page faults... */
 	sb->s_writers.frozen = SB_FREEZE_PAGEFAULT;
@@ -1743,7 +1773,7 @@ int freeze_super(struct super_block *sb)
 	 */
 	sb->s_writers.frozen = SB_FREEZE_COMPLETE;
 	lockdep_sb_freeze_release(sb);
-	up_write(&sb->s_umount);
+	super_unlock_write(sb);
 	return 0;
 }
 EXPORT_SYMBOL(freeze_super);
@@ -1753,7 +1783,7 @@ static int thaw_super_locked(struct super_block *sb)
 	int error;
 
 	if (sb->s_writers.frozen != SB_FREEZE_COMPLETE) {
-		up_write(&sb->s_umount);
+		super_unlock_write(sb);
 		return -EINVAL;
 	}
 
@@ -1770,7 +1800,7 @@ static int thaw_super_locked(struct super_block *sb)
 			printk(KERN_ERR
 				"VFS:Filesystem thaw failed\n");
 			lockdep_sb_freeze_release(sb);
-			up_write(&sb->s_umount);
+			super_unlock_write(sb);
 			return error;
 		}
 	}
@@ -1790,7 +1820,7 @@ static int thaw_super_locked(struct super_block *sb)
  */
 int thaw_super(struct super_block *sb)
 {
-	down_write(&sb->s_umount);
+	super_lock_write(sb);
 	return thaw_super_locked(sb);
 }
 EXPORT_SYMBOL(thaw_super);

-- 
2.34.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 2/3] super: wait for nascent superblocks
  2023-08-17 10:47 [PATCH 0/3] super: allow waiting without s_umount held Christian Brauner
  2023-08-17 10:47 ` [PATCH 1/3] super: use super_{lock,unlock}() Christian Brauner
@ 2023-08-17 10:47 ` Christian Brauner
  2023-08-17 12:50   ` Jan Kara
  2023-08-17 10:47 ` [PATCH 3/3] super: wait until we passed kill super Christian Brauner
  2 siblings, 1 reply; 15+ messages in thread
From: Christian Brauner @ 2023-08-17 10:47 UTC (permalink / raw)
  To: Jan Kara, Christoph Hellwig; +Cc: Al Viro, linux-fsdevel, Christian Brauner

Recent patches experiment with making it possible to allocate a new
superblock before opening the relevant block device. Naturally this has
intricate side-effects that we get to learn about while developing this.

Superblock allocators such as sget{_fc}() return with s_umount of the
new superblock held and ock ordering currently requires that block level
locks such as bdev_lock and open_mutex rank above s_umount.

Before aca740cecbe5 ("fs: open block device after superblock creation")
ordering was guaranteed to be correct as block devices were opened prior
to superblock allocation and thus s_umount wasn't held. But now s_umount
must be dropped before opening block devices to avoid locking
violations.

This has consequences. The main one being that iterators over
@super_blocks and @fs_supers that grab a temporary reference to the
superblock can now also grab s_umount before the caller has managed to
open block devices and called fill_super(). So whereas before such
iterators or concurrent mounts would have simply slept on s_umount until
SB_BORN was set or the superblock was discard due to initalization
failure they can now needlessly spin through sget{_fc}().

If the caller is sleeping on bdev_lock or open_mutex one caller waiting
on SB_BORN will always spin somewhere potentially this can go on for
quite a while.

It should be possible to drop s_umount while allowing iterators to wait
on a nascent superblock to either be born or discarded. This patch
implements a wait_var_event() mechanism allowing iterators to sleep
until they are woken when the superblock is born or discarded.

This should also allows us to avoid relooping through @fs_supers and
@super_blocks if a superblock isn't yet born or dying.

Link: aca740cecbe5 ("fs: open block device after superblock creation")
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/super.c         | 146 +++++++++++++++++++++++++++++++++++++++++++++--------
 include/linux/fs.h |   1 +
 2 files changed, 125 insertions(+), 22 deletions(-)

diff --git a/fs/super.c b/fs/super.c
index 878675921bdc..55bf495763d9 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -86,6 +86,89 @@ static inline void super_unlock_read(struct super_block *sb)
 	super_unlock(sb, false);
 }
 
+static inline bool wait_born(struct super_block *sb)
+{
+	unsigned int flags;
+
+	/*
+	 * Pairs with smp_store_release() in super_wake() and ensure
+	 * that we see SB_BORN or SB_DYING after we're woken.
+	 */
+	flags = smp_load_acquire(&sb->s_flags);
+	return flags & (SB_BORN | SB_DYING);
+}
+
+/**
+ * super_wait - wait for superblock to become ready
+ * @sb: superblock to wait for
+ * @excl: whether exclusive access is required
+ *
+ * If the superblock has neither passed through vfs_get_tree() or
+ * generic_shutdown_super() yet wait for it to happen. Either superblock
+ * creation will succeed and SB_BORN is set by vfs_get_tree() or we're
+ * woken and we'll see SB_DYING.
+ *
+ * The caller must have acquired a temporary reference on @sb->s_count.
+ *
+ * Return: true if SB_BORN was set, false if SB_DYING was set.
+ */
+static bool super_wait(struct super_block *sb, bool excl)
+{
+
+	lockdep_assert_not_held(&sb->s_umount);
+
+relock:
+	super_lock(sb, excl);
+
+	/*
+	 * Has gone through generic_shutdown_super() in the meantime.
+	 * @sb->s_root is NULL and @sb->s_active is 0. No one needs to
+	 * grab a reference to this. Tell them so.
+	 */
+	if (sb->s_flags & SB_DYING)
+		return false;
+
+	/* Has called ->get_tree() successfully. */
+	if (sb->s_flags & SB_BORN)
+		return true;
+
+	super_unlock(sb, excl);
+
+	/* wait until the superblock is ready or dying */
+	wait_var_event(&sb->s_flags, wait_born(sb));
+
+	/*
+	 * Neither SB_BORN nor SB_DYING are ever unset so we never loop.
+	 * Just reacquire @sb->s_umount for the caller.
+	 */
+	goto relock;
+}
+
+/* wait and acquire read-side of @sb->s_umount */
+static inline bool super_wait_read(struct super_block *sb)
+{
+	return super_wait(sb, false);
+}
+
+/* wait and acquire write-side of @sb->s_umount */
+static inline bool super_wait_write(struct super_block *sb)
+{
+	return super_wait(sb, true);
+}
+
+/* wake waiters */
+static void super_wake(struct super_block *sb, unsigned int flag)
+{
+	unsigned int flags = sb->s_flags;
+
+	/*
+	 * Pairs with smp_load_acquire() in super_wait() and ensure that
+	 * we @flag is set before we wake anyone.
+	 */
+	smp_store_release(&sb->s_flags, flags | flag);
+	wake_up_var(&sb->s_flags);
+}
+
 /*
  * One thing we have to be careful of with a per-sb shrinker is that we don't
  * drop the last active reference to the superblock from within the shrinker.
@@ -415,10 +498,12 @@ EXPORT_SYMBOL(deactivate_super);
  */
 static int grab_super(struct super_block *s) __releases(sb_lock)
 {
+	bool born;
+
 	s->s_count++;
 	spin_unlock(&sb_lock);
-	super_lock_write(s);
-	if ((s->s_flags & SB_BORN) && atomic_inc_not_zero(&s->s_active)) {
+	born = super_wait_write(s);
+	if (born && atomic_inc_not_zero(&s->s_active)) {
 		put_super(s);
 		return 1;
 	}
@@ -557,6 +642,13 @@ void generic_shutdown_super(struct super_block *sb)
 	/* should be initialized for __put_super_and_need_restart() */
 	hlist_del_init(&sb->s_instances);
 	spin_unlock(&sb_lock);
+	/*
+	 * Broadcast to everyone that grabbed a temporary reference to this
+	 * superblock before we removed it from @fs_supers that the superblock
+	 * is dying. Every walker of @fs_supers outside of sget{_fc}() will now
+	 * discard this superblock and treat it as dead.
+	 */
+	super_wake(sb, SB_DYING);
 	super_unlock_write(sb);
 	if (sb->s_bdi != &noop_backing_dev_info) {
 		if (sb->s_iflags & SB_I_PERSB_BDI)
@@ -631,6 +723,11 @@ struct super_block *sget_fc(struct fs_context *fc,
 	s->s_type = fc->fs_type;
 	s->s_iflags |= fc->s_iflags;
 	strscpy(s->s_id, s->s_type->name, sizeof(s->s_id));
+	/*
+	 * Make the superblock visible on @super_blocks and @fs_supers.
+	 * It's in a nascent state and users should wait on SB_BORN or
+	 * SB_DYING to be set.
+	 */
 	list_add_tail(&s->s_list, &super_blocks);
 	hlist_add_head(&s->s_instances, &s->s_type->fs_supers);
 	spin_unlock(&sb_lock);
@@ -770,13 +867,15 @@ void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
 
 	spin_lock(&sb_lock);
 	list_for_each_entry(sb, &super_blocks, s_list) {
+		bool born;
+
 		if (hlist_unhashed(&sb->s_instances))
 			continue;
 		sb->s_count++;
 		spin_unlock(&sb_lock);
 
-		super_lock_read(sb);
-		if (sb->s_root && (sb->s_flags & SB_BORN))
+		born = super_wait_read(sb);
+		if (born && sb->s_root)
 			f(sb, arg);
 		super_unlock_read(sb);
 
@@ -806,11 +905,13 @@ void iterate_supers_type(struct file_system_type *type,
 
 	spin_lock(&sb_lock);
 	hlist_for_each_entry(sb, &type->fs_supers, s_instances) {
+		bool born;
+
 		sb->s_count++;
 		spin_unlock(&sb_lock);
 
-		super_lock_read(sb);
-		if (sb->s_root && (sb->s_flags & SB_BORN))
+		born = super_wait_read(sb);
+		if (born && sb->s_root)
 			f(sb, arg);
 		super_unlock_read(sb);
 
@@ -841,15 +942,14 @@ struct super_block *get_active_super(struct block_device *bdev)
 	if (!bdev)
 		return NULL;
 
-restart:
 	spin_lock(&sb_lock);
 	list_for_each_entry(sb, &super_blocks, s_list) {
 		if (hlist_unhashed(&sb->s_instances))
 			continue;
 		if (sb->s_bdev == bdev) {
 			if (!grab_super(sb))
-				goto restart;
-			super_unlock_write(sb);
+				return NULL;
+                        super_unlock_write(sb);
 			return sb;
 		}
 	}
@@ -862,22 +962,23 @@ struct super_block *user_get_super(dev_t dev, bool excl)
 	struct super_block *sb;
 
 	spin_lock(&sb_lock);
-rescan:
 	list_for_each_entry(sb, &super_blocks, s_list) {
 		if (hlist_unhashed(&sb->s_instances))
 			continue;
 		if (sb->s_dev ==  dev) {
+			bool born;
+
 			sb->s_count++;
 			spin_unlock(&sb_lock);
-			super_lock(sb, excl);
 			/* still alive? */
-			if (sb->s_root && (sb->s_flags & SB_BORN))
+			born = super_wait(sb, excl);
+			if (born && sb->s_root)
 				return sb;
 			super_unlock(sb, excl);
 			/* nope, got unmounted */
 			spin_lock(&sb_lock);
 			__put_super(sb);
-			goto rescan;
+			break;
 		}
 	}
 	spin_unlock(&sb_lock);
@@ -984,9 +1085,9 @@ int reconfigure_super(struct fs_context *fc)
 
 static void do_emergency_remount_callback(struct super_block *sb)
 {
-	super_lock_write(sb);
-	if (sb->s_root && sb->s_bdev && (sb->s_flags & SB_BORN) &&
-	    !sb_rdonly(sb)) {
+	bool born = super_wait_write(sb);
+
+	if (born && sb->s_root && sb->s_bdev && !sb_rdonly(sb)) {
 		struct fs_context *fc;
 
 		fc = fs_context_for_reconfigure(sb->s_root,
@@ -1020,8 +1121,9 @@ void emergency_remount(void)
 
 static void do_thaw_all_callback(struct super_block *sb)
 {
-	super_lock_write(sb);
-	if (sb->s_root && sb->s_flags & SB_BORN) {
+	bool born = super_wait_write(sb);
+
+	if (born && sb->s_root) {
 		emergency_thaw_bdev(sb);
 		thaw_super_locked(sb);
 	} else {
@@ -1212,9 +1314,9 @@ EXPORT_SYMBOL(get_tree_keyed);
  */
 static bool lock_active_super(struct super_block *sb)
 {
-	super_lock_read(sb);
-	if (!sb->s_root ||
-	    (sb->s_flags & (SB_ACTIVE | SB_BORN)) != (SB_ACTIVE | SB_BORN)) {
+	bool born = super_wait_read(sb);
+
+	if (!born || !sb->s_root || !(sb->s_flags & SB_ACTIVE)) {
 		super_unlock_read(sb);
 		return false;
 	}
@@ -1572,7 +1674,7 @@ int vfs_get_tree(struct fs_context *fc)
 	 * flag.
 	 */
 	smp_wmb();
-	sb->s_flags |= SB_BORN;
+	super_wake(sb, SB_BORN);
 
 	error = security_sb_set_mnt_opts(sb, fc->security, 0, NULL);
 	if (unlikely(error)) {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 14b5777a24a0..173672645156 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1095,6 +1095,7 @@ extern int send_sigurg(struct fown_struct *fown);
 #define SB_LAZYTIME     BIT(25)	/* Update the on-disk [acm]times lazily */
 
 /* These sb flags are internal to the kernel */
+#define SB_DYING        BIT(24)
 #define SB_SUBMOUNT     BIT(26)
 #define SB_FORCE        BIT(27)
 #define SB_NOSEC        BIT(28)

-- 
2.34.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 3/3] super: wait until we passed kill super
  2023-08-17 10:47 [PATCH 0/3] super: allow waiting without s_umount held Christian Brauner
  2023-08-17 10:47 ` [PATCH 1/3] super: use super_{lock,unlock}() Christian Brauner
  2023-08-17 10:47 ` [PATCH 2/3] super: wait for nascent superblocks Christian Brauner
@ 2023-08-17 10:47 ` Christian Brauner
  2023-08-17 14:37   ` Jan Kara
  2 siblings, 1 reply; 15+ messages in thread
From: Christian Brauner @ 2023-08-17 10:47 UTC (permalink / raw)
  To: Jan Kara, Christoph Hellwig; +Cc: Al Viro, linux-fsdevel, Christian Brauner

Recent rework moved block device closing out of sb->put_super() and into
sb->kill_sb() to avoid deadlocks as s_umount is held in put_super() and
blkdev_put() can end up taking s_umount again.

That means we need to move the removal of the superblock from @fs_supers
out of generic_shutdown_super() and into deactivate_locked_super() to
ensure that concurrent mounters don't fail to open block devices that
are still in use because blkdev_put() in sb->kill_sb() hasn't been
called yet.

We can now do this as we can make iterators through @fs_super and
@super_blocks wait without holding s_umount. Concurrent mounts will wait
until a dying superblock is fully dead so until sb->kill_sb() has been
called and SB_DEAD been set. Concurrent iterators can already discard
any SB_DYING superblock.

Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/super.c         | 71 +++++++++++++++++++++++++++++++++++++++++++++++++-----
 include/linux/fs.h |  1 +
 2 files changed, 66 insertions(+), 6 deletions(-)

diff --git a/fs/super.c b/fs/super.c
index 55bf495763d9..710448078c07 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -98,6 +98,18 @@ static inline bool wait_born(struct super_block *sb)
 	return flags & (SB_BORN | SB_DYING);
 }
 
+static inline bool wait_dead(struct super_block *sb)
+{
+	unsigned int flags;
+
+	/*
+	 * Pairs with smp_store_release() in super_wake() and ensure
+	 * that we see SB_DEAD after we're woken.
+	 */
+	flags = smp_load_acquire(&sb->s_flags);
+	return flags & SB_DEAD;
+}
+
 /**
  * super_wait - wait for superblock to become ready
  * @sb: superblock to wait for
@@ -144,6 +156,19 @@ static bool super_wait(struct super_block *sb, bool excl)
 	goto relock;
 }
 
+static bool super_wait_dead(struct super_block *sb)
+{
+	if (super_wait(sb, true))
+		return true;
+
+	lockdep_assert_held(&sb->s_umount);
+	super_unlock_write(sb);
+	/* If superblock is dying, wait for everything to be shutdown. */
+	wait_var_event(&sb->s_flags, wait_dead(sb));
+	super_lock_write(sb);
+	return false;
+}
+
 /* wait and acquire read-side of @sb->s_umount */
 static inline bool super_wait_read(struct super_block *sb)
 {
@@ -169,6 +194,22 @@ static void super_wake(struct super_block *sb, unsigned int flag)
 	wake_up_var(&sb->s_flags);
 }
 
+static int grab_super_wait_dead(struct super_block *s) __releases(sb_lock)
+{
+	bool born;
+
+	s->s_count++;
+	spin_unlock(&sb_lock);
+	born = super_wait_dead(s);
+	if (born && atomic_inc_not_zero(&s->s_active)) {
+		put_super(s);
+		return 1;
+	}
+	up_write(&s->s_umount);
+	put_super(s);
+	return 0;
+}
+
 /*
  * One thing we have to be careful of with a per-sb shrinker is that we don't
  * drop the last active reference to the superblock from within the shrinker.
@@ -456,6 +497,25 @@ void deactivate_locked_super(struct super_block *s)
 		list_lru_destroy(&s->s_dentry_lru);
 		list_lru_destroy(&s->s_inode_lru);
 
+		/*
+		 * Remove it from @fs_supers so it isn't found by new
+		 * sget{_fc}() walkers anymore. Any concurrent mounter still
+		 * managing to grab a temporary reference is guaranteed to
+		 * already see SB_DYING and will wait until we notify them about
+		 * SB_DEAD.
+		 */
+		spin_lock(&sb_lock);
+		hlist_del_init(&s->s_instances);
+		spin_unlock(&sb_lock);
+
+		/*
+		 * Let concurrent mounts know that this thing is really dead.
+		 * We don't need @sb->s_umount here as every concurrent caller
+		 * will see SB_DYING and either discard the superblock or wait
+		 * for SB_DEAD.
+		 */
+		super_wake(s, SB_DEAD);
+
 		put_filesystem(fs);
 		put_super(s);
 	} else {
@@ -638,15 +698,14 @@ void generic_shutdown_super(struct super_block *sb)
 			spin_unlock(&sb->s_inode_list_lock);
 		}
 	}
-	spin_lock(&sb_lock);
-	/* should be initialized for __put_super_and_need_restart() */
-	hlist_del_init(&sb->s_instances);
-	spin_unlock(&sb_lock);
 	/*
 	 * Broadcast to everyone that grabbed a temporary reference to this
 	 * superblock before we removed it from @fs_supers that the superblock
 	 * is dying. Every walker of @fs_supers outside of sget{_fc}() will now
 	 * discard this superblock and treat it as dead.
+	 *
+	 * We leave the superblock on @fs_supers so it can be found by
+	 * sget{_fc}() until we passed sb->kill_sb().
 	 */
 	super_wake(sb, SB_DYING);
 	super_unlock_write(sb);
@@ -741,7 +800,7 @@ struct super_block *sget_fc(struct fs_context *fc,
 		destroy_unused_super(s);
 		return ERR_PTR(-EBUSY);
 	}
-	if (!grab_super(old))
+	if (!grab_super_wait_dead(old))
 		goto retry;
 	destroy_unused_super(s);
 	return old;
@@ -785,7 +844,7 @@ struct super_block *sget(struct file_system_type *type,
 				destroy_unused_super(s);
 				return ERR_PTR(-EBUSY);
 			}
-			if (!grab_super(old))
+			if (!grab_super_wait_dead(old))
 				goto retry;
 			destroy_unused_super(s);
 			return old;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 173672645156..34ac792c4b19 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1095,6 +1095,7 @@ extern int send_sigurg(struct fown_struct *fown);
 #define SB_LAZYTIME     BIT(25)	/* Update the on-disk [acm]times lazily */
 
 /* These sb flags are internal to the kernel */
+#define SB_DEAD	        BIT(21)
 #define SB_DYING        BIT(24)
 #define SB_SUBMOUNT     BIT(26)
 #define SB_FORCE        BIT(27)

-- 
2.34.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/3] super: use super_{lock,unlock}()
  2023-08-17 10:47 ` [PATCH 1/3] super: use super_{lock,unlock}() Christian Brauner
@ 2023-08-17 11:39   ` Jan Kara
  0 siblings, 0 replies; 15+ messages in thread
From: Jan Kara @ 2023-08-17 11:39 UTC (permalink / raw)
  To: Christian Brauner; +Cc: Jan Kara, Christoph Hellwig, Al Viro, linux-fsdevel

On Thu 17-08-23 12:47:42, Christian Brauner wrote:
> Replace the open-coded {down,up}_{read,write}() calls with simple
> wrappers. Follow-up patches will benefit from this as well.
> 
> Signed-off-by: Christian Brauner <brauner@kernel.org>

Looks good. Except we already have a wrapper trylock_super() which is now
inconsistently named. So that should be renamed as well.

								Honza

> ---
>  fs/super.c | 126 ++++++++++++++++++++++++++++++++++++++-----------------------
>  1 file changed, 78 insertions(+), 48 deletions(-)
> 
> diff --git a/fs/super.c b/fs/super.c
> index c878e7373f93..878675921bdc 100644
> --- a/fs/super.c
> +++ b/fs/super.c
> @@ -50,6 +50,42 @@ static char *sb_writers_name[SB_FREEZE_LEVELS] = {
>  	"sb_internal",
>  };
>  
> +static inline void super_lock(struct super_block *sb, bool excl)
> +{
> +	if (excl)
> +		down_write(&sb->s_umount);
> +	else
> +		down_read(&sb->s_umount);
> +}
> +
> +static inline void super_unlock(struct super_block *sb, bool excl)
> +{
> +	if (excl)
> +		up_write(&sb->s_umount);
> +	else
> +		up_read(&sb->s_umount);
> +}
> +
> +static inline void super_lock_write(struct super_block *sb)
> +{
> +	super_lock(sb, true);
> +}
> +
> +static inline void super_lock_read(struct super_block *sb)
> +{
> +	super_lock(sb, false);
> +}
> +
> +static inline void super_unlock_write(struct super_block *sb)
> +{
> +	super_unlock(sb, true);
> +}
> +
> +static inline void super_unlock_read(struct super_block *sb)
> +{
> +	super_unlock(sb, false);
> +}
> +
>  /*
>   * One thing we have to be careful of with a per-sb shrinker is that we don't
>   * drop the last active reference to the superblock from within the shrinker.
> @@ -110,7 +146,7 @@ static unsigned long super_cache_scan(struct shrinker *shrink,
>  		freed += sb->s_op->free_cached_objects(sb, sc);
>  	}
>  
> -	up_read(&sb->s_umount);
> +	super_unlock_read(sb);
>  	return freed;
>  }
>  
> @@ -176,7 +212,7 @@ static void destroy_unused_super(struct super_block *s)
>  {
>  	if (!s)
>  		return;
> -	up_write(&s->s_umount);
> +	super_unlock_write(s);
>  	list_lru_destroy(&s->s_dentry_lru);
>  	list_lru_destroy(&s->s_inode_lru);
>  	security_sb_free(s);
> @@ -340,7 +376,7 @@ void deactivate_locked_super(struct super_block *s)
>  		put_filesystem(fs);
>  		put_super(s);
>  	} else {
> -		up_write(&s->s_umount);
> +		super_unlock_write(s);
>  	}
>  }
>  
> @@ -357,7 +393,7 @@ EXPORT_SYMBOL(deactivate_locked_super);
>  void deactivate_super(struct super_block *s)
>  {
>  	if (!atomic_add_unless(&s->s_active, -1, 1)) {
> -		down_write(&s->s_umount);
> +		super_lock_write(s);
>  		deactivate_locked_super(s);
>  	}
>  }
> @@ -381,12 +417,12 @@ static int grab_super(struct super_block *s) __releases(sb_lock)
>  {
>  	s->s_count++;
>  	spin_unlock(&sb_lock);
> -	down_write(&s->s_umount);
> +	super_lock_write(s);
>  	if ((s->s_flags & SB_BORN) && atomic_inc_not_zero(&s->s_active)) {
>  		put_super(s);
>  		return 1;
>  	}
> -	up_write(&s->s_umount);
> +	super_unlock_write(s);
>  	put_super(s);
>  	return 0;
>  }
> @@ -414,7 +450,7 @@ bool trylock_super(struct super_block *sb)
>  		if (!hlist_unhashed(&sb->s_instances) &&
>  		    sb->s_root && (sb->s_flags & SB_BORN))
>  			return true;
> -		up_read(&sb->s_umount);
> +		super_unlock_read(sb);
>  	}
>  
>  	return false;
> @@ -439,13 +475,13 @@ bool trylock_super(struct super_block *sb)
>  void retire_super(struct super_block *sb)
>  {
>  	WARN_ON(!sb->s_bdev);
> -	down_write(&sb->s_umount);
> +	super_lock_write(sb);
>  	if (sb->s_iflags & SB_I_PERSB_BDI) {
>  		bdi_unregister(sb->s_bdi);
>  		sb->s_iflags &= ~SB_I_PERSB_BDI;
>  	}
>  	sb->s_iflags |= SB_I_RETIRED;
> -	up_write(&sb->s_umount);
> +	super_unlock_write(sb);
>  }
>  EXPORT_SYMBOL(retire_super);
>  
> @@ -521,7 +557,7 @@ void generic_shutdown_super(struct super_block *sb)
>  	/* should be initialized for __put_super_and_need_restart() */
>  	hlist_del_init(&sb->s_instances);
>  	spin_unlock(&sb_lock);
> -	up_write(&sb->s_umount);
> +	super_unlock_write(sb);
>  	if (sb->s_bdi != &noop_backing_dev_info) {
>  		if (sb->s_iflags & SB_I_PERSB_BDI)
>  			bdi_unregister(sb->s_bdi);
> @@ -685,7 +721,7 @@ EXPORT_SYMBOL(sget);
>  
>  void drop_super(struct super_block *sb)
>  {
> -	up_read(&sb->s_umount);
> +	super_unlock_read(sb);
>  	put_super(sb);
>  }
>  
> @@ -693,7 +729,7 @@ EXPORT_SYMBOL(drop_super);
>  
>  void drop_super_exclusive(struct super_block *sb)
>  {
> -	up_write(&sb->s_umount);
> +	super_unlock_write(sb);
>  	put_super(sb);
>  }
>  EXPORT_SYMBOL(drop_super_exclusive);
> @@ -739,10 +775,10 @@ void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
>  		sb->s_count++;
>  		spin_unlock(&sb_lock);
>  
> -		down_read(&sb->s_umount);
> +		super_lock_read(sb);
>  		if (sb->s_root && (sb->s_flags & SB_BORN))
>  			f(sb, arg);
> -		up_read(&sb->s_umount);
> +		super_unlock_read(sb);
>  
>  		spin_lock(&sb_lock);
>  		if (p)
> @@ -773,10 +809,10 @@ void iterate_supers_type(struct file_system_type *type,
>  		sb->s_count++;
>  		spin_unlock(&sb_lock);
>  
> -		down_read(&sb->s_umount);
> +		super_lock_read(sb);
>  		if (sb->s_root && (sb->s_flags & SB_BORN))
>  			f(sb, arg);
> -		up_read(&sb->s_umount);
> +		super_unlock_read(sb);
>  
>  		spin_lock(&sb_lock);
>  		if (p)
> @@ -813,7 +849,7 @@ struct super_block *get_active_super(struct block_device *bdev)
>  		if (sb->s_bdev == bdev) {
>  			if (!grab_super(sb))
>  				goto restart;
> -			up_write(&sb->s_umount);
> +			super_unlock_write(sb);
>  			return sb;
>  		}
>  	}
> @@ -833,17 +869,11 @@ struct super_block *user_get_super(dev_t dev, bool excl)
>  		if (sb->s_dev ==  dev) {
>  			sb->s_count++;
>  			spin_unlock(&sb_lock);
> -			if (excl)
> -				down_write(&sb->s_umount);
> -			else
> -				down_read(&sb->s_umount);
> +			super_lock(sb, excl);
>  			/* still alive? */
>  			if (sb->s_root && (sb->s_flags & SB_BORN))
>  				return sb;
> -			if (excl)
> -				up_write(&sb->s_umount);
> -			else
> -				up_read(&sb->s_umount);
> +			super_unlock(sb, excl);
>  			/* nope, got unmounted */
>  			spin_lock(&sb_lock);
>  			__put_super(sb);
> @@ -889,9 +919,9 @@ int reconfigure_super(struct fs_context *fc)
>  
>  	if (remount_ro) {
>  		if (!hlist_empty(&sb->s_pins)) {
> -			up_write(&sb->s_umount);
> +			super_unlock_write(sb);
>  			group_pin_kill(&sb->s_pins);
> -			down_write(&sb->s_umount);
> +			super_lock_write(sb);
>  			if (!sb->s_root)
>  				return 0;
>  			if (sb->s_writers.frozen != SB_UNFROZEN)
> @@ -954,7 +984,7 @@ int reconfigure_super(struct fs_context *fc)
>  
>  static void do_emergency_remount_callback(struct super_block *sb)
>  {
> -	down_write(&sb->s_umount);
> +	super_lock_write(sb);
>  	if (sb->s_root && sb->s_bdev && (sb->s_flags & SB_BORN) &&
>  	    !sb_rdonly(sb)) {
>  		struct fs_context *fc;
> @@ -967,7 +997,7 @@ static void do_emergency_remount_callback(struct super_block *sb)
>  			put_fs_context(fc);
>  		}
>  	}
> -	up_write(&sb->s_umount);
> +	super_unlock_write(sb);
>  }
>  
>  static void do_emergency_remount(struct work_struct *work)
> @@ -990,12 +1020,12 @@ void emergency_remount(void)
>  
>  static void do_thaw_all_callback(struct super_block *sb)
>  {
> -	down_write(&sb->s_umount);
> +	super_lock_write(sb);
>  	if (sb->s_root && sb->s_flags & SB_BORN) {
>  		emergency_thaw_bdev(sb);
>  		thaw_super_locked(sb);
>  	} else {
> -		up_write(&sb->s_umount);
> +		super_unlock_write(sb);
>  	}
>  }
>  
> @@ -1182,10 +1212,10 @@ EXPORT_SYMBOL(get_tree_keyed);
>   */
>  static bool lock_active_super(struct super_block *sb)
>  {
> -	down_read(&sb->s_umount);
> +	super_lock_read(sb);
>  	if (!sb->s_root ||
>  	    (sb->s_flags & (SB_ACTIVE | SB_BORN)) != (SB_ACTIVE | SB_BORN)) {
> -		up_read(&sb->s_umount);
> +		super_unlock_read(sb);
>  		return false;
>  	}
>  	return true;
> @@ -1208,7 +1238,7 @@ static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise)
>  	if (sb->s_op->shutdown)
>  		sb->s_op->shutdown(sb);
>  
> -	up_read(&sb->s_umount);
> +	super_unlock_read(sb);
>  }
>  
>  static void fs_bdev_sync(struct block_device *bdev)
> @@ -1220,7 +1250,7 @@ static void fs_bdev_sync(struct block_device *bdev)
>  	if (!lock_active_super(sb))
>  		return;
>  	sync_filesystem(sb);
> -	up_read(&sb->s_umount);
> +	super_unlock_read(sb);
>  }
>  
>  const struct blk_holder_ops fs_holder_ops = {
> @@ -1342,9 +1372,9 @@ int get_tree_bdev(struct fs_context *fc,
>  		 * bdev_mark_dead()). It is safe because we have active sb
>  		 * reference and SB_BORN is not set yet.
>  		 */
> -		up_write(&s->s_umount);
> +		super_unlock_write(s);
>  		error = setup_bdev_super(s, fc->sb_flags, fc);
> -		down_write(&s->s_umount);
> +		super_lock_write(s);
>  		if (!error)
>  			error = fill_super(s, fc);
>  		if (error) {
> @@ -1394,9 +1424,9 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
>  		 * bdev_mark_dead()). It is safe because we have active sb
>  		 * reference and SB_BORN is not set yet.
>  		 */
> -		up_write(&s->s_umount);
> +		super_unlock_write(s);
>  		error = setup_bdev_super(s, flags, NULL);
> -		down_write(&s->s_umount);
> +		super_lock_write(s);
>  		if (!error)
>  			error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
>  		if (error) {
> @@ -1685,29 +1715,29 @@ int freeze_super(struct super_block *sb)
>  	int ret;
>  
>  	atomic_inc(&sb->s_active);
> -	down_write(&sb->s_umount);
> +	super_lock_write(sb);
>  	if (sb->s_writers.frozen != SB_UNFROZEN) {
>  		deactivate_locked_super(sb);
>  		return -EBUSY;
>  	}
>  
>  	if (!(sb->s_flags & SB_BORN)) {
> -		up_write(&sb->s_umount);
> +		super_unlock_write(sb);
>  		return 0;	/* sic - it's "nothing to do" */
>  	}
>  
>  	if (sb_rdonly(sb)) {
>  		/* Nothing to do really... */
>  		sb->s_writers.frozen = SB_FREEZE_COMPLETE;
> -		up_write(&sb->s_umount);
> +		super_unlock_write(sb);
>  		return 0;
>  	}
>  
>  	sb->s_writers.frozen = SB_FREEZE_WRITE;
>  	/* Release s_umount to preserve sb_start_write -> s_umount ordering */
> -	up_write(&sb->s_umount);
> +	super_unlock_write(sb);
>  	sb_wait_write(sb, SB_FREEZE_WRITE);
> -	down_write(&sb->s_umount);
> +	super_lock_write(sb);
>  
>  	/* Now we go and block page faults... */
>  	sb->s_writers.frozen = SB_FREEZE_PAGEFAULT;
> @@ -1743,7 +1773,7 @@ int freeze_super(struct super_block *sb)
>  	 */
>  	sb->s_writers.frozen = SB_FREEZE_COMPLETE;
>  	lockdep_sb_freeze_release(sb);
> -	up_write(&sb->s_umount);
> +	super_unlock_write(sb);
>  	return 0;
>  }
>  EXPORT_SYMBOL(freeze_super);
> @@ -1753,7 +1783,7 @@ static int thaw_super_locked(struct super_block *sb)
>  	int error;
>  
>  	if (sb->s_writers.frozen != SB_FREEZE_COMPLETE) {
> -		up_write(&sb->s_umount);
> +		super_unlock_write(sb);
>  		return -EINVAL;
>  	}
>  
> @@ -1770,7 +1800,7 @@ static int thaw_super_locked(struct super_block *sb)
>  			printk(KERN_ERR
>  				"VFS:Filesystem thaw failed\n");
>  			lockdep_sb_freeze_release(sb);
> -			up_write(&sb->s_umount);
> +			super_unlock_write(sb);
>  			return error;
>  		}
>  	}
> @@ -1790,7 +1820,7 @@ static int thaw_super_locked(struct super_block *sb)
>   */
>  int thaw_super(struct super_block *sb)
>  {
> -	down_write(&sb->s_umount);
> +	super_lock_write(sb);
>  	return thaw_super_locked(sb);
>  }
>  EXPORT_SYMBOL(thaw_super);
> 
> -- 
> 2.34.1
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/3] super: wait for nascent superblocks
  2023-08-17 10:47 ` [PATCH 2/3] super: wait for nascent superblocks Christian Brauner
@ 2023-08-17 12:50   ` Jan Kara
  2023-08-17 13:24     ` Christian Brauner
  2023-08-17 20:53     ` Matthew Wilcox
  0 siblings, 2 replies; 15+ messages in thread
From: Jan Kara @ 2023-08-17 12:50 UTC (permalink / raw)
  To: Christian Brauner; +Cc: Jan Kara, Christoph Hellwig, Al Viro, linux-fsdevel

On Thu 17-08-23 12:47:43, Christian Brauner wrote:
> Recent patches experiment with making it possible to allocate a new
> superblock before opening the relevant block device. Naturally this has
> intricate side-effects that we get to learn about while developing this.
> 
> Superblock allocators such as sget{_fc}() return with s_umount of the
> new superblock held and ock ordering currently requires that block level
> locks such as bdev_lock and open_mutex rank above s_umount.
> 
> Before aca740cecbe5 ("fs: open block device after superblock creation")
> ordering was guaranteed to be correct as block devices were opened prior
> to superblock allocation and thus s_umount wasn't held. But now s_umount
> must be dropped before opening block devices to avoid locking
> violations.
> 
> This has consequences. The main one being that iterators over
> @super_blocks and @fs_supers that grab a temporary reference to the
> superblock can now also grab s_umount before the caller has managed to
> open block devices and called fill_super(). So whereas before such
> iterators or concurrent mounts would have simply slept on s_umount until
> SB_BORN was set or the superblock was discard due to initalization
> failure they can now needlessly spin through sget{_fc}().
> 
> If the caller is sleeping on bdev_lock or open_mutex one caller waiting
> on SB_BORN will always spin somewhere potentially this can go on for
					^^ and potentially?
> quite a while.
> 
> It should be possible to drop s_umount while allowing iterators to wait
> on a nascent superblock to either be born or discarded. This patch
> implements a wait_var_event() mechanism allowing iterators to sleep
> until they are woken when the superblock is born or discarded.
> 
> This should also allows us to avoid relooping through @fs_supers and
       ^^^ superfluous "should"

> @super_blocks if a superblock isn't yet born or dying.
> 
> Link: aca740cecbe5 ("fs: open block device after superblock creation")
> Signed-off-by: Christian Brauner <brauner@kernel.org>
> ---
>  fs/super.c         | 146 +++++++++++++++++++++++++++++++++++++++++++++--------
>  include/linux/fs.h |   1 +
>  2 files changed, 125 insertions(+), 22 deletions(-)
> 
> diff --git a/fs/super.c b/fs/super.c
> index 878675921bdc..55bf495763d9 100644
> --- a/fs/super.c
> +++ b/fs/super.c
> @@ -86,6 +86,89 @@ static inline void super_unlock_read(struct super_block *sb)
>  	super_unlock(sb, false);
>  }
>  
> +static inline bool wait_born(struct super_block *sb)
> +{
> +	unsigned int flags;
> +
> +	/*
> +	 * Pairs with smp_store_release() in super_wake() and ensure
> +	 * that we see SB_BORN or SB_DYING after we're woken.
> +	 */
> +	flags = smp_load_acquire(&sb->s_flags);
> +	return flags & (SB_BORN | SB_DYING);
> +}
> +
> +/**
> + * super_wait - wait for superblock to become ready
> + * @sb: superblock to wait for
> + * @excl: whether exclusive access is required
> + *
> + * If the superblock has neither passed through vfs_get_tree() or
> + * generic_shutdown_super() yet wait for it to happen. Either superblock
> + * creation will succeed and SB_BORN is set by vfs_get_tree() or we're
> + * woken and we'll see SB_DYING.
> + *
> + * The caller must have acquired a temporary reference on @sb->s_count.
> + *
> + * Return: true if SB_BORN was set, false if SB_DYING was set.

The comment should mention that this acquires s_umount and returns with it
held. Also the name is a bit too generic for my taste and not expressing
the fact this is in fact a lock operation. Maybe something like
super_lock_wait_born()?

> + */
> +static bool super_wait(struct super_block *sb, bool excl)
> +{
> +
> +	lockdep_assert_not_held(&sb->s_umount);
> +
> +relock:
> +	super_lock(sb, excl);
> +
> +	/*
> +	 * Has gone through generic_shutdown_super() in the meantime.
> +	 * @sb->s_root is NULL and @sb->s_active is 0. No one needs to
> +	 * grab a reference to this. Tell them so.
> +	 */
> +	if (sb->s_flags & SB_DYING)
> +		return false;
> +
> +	/* Has called ->get_tree() successfully. */
> +	if (sb->s_flags & SB_BORN)
> +		return true;
> +
> +	super_unlock(sb, excl);
> +
> +	/* wait until the superblock is ready or dying */
> +	wait_var_event(&sb->s_flags, wait_born(sb));
> +
> +	/*
> +	 * Neither SB_BORN nor SB_DYING are ever unset so we never loop.
> +	 * Just reacquire @sb->s_umount for the caller.
> +	 */
> +	goto relock;
> +}
> +
> +/* wait and acquire read-side of @sb->s_umount */
> +static inline bool super_wait_read(struct super_block *sb)
> +{
> +	return super_wait(sb, false);
> +}

And I'd call this super_lock_read_wait_born().

> +
> +/* wait and acquire write-side of @sb->s_umount */
> +static inline bool super_wait_write(struct super_block *sb)
> +{
> +	return super_wait(sb, true);
> +}
> +
> +/* wake waiters */
> +static void super_wake(struct super_block *sb, unsigned int flag)
> +{
> +	unsigned int flags = sb->s_flags;

	I kind of miss the point of this local variable but whatever...

> +
> +	/*
> +	 * Pairs with smp_load_acquire() in super_wait() and ensure that
							     ^^^ ensures
> +	 * we @flag is set before we wake anyone.
	   ^^ the

> +	 */
> +	smp_store_release(&sb->s_flags, flags | flag);
> +	wake_up_var(&sb->s_flags);
> +}
> +
>  /*
>   * One thing we have to be careful of with a per-sb shrinker is that we don't
>   * drop the last active reference to the superblock from within the shrinker.
> @@ -415,10 +498,12 @@ EXPORT_SYMBOL(deactivate_super);
>   */
>  static int grab_super(struct super_block *s) __releases(sb_lock)
>  {
> +	bool born;
> +
>  	s->s_count++;
>  	spin_unlock(&sb_lock);
> -	super_lock_write(s);
> -	if ((s->s_flags & SB_BORN) && atomic_inc_not_zero(&s->s_active)) {
> +	born = super_wait_write(s);
> +	if (born && atomic_inc_not_zero(&s->s_active)) {
>  		put_super(s);
>  		return 1;
>  	}
> @@ -557,6 +642,13 @@ void generic_shutdown_super(struct super_block *sb)
>  	/* should be initialized for __put_super_and_need_restart() */
>  	hlist_del_init(&sb->s_instances);
>  	spin_unlock(&sb_lock);
> +	/*
> +	 * Broadcast to everyone that grabbed a temporary reference to this
> +	 * superblock before we removed it from @fs_supers that the superblock
> +	 * is dying. Every walker of @fs_supers outside of sget{_fc}() will now
> +	 * discard this superblock and treat it as dead.
> +	 */
> +	super_wake(sb, SB_DYING);
>  	super_unlock_write(sb);
>  	if (sb->s_bdi != &noop_backing_dev_info) {
>  		if (sb->s_iflags & SB_I_PERSB_BDI)
> @@ -631,6 +723,11 @@ struct super_block *sget_fc(struct fs_context *fc,
>  	s->s_type = fc->fs_type;
>  	s->s_iflags |= fc->s_iflags;
>  	strscpy(s->s_id, s->s_type->name, sizeof(s->s_id));
> +	/*
> +	 * Make the superblock visible on @super_blocks and @fs_supers.
> +	 * It's in a nascent state and users should wait on SB_BORN or
> +	 * SB_DYING to be set.
> +	 */

But now sget_fc() (and sget()) will be looping on superblocks with SB_DYING
set? Ah, that's solved by the next patch. OK.

>  	list_add_tail(&s->s_list, &super_blocks);
>  	hlist_add_head(&s->s_instances, &s->s_type->fs_supers);
>  	spin_unlock(&sb_lock);

<snip>

> @@ -841,15 +942,14 @@ struct super_block *get_active_super(struct block_device *bdev)
>  	if (!bdev)
>  		return NULL;
>  
> -restart:
>  	spin_lock(&sb_lock);
>  	list_for_each_entry(sb, &super_blocks, s_list) {
>  		if (hlist_unhashed(&sb->s_instances))
>  			continue;
>  		if (sb->s_bdev == bdev) {
>  			if (!grab_super(sb))
> -				goto restart;
> -			super_unlock_write(sb);
> +				return NULL;
  Let me check whether I understand the rationale of this change: We found
a matching sb and it's SB_DYING. Instead of waiting for it to die and retry
the search (to likely not find anything) we just return NULL right away to
save us some trouble.

> +                        super_unlock_write(sb);
   ^^^ whitespace damage here

>  			return sb;
>  		}
>  	}

<snip>

> @@ -1212,9 +1314,9 @@ EXPORT_SYMBOL(get_tree_keyed);
>   */
>  static bool lock_active_super(struct super_block *sb)

Another inconsistently named locking function after you've introduced your
locking helpers...

>  {
> -	super_lock_read(sb);
> -	if (!sb->s_root ||
> -	    (sb->s_flags & (SB_ACTIVE | SB_BORN)) != (SB_ACTIVE | SB_BORN)) {
> +	bool born = super_wait_read(sb);
> +
> +	if (!born || !sb->s_root || !(sb->s_flags & SB_ACTIVE)) {
>  		super_unlock_read(sb);
>  		return false;
>  	}
> @@ -1572,7 +1674,7 @@ int vfs_get_tree(struct fs_context *fc)
>  	 * flag.
>  	 */
>  	smp_wmb();

Is the barrier still needed here when super_wake() has smp_store_release()?

> -	sb->s_flags |= SB_BORN;
> +	super_wake(sb, SB_BORN);

I'm also kind of wondering whether when we have SB_BORN and SB_DYING isn't
the SB_ACTIVE flag redundant. SB_BORN is set practically at the same moment
as SB_ACTIVE. SB_ACTIVE gets cleared somewhat earlier than SB_DYING is set
but I believe SB_DYING can be set earlier (after all by the time SB_ACTIVE
is cleared we have sb->s_root == NULL which basically stops most of the
places looking at superblocks. As I'm grepping we've grown a lot of
SB_ACTIVE handling all over the place so this would be a bit non-trivial
but I belive it will make it easier for filesystem developers to decide
which flag they should be using... Also we could then drop sb->s_root
checks from many places because the locking helpers will return false if
SB_DYING is set.

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/3] super: wait for nascent superblocks
  2023-08-17 12:50   ` Jan Kara
@ 2023-08-17 13:24     ` Christian Brauner
  2023-08-17 13:41       ` Christian Brauner
  2023-08-17 14:16       ` Jan Kara
  2023-08-17 20:53     ` Matthew Wilcox
  1 sibling, 2 replies; 15+ messages in thread
From: Christian Brauner @ 2023-08-17 13:24 UTC (permalink / raw)
  To: Jan Kara; +Cc: Jan Kara, Christoph Hellwig, Al Viro, linux-fsdevel

On Thu, Aug 17, 2023 at 02:50:21PM +0200, Jan Kara wrote:
> On Thu 17-08-23 12:47:43, Christian Brauner wrote:
> > Recent patches experiment with making it possible to allocate a new
> > superblock before opening the relevant block device. Naturally this has
> > intricate side-effects that we get to learn about while developing this.
> > 
> > Superblock allocators such as sget{_fc}() return with s_umount of the
> > new superblock held and ock ordering currently requires that block level
> > locks such as bdev_lock and open_mutex rank above s_umount.
> > 
> > Before aca740cecbe5 ("fs: open block device after superblock creation")
> > ordering was guaranteed to be correct as block devices were opened prior
> > to superblock allocation and thus s_umount wasn't held. But now s_umount
> > must be dropped before opening block devices to avoid locking
> > violations.
> > 
> > This has consequences. The main one being that iterators over
> > @super_blocks and @fs_supers that grab a temporary reference to the
> > superblock can now also grab s_umount before the caller has managed to
> > open block devices and called fill_super(). So whereas before such
> > iterators or concurrent mounts would have simply slept on s_umount until
> > SB_BORN was set or the superblock was discard due to initalization
> > failure they can now needlessly spin through sget{_fc}().
> > 
> > If the caller is sleeping on bdev_lock or open_mutex one caller waiting
> > on SB_BORN will always spin somewhere potentially this can go on for
> 					^^ and potentially?
> > quite a while.
> > 
> > It should be possible to drop s_umount while allowing iterators to wait
> > on a nascent superblock to either be born or discarded. This patch
> > implements a wait_var_event() mechanism allowing iterators to sleep
> > until they are woken when the superblock is born or discarded.
> > 
> > This should also allows us to avoid relooping through @fs_supers and
>        ^^^ superfluous "should"
> 
> > @super_blocks if a superblock isn't yet born or dying.
> > 
> > Link: aca740cecbe5 ("fs: open block device after superblock creation")
> > Signed-off-by: Christian Brauner <brauner@kernel.org>
> > ---
> >  fs/super.c         | 146 +++++++++++++++++++++++++++++++++++++++++++++--------
> >  include/linux/fs.h |   1 +
> >  2 files changed, 125 insertions(+), 22 deletions(-)
> > 
> > diff --git a/fs/super.c b/fs/super.c
> > index 878675921bdc..55bf495763d9 100644
> > --- a/fs/super.c
> > +++ b/fs/super.c
> > @@ -86,6 +86,89 @@ static inline void super_unlock_read(struct super_block *sb)
> >  	super_unlock(sb, false);
> >  }
> >  
> > +static inline bool wait_born(struct super_block *sb)
> > +{
> > +	unsigned int flags;
> > +
> > +	/*
> > +	 * Pairs with smp_store_release() in super_wake() and ensure
> > +	 * that we see SB_BORN or SB_DYING after we're woken.
> > +	 */
> > +	flags = smp_load_acquire(&sb->s_flags);
> > +	return flags & (SB_BORN | SB_DYING);
> > +}
> > +
> > +/**
> > + * super_wait - wait for superblock to become ready
> > + * @sb: superblock to wait for
> > + * @excl: whether exclusive access is required
> > + *
> > + * If the superblock has neither passed through vfs_get_tree() or
> > + * generic_shutdown_super() yet wait for it to happen. Either superblock
> > + * creation will succeed and SB_BORN is set by vfs_get_tree() or we're
> > + * woken and we'll see SB_DYING.
> > + *
> > + * The caller must have acquired a temporary reference on @sb->s_count.
> > + *
> > + * Return: true if SB_BORN was set, false if SB_DYING was set.
> 
> The comment should mention that this acquires s_umount and returns with it
> held. Also the name is a bit too generic for my taste and not expressing
> the fact this is in fact a lock operation. Maybe something like
> super_lock_wait_born()?
> 
> > + */
> > +static bool super_wait(struct super_block *sb, bool excl)
> > +{
> > +
> > +	lockdep_assert_not_held(&sb->s_umount);
> > +
> > +relock:
> > +	super_lock(sb, excl);
> > +
> > +	/*
> > +	 * Has gone through generic_shutdown_super() in the meantime.
> > +	 * @sb->s_root is NULL and @sb->s_active is 0. No one needs to
> > +	 * grab a reference to this. Tell them so.
> > +	 */
> > +	if (sb->s_flags & SB_DYING)
> > +		return false;
> > +
> > +	/* Has called ->get_tree() successfully. */
> > +	if (sb->s_flags & SB_BORN)
> > +		return true;
> > +
> > +	super_unlock(sb, excl);
> > +
> > +	/* wait until the superblock is ready or dying */
> > +	wait_var_event(&sb->s_flags, wait_born(sb));
> > +
> > +	/*
> > +	 * Neither SB_BORN nor SB_DYING are ever unset so we never loop.
> > +	 * Just reacquire @sb->s_umount for the caller.
> > +	 */
> > +	goto relock;
> > +}
> > +
> > +/* wait and acquire read-side of @sb->s_umount */
> > +static inline bool super_wait_read(struct super_block *sb)
> > +{
> > +	return super_wait(sb, false);
> > +}
> 
> And I'd call this super_lock_read_wait_born().
> 
> > +
> > +/* wait and acquire write-side of @sb->s_umount */
> > +static inline bool super_wait_write(struct super_block *sb)
> > +{
> > +	return super_wait(sb, true);
> > +}
> > +
> > +/* wake waiters */
> > +static void super_wake(struct super_block *sb, unsigned int flag)
> > +{
> > +	unsigned int flags = sb->s_flags;
> 
> 	I kind of miss the point of this local variable but whatever...
> 
> > +
> > +	/*
> > +	 * Pairs with smp_load_acquire() in super_wait() and ensure that
> 							     ^^^ ensures
> > +	 * we @flag is set before we wake anyone.
> 	   ^^ the
> 
> > +	 */
> > +	smp_store_release(&sb->s_flags, flags | flag);
> > +	wake_up_var(&sb->s_flags);
> > +}
> > +
> >  /*
> >   * One thing we have to be careful of with a per-sb shrinker is that we don't
> >   * drop the last active reference to the superblock from within the shrinker.
> > @@ -415,10 +498,12 @@ EXPORT_SYMBOL(deactivate_super);
> >   */
> >  static int grab_super(struct super_block *s) __releases(sb_lock)
> >  {
> > +	bool born;
> > +
> >  	s->s_count++;
> >  	spin_unlock(&sb_lock);
> > -	super_lock_write(s);
> > -	if ((s->s_flags & SB_BORN) && atomic_inc_not_zero(&s->s_active)) {
> > +	born = super_wait_write(s);
> > +	if (born && atomic_inc_not_zero(&s->s_active)) {
> >  		put_super(s);
> >  		return 1;
> >  	}
> > @@ -557,6 +642,13 @@ void generic_shutdown_super(struct super_block *sb)
> >  	/* should be initialized for __put_super_and_need_restart() */
> >  	hlist_del_init(&sb->s_instances);
> >  	spin_unlock(&sb_lock);
> > +	/*
> > +	 * Broadcast to everyone that grabbed a temporary reference to this
> > +	 * superblock before we removed it from @fs_supers that the superblock
> > +	 * is dying. Every walker of @fs_supers outside of sget{_fc}() will now
> > +	 * discard this superblock and treat it as dead.
> > +	 */
> > +	super_wake(sb, SB_DYING);
> >  	super_unlock_write(sb);
> >  	if (sb->s_bdi != &noop_backing_dev_info) {
> >  		if (sb->s_iflags & SB_I_PERSB_BDI)
> > @@ -631,6 +723,11 @@ struct super_block *sget_fc(struct fs_context *fc,
> >  	s->s_type = fc->fs_type;
> >  	s->s_iflags |= fc->s_iflags;
> >  	strscpy(s->s_id, s->s_type->name, sizeof(s->s_id));
> > +	/*
> > +	 * Make the superblock visible on @super_blocks and @fs_supers.
> > +	 * It's in a nascent state and users should wait on SB_BORN or
> > +	 * SB_DYING to be set.
> > +	 */
> 
> But now sget_fc() (and sget()) will be looping on superblocks with SB_DYING
> set? Ah, that's solved by the next patch. OK.
> 
> >  	list_add_tail(&s->s_list, &super_blocks);
> >  	hlist_add_head(&s->s_instances, &s->s_type->fs_supers);
> >  	spin_unlock(&sb_lock);
> 
> <snip>
> 
> > @@ -841,15 +942,14 @@ struct super_block *get_active_super(struct block_device *bdev)
> >  	if (!bdev)
> >  		return NULL;
> >  
> > -restart:
> >  	spin_lock(&sb_lock);
> >  	list_for_each_entry(sb, &super_blocks, s_list) {
> >  		if (hlist_unhashed(&sb->s_instances))
> >  			continue;
> >  		if (sb->s_bdev == bdev) {
> >  			if (!grab_super(sb))
> > -				goto restart;
> > -			super_unlock_write(sb);
> > +				return NULL;
>   Let me check whether I understand the rationale of this change: We found
> a matching sb and it's SB_DYING. Instead of waiting for it to die and retry
> the search (to likely not find anything) we just return NULL right away to
> save us some trouble.

Thanks for that question! I was missing something. Before these changes,
when a superblock was unmounted and it hit deactivate_super() it could do:

P1                                                      P2
deactivate_locked_super()                               grab_super()
-> if (!atomic_add_unless(&s->s_active, -1, 1))
                                                        -> super_lock_write()
                                                           SB_BORN && !atomic_inc_add_unless(s->s_active)
                                                           // fails, loop until it goes away
   -> super_lock_write()
      // remove sb from fs_supers

That can still happen in the new scheme so my patch needs a fix to wait
for SB_DYING to be broadcast when no active reference can be acquired
anymore because then we know that we're about to shut this down. Either
that or spinning but I think we should just wait as we can now do that
with my proposal.

> 
> > +                        super_unlock_write(sb);
>    ^^^ whitespace damage here
> 
> >  			return sb;
> >  		}
> >  	}
> 
> <snip>
> 
> > @@ -1212,9 +1314,9 @@ EXPORT_SYMBOL(get_tree_keyed);
> >   */
> >  static bool lock_active_super(struct super_block *sb)
> 
> Another inconsistently named locking function after you've introduced your
> locking helpers...

I wanted a first opinion before digging into this. :)

> 
> >  {
> > -	super_lock_read(sb);
> > -	if (!sb->s_root ||
> > -	    (sb->s_flags & (SB_ACTIVE | SB_BORN)) != (SB_ACTIVE | SB_BORN)) {
> > +	bool born = super_wait_read(sb);
> > +
> > +	if (!born || !sb->s_root || !(sb->s_flags & SB_ACTIVE)) {
> >  		super_unlock_read(sb);
> >  		return false;
> >  	}
> > @@ -1572,7 +1674,7 @@ int vfs_get_tree(struct fs_context *fc)
> >  	 * flag.
> >  	 */
> >  	smp_wmb();
> 
> Is the barrier still needed here when super_wake() has smp_store_release()?

I wasn't sure. The barrier tries to ensure that everything before
SB_BORN is seen by super_cache_count(). Whereas the smp_store_release()
really is about the flag. Maybe the smp_wmb() would be sufficient for
that but since I wasn't sure the additional smp_store_release() is way
more obvious imho.

> 
> > -	sb->s_flags |= SB_BORN;
> > +	super_wake(sb, SB_BORN);
> 
> I'm also kind of wondering whether when we have SB_BORN and SB_DYING isn't
> the SB_ACTIVE flag redundant. SB_BORN is set practically at the same moment
> as SB_ACTIVE. SB_ACTIVE gets cleared somewhat earlier than SB_DYING is set
> but I believe SB_DYING can be set earlier (after all by the time SB_ACTIVE
> is cleared we have sb->s_root == NULL which basically stops most of the
> places looking at superblocks. As I'm grepping we've grown a lot of
> SB_ACTIVE handling all over the place so this would be a bit non-trivial
> but I belive it will make it easier for filesystem developers to decide
> which flag they should be using... Also we could then drop sb->s_root
> checks from many places because the locking helpers will return false if
> SB_DYING is set.

Certainly something to explore but no promises. Would you be open to
doig this as a follow-up patch? If you have a clearer idea here then I
wouldn't mind you piling this on top of this series even.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/3] super: wait for nascent superblocks
  2023-08-17 13:24     ` Christian Brauner
@ 2023-08-17 13:41       ` Christian Brauner
  2023-08-17 14:16       ` Jan Kara
  1 sibling, 0 replies; 15+ messages in thread
From: Christian Brauner @ 2023-08-17 13:41 UTC (permalink / raw)
  To: Jan Kara; +Cc: Jan Kara, Christoph Hellwig, Al Viro, linux-fsdevel

> Thanks for that question! I was missing something. Before these changes,
> when a superblock was unmounted and it hit deactivate_super() it could do:
> 
> P1                                                      P2
> deactivate_locked_super()                               grab_super()

s/deactivate_locked_super()/deactivate_super()

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/3] super: wait for nascent superblocks
  2023-08-17 13:24     ` Christian Brauner
  2023-08-17 13:41       ` Christian Brauner
@ 2023-08-17 14:16       ` Jan Kara
  2023-08-17 14:30         ` Christian Brauner
  1 sibling, 1 reply; 15+ messages in thread
From: Jan Kara @ 2023-08-17 14:16 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Jan Kara, Jan Kara, Christoph Hellwig, Al Viro, linux-fsdevel

On Thu 17-08-23 15:24:54, Christian Brauner wrote:
> On Thu, Aug 17, 2023 at 02:50:21PM +0200, Jan Kara wrote:
> > On Thu 17-08-23 12:47:43, Christian Brauner wrote:
> > > Recent patches experiment with making it possible to allocate a new
> > > superblock before opening the relevant block device. Naturally this has
> > > intricate side-effects that we get to learn about while developing this.
> > > 
> > > Superblock allocators such as sget{_fc}() return with s_umount of the
> > > new superblock held and ock ordering currently requires that block level
> > > locks such as bdev_lock and open_mutex rank above s_umount.
> > > 
> > > Before aca740cecbe5 ("fs: open block device after superblock creation")
> > > ordering was guaranteed to be correct as block devices were opened prior
> > > to superblock allocation and thus s_umount wasn't held. But now s_umount
> > > must be dropped before opening block devices to avoid locking
> > > violations.
> > > 
> > > This has consequences. The main one being that iterators over
> > > @super_blocks and @fs_supers that grab a temporary reference to the
> > > superblock can now also grab s_umount before the caller has managed to
> > > open block devices and called fill_super(). So whereas before such
> > > iterators or concurrent mounts would have simply slept on s_umount until
> > > SB_BORN was set or the superblock was discard due to initalization
> > > failure they can now needlessly spin through sget{_fc}().
> > > 
> > > If the caller is sleeping on bdev_lock or open_mutex one caller waiting
> > > on SB_BORN will always spin somewhere potentially this can go on for
> > 					^^ and potentially?
> > > quite a while.
> > > 
> > > It should be possible to drop s_umount while allowing iterators to wait
> > > on a nascent superblock to either be born or discarded. This patch
> > > implements a wait_var_event() mechanism allowing iterators to sleep
> > > until they are woken when the superblock is born or discarded.
> > > 
> > > This should also allows us to avoid relooping through @fs_supers and
> >        ^^^ superfluous "should"
> > 
> > > @super_blocks if a superblock isn't yet born or dying.
> > > 
> > > Link: aca740cecbe5 ("fs: open block device after superblock creation")
> > > Signed-off-by: Christian Brauner <brauner@kernel.org>

<snip>

> > > @@ -841,15 +942,14 @@ struct super_block *get_active_super(struct block_device *bdev)
> > >  	if (!bdev)
> > >  		return NULL;
> > >  
> > > -restart:
> > >  	spin_lock(&sb_lock);
> > >  	list_for_each_entry(sb, &super_blocks, s_list) {
> > >  		if (hlist_unhashed(&sb->s_instances))
> > >  			continue;
> > >  		if (sb->s_bdev == bdev) {
> > >  			if (!grab_super(sb))
> > > -				goto restart;
> > > -			super_unlock_write(sb);
> > > +				return NULL;
> >   Let me check whether I understand the rationale of this change: We found
> > a matching sb and it's SB_DYING. Instead of waiting for it to die and retry
> > the search (to likely not find anything) we just return NULL right away to
> > save us some trouble.
> 
> Thanks for that question! I was missing something. Before these changes,
> when a superblock was unmounted and it hit deactivate_super() it could do:
> 
> P1                                                      P2
> deactivate_locked_super()                               grab_super()
> -> if (!atomic_add_unless(&s->s_active, -1, 1))
>                                                         -> super_lock_write()
>                                                            SB_BORN && !atomic_inc_add_unless(s->s_active)
>                                                            // fails, loop until it goes away
>    -> super_lock_write()
>       // remove sb from fs_supers

I don't think this can happen as you describe it. deactivate_super() +
deactivate_locked_super() are written in a way so that the last s_active
reference is dropped while sb->s_umount is held for writing. And
grab_super() tries the increment under sb->s_umount as well. So either
grab_super() wins the race and deactivate_locked_super() just drops one
refcount and exits, or deactivate_locked_super() wins the race and
grab_super() can come only after the sb is shutdown. Then the increment
will fail and we'll loop as you describe. Perhaps that's what you meant,
just you've ordered things wrongly...

> That can still happen in the new scheme so my patch needs a fix to wait
> for SB_DYING to be broadcast when no active reference can be acquired
> anymore because then we know that we're about to shut this down. Either
> that or spinning but I think we should just wait as we can now do that
> with my proposal.

... But in that case by the time grab_super() is able to get sb->s_umount
semaphore, SB_DYING is already set.

> > >  {
> > > -	super_lock_read(sb);
> > > -	if (!sb->s_root ||
> > > -	    (sb->s_flags & (SB_ACTIVE | SB_BORN)) != (SB_ACTIVE | SB_BORN)) {
> > > +	bool born = super_wait_read(sb);
> > > +
> > > +	if (!born || !sb->s_root || !(sb->s_flags & SB_ACTIVE)) {
> > >  		super_unlock_read(sb);
> > >  		return false;
> > >  	}
> > > @@ -1572,7 +1674,7 @@ int vfs_get_tree(struct fs_context *fc)
> > >  	 * flag.
> > >  	 */
> > >  	smp_wmb();
> > 
> > Is the barrier still needed here when super_wake() has smp_store_release()?
> 
> I wasn't sure. The barrier tries to ensure that everything before
> SB_BORN is seen by super_cache_count(). Whereas the smp_store_release()
> really is about the flag. Maybe the smp_wmb() would be sufficient for
> that but since I wasn't sure the additional smp_store_release() is way
> more obvious imho.

I was looking into memory-barriers.txt and it has:

(6) RELEASE operations.

     This also acts as a one-way permeable barrier.  It guarantees that all
     memory operations before the RELEASE operation will appear to happen
     before the RELEASE operation with respect to the other components of the
     system.

Which sounds like smp_store_release() of SB_BORN should be enough to make
super_cache_count() see all the stores before it if it sees SB_BORN set...

> > > -	sb->s_flags |= SB_BORN;
> > > +	super_wake(sb, SB_BORN);
> > 
> > I'm also kind of wondering whether when we have SB_BORN and SB_DYING isn't
> > the SB_ACTIVE flag redundant. SB_BORN is set practically at the same moment
> > as SB_ACTIVE. SB_ACTIVE gets cleared somewhat earlier than SB_DYING is set
> > but I believe SB_DYING can be set earlier (after all by the time SB_ACTIVE
> > is cleared we have sb->s_root == NULL which basically stops most of the
> > places looking at superblocks. As I'm grepping we've grown a lot of
> > SB_ACTIVE handling all over the place so this would be a bit non-trivial
> > but I belive it will make it easier for filesystem developers to decide
> > which flag they should be using... Also we could then drop sb->s_root
> > checks from many places because the locking helpers will return false if
> > SB_DYING is set.
> 
> Certainly something to explore but no promises. Would you be open to
> doig this as a follow-up patch? If you have a clearer idea here then I
> wouldn't mind you piling this on top of this series even.

Sure, the cleanup of SB_ACTIVE probably deserves a separate patchset
because it's going to involve a lot of individual filesystem changes.

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/3] super: wait for nascent superblocks
  2023-08-17 14:16       ` Jan Kara
@ 2023-08-17 14:30         ` Christian Brauner
  0 siblings, 0 replies; 15+ messages in thread
From: Christian Brauner @ 2023-08-17 14:30 UTC (permalink / raw)
  To: Jan Kara; +Cc: Jan Kara, Christoph Hellwig, Al Viro, linux-fsdevel

On Thu, Aug 17, 2023 at 04:16:49PM +0200, Jan Kara wrote:
> On Thu 17-08-23 15:24:54, Christian Brauner wrote:
> > On Thu, Aug 17, 2023 at 02:50:21PM +0200, Jan Kara wrote:
> > > On Thu 17-08-23 12:47:43, Christian Brauner wrote:
> > > > Recent patches experiment with making it possible to allocate a new
> > > > superblock before opening the relevant block device. Naturally this has
> > > > intricate side-effects that we get to learn about while developing this.
> > > > 
> > > > Superblock allocators such as sget{_fc}() return with s_umount of the
> > > > new superblock held and ock ordering currently requires that block level
> > > > locks such as bdev_lock and open_mutex rank above s_umount.
> > > > 
> > > > Before aca740cecbe5 ("fs: open block device after superblock creation")
> > > > ordering was guaranteed to be correct as block devices were opened prior
> > > > to superblock allocation and thus s_umount wasn't held. But now s_umount
> > > > must be dropped before opening block devices to avoid locking
> > > > violations.
> > > > 
> > > > This has consequences. The main one being that iterators over
> > > > @super_blocks and @fs_supers that grab a temporary reference to the
> > > > superblock can now also grab s_umount before the caller has managed to
> > > > open block devices and called fill_super(). So whereas before such
> > > > iterators or concurrent mounts would have simply slept on s_umount until
> > > > SB_BORN was set or the superblock was discard due to initalization
> > > > failure they can now needlessly spin through sget{_fc}().
> > > > 
> > > > If the caller is sleeping on bdev_lock or open_mutex one caller waiting
> > > > on SB_BORN will always spin somewhere potentially this can go on for
> > > 					^^ and potentially?
> > > > quite a while.
> > > > 
> > > > It should be possible to drop s_umount while allowing iterators to wait
> > > > on a nascent superblock to either be born or discarded. This patch
> > > > implements a wait_var_event() mechanism allowing iterators to sleep
> > > > until they are woken when the superblock is born or discarded.
> > > > 
> > > > This should also allows us to avoid relooping through @fs_supers and
> > >        ^^^ superfluous "should"
> > > 
> > > > @super_blocks if a superblock isn't yet born or dying.
> > > > 
> > > > Link: aca740cecbe5 ("fs: open block device after superblock creation")
> > > > Signed-off-by: Christian Brauner <brauner@kernel.org>
> 
> <snip>
> 
> > > > @@ -841,15 +942,14 @@ struct super_block *get_active_super(struct block_device *bdev)
> > > >  	if (!bdev)
> > > >  		return NULL;
> > > >  
> > > > -restart:
> > > >  	spin_lock(&sb_lock);
> > > >  	list_for_each_entry(sb, &super_blocks, s_list) {
> > > >  		if (hlist_unhashed(&sb->s_instances))
> > > >  			continue;
> > > >  		if (sb->s_bdev == bdev) {
> > > >  			if (!grab_super(sb))
> > > > -				goto restart;
> > > > -			super_unlock_write(sb);
> > > > +				return NULL;
> > >   Let me check whether I understand the rationale of this change: We found
> > > a matching sb and it's SB_DYING. Instead of waiting for it to die and retry
> > > the search (to likely not find anything) we just return NULL right away to
> > > save us some trouble.
> > 
> > Thanks for that question! I was missing something. Before these changes,
> > when a superblock was unmounted and it hit deactivate_super() it could do:
> > 
> > P1                                                      P2
> > deactivate_locked_super()                               grab_super()
> > -> if (!atomic_add_unless(&s->s_active, -1, 1))
> >                                                         -> super_lock_write()
> >                                                            SB_BORN && !atomic_inc_add_unless(s->s_active)
> >                                                            // fails, loop until it goes away
> >    -> super_lock_write()
> >       // remove sb from fs_supers
> 
> I don't think this can happen as you describe it. deactivate_super() +
> deactivate_locked_super() are written in a way so that the last s_active
> reference is dropped while sb->s_umount is held for writing. And
> grab_super() tries the increment under sb->s_umount as well. So either
> grab_super() wins the race and deactivate_locked_super() just drops one
> refcount and exits, or deactivate_locked_super() wins the race and
> grab_super() can come only after the sb is shutdown. Then the increment
> will fail and we'll loop as you describe. Perhaps that's what you meant,
> just you've ordered things wrongly...

Yes, sorry that's what I meant.

> 
> > That can still happen in the new scheme so my patch needs a fix to wait
> > for SB_DYING to be broadcast when no active reference can be acquired
> > anymore because then we know that we're about to shut this down. Either
> > that or spinning but I think we should just wait as we can now do that
> > with my proposal.
> 
> ... But in that case by the time grab_super() is able to get sb->s_umount
> semaphore, SB_DYING is already set.

Yeah, that's what I was about to reply right now after having thought
about it. :) Thanks!

> 
> > > >  {
> > > > -	super_lock_read(sb);
> > > > -	if (!sb->s_root ||
> > > > -	    (sb->s_flags & (SB_ACTIVE | SB_BORN)) != (SB_ACTIVE | SB_BORN)) {
> > > > +	bool born = super_wait_read(sb);
> > > > +
> > > > +	if (!born || !sb->s_root || !(sb->s_flags & SB_ACTIVE)) {
> > > >  		super_unlock_read(sb);
> > > >  		return false;
> > > >  	}
> > > > @@ -1572,7 +1674,7 @@ int vfs_get_tree(struct fs_context *fc)
> > > >  	 * flag.
> > > >  	 */
> > > >  	smp_wmb();
> > > 
> > > Is the barrier still needed here when super_wake() has smp_store_release()?
> > 
> > I wasn't sure. The barrier tries to ensure that everything before
> > SB_BORN is seen by super_cache_count(). Whereas the smp_store_release()
> > really is about the flag. Maybe the smp_wmb() would be sufficient for
> > that but since I wasn't sure the additional smp_store_release() is way
> > more obvious imho.
> 
> I was looking into memory-barriers.txt and it has:
> 
> (6) RELEASE operations.
> 
>      This also acts as a one-way permeable barrier.  It guarantees that all
>      memory operations before the RELEASE operation will appear to happen
>      before the RELEASE operation with respect to the other components of the
>      system.
> 
> Which sounds like smp_store_release() of SB_BORN should be enough to make
> super_cache_count() see all the stores before it if it sees SB_BORN set...

Ok, thanks for checking!

> 
> > > > -	sb->s_flags |= SB_BORN;
> > > > +	super_wake(sb, SB_BORN);
> > > 
> > > I'm also kind of wondering whether when we have SB_BORN and SB_DYING isn't
> > > the SB_ACTIVE flag redundant. SB_BORN is set practically at the same moment
> > > as SB_ACTIVE. SB_ACTIVE gets cleared somewhat earlier than SB_DYING is set
> > > but I believe SB_DYING can be set earlier (after all by the time SB_ACTIVE
> > > is cleared we have sb->s_root == NULL which basically stops most of the
> > > places looking at superblocks. As I'm grepping we've grown a lot of
> > > SB_ACTIVE handling all over the place so this would be a bit non-trivial
> > > but I belive it will make it easier for filesystem developers to decide
> > > which flag they should be using... Also we could then drop sb->s_root
> > > checks from many places because the locking helpers will return false if
> > > SB_DYING is set.
> > 
> > Certainly something to explore but no promises. Would you be open to
> > doig this as a follow-up patch? If you have a clearer idea here then I
> > wouldn't mind you piling this on top of this series even.
> 
> Sure, the cleanup of SB_ACTIVE probably deserves a separate patchset
> because it's going to involve a lot of individual filesystem changes.

Yeah, agreed.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/3] super: wait until we passed kill super
  2023-08-17 10:47 ` [PATCH 3/3] super: wait until we passed kill super Christian Brauner
@ 2023-08-17 14:37   ` Jan Kara
  2023-08-17 14:54     ` Christian Brauner
  0 siblings, 1 reply; 15+ messages in thread
From: Jan Kara @ 2023-08-17 14:37 UTC (permalink / raw)
  To: Christian Brauner; +Cc: Jan Kara, Christoph Hellwig, Al Viro, linux-fsdevel

On Thu 17-08-23 12:47:44, Christian Brauner wrote:
> Recent rework moved block device closing out of sb->put_super() and into
> sb->kill_sb() to avoid deadlocks as s_umount is held in put_super() and
> blkdev_put() can end up taking s_umount again.
> 
> That means we need to move the removal of the superblock from @fs_supers
> out of generic_shutdown_super() and into deactivate_locked_super() to
> ensure that concurrent mounters don't fail to open block devices that
> are still in use because blkdev_put() in sb->kill_sb() hasn't been
> called yet.
> 
> We can now do this as we can make iterators through @fs_super and
> @super_blocks wait without holding s_umount. Concurrent mounts will wait
> until a dying superblock is fully dead so until sb->kill_sb() has been
> called and SB_DEAD been set. Concurrent iterators can already discard
> any SB_DYING superblock.
> 
> Signed-off-by: Christian Brauner <brauner@kernel.org>
> ---
>  fs/super.c         | 71 +++++++++++++++++++++++++++++++++++++++++++++++++-----
>  include/linux/fs.h |  1 +
>  2 files changed, 66 insertions(+), 6 deletions(-)

<snip>

> @@ -456,6 +497,25 @@ void deactivate_locked_super(struct super_block *s)
>  		list_lru_destroy(&s->s_dentry_lru);
>  		list_lru_destroy(&s->s_inode_lru);
>  
> +		/*
> +		 * Remove it from @fs_supers so it isn't found by new
> +		 * sget{_fc}() walkers anymore. Any concurrent mounter still
> +		 * managing to grab a temporary reference is guaranteed to
> +		 * already see SB_DYING and will wait until we notify them about
> +		 * SB_DEAD.
> +		 */
> +		spin_lock(&sb_lock);
> +		hlist_del_init(&s->s_instances);
> +		spin_unlock(&sb_lock);
> +
> +		/*
> +		 * Let concurrent mounts know that this thing is really dead.
> +		 * We don't need @sb->s_umount here as every concurrent caller
> +		 * will see SB_DYING and either discard the superblock or wait
> +		 * for SB_DEAD.
> +		 */
> +		super_wake(s, SB_DEAD);
> +
>  		put_filesystem(fs);
>  		put_super(s);
>  	} else {
> @@ -638,15 +698,14 @@ void generic_shutdown_super(struct super_block *sb)
>  			spin_unlock(&sb->s_inode_list_lock);
>  		}
>  	}
> -	spin_lock(&sb_lock);
> -	/* should be initialized for __put_super_and_need_restart() */
> -	hlist_del_init(&sb->s_instances);
> -	spin_unlock(&sb_lock);

OK, but we have several checks of hlist_unhashed(&sb->s_instances) in the
code whose meaning is now subtly changed. We have:
  trylock_super() - needs SB_DYING check instead of s_instances check
  __iterate_supers() - probably we should add SB_DYING check to not block
    emergency operations on s_umount unnecessarily and drop s_instances
    check
  iterate_supers() - we can drop s_instances check
  get_super() - we can drop s_instances check
  get_active_super() - we can drop s_instances check
  user_get_super() - we can drop s_instances check

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/3] super: wait until we passed kill super
  2023-08-17 14:37   ` Jan Kara
@ 2023-08-17 14:54     ` Christian Brauner
  2023-08-17 16:27       ` Jan Kara
  0 siblings, 1 reply; 15+ messages in thread
From: Christian Brauner @ 2023-08-17 14:54 UTC (permalink / raw)
  To: Jan Kara; +Cc: Jan Kara, Christoph Hellwig, Al Viro, linux-fsdevel

On Thu, Aug 17, 2023 at 04:37:36PM +0200, Jan Kara wrote:
> On Thu 17-08-23 12:47:44, Christian Brauner wrote:
> > Recent rework moved block device closing out of sb->put_super() and into
> > sb->kill_sb() to avoid deadlocks as s_umount is held in put_super() and
> > blkdev_put() can end up taking s_umount again.
> > 
> > That means we need to move the removal of the superblock from @fs_supers
> > out of generic_shutdown_super() and into deactivate_locked_super() to
> > ensure that concurrent mounters don't fail to open block devices that
> > are still in use because blkdev_put() in sb->kill_sb() hasn't been
> > called yet.
> > 
> > We can now do this as we can make iterators through @fs_super and
> > @super_blocks wait without holding s_umount. Concurrent mounts will wait
> > until a dying superblock is fully dead so until sb->kill_sb() has been
> > called and SB_DEAD been set. Concurrent iterators can already discard
> > any SB_DYING superblock.
> > 
> > Signed-off-by: Christian Brauner <brauner@kernel.org>
> > ---
> >  fs/super.c         | 71 +++++++++++++++++++++++++++++++++++++++++++++++++-----
> >  include/linux/fs.h |  1 +
> >  2 files changed, 66 insertions(+), 6 deletions(-)
> 
> <snip>
> 
> > @@ -456,6 +497,25 @@ void deactivate_locked_super(struct super_block *s)
> >  		list_lru_destroy(&s->s_dentry_lru);
> >  		list_lru_destroy(&s->s_inode_lru);
> >  
> > +		/*
> > +		 * Remove it from @fs_supers so it isn't found by new
> > +		 * sget{_fc}() walkers anymore. Any concurrent mounter still
> > +		 * managing to grab a temporary reference is guaranteed to
> > +		 * already see SB_DYING and will wait until we notify them about
> > +		 * SB_DEAD.
> > +		 */
> > +		spin_lock(&sb_lock);
> > +		hlist_del_init(&s->s_instances);
> > +		spin_unlock(&sb_lock);
> > +
> > +		/*
> > +		 * Let concurrent mounts know that this thing is really dead.
> > +		 * We don't need @sb->s_umount here as every concurrent caller
> > +		 * will see SB_DYING and either discard the superblock or wait
> > +		 * for SB_DEAD.
> > +		 */
> > +		super_wake(s, SB_DEAD);
> > +
> >  		put_filesystem(fs);
> >  		put_super(s);
> >  	} else {
> > @@ -638,15 +698,14 @@ void generic_shutdown_super(struct super_block *sb)
> >  			spin_unlock(&sb->s_inode_list_lock);
> >  		}
> >  	}
> > -	spin_lock(&sb_lock);
> > -	/* should be initialized for __put_super_and_need_restart() */
> > -	hlist_del_init(&sb->s_instances);
> > -	spin_unlock(&sb_lock);
> 
> OK, but we have several checks of hlist_unhashed(&sb->s_instances) in the
> code whose meaning is now subtly changed. We have:

If by changed meaning you mean they can be dropped, then yes.
That's what I understand you as saying given the following list.

>   trylock_super() - needs SB_DYING check instead of s_instances check
>   __iterate_supers() - probably we should add SB_DYING check to not block
>     emergency operations on s_umount unnecessarily and drop s_instances
>     check
>   iterate_supers() - we can drop s_instances check
>   get_super() - we can drop s_instances check
>   get_active_super() - we can drop s_instances check
>   user_get_super() - we can drop s_instances check

But does this otherwise look reasonable?

(Btw, just because I noticed it, do you prefer suse.cz or suse.com?)

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/3] super: wait until we passed kill super
  2023-08-17 14:54     ` Christian Brauner
@ 2023-08-17 16:27       ` Jan Kara
  0 siblings, 0 replies; 15+ messages in thread
From: Jan Kara @ 2023-08-17 16:27 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Jan Kara, Jan Kara, Christoph Hellwig, Al Viro, linux-fsdevel

On Thu 17-08-23 16:54:32, Christian Brauner wrote:
> On Thu, Aug 17, 2023 at 04:37:36PM +0200, Jan Kara wrote:
> > On Thu 17-08-23 12:47:44, Christian Brauner wrote:
> > > Recent rework moved block device closing out of sb->put_super() and into
> > > sb->kill_sb() to avoid deadlocks as s_umount is held in put_super() and
> > > blkdev_put() can end up taking s_umount again.
> > > 
> > > That means we need to move the removal of the superblock from @fs_supers
> > > out of generic_shutdown_super() and into deactivate_locked_super() to
> > > ensure that concurrent mounters don't fail to open block devices that
> > > are still in use because blkdev_put() in sb->kill_sb() hasn't been
> > > called yet.
> > > 
> > > We can now do this as we can make iterators through @fs_super and
> > > @super_blocks wait without holding s_umount. Concurrent mounts will wait
> > > until a dying superblock is fully dead so until sb->kill_sb() has been
> > > called and SB_DEAD been set. Concurrent iterators can already discard
> > > any SB_DYING superblock.
> > > 
> > > Signed-off-by: Christian Brauner <brauner@kernel.org>
> > > ---
> > >  fs/super.c         | 71 +++++++++++++++++++++++++++++++++++++++++++++++++-----
> > >  include/linux/fs.h |  1 +
> > >  2 files changed, 66 insertions(+), 6 deletions(-)
> > 
> > <snip>
> > 
> > > @@ -456,6 +497,25 @@ void deactivate_locked_super(struct super_block *s)
> > >  		list_lru_destroy(&s->s_dentry_lru);
> > >  		list_lru_destroy(&s->s_inode_lru);
> > >  
> > > +		/*
> > > +		 * Remove it from @fs_supers so it isn't found by new
> > > +		 * sget{_fc}() walkers anymore. Any concurrent mounter still
> > > +		 * managing to grab a temporary reference is guaranteed to
> > > +		 * already see SB_DYING and will wait until we notify them about
> > > +		 * SB_DEAD.
> > > +		 */
> > > +		spin_lock(&sb_lock);
> > > +		hlist_del_init(&s->s_instances);
> > > +		spin_unlock(&sb_lock);
> > > +
> > > +		/*
> > > +		 * Let concurrent mounts know that this thing is really dead.
> > > +		 * We don't need @sb->s_umount here as every concurrent caller
> > > +		 * will see SB_DYING and either discard the superblock or wait
> > > +		 * for SB_DEAD.
> > > +		 */
> > > +		super_wake(s, SB_DEAD);
> > > +
> > >  		put_filesystem(fs);
> > >  		put_super(s);
> > >  	} else {
> > > @@ -638,15 +698,14 @@ void generic_shutdown_super(struct super_block *sb)
> > >  			spin_unlock(&sb->s_inode_list_lock);
> > >  		}
> > >  	}
> > > -	spin_lock(&sb_lock);
> > > -	/* should be initialized for __put_super_and_need_restart() */
> > > -	hlist_del_init(&sb->s_instances);
> > > -	spin_unlock(&sb_lock);
> > 
> > OK, but we have several checks of hlist_unhashed(&sb->s_instances) in the
> > code whose meaning is now subtly changed. We have:
> 
> If by changed meaning you mean they can be dropped, then yes.
> That's what I understand you as saying given the following list.

Yes, they can be all dropped but we probably need SB_DYING check in
trylock_super() and preferably in __iterate_supers() as well.

> >   trylock_super() - needs SB_DYING check instead of s_instances check
> >   __iterate_supers() - probably we should add SB_DYING check to not block
> >     emergency operations on s_umount unnecessarily and drop s_instances
> >     check
> >   iterate_supers() - we can drop s_instances check
> >   get_super() - we can drop s_instances check
> >   get_active_super() - we can drop s_instances check
> >   user_get_super() - we can drop s_instances check
> 
> But does this otherwise look reasonable?

Yes, otherwise the patch looks good to me.

> (Btw, just because I noticed it, do you prefer suse.cz or suse.com?)

I prefer suse.cz because suse.com comes through MS Exchange before getting
into our Linux mailing system and that occasionally causes trouble.

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/3] super: wait for nascent superblocks
  2023-08-17 12:50   ` Jan Kara
  2023-08-17 13:24     ` Christian Brauner
@ 2023-08-17 20:53     ` Matthew Wilcox
  2023-08-18 10:56       ` Christian Brauner
  1 sibling, 1 reply; 15+ messages in thread
From: Matthew Wilcox @ 2023-08-17 20:53 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christian Brauner, Jan Kara, Christoph Hellwig, Al Viro, linux-fsdevel

On Thu, Aug 17, 2023 at 02:50:21PM +0200, Jan Kara wrote:
> > +/**
> > + * super_wait - wait for superblock to become ready
> > + * @sb: superblock to wait for
> > + * @excl: whether exclusive access is required
> > + *
> > + * If the superblock has neither passed through vfs_get_tree() or
> > + * generic_shutdown_super() yet wait for it to happen. Either superblock
> > + * creation will succeed and SB_BORN is set by vfs_get_tree() or we're
> > + * woken and we'll see SB_DYING.
> > + *
> > + * The caller must have acquired a temporary reference on @sb->s_count.
> > + *
> > + * Return: true if SB_BORN was set, false if SB_DYING was set.
> 
> The comment should mention that this acquires s_umount and returns with it
> held. Also the name is a bit too generic for my taste and not expressing
> the fact this is in fact a lock operation. Maybe something like
> super_lock_wait_born()?

Isn't this actually the normal function we want people to call?  So maybe
this function should be called super_lock() and the functions in the
last patch get called __super_lock() for raw access to the lock.

I'm also a little wary that this isn't _killable.  Are we guaranteed
that a superblock will transition to BORN or DYING within a limited
time?

This isn't part of the VFS I know well.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/3] super: wait for nascent superblocks
  2023-08-17 20:53     ` Matthew Wilcox
@ 2023-08-18 10:56       ` Christian Brauner
  0 siblings, 0 replies; 15+ messages in thread
From: Christian Brauner @ 2023-08-18 10:56 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Jan Kara, Jan Kara, Christoph Hellwig, Al Viro, linux-fsdevel

On Thu, Aug 17, 2023 at 09:53:43PM +0100, Matthew Wilcox wrote:
> On Thu, Aug 17, 2023 at 02:50:21PM +0200, Jan Kara wrote:
> > > +/**
> > > + * super_wait - wait for superblock to become ready
> > > + * @sb: superblock to wait for
> > > + * @excl: whether exclusive access is required
> > > + *
> > > + * If the superblock has neither passed through vfs_get_tree() or
> > > + * generic_shutdown_super() yet wait for it to happen. Either superblock
> > > + * creation will succeed and SB_BORN is set by vfs_get_tree() or we're
> > > + * woken and we'll see SB_DYING.
> > > + *
> > > + * The caller must have acquired a temporary reference on @sb->s_count.
> > > + *
> > > + * Return: true if SB_BORN was set, false if SB_DYING was set.
> > 
> > The comment should mention that this acquires s_umount and returns with it
> > held. Also the name is a bit too generic for my taste and not expressing
> > the fact this is in fact a lock operation. Maybe something like
> > super_lock_wait_born()?
> 
> Isn't this actually the normal function we want people to call?  So maybe
> this function should be called super_lock() and the functions in the
> last patch get called __super_lock() for raw access to the lock.

I think that's a good point. I've renamed them accordingly.

> 
> I'm also a little wary that this isn't _killable.  Are we guaranteed
> that a superblock will transition to BORN or DYING within a limited
> time?

Yes, it should. If this would be an issue it's something we should have
seen already. Random example being s_umount being held over a
filesytem's fill_super() method all while concurrent iterators are
sleeping on s_umount unconditionally. It is ofc always possible for us
to make this killable in the future should it become an issue.

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2023-08-18 10:57 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-08-17 10:47 [PATCH 0/3] super: allow waiting without s_umount held Christian Brauner
2023-08-17 10:47 ` [PATCH 1/3] super: use super_{lock,unlock}() Christian Brauner
2023-08-17 11:39   ` Jan Kara
2023-08-17 10:47 ` [PATCH 2/3] super: wait for nascent superblocks Christian Brauner
2023-08-17 12:50   ` Jan Kara
2023-08-17 13:24     ` Christian Brauner
2023-08-17 13:41       ` Christian Brauner
2023-08-17 14:16       ` Jan Kara
2023-08-17 14:30         ` Christian Brauner
2023-08-17 20:53     ` Matthew Wilcox
2023-08-18 10:56       ` Christian Brauner
2023-08-17 10:47 ` [PATCH 3/3] super: wait until we passed kill super Christian Brauner
2023-08-17 14:37   ` Jan Kara
2023-08-17 14:54     ` Christian Brauner
2023-08-17 16:27       ` Jan Kara

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.