linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] kernfs: switch global kernfs_rwsem lock to per-fs lock
@ 2021-11-18 23:00 Minchan Kim
  2021-11-18 23:03 ` Tejun Heo
       [not found] ` <CGME20211126115446eucas1p2e377cf7718c6da6bfe40a016bc0191a8@eucas1p2.samsung.com>
  0 siblings, 2 replies; 8+ messages in thread
From: Minchan Kim @ 2021-11-18 23:00 UTC (permalink / raw)
  To: Greg Kroah-Hartman, Tejun Heo; +Cc: LKML, Minchan Kim

The kernfs implementation has big lock granularity(kernfs_rwsem) so
every kernfs-based(e.g., sysfs, cgroup) fs are able to compete the
lock. It makes trouble for some cases to wait the global lock
for a long time even though they are totally independent contexts
each other.

A general example is process A goes under direct reclaim with holding
the lock when it accessed the file in sysfs and process B is waiting
the lock with exclusive mode and then process C is waiting the lock
until process B could finish the job after it gets the lock from
process A.

This patch switches the global kernfs_rwsem to per-fs lock, which
put the rwsem into kernfs_root.

Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Minchan Kim <minchan@kernel.org>
---
from RFC - https://lore.kernel.org/lkml/20211116194317.1430399-1-minchan@kernel.org/
 * move kernfs_rwsem to kernfs_root - tejun@

 fs/kernfs/dir.c        | 110 ++++++++++++++++++++++++-----------------
 fs/kernfs/file.c       |   6 ++-
 fs/kernfs/inode.c      |  22 ++++++---
 fs/kernfs/mount.c      |  15 +++---
 fs/kernfs/symlink.c    |   5 +-
 include/linux/kernfs.h |   2 +
 6 files changed, 97 insertions(+), 63 deletions(-)

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 8e0a1378a4b1..13cae0ccce74 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -17,7 +17,6 @@
 
 #include "kernfs-internal.h"
 
-DECLARE_RWSEM(kernfs_rwsem);
 static DEFINE_SPINLOCK(kernfs_rename_lock);	/* kn->parent and ->name */
 static char kernfs_pr_cont_buf[PATH_MAX];	/* protected by rename_lock */
 static DEFINE_SPINLOCK(kernfs_idr_lock);	/* root->ino_idr */
@@ -26,7 +25,7 @@ static DEFINE_SPINLOCK(kernfs_idr_lock);	/* root->ino_idr */
 
 static bool kernfs_active(struct kernfs_node *kn)
 {
-	lockdep_assert_held(&kernfs_rwsem);
+	lockdep_assert_held(&kernfs_root(kn)->kernfs_rwsem);
 	return atomic_read(&kn->active) >= 0;
 }
 
@@ -457,14 +456,15 @@ void kernfs_put_active(struct kernfs_node *kn)
  * return after draining is complete.
  */
 static void kernfs_drain(struct kernfs_node *kn)
-	__releases(&kernfs_rwsem) __acquires(&kernfs_rwsem)
+	__releases(&kernfs_root(kn)->kernfs_rwsem)
+	__acquires(&kernfs_root(kn)->kernfs_rwsem)
 {
 	struct kernfs_root *root = kernfs_root(kn);
 
-	lockdep_assert_held_write(&kernfs_rwsem);
+	lockdep_assert_held_write(&root->kernfs_rwsem);
 	WARN_ON_ONCE(kernfs_active(kn));
 
-	up_write(&kernfs_rwsem);
+	up_write(&root->kernfs_rwsem);
 
 	if (kernfs_lockdep(kn)) {
 		rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_);
@@ -483,7 +483,7 @@ static void kernfs_drain(struct kernfs_node *kn)
 
 	kernfs_drain_open_files(kn);
 
-	down_write(&kernfs_rwsem);
+	down_write(&root->kernfs_rwsem);
 }
 
 /**
@@ -718,11 +718,12 @@ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
 int kernfs_add_one(struct kernfs_node *kn)
 {
 	struct kernfs_node *parent = kn->parent;
+	struct kernfs_root *root = kernfs_root(parent);
 	struct kernfs_iattrs *ps_iattr;
 	bool has_ns;
 	int ret;
 
-	down_write(&kernfs_rwsem);
+	down_write(&root->kernfs_rwsem);
 
 	ret = -EINVAL;
 	has_ns = kernfs_ns_enabled(parent);
@@ -753,7 +754,7 @@ int kernfs_add_one(struct kernfs_node *kn)
 		ps_iattr->ia_mtime = ps_iattr->ia_ctime;
 	}
 
-	up_write(&kernfs_rwsem);
+	up_write(&root->kernfs_rwsem);
 
 	/*
 	 * Activate the new node unless CREATE_DEACTIVATED is requested.
@@ -767,7 +768,7 @@ int kernfs_add_one(struct kernfs_node *kn)
 	return 0;
 
 out_unlock:
-	up_write(&kernfs_rwsem);
+	up_write(&root->kernfs_rwsem);
 	return ret;
 }
 
@@ -788,7 +789,7 @@ static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent,
 	bool has_ns = kernfs_ns_enabled(parent);
 	unsigned int hash;
 
-	lockdep_assert_held(&kernfs_rwsem);
+	lockdep_assert_held(&kernfs_root(parent)->kernfs_rwsem);
 
 	if (has_ns != (bool)ns) {
 		WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
@@ -820,7 +821,7 @@ static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent,
 	size_t len;
 	char *p, *name;
 
-	lockdep_assert_held_read(&kernfs_rwsem);
+	lockdep_assert_held_read(&kernfs_root(parent)->kernfs_rwsem);
 
 	/* grab kernfs_rename_lock to piggy back on kernfs_pr_cont_buf */
 	spin_lock_irq(&kernfs_rename_lock);
@@ -859,11 +860,12 @@ struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent,
 					   const char *name, const void *ns)
 {
 	struct kernfs_node *kn;
+	struct kernfs_root *root = kernfs_root(parent);
 
-	down_read(&kernfs_rwsem);
+	down_read(&root->kernfs_rwsem);
 	kn = kernfs_find_ns(parent, name, ns);
 	kernfs_get(kn);
-	up_read(&kernfs_rwsem);
+	up_read(&root->kernfs_rwsem);
 
 	return kn;
 }
@@ -883,11 +885,12 @@ struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent,
 					   const char *path, const void *ns)
 {
 	struct kernfs_node *kn;
+	struct kernfs_root *root = kernfs_root(parent);
 
-	down_read(&kernfs_rwsem);
+	down_read(&root->kernfs_rwsem);
 	kn = kernfs_walk_ns(parent, path, ns);
 	kernfs_get(kn);
-	up_read(&kernfs_rwsem);
+	up_read(&root->kernfs_rwsem);
 
 	return kn;
 }
@@ -912,6 +915,7 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
 		return ERR_PTR(-ENOMEM);
 
 	idr_init(&root->ino_idr);
+	init_rwsem(&root->kernfs_rwsem);
 	INIT_LIST_HEAD(&root->supers);
 
 	/*
@@ -1035,6 +1039,7 @@ struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent,
 static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
 {
 	struct kernfs_node *kn;
+	struct kernfs_root *root;
 
 	if (flags & LOOKUP_RCU)
 		return -ECHILD;
@@ -1046,18 +1051,19 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
 		/* If the kernfs parent node has changed discard and
 		 * proceed to ->lookup.
 		 */
-		down_read(&kernfs_rwsem);
 		spin_lock(&dentry->d_lock);
 		parent = kernfs_dentry_node(dentry->d_parent);
 		if (parent) {
+			spin_unlock(&dentry->d_lock);
+			root = kernfs_root(parent);
+			down_read(&root->kernfs_rwsem);
 			if (kernfs_dir_changed(parent, dentry)) {
-				spin_unlock(&dentry->d_lock);
-				up_read(&kernfs_rwsem);
+				up_read(&root->kernfs_rwsem);
 				return 0;
 			}
-		}
-		spin_unlock(&dentry->d_lock);
-		up_read(&kernfs_rwsem);
+			up_read(&root->kernfs_rwsem);
+		} else
+			spin_unlock(&dentry->d_lock);
 
 		/* The kernfs parent node hasn't changed, leave the
 		 * dentry negative and return success.
@@ -1066,7 +1072,8 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
 	}
 
 	kn = kernfs_dentry_node(dentry);
-	down_read(&kernfs_rwsem);
+	root = kernfs_root(kn);
+	down_read(&root->kernfs_rwsem);
 
 	/* The kernfs node has been deactivated */
 	if (!kernfs_active(kn))
@@ -1085,10 +1092,10 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
 	    kernfs_info(dentry->d_sb)->ns != kn->ns)
 		goto out_bad;
 
-	up_read(&kernfs_rwsem);
+	up_read(&root->kernfs_rwsem);
 	return 1;
 out_bad:
-	up_read(&kernfs_rwsem);
+	up_read(&root->kernfs_rwsem);
 	return 0;
 }
 
@@ -1102,10 +1109,12 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir,
 {
 	struct kernfs_node *parent = dir->i_private;
 	struct kernfs_node *kn;
+	struct kernfs_root *root;
 	struct inode *inode = NULL;
 	const void *ns = NULL;
 
-	down_read(&kernfs_rwsem);
+	root = kernfs_root(parent);
+	down_read(&root->kernfs_rwsem);
 	if (kernfs_ns_enabled(parent))
 		ns = kernfs_info(dir->i_sb)->ns;
 
@@ -1116,7 +1125,7 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir,
 		 * create a negative.
 		 */
 		if (!kernfs_active(kn)) {
-			up_read(&kernfs_rwsem);
+			up_read(&root->kernfs_rwsem);
 			return NULL;
 		}
 		inode = kernfs_get_inode(dir->i_sb, kn);
@@ -1131,7 +1140,7 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir,
 	 */
 	if (!IS_ERR(inode))
 		kernfs_set_rev(parent, dentry);
-	up_read(&kernfs_rwsem);
+	up_read(&root->kernfs_rwsem);
 
 	/* instantiate and hash (possibly negative) dentry */
 	return d_splice_alias(inode, dentry);
@@ -1254,7 +1263,7 @@ static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos,
 {
 	struct rb_node *rbn;
 
-	lockdep_assert_held_write(&kernfs_rwsem);
+	lockdep_assert_held_write(&kernfs_root(root)->kernfs_rwsem);
 
 	/* if first iteration, visit leftmost descendant which may be root */
 	if (!pos)
@@ -1289,8 +1298,9 @@ static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos,
 void kernfs_activate(struct kernfs_node *kn)
 {
 	struct kernfs_node *pos;
+	struct kernfs_root *root = kernfs_root(kn);
 
-	down_write(&kernfs_rwsem);
+	down_write(&root->kernfs_rwsem);
 
 	pos = NULL;
 	while ((pos = kernfs_next_descendant_post(pos, kn))) {
@@ -1304,14 +1314,14 @@ void kernfs_activate(struct kernfs_node *kn)
 		pos->flags |= KERNFS_ACTIVATED;
 	}
 
-	up_write(&kernfs_rwsem);
+	up_write(&root->kernfs_rwsem);
 }
 
 static void __kernfs_remove(struct kernfs_node *kn)
 {
 	struct kernfs_node *pos;
 
-	lockdep_assert_held_write(&kernfs_rwsem);
+	lockdep_assert_held_write(&kernfs_root(kn)->kernfs_rwsem);
 
 	/*
 	 * Short-circuit if non-root @kn has already finished removal.
@@ -1381,9 +1391,11 @@ static void __kernfs_remove(struct kernfs_node *kn)
  */
 void kernfs_remove(struct kernfs_node *kn)
 {
-	down_write(&kernfs_rwsem);
+	struct kernfs_root *root = kernfs_root(kn);
+
+	down_write(&root->kernfs_rwsem);
 	__kernfs_remove(kn);
-	up_write(&kernfs_rwsem);
+	up_write(&root->kernfs_rwsem);
 }
 
 /**
@@ -1469,8 +1481,9 @@ void kernfs_unbreak_active_protection(struct kernfs_node *kn)
 bool kernfs_remove_self(struct kernfs_node *kn)
 {
 	bool ret;
+	struct kernfs_root *root = kernfs_root(kn);
 
-	down_write(&kernfs_rwsem);
+	down_write(&root->kernfs_rwsem);
 	kernfs_break_active_protection(kn);
 
 	/*
@@ -1498,9 +1511,9 @@ bool kernfs_remove_self(struct kernfs_node *kn)
 			    atomic_read(&kn->active) == KN_DEACTIVATED_BIAS)
 				break;
 
-			up_write(&kernfs_rwsem);
+			up_write(&root->kernfs_rwsem);
 			schedule();
-			down_write(&kernfs_rwsem);
+			down_write(&root->kernfs_rwsem);
 		}
 		finish_wait(waitq, &wait);
 		WARN_ON_ONCE(!RB_EMPTY_NODE(&kn->rb));
@@ -1513,7 +1526,7 @@ bool kernfs_remove_self(struct kernfs_node *kn)
 	 */
 	kernfs_unbreak_active_protection(kn);
 
-	up_write(&kernfs_rwsem);
+	up_write(&root->kernfs_rwsem);
 	return ret;
 }
 
@@ -1530,6 +1543,7 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
 			     const void *ns)
 {
 	struct kernfs_node *kn;
+	struct kernfs_root *root;
 
 	if (!parent) {
 		WARN(1, KERN_WARNING "kernfs: can not remove '%s', no directory\n",
@@ -1537,13 +1551,14 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
 		return -ENOENT;
 	}
 
-	down_write(&kernfs_rwsem);
+	root = kernfs_root(parent);
+	down_write(&root->kernfs_rwsem);
 
 	kn = kernfs_find_ns(parent, name, ns);
 	if (kn)
 		__kernfs_remove(kn);
 
-	up_write(&kernfs_rwsem);
+	up_write(&root->kernfs_rwsem);
 
 	if (kn)
 		return 0;
@@ -1562,6 +1577,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
 		     const char *new_name, const void *new_ns)
 {
 	struct kernfs_node *old_parent;
+	struct kernfs_root *root;
 	const char *old_name = NULL;
 	int error;
 
@@ -1569,7 +1585,8 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
 	if (!kn->parent)
 		return -EINVAL;
 
-	down_write(&kernfs_rwsem);
+	root = kernfs_root(kn);
+	down_write(&root->kernfs_rwsem);
 
 	error = -ENOENT;
 	if (!kernfs_active(kn) || !kernfs_active(new_parent) ||
@@ -1623,7 +1640,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
 
 	error = 0;
  out:
-	up_write(&kernfs_rwsem);
+	up_write(&root->kernfs_rwsem);
 	return error;
 }
 
@@ -1694,11 +1711,14 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx)
 	struct dentry *dentry = file->f_path.dentry;
 	struct kernfs_node *parent = kernfs_dentry_node(dentry);
 	struct kernfs_node *pos = file->private_data;
+	struct kernfs_root *root;
 	const void *ns = NULL;
 
 	if (!dir_emit_dots(file, ctx))
 		return 0;
-	down_read(&kernfs_rwsem);
+
+	root = kernfs_root(parent);
+	down_read(&root->kernfs_rwsem);
 
 	if (kernfs_ns_enabled(parent))
 		ns = kernfs_info(dentry->d_sb)->ns;
@@ -1715,12 +1735,12 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx)
 		file->private_data = pos;
 		kernfs_get(pos);
 
-		up_read(&kernfs_rwsem);
+		up_read(&root->kernfs_rwsem);
 		if (!dir_emit(ctx, name, len, ino, type))
 			return 0;
-		down_read(&kernfs_rwsem);
+		down_read(&root->kernfs_rwsem);
 	}
-	up_read(&kernfs_rwsem);
+	up_read(&root->kernfs_rwsem);
 	file->private_data = NULL;
 	ctx->pos = INT_MAX;
 	return 0;
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 60e2a86c535e..9414a7a60a9f 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -847,6 +847,7 @@ static void kernfs_notify_workfn(struct work_struct *work)
 {
 	struct kernfs_node *kn;
 	struct kernfs_super_info *info;
+	struct kernfs_root *root;
 repeat:
 	/* pop one off the notify_list */
 	spin_lock_irq(&kernfs_notify_lock);
@@ -859,8 +860,9 @@ static void kernfs_notify_workfn(struct work_struct *work)
 	kn->attr.notify_next = NULL;
 	spin_unlock_irq(&kernfs_notify_lock);
 
+	root = kernfs_root(kn);
 	/* kick fsnotify */
-	down_write(&kernfs_rwsem);
+	down_write(&root->kernfs_rwsem);
 
 	list_for_each_entry(info, &kernfs_root(kn)->supers, node) {
 		struct kernfs_node *parent;
@@ -898,7 +900,7 @@ static void kernfs_notify_workfn(struct work_struct *work)
 		iput(inode);
 	}
 
-	up_write(&kernfs_rwsem);
+	up_write(&root->kernfs_rwsem);
 	kernfs_put(kn);
 	goto repeat;
 }
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index c0eae1725435..3d783d80f5da 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -99,10 +99,11 @@ int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
 int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
 {
 	int ret;
+	struct kernfs_root *root = kernfs_root(kn);
 
-	down_write(&kernfs_rwsem);
+	down_write(&root->kernfs_rwsem);
 	ret = __kernfs_setattr(kn, iattr);
-	up_write(&kernfs_rwsem);
+	up_write(&root->kernfs_rwsem);
 	return ret;
 }
 
@@ -111,12 +112,14 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 {
 	struct inode *inode = d_inode(dentry);
 	struct kernfs_node *kn = inode->i_private;
+	struct kernfs_root *root;
 	int error;
 
 	if (!kn)
 		return -EINVAL;
 
-	down_write(&kernfs_rwsem);
+	root = kernfs_root(kn);
+	down_write(&root->kernfs_rwsem);
 	error = setattr_prepare(&init_user_ns, dentry, iattr);
 	if (error)
 		goto out;
@@ -129,7 +132,7 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 	setattr_copy(&init_user_ns, inode, iattr);
 
 out:
-	up_write(&kernfs_rwsem);
+	up_write(&root->kernfs_rwsem);
 	return error;
 }
 
@@ -184,13 +187,14 @@ int kernfs_iop_getattr(struct user_namespace *mnt_userns,
 {
 	struct inode *inode = d_inode(path->dentry);
 	struct kernfs_node *kn = inode->i_private;
+	struct kernfs_root *root = kernfs_root(kn);
 
-	down_read(&kernfs_rwsem);
+	down_read(&root->kernfs_rwsem);
 	spin_lock(&inode->i_lock);
 	kernfs_refresh_inode(kn, inode);
 	generic_fillattr(&init_user_ns, inode, stat);
 	spin_unlock(&inode->i_lock);
-	up_read(&kernfs_rwsem);
+	up_read(&root->kernfs_rwsem);
 
 	return 0;
 }
@@ -274,19 +278,21 @@ int kernfs_iop_permission(struct user_namespace *mnt_userns,
 			  struct inode *inode, int mask)
 {
 	struct kernfs_node *kn;
+	struct kernfs_root *root;
 	int ret;
 
 	if (mask & MAY_NOT_BLOCK)
 		return -ECHILD;
 
 	kn = inode->i_private;
+	root = kernfs_root(kn);
 
-	down_read(&kernfs_rwsem);
+	down_read(&root->kernfs_rwsem);
 	spin_lock(&inode->i_lock);
 	kernfs_refresh_inode(kn, inode);
 	ret = generic_permission(&init_user_ns, inode, mask);
 	spin_unlock(&inode->i_lock);
-	up_read(&kernfs_rwsem);
+	up_read(&root->kernfs_rwsem);
 
 	return ret;
 }
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index f2f909d09f52..cfa79715fc1a 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -236,6 +236,7 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
 static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *kfc)
 {
 	struct kernfs_super_info *info = kernfs_info(sb);
+	struct kernfs_root *kf_root = kfc->root;
 	struct inode *inode;
 	struct dentry *root;
 
@@ -255,9 +256,9 @@ static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *k
 	sb->s_shrink.seeks = 0;
 
 	/* get root inode, initialize and unlock it */
-	down_read(&kernfs_rwsem);
+	down_read(&kf_root->kernfs_rwsem);
 	inode = kernfs_get_inode(sb, info->root->kn);
-	up_read(&kernfs_rwsem);
+	up_read(&kf_root->kernfs_rwsem);
 	if (!inode) {
 		pr_debug("kernfs: could not get root inode\n");
 		return -ENOMEM;
@@ -334,6 +335,7 @@ int kernfs_get_tree(struct fs_context *fc)
 
 	if (!sb->s_root) {
 		struct kernfs_super_info *info = kernfs_info(sb);
+		struct kernfs_root *root = kfc->root;
 
 		kfc->new_sb_created = true;
 
@@ -344,9 +346,9 @@ int kernfs_get_tree(struct fs_context *fc)
 		}
 		sb->s_flags |= SB_ACTIVE;
 
-		down_write(&kernfs_rwsem);
+		down_write(&root->kernfs_rwsem);
 		list_add(&info->node, &info->root->supers);
-		up_write(&kernfs_rwsem);
+		up_write(&root->kernfs_rwsem);
 	}
 
 	fc->root = dget(sb->s_root);
@@ -371,10 +373,11 @@ void kernfs_free_fs_context(struct fs_context *fc)
 void kernfs_kill_sb(struct super_block *sb)
 {
 	struct kernfs_super_info *info = kernfs_info(sb);
+	struct kernfs_root *root = info->root;
 
-	down_write(&kernfs_rwsem);
+	down_write(&root->kernfs_rwsem);
 	list_del(&info->node);
-	up_write(&kernfs_rwsem);
+	up_write(&root->kernfs_rwsem);
 
 	/*
 	 * Remove the superblock from fs_supers/s_instances
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index 19a6c71c6ff5..0ab13824822f 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -113,11 +113,12 @@ static int kernfs_getlink(struct inode *inode, char *path)
 	struct kernfs_node *kn = inode->i_private;
 	struct kernfs_node *parent = kn->parent;
 	struct kernfs_node *target = kn->symlink.target_kn;
+	struct kernfs_root *root = kernfs_root(parent);
 	int error;
 
-	down_read(&kernfs_rwsem);
+	down_read(&root->kernfs_rwsem);
 	error = kernfs_get_target_path(parent, target, path);
-	up_read(&kernfs_rwsem);
+	up_read(&root->kernfs_rwsem);
 
 	return error;
 }
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 3ccce6f24548..9f650986a81b 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -16,6 +16,7 @@
 #include <linux/atomic.h>
 #include <linux/uidgid.h>
 #include <linux/wait.h>
+#include <linux/rwsem.h>
 
 struct file;
 struct dentry;
@@ -197,6 +198,7 @@ struct kernfs_root {
 	struct list_head	supers;
 
 	wait_queue_head_t	deactivate_waitq;
+	struct rw_semaphore	kernfs_rwsem;
 };
 
 struct kernfs_open_file {
-- 
2.34.0.rc2.393.gf8c9666880-goog


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] kernfs: switch global kernfs_rwsem lock to per-fs lock
  2021-11-18 23:00 [PATCH] kernfs: switch global kernfs_rwsem lock to per-fs lock Minchan Kim
@ 2021-11-18 23:03 ` Tejun Heo
  2021-11-22 17:39   ` Minchan Kim
       [not found] ` <CGME20211126115446eucas1p2e377cf7718c6da6bfe40a016bc0191a8@eucas1p2.samsung.com>
  1 sibling, 1 reply; 8+ messages in thread
From: Tejun Heo @ 2021-11-18 23:03 UTC (permalink / raw)
  To: Minchan Kim; +Cc: Greg Kroah-Hartman, LKML

On Thu, Nov 18, 2021 at 03:00:08PM -0800, Minchan Kim wrote:
> The kernfs implementation has big lock granularity(kernfs_rwsem) so
> every kernfs-based(e.g., sysfs, cgroup) fs are able to compete the
> lock. It makes trouble for some cases to wait the global lock
> for a long time even though they are totally independent contexts
> each other.
> 
> A general example is process A goes under direct reclaim with holding
> the lock when it accessed the file in sysfs and process B is waiting
> the lock with exclusive mode and then process C is waiting the lock
> until process B could finish the job after it gets the lock from
> process A.
> 
> This patch switches the global kernfs_rwsem to per-fs lock, which
> put the rwsem into kernfs_root.
> 
> Suggested-by: Tejun Heo <tj@kernel.org>
> Signed-off-by: Minchan Kim <minchan@kernel.org>

Acked-by: Tejun Heo <tj@kernel.org>

Greg, I think this is the right thing to do even if there is no concrete
performance argument (not saying there isn't). It's just weird to entangle
these completely unrelated users in a single rwsem.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] kernfs: switch global kernfs_rwsem lock to per-fs lock
  2021-11-18 23:03 ` Tejun Heo
@ 2021-11-22 17:39   ` Minchan Kim
  2021-11-23  6:44     ` Greg Kroah-Hartman
  0 siblings, 1 reply; 8+ messages in thread
From: Minchan Kim @ 2021-11-22 17:39 UTC (permalink / raw)
  To: Tejun Heo, Greg Kroah-Hartman; +Cc: LKML

On Thu, Nov 18, 2021 at 01:03:32PM -1000, Tejun Heo wrote:
> On Thu, Nov 18, 2021 at 03:00:08PM -0800, Minchan Kim wrote:
> > The kernfs implementation has big lock granularity(kernfs_rwsem) so
> > every kernfs-based(e.g., sysfs, cgroup) fs are able to compete the
> > lock. It makes trouble for some cases to wait the global lock
> > for a long time even though they are totally independent contexts
> > each other.
> > 
> > A general example is process A goes under direct reclaim with holding
> > the lock when it accessed the file in sysfs and process B is waiting
> > the lock with exclusive mode and then process C is waiting the lock
> > until process B could finish the job after it gets the lock from
> > process A.
> > 
> > This patch switches the global kernfs_rwsem to per-fs lock, which
> > put the rwsem into kernfs_root.
> > 
> > Suggested-by: Tejun Heo <tj@kernel.org>
> > Signed-off-by: Minchan Kim <minchan@kernel.org>
> 
> Acked-by: Tejun Heo <tj@kernel.org>
> 
> Greg, I think this is the right thing to do even if there is no concrete
> performance argument (not saying there isn't). It's just weird to entangle
> these completely unrelated users in a single rwsem.
> 
> Thanks.

Greg, Do you mind picking this patch?

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] kernfs: switch global kernfs_rwsem lock to per-fs lock
  2021-11-22 17:39   ` Minchan Kim
@ 2021-11-23  6:44     ` Greg Kroah-Hartman
  0 siblings, 0 replies; 8+ messages in thread
From: Greg Kroah-Hartman @ 2021-11-23  6:44 UTC (permalink / raw)
  To: Minchan Kim; +Cc: Tejun Heo, LKML

On Mon, Nov 22, 2021 at 09:39:28AM -0800, Minchan Kim wrote:
> On Thu, Nov 18, 2021 at 01:03:32PM -1000, Tejun Heo wrote:
> > On Thu, Nov 18, 2021 at 03:00:08PM -0800, Minchan Kim wrote:
> > > The kernfs implementation has big lock granularity(kernfs_rwsem) so
> > > every kernfs-based(e.g., sysfs, cgroup) fs are able to compete the
> > > lock. It makes trouble for some cases to wait the global lock
> > > for a long time even though they are totally independent contexts
> > > each other.
> > > 
> > > A general example is process A goes under direct reclaim with holding
> > > the lock when it accessed the file in sysfs and process B is waiting
> > > the lock with exclusive mode and then process C is waiting the lock
> > > until process B could finish the job after it gets the lock from
> > > process A.
> > > 
> > > This patch switches the global kernfs_rwsem to per-fs lock, which
> > > put the rwsem into kernfs_root.
> > > 
> > > Suggested-by: Tejun Heo <tj@kernel.org>
> > > Signed-off-by: Minchan Kim <minchan@kernel.org>
> > 
> > Acked-by: Tejun Heo <tj@kernel.org>
> > 
> > Greg, I think this is the right thing to do even if there is no concrete
> > performance argument (not saying there isn't). It's just weird to entangle
> > these completely unrelated users in a single rwsem.
> > 
> > Thanks.
> 
> Greg, Do you mind picking this patch?

$ mdfrm -c ~/mail/todo/
1872 messages in /home/gregkh/mail/todo/

Give me a chance to catch up...

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] kernfs: switch global kernfs_rwsem lock to per-fs lock
       [not found] ` <CGME20211126115446eucas1p2e377cf7718c6da6bfe40a016bc0191a8@eucas1p2.samsung.com>
@ 2021-11-26 11:54   ` Marek Szyprowski
  2021-11-29 14:23     ` Nguyen Dinh Phi
  2021-11-29 16:58     ` Minchan Kim
  0 siblings, 2 replies; 8+ messages in thread
From: Marek Szyprowski @ 2021-11-26 11:54 UTC (permalink / raw)
  To: Minchan Kim, Greg Kroah-Hartman, Tejun Heo; +Cc: LKML

Hi,

On 19.11.2021 00:00, Minchan Kim wrote:
> The kernfs implementation has big lock granularity(kernfs_rwsem) so
> every kernfs-based(e.g., sysfs, cgroup) fs are able to compete the
> lock. It makes trouble for some cases to wait the global lock
> for a long time even though they are totally independent contexts
> each other.
>
> A general example is process A goes under direct reclaim with holding
> the lock when it accessed the file in sysfs and process B is waiting
> the lock with exclusive mode and then process C is waiting the lock
> until process B could finish the job after it gets the lock from
> process A.
>
> This patch switches the global kernfs_rwsem to per-fs lock, which
> put the rwsem into kernfs_root.
>
> Suggested-by: Tejun Heo <tj@kernel.org>
> Signed-off-by: Minchan Kim <minchan@kernel.org>

This patch landed recently in linux-next (20211126) as commit 
393c3714081a ("kernfs: switch global kernfs_rwsem lock to per-fs lock"). 
In my tests I've found that it causes the following warning during the 
system reboot:

  =========================
  WARNING: held lock freed!
  5.16.0-rc2+ #10984 Not tainted
  -------------------------
  kworker/1:0/18 is freeing memory ffff00004034e200-ffff00004034e3ff, 
with a lock still held there!
  ffff00004034e348 (&root->kernfs_rwsem){++++}-{3:3}, at: 
__kernfs_remove+0x310/0x37c
  3 locks held by kworker/1:0/18:
   #0: ffff000040107938 ((wq_completion)cgroup_destroy){+.+.}-{0:0}, at: 
process_one_work+0x1f0/0x6f0
   #1: ffff80000b55bdc0 
((work_completion)(&(&css->destroy_rwork)->work)){+.+.}-{0:0}, at: 
process_one_work+0x1f0/0x6f0
   #2: ffff00004034e348 (&root->kernfs_rwsem){++++}-{3:3}, at: 
__kernfs_remove+0x310/0x37c

  stack backtrace:
  CPU: 1 PID: 18 Comm: kworker/1:0 Not tainted 5.16.0-rc2+ #10984
  Hardware name: Raspberry Pi 4 Model B (DT)
  Workqueue: cgroup_destroy css_free_rwork_fn
  Call trace:
   dump_backtrace+0x0/0x1ac
   show_stack+0x18/0x24
   dump_stack_lvl+0x8c/0xb8
   dump_stack+0x18/0x34
   debug_check_no_locks_freed+0x124/0x140
   kfree+0xf0/0x3a4
   kernfs_put+0x1f8/0x224
   __kernfs_remove+0x1b8/0x37c
   kernfs_destroy_root+0x38/0x50
   css_free_rwork_fn+0x288/0x3d4
   process_one_work+0x288/0x6f0
   worker_thread+0x74/0x470
   kthread+0x188/0x194
   ret_from_fork+0x10/0x20

Let me know if you need more information or help in reproducing this issue.

> ---
> from RFC - https://lore.kernel.org/lkml/20211116194317.1430399-1-minchan@kernel.org/
>   * move kernfs_rwsem to kernfs_root - tejun@
>
>   fs/kernfs/dir.c        | 110 ++++++++++++++++++++++++-----------------
>   fs/kernfs/file.c       |   6 ++-
>   fs/kernfs/inode.c      |  22 ++++++---
>   fs/kernfs/mount.c      |  15 +++---
>   fs/kernfs/symlink.c    |   5 +-
>   include/linux/kernfs.h |   2 +
>   6 files changed, 97 insertions(+), 63 deletions(-)
>
> diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
> index 8e0a1378a4b1..13cae0ccce74 100644
> --- a/fs/kernfs/dir.c
> +++ b/fs/kernfs/dir.c
> @@ -17,7 +17,6 @@
>   
>   #include "kernfs-internal.h"
>   
> -DECLARE_RWSEM(kernfs_rwsem);
>   static DEFINE_SPINLOCK(kernfs_rename_lock);	/* kn->parent and ->name */
>   static char kernfs_pr_cont_buf[PATH_MAX];	/* protected by rename_lock */
>   static DEFINE_SPINLOCK(kernfs_idr_lock);	/* root->ino_idr */
> @@ -26,7 +25,7 @@ static DEFINE_SPINLOCK(kernfs_idr_lock);	/* root->ino_idr */
>   
>   static bool kernfs_active(struct kernfs_node *kn)
>   {
> -	lockdep_assert_held(&kernfs_rwsem);
> +	lockdep_assert_held(&kernfs_root(kn)->kernfs_rwsem);
>   	return atomic_read(&kn->active) >= 0;
>   }
>   
> @@ -457,14 +456,15 @@ void kernfs_put_active(struct kernfs_node *kn)
>    * return after draining is complete.
>    */
>   static void kernfs_drain(struct kernfs_node *kn)
> -	__releases(&kernfs_rwsem) __acquires(&kernfs_rwsem)
> +	__releases(&kernfs_root(kn)->kernfs_rwsem)
> +	__acquires(&kernfs_root(kn)->kernfs_rwsem)
>   {
>   	struct kernfs_root *root = kernfs_root(kn);
>   
> -	lockdep_assert_held_write(&kernfs_rwsem);
> +	lockdep_assert_held_write(&root->kernfs_rwsem);
>   	WARN_ON_ONCE(kernfs_active(kn));
>   
> -	up_write(&kernfs_rwsem);
> +	up_write(&root->kernfs_rwsem);
>   
>   	if (kernfs_lockdep(kn)) {
>   		rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_);
> @@ -483,7 +483,7 @@ static void kernfs_drain(struct kernfs_node *kn)
>   
>   	kernfs_drain_open_files(kn);
>   
> -	down_write(&kernfs_rwsem);
> +	down_write(&root->kernfs_rwsem);
>   }
>   
>   /**
> @@ -718,11 +718,12 @@ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
>   int kernfs_add_one(struct kernfs_node *kn)
>   {
>   	struct kernfs_node *parent = kn->parent;
> +	struct kernfs_root *root = kernfs_root(parent);
>   	struct kernfs_iattrs *ps_iattr;
>   	bool has_ns;
>   	int ret;
>   
> -	down_write(&kernfs_rwsem);
> +	down_write(&root->kernfs_rwsem);
>   
>   	ret = -EINVAL;
>   	has_ns = kernfs_ns_enabled(parent);
> @@ -753,7 +754,7 @@ int kernfs_add_one(struct kernfs_node *kn)
>   		ps_iattr->ia_mtime = ps_iattr->ia_ctime;
>   	}
>   
> -	up_write(&kernfs_rwsem);
> +	up_write(&root->kernfs_rwsem);
>   
>   	/*
>   	 * Activate the new node unless CREATE_DEACTIVATED is requested.
> @@ -767,7 +768,7 @@ int kernfs_add_one(struct kernfs_node *kn)
>   	return 0;
>   
>   out_unlock:
> -	up_write(&kernfs_rwsem);
> +	up_write(&root->kernfs_rwsem);
>   	return ret;
>   }
>   
> @@ -788,7 +789,7 @@ static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent,
>   	bool has_ns = kernfs_ns_enabled(parent);
>   	unsigned int hash;
>   
> -	lockdep_assert_held(&kernfs_rwsem);
> +	lockdep_assert_held(&kernfs_root(parent)->kernfs_rwsem);
>   
>   	if (has_ns != (bool)ns) {
>   		WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
> @@ -820,7 +821,7 @@ static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent,
>   	size_t len;
>   	char *p, *name;
>   
> -	lockdep_assert_held_read(&kernfs_rwsem);
> +	lockdep_assert_held_read(&kernfs_root(parent)->kernfs_rwsem);
>   
>   	/* grab kernfs_rename_lock to piggy back on kernfs_pr_cont_buf */
>   	spin_lock_irq(&kernfs_rename_lock);
> @@ -859,11 +860,12 @@ struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent,
>   					   const char *name, const void *ns)
>   {
>   	struct kernfs_node *kn;
> +	struct kernfs_root *root = kernfs_root(parent);
>   
> -	down_read(&kernfs_rwsem);
> +	down_read(&root->kernfs_rwsem);
>   	kn = kernfs_find_ns(parent, name, ns);
>   	kernfs_get(kn);
> -	up_read(&kernfs_rwsem);
> +	up_read(&root->kernfs_rwsem);
>   
>   	return kn;
>   }
> @@ -883,11 +885,12 @@ struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent,
>   					   const char *path, const void *ns)
>   {
>   	struct kernfs_node *kn;
> +	struct kernfs_root *root = kernfs_root(parent);
>   
> -	down_read(&kernfs_rwsem);
> +	down_read(&root->kernfs_rwsem);
>   	kn = kernfs_walk_ns(parent, path, ns);
>   	kernfs_get(kn);
> -	up_read(&kernfs_rwsem);
> +	up_read(&root->kernfs_rwsem);
>   
>   	return kn;
>   }
> @@ -912,6 +915,7 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
>   		return ERR_PTR(-ENOMEM);
>   
>   	idr_init(&root->ino_idr);
> +	init_rwsem(&root->kernfs_rwsem);
>   	INIT_LIST_HEAD(&root->supers);
>   
>   	/*
> @@ -1035,6 +1039,7 @@ struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent,
>   static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
>   {
>   	struct kernfs_node *kn;
> +	struct kernfs_root *root;
>   
>   	if (flags & LOOKUP_RCU)
>   		return -ECHILD;
> @@ -1046,18 +1051,19 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
>   		/* If the kernfs parent node has changed discard and
>   		 * proceed to ->lookup.
>   		 */
> -		down_read(&kernfs_rwsem);
>   		spin_lock(&dentry->d_lock);
>   		parent = kernfs_dentry_node(dentry->d_parent);
>   		if (parent) {
> +			spin_unlock(&dentry->d_lock);
> +			root = kernfs_root(parent);
> +			down_read(&root->kernfs_rwsem);
>   			if (kernfs_dir_changed(parent, dentry)) {
> -				spin_unlock(&dentry->d_lock);
> -				up_read(&kernfs_rwsem);
> +				up_read(&root->kernfs_rwsem);
>   				return 0;
>   			}
> -		}
> -		spin_unlock(&dentry->d_lock);
> -		up_read(&kernfs_rwsem);
> +			up_read(&root->kernfs_rwsem);
> +		} else
> +			spin_unlock(&dentry->d_lock);
>   
>   		/* The kernfs parent node hasn't changed, leave the
>   		 * dentry negative and return success.
> @@ -1066,7 +1072,8 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
>   	}
>   
>   	kn = kernfs_dentry_node(dentry);
> -	down_read(&kernfs_rwsem);
> +	root = kernfs_root(kn);
> +	down_read(&root->kernfs_rwsem);
>   
>   	/* The kernfs node has been deactivated */
>   	if (!kernfs_active(kn))
> @@ -1085,10 +1092,10 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
>   	    kernfs_info(dentry->d_sb)->ns != kn->ns)
>   		goto out_bad;
>   
> -	up_read(&kernfs_rwsem);
> +	up_read(&root->kernfs_rwsem);
>   	return 1;
>   out_bad:
> -	up_read(&kernfs_rwsem);
> +	up_read(&root->kernfs_rwsem);
>   	return 0;
>   }
>   
> @@ -1102,10 +1109,12 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir,
>   {
>   	struct kernfs_node *parent = dir->i_private;
>   	struct kernfs_node *kn;
> +	struct kernfs_root *root;
>   	struct inode *inode = NULL;
>   	const void *ns = NULL;
>   
> -	down_read(&kernfs_rwsem);
> +	root = kernfs_root(parent);
> +	down_read(&root->kernfs_rwsem);
>   	if (kernfs_ns_enabled(parent))
>   		ns = kernfs_info(dir->i_sb)->ns;
>   
> @@ -1116,7 +1125,7 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir,
>   		 * create a negative.
>   		 */
>   		if (!kernfs_active(kn)) {
> -			up_read(&kernfs_rwsem);
> +			up_read(&root->kernfs_rwsem);
>   			return NULL;
>   		}
>   		inode = kernfs_get_inode(dir->i_sb, kn);
> @@ -1131,7 +1140,7 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir,
>   	 */
>   	if (!IS_ERR(inode))
>   		kernfs_set_rev(parent, dentry);
> -	up_read(&kernfs_rwsem);
> +	up_read(&root->kernfs_rwsem);
>   
>   	/* instantiate and hash (possibly negative) dentry */
>   	return d_splice_alias(inode, dentry);
> @@ -1254,7 +1263,7 @@ static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos,
>   {
>   	struct rb_node *rbn;
>   
> -	lockdep_assert_held_write(&kernfs_rwsem);
> +	lockdep_assert_held_write(&kernfs_root(root)->kernfs_rwsem);
>   
>   	/* if first iteration, visit leftmost descendant which may be root */
>   	if (!pos)
> @@ -1289,8 +1298,9 @@ static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos,
>   void kernfs_activate(struct kernfs_node *kn)
>   {
>   	struct kernfs_node *pos;
> +	struct kernfs_root *root = kernfs_root(kn);
>   
> -	down_write(&kernfs_rwsem);
> +	down_write(&root->kernfs_rwsem);
>   
>   	pos = NULL;
>   	while ((pos = kernfs_next_descendant_post(pos, kn))) {
> @@ -1304,14 +1314,14 @@ void kernfs_activate(struct kernfs_node *kn)
>   		pos->flags |= KERNFS_ACTIVATED;
>   	}
>   
> -	up_write(&kernfs_rwsem);
> +	up_write(&root->kernfs_rwsem);
>   }
>   
>   static void __kernfs_remove(struct kernfs_node *kn)
>   {
>   	struct kernfs_node *pos;
>   
> -	lockdep_assert_held_write(&kernfs_rwsem);
> +	lockdep_assert_held_write(&kernfs_root(kn)->kernfs_rwsem);
>   
>   	/*
>   	 * Short-circuit if non-root @kn has already finished removal.
> @@ -1381,9 +1391,11 @@ static void __kernfs_remove(struct kernfs_node *kn)
>    */
>   void kernfs_remove(struct kernfs_node *kn)
>   {
> -	down_write(&kernfs_rwsem);
> +	struct kernfs_root *root = kernfs_root(kn);
> +
> +	down_write(&root->kernfs_rwsem);
>   	__kernfs_remove(kn);
> -	up_write(&kernfs_rwsem);
> +	up_write(&root->kernfs_rwsem);
>   }
>   
>   /**
> @@ -1469,8 +1481,9 @@ void kernfs_unbreak_active_protection(struct kernfs_node *kn)
>   bool kernfs_remove_self(struct kernfs_node *kn)
>   {
>   	bool ret;
> +	struct kernfs_root *root = kernfs_root(kn);
>   
> -	down_write(&kernfs_rwsem);
> +	down_write(&root->kernfs_rwsem);
>   	kernfs_break_active_protection(kn);
>   
>   	/*
> @@ -1498,9 +1511,9 @@ bool kernfs_remove_self(struct kernfs_node *kn)
>   			    atomic_read(&kn->active) == KN_DEACTIVATED_BIAS)
>   				break;
>   
> -			up_write(&kernfs_rwsem);
> +			up_write(&root->kernfs_rwsem);
>   			schedule();
> -			down_write(&kernfs_rwsem);
> +			down_write(&root->kernfs_rwsem);
>   		}
>   		finish_wait(waitq, &wait);
>   		WARN_ON_ONCE(!RB_EMPTY_NODE(&kn->rb));
> @@ -1513,7 +1526,7 @@ bool kernfs_remove_self(struct kernfs_node *kn)
>   	 */
>   	kernfs_unbreak_active_protection(kn);
>   
> -	up_write(&kernfs_rwsem);
> +	up_write(&root->kernfs_rwsem);
>   	return ret;
>   }
>   
> @@ -1530,6 +1543,7 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
>   			     const void *ns)
>   {
>   	struct kernfs_node *kn;
> +	struct kernfs_root *root;
>   
>   	if (!parent) {
>   		WARN(1, KERN_WARNING "kernfs: can not remove '%s', no directory\n",
> @@ -1537,13 +1551,14 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
>   		return -ENOENT;
>   	}
>   
> -	down_write(&kernfs_rwsem);
> +	root = kernfs_root(parent);
> +	down_write(&root->kernfs_rwsem);
>   
>   	kn = kernfs_find_ns(parent, name, ns);
>   	if (kn)
>   		__kernfs_remove(kn);
>   
> -	up_write(&kernfs_rwsem);
> +	up_write(&root->kernfs_rwsem);
>   
>   	if (kn)
>   		return 0;
> @@ -1562,6 +1577,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
>   		     const char *new_name, const void *new_ns)
>   {
>   	struct kernfs_node *old_parent;
> +	struct kernfs_root *root;
>   	const char *old_name = NULL;
>   	int error;
>   
> @@ -1569,7 +1585,8 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
>   	if (!kn->parent)
>   		return -EINVAL;
>   
> -	down_write(&kernfs_rwsem);
> +	root = kernfs_root(kn);
> +	down_write(&root->kernfs_rwsem);
>   
>   	error = -ENOENT;
>   	if (!kernfs_active(kn) || !kernfs_active(new_parent) ||
> @@ -1623,7 +1640,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
>   
>   	error = 0;
>    out:
> -	up_write(&kernfs_rwsem);
> +	up_write(&root->kernfs_rwsem);
>   	return error;
>   }
>   
> @@ -1694,11 +1711,14 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx)
>   	struct dentry *dentry = file->f_path.dentry;
>   	struct kernfs_node *parent = kernfs_dentry_node(dentry);
>   	struct kernfs_node *pos = file->private_data;
> +	struct kernfs_root *root;
>   	const void *ns = NULL;
>   
>   	if (!dir_emit_dots(file, ctx))
>   		return 0;
> -	down_read(&kernfs_rwsem);
> +
> +	root = kernfs_root(parent);
> +	down_read(&root->kernfs_rwsem);
>   
>   	if (kernfs_ns_enabled(parent))
>   		ns = kernfs_info(dentry->d_sb)->ns;
> @@ -1715,12 +1735,12 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx)
>   		file->private_data = pos;
>   		kernfs_get(pos);
>   
> -		up_read(&kernfs_rwsem);
> +		up_read(&root->kernfs_rwsem);
>   		if (!dir_emit(ctx, name, len, ino, type))
>   			return 0;
> -		down_read(&kernfs_rwsem);
> +		down_read(&root->kernfs_rwsem);
>   	}
> -	up_read(&kernfs_rwsem);
> +	up_read(&root->kernfs_rwsem);
>   	file->private_data = NULL;
>   	ctx->pos = INT_MAX;
>   	return 0;
> diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
> index 60e2a86c535e..9414a7a60a9f 100644
> --- a/fs/kernfs/file.c
> +++ b/fs/kernfs/file.c
> @@ -847,6 +847,7 @@ static void kernfs_notify_workfn(struct work_struct *work)
>   {
>   	struct kernfs_node *kn;
>   	struct kernfs_super_info *info;
> +	struct kernfs_root *root;
>   repeat:
>   	/* pop one off the notify_list */
>   	spin_lock_irq(&kernfs_notify_lock);
> @@ -859,8 +860,9 @@ static void kernfs_notify_workfn(struct work_struct *work)
>   	kn->attr.notify_next = NULL;
>   	spin_unlock_irq(&kernfs_notify_lock);
>   
> +	root = kernfs_root(kn);
>   	/* kick fsnotify */
> -	down_write(&kernfs_rwsem);
> +	down_write(&root->kernfs_rwsem);
>   
>   	list_for_each_entry(info, &kernfs_root(kn)->supers, node) {
>   		struct kernfs_node *parent;
> @@ -898,7 +900,7 @@ static void kernfs_notify_workfn(struct work_struct *work)
>   		iput(inode);
>   	}
>   
> -	up_write(&kernfs_rwsem);
> +	up_write(&root->kernfs_rwsem);
>   	kernfs_put(kn);
>   	goto repeat;
>   }
> diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
> index c0eae1725435..3d783d80f5da 100644
> --- a/fs/kernfs/inode.c
> +++ b/fs/kernfs/inode.c
> @@ -99,10 +99,11 @@ int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
>   int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
>   {
>   	int ret;
> +	struct kernfs_root *root = kernfs_root(kn);
>   
> -	down_write(&kernfs_rwsem);
> +	down_write(&root->kernfs_rwsem);
>   	ret = __kernfs_setattr(kn, iattr);
> -	up_write(&kernfs_rwsem);
> +	up_write(&root->kernfs_rwsem);
>   	return ret;
>   }
>   
> @@ -111,12 +112,14 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
>   {
>   	struct inode *inode = d_inode(dentry);
>   	struct kernfs_node *kn = inode->i_private;
> +	struct kernfs_root *root;
>   	int error;
>   
>   	if (!kn)
>   		return -EINVAL;
>   
> -	down_write(&kernfs_rwsem);
> +	root = kernfs_root(kn);
> +	down_write(&root->kernfs_rwsem);
>   	error = setattr_prepare(&init_user_ns, dentry, iattr);
>   	if (error)
>   		goto out;
> @@ -129,7 +132,7 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
>   	setattr_copy(&init_user_ns, inode, iattr);
>   
>   out:
> -	up_write(&kernfs_rwsem);
> +	up_write(&root->kernfs_rwsem);
>   	return error;
>   }
>   
> @@ -184,13 +187,14 @@ int kernfs_iop_getattr(struct user_namespace *mnt_userns,
>   {
>   	struct inode *inode = d_inode(path->dentry);
>   	struct kernfs_node *kn = inode->i_private;
> +	struct kernfs_root *root = kernfs_root(kn);
>   
> -	down_read(&kernfs_rwsem);
> +	down_read(&root->kernfs_rwsem);
>   	spin_lock(&inode->i_lock);
>   	kernfs_refresh_inode(kn, inode);
>   	generic_fillattr(&init_user_ns, inode, stat);
>   	spin_unlock(&inode->i_lock);
> -	up_read(&kernfs_rwsem);
> +	up_read(&root->kernfs_rwsem);
>   
>   	return 0;
>   }
> @@ -274,19 +278,21 @@ int kernfs_iop_permission(struct user_namespace *mnt_userns,
>   			  struct inode *inode, int mask)
>   {
>   	struct kernfs_node *kn;
> +	struct kernfs_root *root;
>   	int ret;
>   
>   	if (mask & MAY_NOT_BLOCK)
>   		return -ECHILD;
>   
>   	kn = inode->i_private;
> +	root = kernfs_root(kn);
>   
> -	down_read(&kernfs_rwsem);
> +	down_read(&root->kernfs_rwsem);
>   	spin_lock(&inode->i_lock);
>   	kernfs_refresh_inode(kn, inode);
>   	ret = generic_permission(&init_user_ns, inode, mask);
>   	spin_unlock(&inode->i_lock);
> -	up_read(&kernfs_rwsem);
> +	up_read(&root->kernfs_rwsem);
>   
>   	return ret;
>   }
> diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
> index f2f909d09f52..cfa79715fc1a 100644
> --- a/fs/kernfs/mount.c
> +++ b/fs/kernfs/mount.c
> @@ -236,6 +236,7 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
>   static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *kfc)
>   {
>   	struct kernfs_super_info *info = kernfs_info(sb);
> +	struct kernfs_root *kf_root = kfc->root;
>   	struct inode *inode;
>   	struct dentry *root;
>   
> @@ -255,9 +256,9 @@ static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *k
>   	sb->s_shrink.seeks = 0;
>   
>   	/* get root inode, initialize and unlock it */
> -	down_read(&kernfs_rwsem);
> +	down_read(&kf_root->kernfs_rwsem);
>   	inode = kernfs_get_inode(sb, info->root->kn);
> -	up_read(&kernfs_rwsem);
> +	up_read(&kf_root->kernfs_rwsem);
>   	if (!inode) {
>   		pr_debug("kernfs: could not get root inode\n");
>   		return -ENOMEM;
> @@ -334,6 +335,7 @@ int kernfs_get_tree(struct fs_context *fc)
>   
>   	if (!sb->s_root) {
>   		struct kernfs_super_info *info = kernfs_info(sb);
> +		struct kernfs_root *root = kfc->root;
>   
>   		kfc->new_sb_created = true;
>   
> @@ -344,9 +346,9 @@ int kernfs_get_tree(struct fs_context *fc)
>   		}
>   		sb->s_flags |= SB_ACTIVE;
>   
> -		down_write(&kernfs_rwsem);
> +		down_write(&root->kernfs_rwsem);
>   		list_add(&info->node, &info->root->supers);
> -		up_write(&kernfs_rwsem);
> +		up_write(&root->kernfs_rwsem);
>   	}
>   
>   	fc->root = dget(sb->s_root);
> @@ -371,10 +373,11 @@ void kernfs_free_fs_context(struct fs_context *fc)
>   void kernfs_kill_sb(struct super_block *sb)
>   {
>   	struct kernfs_super_info *info = kernfs_info(sb);
> +	struct kernfs_root *root = info->root;
>   
> -	down_write(&kernfs_rwsem);
> +	down_write(&root->kernfs_rwsem);
>   	list_del(&info->node);
> -	up_write(&kernfs_rwsem);
> +	up_write(&root->kernfs_rwsem);
>   
>   	/*
>   	 * Remove the superblock from fs_supers/s_instances
> diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
> index 19a6c71c6ff5..0ab13824822f 100644
> --- a/fs/kernfs/symlink.c
> +++ b/fs/kernfs/symlink.c
> @@ -113,11 +113,12 @@ static int kernfs_getlink(struct inode *inode, char *path)
>   	struct kernfs_node *kn = inode->i_private;
>   	struct kernfs_node *parent = kn->parent;
>   	struct kernfs_node *target = kn->symlink.target_kn;
> +	struct kernfs_root *root = kernfs_root(parent);
>   	int error;
>   
> -	down_read(&kernfs_rwsem);
> +	down_read(&root->kernfs_rwsem);
>   	error = kernfs_get_target_path(parent, target, path);
> -	up_read(&kernfs_rwsem);
> +	up_read(&root->kernfs_rwsem);
>   
>   	return error;
>   }
> diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
> index 3ccce6f24548..9f650986a81b 100644
> --- a/include/linux/kernfs.h
> +++ b/include/linux/kernfs.h
> @@ -16,6 +16,7 @@
>   #include <linux/atomic.h>
>   #include <linux/uidgid.h>
>   #include <linux/wait.h>
> +#include <linux/rwsem.h>
>   
>   struct file;
>   struct dentry;
> @@ -197,6 +198,7 @@ struct kernfs_root {
>   	struct list_head	supers;
>   
>   	wait_queue_head_t	deactivate_waitq;
> +	struct rw_semaphore	kernfs_rwsem;
>   };
>   
>   struct kernfs_open_file {

Best regards
-- 
Marek Szyprowski, PhD
Samsung R&D Institute Poland


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] kernfs: switch global kernfs_rwsem lock to per-fs lock
  2021-11-26 11:54   ` Marek Szyprowski
@ 2021-11-29 14:23     ` Nguyen Dinh Phi
  2021-11-29 16:58     ` Minchan Kim
  1 sibling, 0 replies; 8+ messages in thread
From: Nguyen Dinh Phi @ 2021-11-29 14:23 UTC (permalink / raw)
  To: m.szyprowski; +Cc: gregkh, linux-kernel, minchan, tj

>This patch landed recently in linux-next (20211126) as commit 
>393c3714081a ("kernfs: switch global kernfs_rwsem lock to per-fs lock"). 
>In my tests I've found that it causes the following warning during the 
>system reboot:
>
>  =========================
>  WARNING: held lock freed!
>  5.16.0-rc2+ #10984 Not tainted
>  -------------------------
>  kworker/1:0/18 is freeing memory ffff00004034e200-ffff00004034e3ff, 
>with a lock still held there!
>  ffff00004034e348 (&root->kernfs_rwsem){++++}-{3:3}, at: 
>__kernfs_remove+0x310/0x37c
>  3 locks held by kworker/1:0/18:
>   #0: ffff000040107938 ((wq_completion)cgroup_destroy){+.+.}-{0:0}, at: 
>process_one_work+0x1f0/0x6f0
>   #1: ffff80000b55bdc0 
>((work_completion)(&(&css->destroy_rwork)->work)){+.+.}-{0:0}, at: 
>process_one_work+0x1f0/0x6f0
>   #2: ffff00004034e348 (&root->kernfs_rwsem){++++}-{3:3}, at: 
>__kernfs_remove+0x310/0x37c
>
>  stack backtrace:
>  CPU: 1 PID: 18 Comm: kworker/1:0 Not tainted 5.16.0-rc2+ #10984
>  Hardware name: Raspberry Pi 4 Model B (DT)
>  Workqueue: cgroup_destroy css_free_rwork_fn
>  Call trace:
>   dump_backtrace+0x0/0x1ac
>   show_stack+0x18/0x24
>   dump_stack_lvl+0x8c/0xb8
>   dump_stack+0x18/0x34
>   debug_check_no_locks_freed+0x124/0x140
>   kfree+0xf0/0x3a4
>   kernfs_put+0x1f8/0x224
>   __kernfs_remove+0x1b8/0x37c
>   kernfs_destroy_root+0x38/0x50
>   css_free_rwork_fn+0x288/0x3d4
>   process_one_work+0x288/0x6f0
>   worker_thread+0x74/0x470
>   kthread+0x188/0x194
>   ret_from_fork+0x10/0x20
>
>Let me know if you need more information or help in reproducing this issue.

This patch has a problem, in function kernfs_remove(), it call to 
__kernfs_remove(kn), which will free the related @root of @kn, it means the 
root->kernfs_rwsem will be freed too, but it is still being held by the call 
to down_write(&root->kernfs_rwsem) around line number 100.

BR

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] kernfs: switch global kernfs_rwsem lock to per-fs lock
  2021-11-26 11:54   ` Marek Szyprowski
  2021-11-29 14:23     ` Nguyen Dinh Phi
@ 2021-11-29 16:58     ` Minchan Kim
  2021-11-30  8:00       ` Marek Szyprowski
  1 sibling, 1 reply; 8+ messages in thread
From: Minchan Kim @ 2021-11-29 16:58 UTC (permalink / raw)
  To: Marek Szyprowski; +Cc: Greg Kroah-Hartman, Tejun Heo, LKML

On Fri, Nov 26, 2021 at 12:54:45PM +0100, Marek Szyprowski wrote:
> Hi,
> 
> On 19.11.2021 00:00, Minchan Kim wrote:
> > The kernfs implementation has big lock granularity(kernfs_rwsem) so
> > every kernfs-based(e.g., sysfs, cgroup) fs are able to compete the
> > lock. It makes trouble for some cases to wait the global lock
> > for a long time even though they are totally independent contexts
> > each other.
> >
> > A general example is process A goes under direct reclaim with holding
> > the lock when it accessed the file in sysfs and process B is waiting
> > the lock with exclusive mode and then process C is waiting the lock
> > until process B could finish the job after it gets the lock from
> > process A.
> >
> > This patch switches the global kernfs_rwsem to per-fs lock, which
> > put the rwsem into kernfs_root.
> >
> > Suggested-by: Tejun Heo <tj@kernel.org>
> > Signed-off-by: Minchan Kim <minchan@kernel.org>
> 
> This patch landed recently in linux-next (20211126) as commit 
> 393c3714081a ("kernfs: switch global kernfs_rwsem lock to per-fs lock"). 
> In my tests I've found that it causes the following warning during the 
> system reboot:
> 
>   =========================
>   WARNING: held lock freed!
>   5.16.0-rc2+ #10984 Not tainted
>   -------------------------
>   kworker/1:0/18 is freeing memory ffff00004034e200-ffff00004034e3ff, 
> with a lock still held there!
>   ffff00004034e348 (&root->kernfs_rwsem){++++}-{3:3}, at: 
> __kernfs_remove+0x310/0x37c
>   3 locks held by kworker/1:0/18:
>    #0: ffff000040107938 ((wq_completion)cgroup_destroy){+.+.}-{0:0}, at: 
> process_one_work+0x1f0/0x6f0
>    #1: ffff80000b55bdc0 
> ((work_completion)(&(&css->destroy_rwork)->work)){+.+.}-{0:0}, at: 
> process_one_work+0x1f0/0x6f0
>    #2: ffff00004034e348 (&root->kernfs_rwsem){++++}-{3:3}, at: 
> __kernfs_remove+0x310/0x37c
> 
>   stack backtrace:
>   CPU: 1 PID: 18 Comm: kworker/1:0 Not tainted 5.16.0-rc2+ #10984
>   Hardware name: Raspberry Pi 4 Model B (DT)
>   Workqueue: cgroup_destroy css_free_rwork_fn
>   Call trace:
>    dump_backtrace+0x0/0x1ac
>    show_stack+0x18/0x24
>    dump_stack_lvl+0x8c/0xb8
>    dump_stack+0x18/0x34
>    debug_check_no_locks_freed+0x124/0x140
>    kfree+0xf0/0x3a4
>    kernfs_put+0x1f8/0x224
>    __kernfs_remove+0x1b8/0x37c
>    kernfs_destroy_root+0x38/0x50
>    css_free_rwork_fn+0x288/0x3d4
>    process_one_work+0x288/0x6f0
>    worker_thread+0x74/0x470
>    kthread+0x188/0x194
>    ret_from_fork+0x10/0x20
> 
> Let me know if you need more information or help in reproducing this issue.

Hi Marek,

Thanks for the report. Could you try this one? 

From 68741aa598ffd4a407d6d5f6b58bc7e7281e6f81 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Mon, 29 Nov 2021 08:40:15 -0800
Subject: [PATCH] kernfs: prevent early freeing of root node

Marek reported the warning below.

  =========================
  WARNING: held lock freed!
  5.16.0-rc2+ #10984 Not tainted
  -------------------------
  kworker/1:0/18 is freeing memory ffff00004034e200-ffff00004034e3ff,
with a lock still held there!
  ffff00004034e348 (&root->kernfs_rwsem){++++}-{3:3}, at:
__kernfs_remove+0x310/0x37c
  3 locks held by kworker/1:0/18:
   #0: ffff000040107938 ((wq_completion)cgroup_destroy){+.+.}-{0:0}, at:
process_one_work+0x1f0/0x6f0
   #1: ffff80000b55bdc0
((work_completion)(&(&css->destroy_rwork)->work)){+.+.}-{0:0}, at:
process_one_work+0x1f0/0x6f0
   #2: ffff00004034e348 (&root->kernfs_rwsem){++++}-{3:3}, at:
__kernfs_remove+0x310/0x37c

  stack backtrace:
  CPU: 1 PID: 18 Comm: kworker/1:0 Not tainted 5.16.0-rc2+ #10984
  Hardware name: Raspberry Pi 4 Model B (DT)
  Workqueue: cgroup_destroy css_free_rwork_fn
  Call trace:
   dump_backtrace+0x0/0x1ac
   show_stack+0x18/0x24
   dump_stack_lvl+0x8c/0xb8
   dump_stack+0x18/0x34
   debug_check_no_locks_freed+0x124/0x140
   kfree+0xf0/0x3a4
   kernfs_put+0x1f8/0x224
   __kernfs_remove+0x1b8/0x37c
   kernfs_destroy_root+0x38/0x50
   css_free_rwork_fn+0x288/0x3d4
   process_one_work+0x288/0x6f0
   worker_thread+0x74/0x470
   kthread+0x188/0x194
   ret_from_fork+0x10/0x20

Since kernfs moves the kernfs_rwsem lock into root, it couldn't hold
the lock when the root node is tearing down. Thus, get the refcount
of root node.

Reported-by: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Minchan Kim <minchan@kernel.org>
---
 fs/kernfs/dir.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 13cae0ccce74..e6d9772ddb4c 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -961,7 +961,13 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
  */
 void kernfs_destroy_root(struct kernfs_root *root)
 {
-	kernfs_remove(root->kn);	/* will also free @root */
+	/*
+	 *  kernfs_remove holds kernfs_rwsem from the root so the root
+	 *  shouldn't be freed during the operation.
+	 */
+	kernfs_get(root->kn);
+	kernfs_remove(root->kn);
+	kernfs_put(root->kn); /* will also free @root */
 }
 
 /**
-- 
2.34.0.rc2.393.gf8c9666880-goog


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] kernfs: switch global kernfs_rwsem lock to per-fs lock
  2021-11-29 16:58     ` Minchan Kim
@ 2021-11-30  8:00       ` Marek Szyprowski
  0 siblings, 0 replies; 8+ messages in thread
From: Marek Szyprowski @ 2021-11-30  8:00 UTC (permalink / raw)
  To: Minchan Kim; +Cc: Greg Kroah-Hartman, Tejun Heo, LKML


On 29.11.2021 17:58, Minchan Kim wrote:
> On Fri, Nov 26, 2021 at 12:54:45PM +0100, Marek Szyprowski wrote:
>> Hi,
>>
>> On 19.11.2021 00:00, Minchan Kim wrote:
>>> The kernfs implementation has big lock granularity(kernfs_rwsem) so
>>> every kernfs-based(e.g., sysfs, cgroup) fs are able to compete the
>>> lock. It makes trouble for some cases to wait the global lock
>>> for a long time even though they are totally independent contexts
>>> each other.
>>>
>>> A general example is process A goes under direct reclaim with holding
>>> the lock when it accessed the file in sysfs and process B is waiting
>>> the lock with exclusive mode and then process C is waiting the lock
>>> until process B could finish the job after it gets the lock from
>>> process A.
>>>
>>> This patch switches the global kernfs_rwsem to per-fs lock, which
>>> put the rwsem into kernfs_root.
>>>
>>> Suggested-by: Tejun Heo <tj@kernel.org>
>>> Signed-off-by: Minchan Kim <minchan@kernel.org>
>> This patch landed recently in linux-next (20211126) as commit
>> 393c3714081a ("kernfs: switch global kernfs_rwsem lock to per-fs lock").
>> In my tests I've found that it causes the following warning during the
>> system reboot:
>>
>>    =========================
>>    WARNING: held lock freed!
>>    5.16.0-rc2+ #10984 Not tainted
>>    -------------------------
>>    kworker/1:0/18 is freeing memory ffff00004034e200-ffff00004034e3ff,
>> with a lock still held there!
>>    ffff00004034e348 (&root->kernfs_rwsem){++++}-{3:3}, at:
>> __kernfs_remove+0x310/0x37c
>>    3 locks held by kworker/1:0/18:
>>     #0: ffff000040107938 ((wq_completion)cgroup_destroy){+.+.}-{0:0}, at:
>> process_one_work+0x1f0/0x6f0
>>     #1: ffff80000b55bdc0
>> ((work_completion)(&(&css->destroy_rwork)->work)){+.+.}-{0:0}, at:
>> process_one_work+0x1f0/0x6f0
>>     #2: ffff00004034e348 (&root->kernfs_rwsem){++++}-{3:3}, at:
>> __kernfs_remove+0x310/0x37c
>>
>>    stack backtrace:
>>    CPU: 1 PID: 18 Comm: kworker/1:0 Not tainted 5.16.0-rc2+ #10984
>>    Hardware name: Raspberry Pi 4 Model B (DT)
>>    Workqueue: cgroup_destroy css_free_rwork_fn
>>    Call trace:
>>     dump_backtrace+0x0/0x1ac
>>     show_stack+0x18/0x24
>>     dump_stack_lvl+0x8c/0xb8
>>     dump_stack+0x18/0x34
>>     debug_check_no_locks_freed+0x124/0x140
>>     kfree+0xf0/0x3a4
>>     kernfs_put+0x1f8/0x224
>>     __kernfs_remove+0x1b8/0x37c
>>     kernfs_destroy_root+0x38/0x50
>>     css_free_rwork_fn+0x288/0x3d4
>>     process_one_work+0x288/0x6f0
>>     worker_thread+0x74/0x470
>>     kthread+0x188/0x194
>>     ret_from_fork+0x10/0x20
>>
>> Let me know if you need more information or help in reproducing this issue.
> Hi Marek,
>
> Thanks for the report. Could you try this one?
>
> >From 68741aa598ffd4a407d6d5f6b58bc7e7281e6f81 Mon Sep 17 00:00:00 2001
> From: Minchan Kim <minchan@kernel.org>
> Date: Mon, 29 Nov 2021 08:40:15 -0800
> Subject: [PATCH] kernfs: prevent early freeing of root node
>
> Marek reported the warning below.
>
>    =========================
>    WARNING: held lock freed!
>    5.16.0-rc2+ #10984 Not tainted
>    -------------------------
>    kworker/1:0/18 is freeing memory ffff00004034e200-ffff00004034e3ff,
> with a lock still held there!
>    ffff00004034e348 (&root->kernfs_rwsem){++++}-{3:3}, at:
> __kernfs_remove+0x310/0x37c
>    3 locks held by kworker/1:0/18:
>     #0: ffff000040107938 ((wq_completion)cgroup_destroy){+.+.}-{0:0}, at:
> process_one_work+0x1f0/0x6f0
>     #1: ffff80000b55bdc0
> ((work_completion)(&(&css->destroy_rwork)->work)){+.+.}-{0:0}, at:
> process_one_work+0x1f0/0x6f0
>     #2: ffff00004034e348 (&root->kernfs_rwsem){++++}-{3:3}, at:
> __kernfs_remove+0x310/0x37c
>
>    stack backtrace:
>    CPU: 1 PID: 18 Comm: kworker/1:0 Not tainted 5.16.0-rc2+ #10984
>    Hardware name: Raspberry Pi 4 Model B (DT)
>    Workqueue: cgroup_destroy css_free_rwork_fn
>    Call trace:
>     dump_backtrace+0x0/0x1ac
>     show_stack+0x18/0x24
>     dump_stack_lvl+0x8c/0xb8
>     dump_stack+0x18/0x34
>     debug_check_no_locks_freed+0x124/0x140
>     kfree+0xf0/0x3a4
>     kernfs_put+0x1f8/0x224
>     __kernfs_remove+0x1b8/0x37c
>     kernfs_destroy_root+0x38/0x50
>     css_free_rwork_fn+0x288/0x3d4
>     process_one_work+0x288/0x6f0
>     worker_thread+0x74/0x470
>     kthread+0x188/0x194
>     ret_from_fork+0x10/0x20
>
> Since kernfs moves the kernfs_rwsem lock into root, it couldn't hold
> the lock when the root node is tearing down. Thus, get the refcount
> of root node.
>
> Reported-by: Marek Szyprowski <m.szyprowski@samsung.com>
> Signed-off-by: Minchan Kim <minchan@kernel.org>

Right, this fixes the issue.

Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>

> ---
>   fs/kernfs/dir.c | 8 +++++++-
>   1 file changed, 7 insertions(+), 1 deletion(-)
>
> diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
> index 13cae0ccce74..e6d9772ddb4c 100644
> --- a/fs/kernfs/dir.c
> +++ b/fs/kernfs/dir.c
> @@ -961,7 +961,13 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
>    */
>   void kernfs_destroy_root(struct kernfs_root *root)
>   {
> -	kernfs_remove(root->kn);	/* will also free @root */
> +	/*
> +	 *  kernfs_remove holds kernfs_rwsem from the root so the root
> +	 *  shouldn't be freed during the operation.
> +	 */
> +	kernfs_get(root->kn);
> +	kernfs_remove(root->kn);
> +	kernfs_put(root->kn); /* will also free @root */
>   }
>   
>   /**

Best regards
-- 
Marek Szyprowski, PhD
Samsung R&D Institute Poland


^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2021-11-30  8:00 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-11-18 23:00 [PATCH] kernfs: switch global kernfs_rwsem lock to per-fs lock Minchan Kim
2021-11-18 23:03 ` Tejun Heo
2021-11-22 17:39   ` Minchan Kim
2021-11-23  6:44     ` Greg Kroah-Hartman
     [not found] ` <CGME20211126115446eucas1p2e377cf7718c6da6bfe40a016bc0191a8@eucas1p2.samsung.com>
2021-11-26 11:54   ` Marek Szyprowski
2021-11-29 14:23     ` Nguyen Dinh Phi
2021-11-29 16:58     ` Minchan Kim
2021-11-30  8:00       ` Marek Szyprowski

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).