All of lore.kernel.org
 help / color / mirror / Atom feed
From: Imran Khan <imran.f.khan@oracle.com>
To: tj@kernel.org, gregkh@linuxfoundation.org
Cc: linux-kernel@vger.kernel.org
Subject: [PATCH v3 2/2] kernfs: Reduce contention around global per-fs kernfs_rwsem.
Date: Thu, 13 Jan 2022 21:42:59 +1100	[thread overview]
Message-ID: <20220113104259.1584491-3-imran.f.khan@oracle.com> (raw)
In-Reply-To: <20220113104259.1584491-1-imran.f.khan@oracle.com>

Right now a global per file system based rwsem (kernfs_rwsem)
synchronizes multiple kernfs operations. On a large system with
few hundred CPUs and few hundred applications simultaenously trying
to access sysfs, this results in multiple sys_open(s) contending on
kernfs_rwsem via kernfs_iop_permission and kernfs_dop_revalidate.

-   21.42%    21.34%  showgids   [kernel.kallsyms]     [k] up_read
     21.34% __libc_start_main
        __GI___libc_open
        entry_SYSCALL_64_after_hwframe
        do_syscall_64
        sys_open
        do_sys_open
        do_filp_open
      - path_openat
         - 20.05% link_path_walk
            - 9.76% walk_component
                 lookup_fast
               - d_revalidate.part.24
                  - 9.75% kernfs_dop_revalidate
                       up_read
            - 9.46% inode_permission
               - __inode_permission
                  - 9.46% kernfs_iop_permission
                       up_read
            - 0.83% kernfs_iop_get_link
                 up_read
         - 0.80% lookup_fast
              d_revalidate.part.24
              kernfs_dop_revalidate
              up_read

-   21.31%    21.21%  showgids   [kernel.kallsyms]    [k] down_read
     21.21% __libc_start_main
        __GI___libc_open
        entry_SYSCALL_64_after_hwframe
        do_syscall_64
        sys_open
        do_sys_open
        do_filp_open
      - path_openat
         - 19.78% link_path_walk
            - 10.62% inode_permission
               - __inode_permission
                  - 10.62% kernfs_iop_permission
                       down_read
            - 8.45% walk_component
                 lookup_fast
               - d_revalidate.part.24
                  - 8.45% kernfs_dop_revalidate
                       down_read
            - 0.71% kernfs_iop_get_link
                 down_read
         - 0.72% lookup_fast
            - d_revalidate.part.24
               - 0.72% kernfs_dop_revalidate
                    down_read
         - 0.71% may_open
              inode_permission
              __inode_permission
              kernfs_iop_permission
              down_read

Since permission is specific to a kernfs_node we can use a hashed
lock to access/modify permission. Also use kernfs reference counting
to ensure we are accessing/modifying permissions for an existing
kernfs_node object.

Using this change brings down the above mentioned down_read/up_read
numbers to ~8%, thus indicating that contention around kernfs_rwsem
has reduced to about 1/3rd of earlier value.

Signed-off-by: Imran Khan <imran.f.khan@oracle.com>
---
 fs/kernfs/dir.c             |  8 ++++++++
 fs/kernfs/inode.c           | 35 +++++++++++++++++++++++++----------
 fs/kernfs/kernfs-internal.h | 18 ++++++++++++++++--
 3 files changed, 49 insertions(+), 12 deletions(-)

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index e6d9772ddb4c..37daaffda718 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -720,6 +720,7 @@ int kernfs_add_one(struct kernfs_node *kn)
 	struct kernfs_node *parent = kn->parent;
 	struct kernfs_root *root = kernfs_root(parent);
 	struct kernfs_iattrs *ps_iattr;
+	struct rw_semaphore *rwsem = NULL;
 	bool has_ns;
 	int ret;
 
@@ -748,11 +749,14 @@ int kernfs_add_one(struct kernfs_node *kn)
 		goto out_unlock;
 
 	/* Update timestamps on the parent */
+	rwsem = iattr_rwsem_ptr(parent);
+	down_write(rwsem);
 	ps_iattr = parent->iattr;
 	if (ps_iattr) {
 		ktime_get_real_ts64(&ps_iattr->ia_ctime);
 		ps_iattr->ia_mtime = ps_iattr->ia_ctime;
 	}
+	up_write(rwsem);
 
 	up_write(&root->kernfs_rwsem);
 
@@ -1326,6 +1330,7 @@ void kernfs_activate(struct kernfs_node *kn)
 static void __kernfs_remove(struct kernfs_node *kn)
 {
 	struct kernfs_node *pos;
+	struct rw_semaphore *rwsem;
 
 	lockdep_assert_held_write(&kernfs_root(kn)->kernfs_rwsem);
 
@@ -1378,8 +1383,11 @@ static void __kernfs_remove(struct kernfs_node *kn)
 
 			/* update timestamps on the parent */
 			if (ps_iattr) {
+				rwsem = iattr_rwsem_ptr(pos->parent);
+				down_write(rwsem);
 				ktime_get_real_ts64(&ps_iattr->ia_ctime);
 				ps_iattr->ia_mtime = ps_iattr->ia_ctime;
+				up_write(rwsem);
 			}
 
 			kernfs_put(pos);
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 3d783d80f5da..7a55ad440bf4 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -99,11 +99,15 @@ int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
 int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
 {
 	int ret;
-	struct kernfs_root *root = kernfs_root(kn);
+	struct rw_semaphore *rwsem = NULL;
 
-	down_write(&root->kernfs_rwsem);
+	kernfs_get(kn);
+	rwsem = iattr_rwsem_ptr(kn);
+	down_write(rwsem);
 	ret = __kernfs_setattr(kn, iattr);
-	up_write(&root->kernfs_rwsem);
+	up_write(rwsem);
+	kernfs_put(kn);
+
 	return ret;
 }
 
@@ -112,6 +116,7 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 {
 	struct inode *inode = d_inode(dentry);
 	struct kernfs_node *kn = inode->i_private;
+	struct rw_semaphore *rwsem = NULL;
 	struct kernfs_root *root;
 	int error;
 
@@ -119,7 +124,9 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 		return -EINVAL;
 
 	root = kernfs_root(kn);
-	down_write(&root->kernfs_rwsem);
+	kernfs_get(kn);
+	rwsem = iattr_rwsem_ptr(kn);
+	down_write(rwsem);
 	error = setattr_prepare(&init_user_ns, dentry, iattr);
 	if (error)
 		goto out;
@@ -132,7 +139,8 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 	setattr_copy(&init_user_ns, inode, iattr);
 
 out:
-	up_write(&root->kernfs_rwsem);
+	up_write(rwsem);
+	kernfs_put(kn);
 	return error;
 }
 
@@ -187,14 +195,17 @@ int kernfs_iop_getattr(struct user_namespace *mnt_userns,
 {
 	struct inode *inode = d_inode(path->dentry);
 	struct kernfs_node *kn = inode->i_private;
-	struct kernfs_root *root = kernfs_root(kn);
+	struct rw_semaphore *rwsem = NULL;
 
-	down_read(&root->kernfs_rwsem);
+	kernfs_get(kn);
+	rwsem = iattr_rwsem_ptr(kn);
+	down_read(rwsem);
 	spin_lock(&inode->i_lock);
 	kernfs_refresh_inode(kn, inode);
 	generic_fillattr(&init_user_ns, inode, stat);
 	spin_unlock(&inode->i_lock);
-	up_read(&root->kernfs_rwsem);
+	up_read(rwsem);
+	kernfs_put(kn);
 
 	return 0;
 }
@@ -279,6 +290,7 @@ int kernfs_iop_permission(struct user_namespace *mnt_userns,
 {
 	struct kernfs_node *kn;
 	struct kernfs_root *root;
+	struct rw_semaphore *rwsem = NULL;
 	int ret;
 
 	if (mask & MAY_NOT_BLOCK)
@@ -287,12 +299,15 @@ int kernfs_iop_permission(struct user_namespace *mnt_userns,
 	kn = inode->i_private;
 	root = kernfs_root(kn);
 
-	down_read(&root->kernfs_rwsem);
+	kernfs_get(kn);
+	rwsem = iattr_rwsem_ptr(kn);
+	down_read(rwsem);
 	spin_lock(&inode->i_lock);
 	kernfs_refresh_inode(kn, inode);
 	ret = generic_permission(&init_user_ns, inode, mask);
 	spin_unlock(&inode->i_lock);
-	up_read(&root->kernfs_rwsem);
+	up_read(rwsem);
+	kernfs_put(kn);
 
 	return ret;
 }
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 4bdcf7a71845..f1977d72369a 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -39,17 +39,23 @@ struct kernfs_open_file_mutex {
 	struct mutex lock;
 } ____cacheline_aligned_in_smp;
 
+struct kernfs_iattr_rwsem {
+	struct rw_semaphore rwsem;
+} ____cacheline_aligned_in_smp;
+
 /*
  * There's one kernfs_open_file for each open file and one kernfs_open_node
  * for each kernfs_node with one or more open files.
  *
  * kernfs_node->attr.open points to kernfs_open_node.  attr.open is
- * protected by open_node_locks[i].
+ * protected by open_node_locks[i].lock.
  *
  * filp->private_data points to seq_file whose ->private points to
  * kernfs_open_file.  kernfs_open_files are chained at
- * kernfs_open_node->files, which is protected by open_file_mutex[i].
+ * kernfs_open_node->files, which is protected by open_file_mutex[i].lock.
  *
+ * kernfs_node->iattr points to kernfs_node's attributes  and is
+ * protected by iattr_rwsem[i].rwsem
  * To reduce possible contention in sysfs access, arising due to single
  * locks, use an array of locks and use kernfs_node object address as
  * hash keys to get the index of these locks.
@@ -58,6 +64,7 @@ struct kernfs_open_file_mutex {
 struct kernfs_global_locks {
 	struct kernfs_open_node_lock open_node_locks[NR_KERNFS_LOCKS];
 	struct kernfs_open_file_mutex open_file_mutex[NR_KERNFS_LOCKS];
+	struct kernfs_iattr_rwsem iattr_rwsem[NR_KERNFS_LOCKS];
 };
 
 static struct kernfs_global_locks kernfs_global_locks;
@@ -76,6 +83,13 @@ static inline struct mutex *open_file_mutex_ptr(struct kernfs_node *kn)
 	return &kernfs_global_locks.open_file_mutex[index].lock;
 }
 
+static inline struct rw_semaphore *iattr_rwsem_ptr(struct kernfs_node *kn)
+{
+	int index = hash_ptr(kn, NR_KERNFS_LOCK_BITS);
+
+	return &kernfs_global_locks.iattr_rwsem[index].rwsem;
+}
+
 struct kernfs_iattrs {
 	kuid_t			ia_uid;
 	kgid_t			ia_gid;
-- 
2.30.2


  parent reply	other threads:[~2022-01-13 10:43 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-01-13 10:42 [PATCH v3 0/2] kernfs: use hashed mutex and spinlock in place of global ones Imran Khan
2022-01-13 10:42 ` [PATCH v3 1/2] " Imran Khan
2022-01-13 15:08   ` kernel test robot
2022-01-13 16:37   ` Tejun Heo
2022-01-17 18:54   ` kernel test robot
2022-01-13 10:42 ` Imran Khan [this message]
2022-01-13 10:58   ` [PATCH v3 2/2] kernfs: Reduce contention around global per-fs kernfs_rwsem Greg KH
2022-01-13 16:42   ` Tejun Heo
2022-01-14 17:08     ` Imran Khan
2022-01-14 17:44       ` Tejun Heo
2022-02-02 15:10     ` Imran Khan
2022-01-25  2:55   ` [kernfs] 8652224976: WARNING:at_kernel/locking/rwsem.c:#up_write kernel test robot
2022-01-25  2:55     ` kernel test robot
2022-01-13 10:57 ` [PATCH v3 0/2] kernfs: use hashed mutex and spinlock in place of global ones Greg KH

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220113104259.1584491-3-imran.f.khan@oracle.com \
    --to=imran.f.khan@oracle.com \
    --cc=gregkh@linuxfoundation.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=tj@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.