All of lore.kernel.org
 help / color / mirror / Atom feed
From: Stephen Brennan <stephen.s.brennan@oracle.com>
To: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Stephen Brennan <stephen.s.brennan@oracle.com>,
	James Morris <jmorris@namei.org>,
	"Serge E. Hallyn" <serge@hallyn.com>,
	linux-security-module@vger.kernel.org,
	Paul Moore <paul@paul-moore.com>,
	Stephen Smalley <stephen.smalley.work@gmail.com>,
	Eric Paris <eparis@parisplace.org>,
	selinux@vger.kernel.org, Casey Schaufler <casey@schaufler-ca.com>,
	Eric Biederman <ebiederm@xmission.com>,
	Alexander Viro <viro@zeniv.linux.org.uk>,
	linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org,
	Matthew Wilcox <willy@infradead.org>
Subject: [PATCH] proc: Allow pid_revalidate() during LOOKUP_RCU
Date: Mon, 30 Nov 2020 12:06:19 -0800	[thread overview]
Message-ID: <20201130200619.84819-1-stephen.s.brennan@oracle.com> (raw)

The pid_revalidate() function requires dropping from RCU into REF lookup
mode. When many threads are resolving paths within /proc in parallel,
this can result in heavy spinlock contention as each thread tries to
grab a reference to the /proc dentry (and drop it shortly thereafter).

Allow the pid_revalidate() function to execute under LOOKUP_RCU. When
updates must be made to the inode due to the owning task performing
setuid(), drop out of RCU and into REF mode.

Signed-off-by: Stephen Brennan <stephen.s.brennan@oracle.com>
---

I'd like to use this patch as an RFC on this approach for reducing spinlock
contention during many parallel path lookups in the /proc filesystem. The
contention can be triggered by, for example, running ~100 parallel instances of
"TZ=/etc/localtime ps -fe >/dev/null" on a 100CPU machine. The %sys utilization
in such a case reaches around 90%, and profiles show two code paths with high
utilization:

    walk_component
      lookup_fast
        unlazy_child
          legitimize_root
            __legitimize_path
              lockref_get_not_dead

    terminate_walk
      dput
        dput

By applying this patch, %sys utilization falls to around 60% under the same
workload.

One item I'd like to highlight about this patch is that the
security_task_to_inode() hook is called less frequently as a result. I don't
know whether this is a major concern, which is why I've included security
reviewers as well.

 fs/proc/base.c      | 50 ++++++++++++++++++++++++++++++++-------------
 fs/proc/internal.h  |  5 +++++
 include/linux/pid.h |  2 ++
 kernel/pid.c        | 12 +++++++++++
 4 files changed, 55 insertions(+), 14 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index ebea9501afb8..038056f94ed0 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1813,12 +1813,29 @@ int pid_getattr(const struct path *path, struct kstat *stat,
 /*
  * Set <pid>/... inode ownership (can change due to setuid(), etc.)
  */
-void pid_update_inode(struct task_struct *task, struct inode *inode)
+static int do_pid_update_inode(struct task_struct *task, struct inode *inode,
+							   unsigned int flags)
 {
-	task_dump_owner(task, inode->i_mode, &inode->i_uid, &inode->i_gid);
+	kuid_t uid;
+	kgid_t gid;
+
+	task_dump_owner(task, inode->i_mode, &uid, &gid);
+	if (uid_eq(uid, inode->i_uid) && gid_eq(gid, inode->i_gid) &&
+			!(inode->i_mode & (S_ISUID | S_ISGID)))
+		return 1;
+	if (flags & LOOKUP_RCU)
+		return -ECHILD;
 
+	inode->i_uid = uid;
+	inode->i_gid = gid;
 	inode->i_mode &= ~(S_ISUID | S_ISGID);
 	security_task_to_inode(task, inode);
+	return 1;
+}
+
+void pid_update_inode(struct task_struct *task, struct inode *inode)
+{
+	do_pid_update_inode(task, inode, 0);
 }
 
 /*
@@ -1830,19 +1847,24 @@ static int pid_revalidate(struct dentry *dentry, unsigned int flags)
 {
 	struct inode *inode;
 	struct task_struct *task;
-
-	if (flags & LOOKUP_RCU)
-		return -ECHILD;
-
-	inode = d_inode(dentry);
-	task = get_proc_task(inode);
-
-	if (task) {
-		pid_update_inode(task, inode);
-		put_task_struct(task);
-		return 1;
+	int rv = 0;
+
+	if (flags & LOOKUP_RCU) {
+		inode = d_inode_rcu(dentry);
+		task = get_proc_task_rcu(inode);
+		if (task) {
+			rv = do_pid_update_inode(task, inode, flags);
+			put_task_struct_rcu_user(task);
+		}
+	} else {
+		inode = d_inode(dentry);
+		task = get_proc_task(inode);
+		if (task) {
+			rv = do_pid_update_inode(task, inode, flags);
+			put_task_struct(task);
+		}
 	}
-	return 0;
+	return rv;
 }
 
 static inline bool proc_inode_is_dead(struct inode *inode)
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index cd0c8d5ce9a1..aa6df65ad3eb 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -121,6 +121,11 @@ static inline struct task_struct *get_proc_task(const struct inode *inode)
 	return get_pid_task(proc_pid(inode), PIDTYPE_PID);
 }
 
+static inline struct task_struct *get_proc_task_rcu(const struct inode *inode)
+{
+	return get_pid_task_rcu_user(proc_pid(inode), PIDTYPE_PID);
+}
+
 void task_dump_owner(struct task_struct *task, umode_t mode,
 		     kuid_t *ruid, kgid_t *rgid);
 
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 9645b1194c98..0b2c54f85e6d 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -86,6 +86,8 @@ static inline struct pid *get_pid(struct pid *pid)
 extern void put_pid(struct pid *pid);
 extern struct task_struct *pid_task(struct pid *pid, enum pid_type);
 extern struct task_struct *get_pid_task(struct pid *pid, enum pid_type);
+extern struct task_struct *get_pid_task_rcu_user(struct pid *pid,
+						 enum pid_type type);
 
 extern struct pid *get_task_pid(struct task_struct *task, enum pid_type type);
 
diff --git a/kernel/pid.c b/kernel/pid.c
index 0a9f2e437217..05acbd15cfa6 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -390,6 +390,18 @@ struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
 }
 EXPORT_SYMBOL_GPL(get_pid_task);
 
+struct task_struct *get_pid_task_rcu_user(struct pid *pid, enum pid_type type)
+{
+	struct task_struct *task;
+
+	task = pid_task(pid, type);
+	if (task && refcount_inc_not_zero(&task->rcu_users))
+		return task;
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(get_pid_task_rcu_user);
+
 struct pid *find_get_pid(pid_t nr)
 {
 	struct pid *pid;
-- 
2.25.1


             reply	other threads:[~2020-11-30 20:08 UTC|newest]

Thread overview: 3+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-11-30 20:06 Stephen Brennan [this message]
2020-11-30 20:46 ` [PATCH] proc: Allow pid_revalidate() during LOOKUP_RCU Eric W. Biederman
2020-12-01 23:49   ` Stephen Brennan

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20201130200619.84819-1-stephen.s.brennan@oracle.com \
    --to=stephen.s.brennan@oracle.com \
    --cc=adobriyan@gmail.com \
    --cc=casey@schaufler-ca.com \
    --cc=ebiederm@xmission.com \
    --cc=eparis@parisplace.org \
    --cc=jmorris@namei.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-security-module@vger.kernel.org \
    --cc=paul@paul-moore.com \
    --cc=selinux@vger.kernel.org \
    --cc=serge@hallyn.com \
    --cc=stephen.smalley.work@gmail.com \
    --cc=viro@zeniv.linux.org.uk \
    --cc=willy@infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.