Kernel-hardening archive on lore.kernel.org
 help / color / Atom feed
From: ebiederm@xmission.com (Eric W. Biederman)
To: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>,
	 LKML <linux-kernel@vger.kernel.org>,
	 Kernel Hardening <kernel-hardening@lists.openwall.com>,
	 Linux API <linux-api@vger.kernel.org>,
	 Linux FS Devel <linux-fsdevel@vger.kernel.org>,
	 Linux Security Module <linux-security-module@vger.kernel.org>,
	 Akinobu Mita <akinobu.mita@gmail.com>,
	 Alexey Dobriyan <adobriyan@gmail.com>,
	 Andrew Morton <akpm@linux-foundation.org>,
	 Andy Lutomirski <luto@kernel.org>,
	 Daniel Micay <danielmicay@gmail.com>,
	 Djalal Harouni <tixxdz@gmail.com>,
	 "Dmitry V . Levin" <ldv@altlinux.org>,
	 Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
	 Ingo Molnar <mingo@kernel.org>,
	 "J . Bruce Fields" <bfields@fieldses.org>,
	 Jeff Layton <jlayton@poochiereds.net>,
	 Jonathan Corbet <corbet@lwn.net>,
	 Kees Cook <keescook@chromium.org>,
	 Oleg Nesterov <oleg@redhat.com>,
	 Solar Designer <solar@openwall.com>
Subject: [PATCH 6/7] proc: Use a list of inodes to flush from proc
Date: Thu, 20 Feb 2020 14:52:12 -0600
Message-ID: <87wo8h7zib.fsf_-_@x220.int.ebiederm.org> (raw)
In-Reply-To: <871rqpaswu.fsf_-_@x220.int.ebiederm.org> (Eric W. Biederman's message of "Thu, 20 Feb 2020 14:46:25 -0600")


Rework the flushing of proc to use a list of directory inodes that
need to be flushed.

The list is kept on struct pid not on struct task_struct, as there is
a fixed connection between proc inodes and pids but at least for the
case of de_thread the pid of a task_struct changes.

This removes the dependency on proc_mnt which allows for different
mounts of proc having different mount options even in the same pid
namespace and this allows for the removal of proc_mnt which will
trivially the first mount of proc to honor it's mount options.

This flushing remains an optimization.  The functions
pid_delete_dentry and pid_revalidate ensure that ordinary dcache
management will not attempt to use dentries past the point their
respective task has died.  When unused the shrinker will
eventually be able to remove these dentries.

There is a case in de_thread where proc_flush_pid can be
called early for a given pid.  Which winds up being
safe (if suboptimal) as this is just an optiimization.

Only pid directories are put on the list as the other
per pid files are children of those directories and
d_invalidate on the directory will get them as well.

So that the pid can be used during flushing it's reference count is
taken in release_task and dropped in proc_flush_pid.  Further the call
of proc_flush_pid is moved after the tasklist_lock is released in
release_task so that it is certain that the pid has already been
unhashed when flushing it taking place.  This removes a small race
where a dentry could recreated.

As struct pid is supposed to be small and I need a per pid lock
I reuse the only lock that currently exists in struct pid the
the wait_pidfd.lock.

The net result is that this adds all of this functionality
with just a little extra list management overhead and
a single extra pointer in struct pid.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/proc/base.c          | 111 +++++++++++++---------------------------
 fs/proc/inode.c         |   2 +-
 fs/proc/internal.h      |   1 +
 include/linux/pid.h     |   1 +
 include/linux/proc_fs.h |   4 +-
 kernel/exit.c           |   4 +-
 6 files changed, 44 insertions(+), 79 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index c7c64272b0fa..e7efe9d6f3d6 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1834,11 +1834,25 @@ void task_dump_owner(struct task_struct *task, umode_t mode,
 	*rgid = gid;
 }
 
+void proc_pid_evict_inode(struct proc_inode *ei)
+{
+	struct pid *pid = ei->pid;
+
+	if (S_ISDIR(ei->vfs_inode.i_mode)) {
+		spin_lock(&pid->wait_pidfd.lock);
+		hlist_del_init_rcu(&ei->sibling_inodes);
+		spin_unlock(&pid->wait_pidfd.lock);
+	}
+
+	put_pid(pid);
+}
+
 struct inode *proc_pid_make_inode(struct super_block * sb,
 				  struct task_struct *task, umode_t mode)
 {
 	struct inode * inode;
 	struct proc_inode *ei;
+	struct pid *pid;
 
 	/* We need a new inode */
 
@@ -1856,10 +1870,18 @@ struct inode *proc_pid_make_inode(struct super_block * sb,
 	/*
 	 * grab the reference to task.
 	 */
-	ei->pid = get_task_pid(task, PIDTYPE_PID);
-	if (!ei->pid)
+	pid = get_task_pid(task, PIDTYPE_PID);
+	if (!pid)
 		goto out_unlock;
 
+	/* Let the pid remember us for quick removal */
+	ei->pid = pid;
+	if (S_ISDIR(mode)) {
+		spin_lock(&pid->wait_pidfd.lock);
+		hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes);
+		spin_unlock(&pid->wait_pidfd.lock);
+	}
+
 	task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
 	security_task_to_inode(task, inode);
 
@@ -3230,90 +3252,29 @@ static const struct inode_operations proc_tgid_base_inode_operations = {
 	.permission	= proc_pid_permission,
 };
 
-static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
-{
-	struct dentry *dentry, *leader, *dir;
-	char buf[10 + 1];
-	struct qstr name;
-
-	name.name = buf;
-	name.len = snprintf(buf, sizeof(buf), "%u", pid);
-	/* no ->d_hash() rejects on procfs */
-	dentry = d_hash_and_lookup(mnt->mnt_root, &name);
-	if (dentry) {
-		d_invalidate(dentry);
-		dput(dentry);
-	}
-
-	if (pid == tgid)
-		return;
-
-	name.name = buf;
-	name.len = snprintf(buf, sizeof(buf), "%u", tgid);
-	leader = d_hash_and_lookup(mnt->mnt_root, &name);
-	if (!leader)
-		goto out;
-
-	name.name = "task";
-	name.len = strlen(name.name);
-	dir = d_hash_and_lookup(leader, &name);
-	if (!dir)
-		goto out_put_leader;
-
-	name.name = buf;
-	name.len = snprintf(buf, sizeof(buf), "%u", pid);
-	dentry = d_hash_and_lookup(dir, &name);
-	if (dentry) {
-		d_invalidate(dentry);
-		dput(dentry);
-	}
-
-	dput(dir);
-out_put_leader:
-	dput(leader);
-out:
-	return;
-}
-
 /**
- * proc_flush_task -  Remove dcache entries for @task from the /proc dcache.
- * @task: task that should be flushed.
+ * proc_flush_pid -  Remove dcache entries for @pid from the /proc dcache.
+ * @pid: pid that should be flushed.
  *
- * When flushing dentries from proc, one needs to flush them from global
- * proc (proc_mnt) and from all the namespaces' procs this task was seen
- * in. This call is supposed to do all of this job.
- *
- * Looks in the dcache for
- * /proc/@pid
- * /proc/@tgid/task/@pid
- * if either directory is present flushes it and all of it'ts children
- * from the dcache.
+ * This function walks a list of inodes (that belong to any proc
+ * filesystem) that are attached to the pid and flushes them from
+ * the dentry cache.
  *
  * It is safe and reasonable to cache /proc entries for a task until
  * that task exits.  After that they just clog up the dcache with
  * useless entries, possibly causing useful dcache entries to be
- * flushed instead.  This routine is proved to flush those useless
- * dcache entries at process exit time.
+ * flushed instead.  This routine is provided to flush those useless
+ * dcache entries when a process is reaped.
  *
  * NOTE: This routine is just an optimization so it does not guarantee
- *       that no dcache entries will exist at process exit time it
- *       just makes it very unlikely that any will persist.
+ *       that no dcache entries will exist after a process is reaped
+ *       it just makes it very unlikely that any will persist.
  */
 
-void proc_flush_task(struct task_struct *task)
+void proc_flush_pid(struct pid *pid)
 {
-	int i;
-	struct pid *pid, *tgid;
-	struct upid *upid;
-
-	pid = task_pid(task);
-	tgid = task_tgid(task);
-
-	for (i = 0; i <= pid->level; i++) {
-		upid = &pid->numbers[i];
-		proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
-					tgid->numbers[i].nr);
-	}
+	proc_invalidate_siblings_dcache(&pid->inodes, &pid->wait_pidfd.lock);
+	put_pid(pid);
 }
 
 static struct dentry *proc_pid_instantiate(struct dentry * dentry,
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 3c9082cd257b..979e867be3e5 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -40,7 +40,7 @@ static void proc_evict_inode(struct inode *inode)
 
 	/* Stop tracking associated processes */
 	if (ei->pid) {
-		put_pid(ei->pid);
+		proc_pid_evict_inode(ei);
 		ei->pid = NULL;
 	}
 
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index fd470172675f..9e294f0290e5 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -158,6 +158,7 @@ extern int proc_pid_statm(struct seq_file *, struct pid_namespace *,
 extern const struct dentry_operations pid_dentry_operations;
 extern int pid_getattr(const struct path *, struct kstat *, u32, unsigned int);
 extern int proc_setattr(struct dentry *, struct iattr *);
+extern void proc_pid_evict_inode(struct proc_inode *);
 extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t);
 extern void pid_update_inode(struct task_struct *, struct inode *);
 extern int pid_delete_dentry(const struct dentry *);
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 998ae7d24450..01a0d4e28506 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -62,6 +62,7 @@ struct pid
 	unsigned int level;
 	/* lists of tasks that use this pid */
 	struct hlist_head tasks[PIDTYPE_MAX];
+	struct hlist_head inodes;
 	/* wait queue for pidfd notifications */
 	wait_queue_head_t wait_pidfd;
 	struct rcu_head rcu;
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 3dfa92633af3..40a7982b7285 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -32,7 +32,7 @@ struct proc_ops {
 typedef int (*proc_write_t)(struct file *, char *, size_t);
 
 extern void proc_root_init(void);
-extern void proc_flush_task(struct task_struct *);
+extern void proc_flush_pid(struct pid *);
 
 extern struct proc_dir_entry *proc_symlink(const char *,
 		struct proc_dir_entry *, const char *);
@@ -105,7 +105,7 @@ static inline void proc_root_init(void)
 {
 }
 
-static inline void proc_flush_task(struct task_struct *task)
+static inline void proc_flush_pid(struct pid *pid)
 {
 }
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 2833ffb0c211..502b4995b688 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -191,6 +191,7 @@ void put_task_struct_rcu_user(struct task_struct *task)
 void release_task(struct task_struct *p)
 {
 	struct task_struct *leader;
+	struct pid *thread_pid;
 	int zap_leader;
 repeat:
 	/* don't need to get the RCU readlock here - the process is dead and
@@ -199,11 +200,11 @@ void release_task(struct task_struct *p)
 	atomic_dec(&__task_cred(p)->user->processes);
 	rcu_read_unlock();
 
-	proc_flush_task(p);
 	cgroup_release(p);
 
 	write_lock_irq(&tasklist_lock);
 	ptrace_release_task(p);
+	thread_pid = get_pid(p->thread_pid);
 	__exit_signal(p);
 
 	/*
@@ -226,6 +227,7 @@ void release_task(struct task_struct *p)
 	}
 
 	write_unlock_irq(&tasklist_lock);
+	proc_flush_pid(thread_pid);
 	release_thread(p);
 	put_task_struct_rcu_user(p);
 
-- 
2.20.1


  parent reply index

Thread overview: 85+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-02-10 15:05 [PATCH v8 00/11] proc: modernize proc to support multiple private instances Alexey Gladkov
2020-02-10 15:05 ` [PATCH v8 01/11] proc: Rename struct proc_fs_info to proc_fs_opts Alexey Gladkov
2020-02-10 15:05 ` [PATCH v8 02/11] proc: add proc_fs_info struct to store proc information Alexey Gladkov
2020-02-10 15:05 ` [PATCH v8 03/11] proc: move /proc/{self|thread-self} dentries to proc_fs_info Alexey Gladkov
2020-02-10 18:23   ` Andy Lutomirski
2020-02-12 15:00     ` Alexey Gladkov
2020-02-10 15:05 ` [PATCH v8 04/11] proc: move hide_pid, pid_gid from pid_namespace " Alexey Gladkov
2020-02-10 15:05 ` [PATCH v8 05/11] proc: add helpers to set and get proc hidepid and gid mount options Alexey Gladkov
2020-02-10 18:30   ` Andy Lutomirski
2020-02-12 14:57     ` Alexey Gladkov
2020-02-10 15:05 ` [PATCH v8 06/11] proc: support mounting procfs instances inside same pid namespace Alexey Gladkov
2020-02-10 15:05 ` [PATCH v8 07/11] proc: flush task dcache entries from all procfs instances Alexey Gladkov
2020-02-10 17:46   ` Linus Torvalds
2020-02-10 19:23     ` Al Viro
2020-02-11  1:36   ` Eric W. Biederman
2020-02-11  4:01     ` Eric W. Biederman
2020-02-12 14:49     ` Alexey Gladkov
2020-02-12 14:59       ` Eric W. Biederman
2020-02-12 17:08         ` Alexey Gladkov
2020-02-12 18:45         ` Linus Torvalds
2020-02-12 19:16           ` Eric W. Biederman
2020-02-12 19:49             ` Linus Torvalds
2020-02-12 20:03               ` Al Viro
2020-02-12 20:35                 ` Linus Torvalds
2020-02-12 20:38                   ` Al Viro
2020-02-12 20:41                     ` Al Viro
2020-02-12 21:02                       ` Linus Torvalds
2020-02-12 21:46                         ` Eric W. Biederman
2020-02-13  0:48                           ` Linus Torvalds
2020-02-13  4:37                             ` Eric W. Biederman
2020-02-13  5:55                               ` Al Viro
2020-02-13 21:30                                 ` Linus Torvalds
2020-02-13 22:23                                   ` Al Viro
2020-02-13 22:47                                     ` Linus Torvalds
2020-02-14 14:15                                       ` Eric W. Biederman
2020-02-14  3:48                                 ` Eric W. Biederman
2020-02-20 20:46                               ` [PATCH 0/7] proc: Dentry flushing without proc_mnt Eric W. Biederman
2020-02-20 20:47                                 ` [PATCH 1/7] proc: Rename in proc_inode rename sysctl_inodes sibling_inodes Eric W. Biederman
2020-02-20 20:48                                 ` [PATCH 2/7] proc: Generalize proc_sys_prune_dcache into proc_prune_siblings_dcache Eric W. Biederman
2020-02-20 20:49                                 ` [PATCH 3/7] proc: Mov rcu_read_(lock|unlock) in proc_prune_siblings_dcache Eric W. Biederman
2020-02-20 22:33                                   ` Linus Torvalds
2020-02-20 20:49                                 ` [PATCH 4/7] proc: Use d_invalidate " Eric W. Biederman
2020-02-20 22:43                                   ` Linus Torvalds
2020-02-20 22:54                                   ` Al Viro
2020-02-20 23:00                                     ` Linus Torvalds
2020-02-20 23:03                                     ` Al Viro
2020-02-20 23:39                                       ` Eric W. Biederman
2020-02-20 20:51                                 ` [PATCH 5/7] proc: Clear the pieces of proc_inode that proc_evict_inode cares about Eric W. Biederman
2020-02-20 20:52                                 ` Eric W. Biederman [this message]
2020-02-20 20:52                                 ` [PATCH 7/7] proc: Ensure we see the exit of each process tid exactly once Eric W. Biederman
2020-02-21 16:50                                   ` Oleg Nesterov
2020-02-22 15:46                                     ` Eric W. Biederman
2020-02-20 23:02                                 ` [PATCH 0/7] proc: Dentry flushing without proc_mnt Linus Torvalds
2020-02-20 23:07                                   ` Al Viro
2020-02-20 23:37                                     ` Eric W. Biederman
2020-02-24 16:25                                 ` [PATCH v2 0/6] " Eric W. Biederman
2020-02-24 16:26                                   ` [PATCH v2 1/6] proc: Rename in proc_inode rename sysctl_inodes sibling_inodes Eric W. Biederman
2020-02-24 16:27                                   ` [PATCH v2 2/6] proc: Generalize proc_sys_prune_dcache into proc_prune_siblings_dcache Eric W. Biederman
2020-02-24 16:27                                   ` [PATCH v2 3/6] proc: In proc_prune_siblings_dcache cache an aquired super block Eric W. Biederman
2020-02-24 16:28                                   ` [PATCH v2 4/6] proc: Use d_invalidate in proc_prune_siblings_dcache Eric W. Biederman
2020-02-24 16:28                                   ` [PATCH v2 5/6] proc: Clear the pieces of proc_inode that proc_evict_inode cares about Eric W. Biederman
2020-02-24 16:29                                   ` [PATCH v2 6/6] proc: Use a list of inodes to flush from proc Eric W. Biederman
2020-02-28 20:17                                   ` [PATCH 0/3] proc: Actually honor the mount options Eric W. Biederman
2020-02-28 20:18                                     ` [PATCH 1/3] uml: Don't consult current to find the proc_mnt in mconsole_proc Eric W. Biederman
2020-02-28 20:18                                     ` [PATCH 2/3] uml: Create a private mount of proc for mconsole Eric W. Biederman
2020-02-28 20:30                                       ` Christian Brauner
2020-02-28 21:28                                         ` Eric W. Biederman
2020-02-28 21:59                                           ` Christian Brauner
2020-02-28 20:19                                     ` [PATCH 3/3] proc: Remove the now unnecessary internal mount of proc Eric W. Biederman
2020-02-28 20:39                                       ` Christian Brauner
2020-02-28 21:40                                         ` Eric W. Biederman
2020-02-28 22:34                                     ` [PATCH 4/3] pid: Improve the comment about waiting in zap_pid_ns_processes Eric W. Biederman
2020-02-29  2:59                                       ` Christian Brauner
2020-02-14  3:49                     ` [PATCH v8 07/11] proc: flush task dcache entries from all procfs instances Eric W. Biederman
2020-02-12 19:47           ` Al Viro
2020-02-11 22:45   ` Al Viro
2020-02-12 14:26     ` Alexey Gladkov
2020-02-10 15:05 ` [PATCH v8 08/11] proc: instantiate only pids that we can ptrace on 'hidepid=4' mount option Alexey Gladkov
2020-02-10 16:29   ` Jordan Glover
2020-02-12 14:34     ` Alexey Gladkov
2020-02-10 15:05 ` [PATCH v8 09/11] proc: add option to mount only a pids subset Alexey Gladkov
2020-02-10 15:05 ` [PATCH v8 10/11] docs: proc: add documentation for "hidepid=4" and "subset=pidfs" options and new mount behavior Alexey Gladkov
2020-02-10 18:29   ` Andy Lutomirski
2020-02-12 16:03     ` Alexey Gladkov
2020-02-10 15:05 ` [PATCH v8 11/11] proc: Move hidepid values to uapi as they are user interface to mount Alexey Gladkov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=87wo8h7zib.fsf_-_@x220.int.ebiederm.org \
    --to=ebiederm@xmission.com \
    --cc=adobriyan@gmail.com \
    --cc=akinobu.mita@gmail.com \
    --cc=akpm@linux-foundation.org \
    --cc=bfields@fieldses.org \
    --cc=corbet@lwn.net \
    --cc=danielmicay@gmail.com \
    --cc=gregkh@linuxfoundation.org \
    --cc=jlayton@poochiereds.net \
    --cc=keescook@chromium.org \
    --cc=kernel-hardening@lists.openwall.com \
    --cc=ldv@altlinux.org \
    --cc=linux-api@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-security-module@vger.kernel.org \
    --cc=luto@kernel.org \
    --cc=mingo@kernel.org \
    --cc=oleg@redhat.com \
    --cc=solar@openwall.com \
    --cc=tixxdz@gmail.com \
    --cc=torvalds@linux-foundation.org \
    --cc=viro@zeniv.linux.org.uk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Kernel-hardening archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/kernel-hardening/0 kernel-hardening/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 kernel-hardening kernel-hardening/ https://lore.kernel.org/kernel-hardening \
		kernel-hardening@lists.openwall.com
	public-inbox-index kernel-hardening

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/com.openwall.lists.kernel-hardening


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git