All of lore.kernel.org
 help / color / mirror / Atom feed
* [patch 0/2] Introduce /proc/pid/map_files v6
@ 2011-08-31  7:58 Cyrill Gorcunov
  2011-08-31  7:58 ` [patch 1/2] fs, proc: Make proc_get_link to use dentry instead of inode Cyrill Gorcunov
  2011-08-31  7:58 ` [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6 Cyrill Gorcunov
  0 siblings, 2 replies; 82+ messages in thread
From: Cyrill Gorcunov @ 2011-08-31  7:58 UTC (permalink / raw)
  To: containers, linux-kernel, linux-fsdevel
  Cc: Nathan Lynch, Oren Laadan, Daniel Lezcano, Glauber Costa,
	James Bottomley, Tejun Heo, Vasiliy Kulikov, Kirill A. Shutemov,
	Alexey Dobriyan, Al Viro, Andrew Morton, Pavel Emelyanov

Hi,

this series (I hope) do address all concerns on /proc/pid/map_files
feature. Since it needs to change proc_get_link arguments type --
such change was factored out into a separate patch.

Please review, comments/complains are *highly* appreciated!

Thanks,
  Cyrill

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [patch 1/2] fs, proc: Make proc_get_link to use dentry instead of inode
  2011-08-31  7:58 [patch 0/2] Introduce /proc/pid/map_files v6 Cyrill Gorcunov
@ 2011-08-31  7:58 ` Cyrill Gorcunov
  2011-08-31  7:58 ` [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6 Cyrill Gorcunov
  1 sibling, 0 replies; 82+ messages in thread
From: Cyrill Gorcunov @ 2011-08-31  7:58 UTC (permalink / raw)
  To: containers, linux-kernel, linux-fsdevel
  Cc: Nathan Lynch, Oren Laadan, Daniel Lezcano, Glauber Costa,
	James Bottomley, Tejun Heo, Vasiliy Kulikov, Kirill A. Shutemov,
	Alexey Dobriyan, Al Viro, Andrew Morton, Pavel Emelyanov,
	Cyrill Gorcunov

[-- Attachment #1: fs-proc-switch-to-dentry --]
[-- Type: text/plain, Size: 3573 bytes --]

This patch prepares the ground for the next "map_files"
patch which needs a name of a link file to analyse.

So instead of squashing this change into one big
patch the separate one is done.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
CC: Pavel Emelyanov <xemul@parallels.com>
CC: Tejun Heo <tj@kernel.org>
CC: Vasiliy Kulikov <segoon@openwall.com>
CC: "Kirill A. Shutemov" <kirill@shutemov.name>
CC: Alexey Dobriyan <adobriyan@gmail.com>
CC: Al Viro <viro@ZenIV.linux.org.uk>
CC: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/base.c          |   20 ++++++++++----------
 include/linux/proc_fs.h |    2 +-
 2 files changed, 11 insertions(+), 11 deletions(-)

Index: linux-2.6.git/fs/proc/base.c
===================================================================
--- linux-2.6.git.orig/fs/proc/base.c
+++ linux-2.6.git/fs/proc/base.c
@@ -165,9 +165,9 @@ static int get_task_root(struct task_str
 	return result;
 }
 
-static int proc_cwd_link(struct inode *inode, struct path *path)
+static int proc_cwd_link(struct dentry *dentry, struct path *path)
 {
-	struct task_struct *task = get_proc_task(inode);
+	struct task_struct *task = get_proc_task(dentry->d_inode);
 	int result = -ENOENT;
 
 	if (task) {
@@ -182,9 +182,9 @@ static int proc_cwd_link(struct inode *i
 	return result;
 }
 
-static int proc_root_link(struct inode *inode, struct path *path)
+static int proc_root_link(struct dentry *dentry, struct path *path)
 {
-	struct task_struct *task = get_proc_task(inode);
+	struct task_struct *task = get_proc_task(dentry->d_inode);
 	int result = -ENOENT;
 
 	if (task) {
@@ -1580,13 +1580,13 @@ static const struct file_operations proc
 	.release	= single_release,
 };
 
-static int proc_exe_link(struct inode *inode, struct path *exe_path)
+static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
 {
 	struct task_struct *task;
 	struct mm_struct *mm;
 	struct file *exe_file;
 
-	task = get_proc_task(inode);
+	task = get_proc_task(dentry->d_inode);
 	if (!task)
 		return -ENOENT;
 	mm = get_task_mm(task);
@@ -1616,7 +1616,7 @@ static void *proc_pid_follow_link(struct
 	if (!proc_fd_access_allowed(inode))
 		goto out;
 
-	error = PROC_I(inode)->op.proc_get_link(inode, &nd->path);
+	error = PROC_I(inode)->op.proc_get_link(dentry, &nd->path);
 out:
 	return ERR_PTR(error);
 }
@@ -1655,7 +1655,7 @@ static int proc_pid_readlink(struct dent
 	if (!proc_fd_access_allowed(inode))
 		goto out;
 
-	error = PROC_I(inode)->op.proc_get_link(inode, &path);
+	error = PROC_I(inode)->op.proc_get_link(dentry, &path);
 	if (error)
 		goto out;
 
@@ -1947,9 +1947,9 @@ static int proc_fd_info(struct inode *in
 	return -ENOENT;
 }
 
-static int proc_fd_link(struct inode *inode, struct path *path)
+static int proc_fd_link(struct dentry *dentry, struct path *path)
 {
-	return proc_fd_info(inode, path, NULL);
+	return proc_fd_info(dentry->d_inode, path, NULL);
 }
 
 static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
Index: linux-2.6.git/include/linux/proc_fs.h
===================================================================
--- linux-2.6.git.orig/include/linux/proc_fs.h
+++ linux-2.6.git/include/linux/proc_fs.h
@@ -253,7 +253,7 @@ extern const struct proc_ns_operations u
 extern const struct proc_ns_operations ipcns_operations;
 
 union proc_op {
-	int (*proc_get_link)(struct inode *, struct path *);
+	int (*proc_get_link)(struct dentry *, struct path *);
 	int (*proc_read)(struct task_struct *task, char *page);
 	int (*proc_show)(struct seq_file *m,
 		struct pid_namespace *ns, struct pid *pid,

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-08-31  7:58 [patch 0/2] Introduce /proc/pid/map_files v6 Cyrill Gorcunov
  2011-08-31  7:58 ` [patch 1/2] fs, proc: Make proc_get_link to use dentry instead of inode Cyrill Gorcunov
@ 2011-08-31  7:58 ` Cyrill Gorcunov
  2011-08-31  9:06   ` Vasiliy Kulikov
  2011-09-02  1:54   ` Nicholas Miell
  1 sibling, 2 replies; 82+ messages in thread
From: Cyrill Gorcunov @ 2011-08-31  7:58 UTC (permalink / raw)
  To: containers, linux-kernel, linux-fsdevel
  Cc: Nathan Lynch, Oren Laadan, Daniel Lezcano, Glauber Costa,
	James Bottomley, Tejun Heo, Vasiliy Kulikov, Kirill A. Shutemov,
	Alexey Dobriyan, Al Viro, Andrew Morton, Pavel Emelyanov,
	Cyrill Gorcunov

[-- Attachment #1: cr-proc-map-files-13 --]
[-- Type: text/plain, Size: 10865 bytes --]

From: Pavel Emelyanov <xemul@parallels.com>

This one behaves similarly to the /proc/<pid>/fd/ one - it contains symlinks
one for each mapping with file, the name of a symlink is "vma->vm_start-vma->vm_end",
the target is the file. Opening a symlink results in a file that point exactly
to the same inode as them vma's one.

For example the ls -l of some arbitrary /proc/<pid>/map_files/

 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80403000-7f8f80404000 -> /lib64/libc-2.5.so
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f8061e000-7f8f80620000 -> /lib64/libselinux.so.1
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80826000-7f8f80827000 -> /lib64/libacl.so.1.1.0
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a2f000-7f8f80a30000 -> /lib64/librt-2.5.so
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a30000-7f8f80a4c000 -> /lib64/ld-2.5.so

This helps checkpointing process in three ways:

1. When dumping a task mappings we do know exact file that is mapped by particular
   region. We do this by opening /proc/pid/map_files/address symlink the way we do
   with file descriptors.

2. This also helps in determining which anonymous shared mappings are shared with
   each other by comparing the inodes of them.

3. When restoring a set of process in case two of them has a mapping shared, we map
   the memory by the 1st one and then open its /proc/pid/map_files/address file and
   map it by the 2nd task.

v2: (spotted by Tejun Heo)
 - /proc/<pid>/mfd changed to /proc/<pid>/map_files
 - find_vma helper is used instead of linear search
 - routines are re-grouped
 - d_revalidate is set now

v3:
 - d_revalidate reworked, now it should drops no longer valid dentries (Tejun Heo)
 - ptrace_may_access added into proc_map_files_lookup (Vasiliy Kulikov)
 - because of filldir (which eventually might need to lock mmap_sem)
   the proc_map_files_readdir() was reworked to call proc_fill_cache()
   with unlocked mmap_sem

v4: (feedback by Tejun Heo and Vasiliy Kulikov)
 - instead of saving data in proc_inode we rather make a dentry name
   to keep both vm_start and vm_end accordingly
 - d_revalidate now honor task credentials

v5: (feedback by Kirill A. Shutemov)
 - don't forget to release mmap_sem on error path

v6:
 - sizeof get used in map_files_info which shrink member
   a bit on x86-32 (by Kirill A. Shutemov)
 - map_name_to_addr returns -EINVAL instead of -1
   which is more appropriate (by Tejun Heo)

Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
CC: Tejun Heo <tj@kernel.org>
CC: Vasiliy Kulikov <segoon@openwall.com>
CC: "Kirill A. Shutemov" <kirill@shutemov.name>
CC: Alexey Dobriyan <adobriyan@gmail.com>
CC: Al Viro <viro@ZenIV.linux.org.uk>
CC: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/base.c |  321 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 321 insertions(+)

Index: linux-2.6.git/fs/proc/base.c
===================================================================
--- linux-2.6.git.orig/fs/proc/base.c
+++ linux-2.6.git/fs/proc/base.c
@@ -2170,6 +2170,326 @@ static const struct file_operations proc
 	.llseek		= default_llseek,
 };
 
+static struct vm_area_struct *
+find_exact_vma(struct mm_struct *mm, unsigned long vm_start, unsigned long vm_end)
+{
+	struct vm_area_struct *vma = find_vma(mm, vm_start);
+	if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end))
+		vma = NULL;
+	return vma;
+}
+
+static int map_name_to_addr(const unsigned char *name, unsigned long *start, unsigned long *end)
+{
+	int ret = -EINVAL;
+	char *endp;
+
+	if (unlikely(!name))
+		goto err;
+
+	*start = simple_strtoul(name, &endp, 16);
+	if (*endp != '-')
+		goto err;
+	*end = simple_strtoul(endp + 1, &endp, 16);
+	if (*endp != 0)
+		goto err;
+
+	ret = 0;
+
+err:
+	return ret;
+}
+
+static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	struct vm_area_struct *vma = NULL;
+	unsigned long vm_start, vm_end;
+	struct task_struct *task;
+	const struct cred *cred;
+	struct mm_struct *mm;
+	struct inode *inode;
+
+	if (nd && nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	inode = dentry->d_inode;
+	task = get_proc_task(inode);
+	if (!task)
+		goto out;
+
+	mm = get_task_mm(task);
+	put_task_struct(task);
+	if (!mm)
+		goto out;
+
+	if (!map_name_to_addr(dentry->d_name.name, &vm_start, &vm_end)) {
+		down_read(&mm->mmap_sem);
+		vma = find_exact_vma(mm, vm_start, vm_end);
+		up_read(&mm->mmap_sem);
+	}
+
+	mmput(mm);
+
+	if (vma) {
+		if (task_dumpable(task)) {
+			rcu_read_lock();
+			cred = __task_cred(task);
+			inode->i_uid = cred->euid;
+			inode->i_gid = cred->egid;
+			rcu_read_unlock();
+		} else {
+			inode->i_uid = 0;
+			inode->i_gid = 0;
+		}
+		security_task_to_inode(task, inode);
+		return 1;
+	}
+out:
+	d_drop(dentry);
+	return 0;
+}
+
+static const struct dentry_operations tid_map_files_dentry_operations = {
+	.d_revalidate	= map_files_d_revalidate,
+	.d_delete	= pid_delete_dentry,
+};
+
+static int proc_map_files_get_link(struct dentry *dentry, struct inode *inode, struct path *path)
+{
+	unsigned long vm_start, vm_end;
+	struct vm_area_struct *vma;
+	struct task_struct *task;
+	struct mm_struct *mm;
+	int rc = -ENOENT;
+
+	task = get_proc_task(inode);
+	if (!task)
+		goto out;
+
+	mm = get_task_mm(task);
+	put_task_struct(task);
+	if (!mm)
+		goto out;
+
+	rc = map_name_to_addr(dentry->d_name.name,
+			      &vm_start, &vm_end);
+	if (rc)
+		goto out_mmput;
+
+	down_read(&mm->mmap_sem);
+	vma = find_exact_vma(mm, vm_start, vm_end);
+	if (vma && vma->vm_file) {
+		*path = vma->vm_file->f_path;
+		path_get(path);
+		rc = 0;
+	}
+	up_read(&mm->mmap_sem);
+
+out_mmput:
+	mmput(mm);
+out:
+	return rc;
+}
+
+struct map_files_info {
+	struct file	*file;
+	unsigned long	len;
+	unsigned char	name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
+};
+
+static struct dentry *
+proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
+			   struct task_struct *task, const void *ptr)
+{
+	const struct file *file = ptr;
+	struct proc_inode *ei;
+	struct inode *inode;
+
+	if (!file)
+		return ERR_PTR(-ENOENT);
+
+	inode = proc_pid_make_inode(dir->i_sb, task);
+	if (!inode)
+		return ERR_PTR(-ENOENT);
+
+	ei			= PROC_I(inode);
+	ei->op.proc_get_link	= proc_map_files_get_link;
+
+	inode->i_op	= &proc_pid_link_inode_operations;
+	inode->i_size	= 64;
+	inode->i_mode	= S_IFLNK;
+
+	if (file->f_mode & FMODE_READ)
+		inode->i_mode |= S_IRUSR | S_IXUSR;
+	if (file->f_mode & FMODE_WRITE)
+		inode->i_mode |= S_IWUSR | S_IXUSR;
+
+	d_set_d_op(dentry, &tid_map_files_dentry_operations);
+	d_add(dentry, inode);
+
+	return NULL;
+}
+
+static struct dentry *proc_map_files_lookup(struct inode *dir,
+		struct dentry *dentry, struct nameidata *nd)
+{
+	unsigned long vm_start, vm_end;
+	struct task_struct *task;
+	struct vm_area_struct *vma;
+	struct mm_struct *mm;
+	struct dentry *result;
+
+	result = ERR_PTR(-ENOENT);
+	task = get_proc_task(dir);
+	if (!task)
+		goto out_no_task;
+
+	result = ERR_PTR(-EPERM);
+	if (!ptrace_may_access(task, PTRACE_MODE_READ));
+		goto out_no_mm;
+
+	result = ERR_PTR(-ENOENT);
+	if (map_name_to_addr(dentry->d_name.name,
+			     &vm_start, &vm_end))
+		goto out_no_mm;
+
+	mm = get_task_mm(task);
+	if (!mm)
+		goto out_no_mm;
+
+	down_read(&mm->mmap_sem);
+	vma = find_exact_vma(mm, vm_start, vm_end);
+	if (!vma)
+		goto out_no_vma;
+
+	result = proc_map_files_instantiate(dir, dentry, task, vma->vm_file);
+
+out_no_vma:
+	up_read(&mm->mmap_sem);
+	mmput(mm);
+out_no_mm:
+	put_task_struct(task);
+out_no_task:
+	return result;
+}
+
+static const struct inode_operations proc_map_files_inode_operations = {
+	.lookup		= proc_map_files_lookup,
+	.setattr	= proc_setattr,
+};
+
+static int proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct dentry *dentry = filp->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	struct vm_area_struct *vma;
+	struct task_struct *task;
+	struct mm_struct *mm;
+	unsigned int vmai;
+	ino_t ino;
+	int ret;
+
+	ret = -ENOENT;
+	task = get_proc_task(inode);
+	if (!task)
+		goto out_no_task;
+
+	ret = -EPERM;
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+		goto out;
+
+	ret = 0;
+	switch (filp->f_pos) {
+	case 0:
+		ino = inode->i_ino;
+		if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0)
+			goto out;
+		filp->f_pos++;
+	case 1:
+		ino = parent_ino(dentry);
+		if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
+			goto out;
+		filp->f_pos++;
+	default:
+	{
+		struct map_files_info *info = NULL;
+		unsigned long nr_files, used, i;
+
+		mm = get_task_mm(task);
+		if (!mm)
+			goto out;
+		down_read(&mm->mmap_sem);
+
+		nr_files = 0;
+		used = 0;
+
+		/*
+		 * We need two passes here:
+		 *  1) Collect vmas of mapped files with mmap_sem taken
+		 *  2) Release mmap_sem and instantiate entries
+		 * othrewise we get lockdep complains since filldir
+		 * might sleep.
+		 */
+
+		for (vma = mm->mmap; vma; vma = vma->vm_next) {
+			if (vma->vm_file)
+				nr_files++;
+		}
+
+		if (nr_files) {
+			info = kmalloc(nr_files * sizeof(*info), GFP_KERNEL);
+			if (!info)
+				ret = -ENOMEM;
+			for (vma = mm->mmap, vmai = 2; vma && info; vma = vma->vm_next) {
+				if (!vma->vm_file)
+					continue;
+				vmai++;
+				if (vmai <= filp->f_pos)
+					continue;
+
+				get_file(vma->vm_file);
+				info[used].file	= vma->vm_file;
+				info[used].len	= snprintf(info[used].name,
+							   sizeof(info[used].name),
+							   "%lx-%lx",
+							   vma->vm_start,
+							   vma->vm_end);
+				used++;
+			}
+		}
+
+		up_read(&mm->mmap_sem);
+
+		for (i = 0; i < used; i++) {
+			ret = proc_fill_cache(filp, dirent, filldir,
+					      info[i].name, info[i].len,
+					      proc_map_files_instantiate,
+					      task, info[i].file);
+			if (ret)
+				break;
+			filp->f_pos++;
+		}
+
+		for (i = 0; i < used; i++)
+			put_filp(info[i].file);
+
+		kfree(info);
+		mmput(mm);
+	}
+	}
+
+out:
+	put_task_struct(task);
+out_no_task:
+	return ret;
+}
+
+static const struct file_operations proc_map_files_operations = {
+	.read		= generic_read_dir,
+	.readdir	= proc_map_files_readdir,
+	.llseek		= default_llseek,
+};
+
 /*
  * /proc/pid/fd needs a special permission handler so that a process can still
  * access /proc/self/fd after it has executed a setuid().
@@ -2785,6 +3105,7 @@ static const struct inode_operations pro
 static const struct pid_entry tgid_base_stuff[] = {
 	DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
 	DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
+	DIR("map_files",  S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
 	DIR("fdinfo",     S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
 	DIR("ns",	  S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
 #ifdef CONFIG_NET

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-08-31  7:58 ` [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6 Cyrill Gorcunov
@ 2011-08-31  9:06   ` Vasiliy Kulikov
  2011-08-31 10:12     ` Cyrill Gorcunov
  2011-08-31 11:26     ` Cyrill Gorcunov
  2011-09-02  1:54   ` Nicholas Miell
  1 sibling, 2 replies; 82+ messages in thread
From: Vasiliy Kulikov @ 2011-08-31  9:06 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: containers, linux-kernel, linux-fsdevel, Nathan Lynch,
	Oren Laadan, Daniel Lezcano, Glauber Costa, James Bottomley,
	Tejun Heo, Kirill A. Shutemov, Alexey Dobriyan, Al Viro,
	Andrew Morton, Pavel Emelyanov

Hi,

On Wed, Aug 31, 2011 at 11:58 +0400, Cyrill Gorcunov wrote:
> From: Pavel Emelyanov <xemul@parallels.com>
> 
> This one behaves similarly to the /proc/<pid>/fd/ one - it contains symlinks
> one for each mapping with file, the name of a symlink is "vma->vm_start-vma->vm_end",
> the target is the file. Opening a symlink results in a file that point exactly
> to the same inode as them vma's one.

I'm late noting it before the RFCv6 :)  Besides checking
ptrace_may_access() on ->lookup and ->readdir you also should define
->stat, otherwise you can bypass ptrace checks if there is a
corresponding dentry in the cache.  The same issue existed in fd* handlers:

http://www.openwall.com/lists/kernel-hardening/2011/08/29/1

Thanks,

-- 
Vasiliy Kulikov
http://www.openwall.com - bringing security into open computing environments

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-08-31  9:06   ` Vasiliy Kulikov
@ 2011-08-31 10:12     ` Cyrill Gorcunov
  2011-08-31 11:26     ` Cyrill Gorcunov
  1 sibling, 0 replies; 82+ messages in thread
From: Cyrill Gorcunov @ 2011-08-31 10:12 UTC (permalink / raw)
  To: Vasiliy Kulikov
  Cc: containers, linux-kernel, linux-fsdevel, Nathan Lynch,
	Oren Laadan, Daniel Lezcano, Glauber Costa, James Bottomley,
	Tejun Heo, Kirill A. Shutemov, Alexey Dobriyan, Al Viro,
	Andrew Morton, Pavel Emelyanov

On Wed, Aug 31, 2011 at 01:06:12PM +0400, Vasiliy Kulikov wrote:
> Hi,
> 
> On Wed, Aug 31, 2011 at 11:58 +0400, Cyrill Gorcunov wrote:
> > From: Pavel Emelyanov <xemul@parallels.com>
> > 
> > This one behaves similarly to the /proc/<pid>/fd/ one - it contains symlinks
> > one for each mapping with file, the name of a symlink is "vma->vm_start-vma->vm_end",
> > the target is the file. Opening a symlink results in a file that point exactly
> > to the same inode as them vma's one.
> 
> I'm late noting it before the RFCv6 :)  Besides checking
> ptrace_may_access() on ->lookup and ->readdir you also should define
> ->stat, otherwise you can bypass ptrace checks if there is a
> corresponding dentry in the cache.  The same issue existed in fd* handlers:
> 
> http://www.openwall.com/lists/kernel-hardening/2011/08/29/1
> 

Yeah, good point, thanks Vasiliy.

	Cyrill

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-08-31  9:06   ` Vasiliy Kulikov
  2011-08-31 10:12     ` Cyrill Gorcunov
@ 2011-08-31 11:26     ` Cyrill Gorcunov
  2011-08-31 14:04       ` Kirill A. Shutemov
  1 sibling, 1 reply; 82+ messages in thread
From: Cyrill Gorcunov @ 2011-08-31 11:26 UTC (permalink / raw)
  To: Vasiliy Kulikov
  Cc: containers, linux-kernel, linux-fsdevel, Nathan Lynch,
	Oren Laadan, Daniel Lezcano, Glauber Costa, James Bottomley,
	Tejun Heo, Kirill A. Shutemov, Alexey Dobriyan, Al Viro,
	Andrew Morton, Pavel Emelyanov

On Wed, Aug 31, 2011 at 01:06:12PM +0400, Vasiliy Kulikov wrote:
> Hi,
> 
> On Wed, Aug 31, 2011 at 11:58 +0400, Cyrill Gorcunov wrote:
> > From: Pavel Emelyanov <xemul@parallels.com>
> > 
> > This one behaves similarly to the /proc/<pid>/fd/ one - it contains symlinks
> > one for each mapping with file, the name of a symlink is "vma->vm_start-vma->vm_end",
> > the target is the file. Opening a symlink results in a file that point exactly
> > to the same inode as them vma's one.
> 
> I'm late noting it before the RFCv6 :)  Besides checking
> ptrace_may_access() on ->lookup and ->readdir you also should define
> ->stat, otherwise you can bypass ptrace checks if there is a
> corresponding dentry in the cache.  The same issue existed in fd* handlers:
> 
> http://www.openwall.com/lists/kernel-hardening/2011/08/29/1
> 

OK, here is an updated one. Thanks for feedback. Hope this time
all nits are addressed. Still reviews/complains are *very* appreciated.

	Cyrill
---
fs, proc: Introduce the /proc/<pid>/map_files/ directory v7

From: Pavel Emelyanov <xemul@parallels.com>

This one behaves similarly to the /proc/<pid>/fd/ one - it contains symlinks
one for each mapping with file, the name of a symlink is "vma->vm_start-vma->vm_end",
the target is the file. Opening a symlink results in a file that point exactly
to the same inode as them vma's one.

For example the ls -l of some arbitrary /proc/<pid>/map_files/

 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80403000-7f8f80404000 -> /lib64/libc-2.5.so
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f8061e000-7f8f80620000 -> /lib64/libselinux.so.1
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80826000-7f8f80827000 -> /lib64/libacl.so.1.1.0
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a2f000-7f8f80a30000 -> /lib64/librt-2.5.so
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a30000-7f8f80a4c000 -> /lib64/ld-2.5.so

This helps checkpointing process in three ways:

1. When dumping a task mappings we do know exact file that is mapped by particular
   region. We do this by opening /proc/pid/map_files/address symlink the way we do
   with file descriptors.

2. This also helps in determining which anonymous shared mappings are shared with
   each other by comparing the inodes of them.

3. When restoring a set of process in case two of them has a mapping shared, we map
   the memory by the 1st one and then open its /proc/pid/map_files/address file and
   map it by the 2nd task.

v2: (spotted by Tejun Heo)
 - /proc/<pid>/mfd changed to /proc/<pid>/map_files
 - find_vma helper is used instead of linear search
 - routines are re-grouped
 - d_revalidate is set now

v3:
 - d_revalidate reworked, now it should drops no longer valid dentries (Tejun Heo)
 - ptrace_may_access added into proc_map_files_lookup (Vasiliy Kulikov)
 - because of filldir (which eventually might need to lock mmap_sem)
   the proc_map_files_readdir() was reworked to call proc_fill_cache()
   with unlocked mmap_sem

v4: (feedback by Tejun Heo and Vasiliy Kulikov)
 - instead of saving data in proc_inode we rather make a dentry name
   to keep both vm_start and vm_end accordingly
 - d_revalidate now honor task credentials

v5: (feedback by Kirill A. Shutemov)
 - don't forget to release mmap_sem on error path

v6:
 - sizeof get used in map_files_info which shrink member
   a bit on x86-32 (by Kirill A. Shutemov)
 - map_name_to_addr returns -EINVAL instead of -1
   which is more appropriate (by Tejun Heo)

v7:
 - add [get/set]attr handlers for
   proc_map_files_inode_operations (by Vasiliy Kulikov)

Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
CC: Tejun Heo <tj@kernel.org>
CC: Vasiliy Kulikov <segoon@openwall.com>
CC: "Kirill A. Shutemov" <kirill@shutemov.name>
CC: Alexey Dobriyan <adobriyan@gmail.com>
CC: Al Viro <viro@ZenIV.linux.org.uk>
CC: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/base.c |  368 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 368 insertions(+)

Index: linux-2.6.git/fs/proc/base.c
===================================================================
--- linux-2.6.git.orig/fs/proc/base.c
+++ linux-2.6.git/fs/proc/base.c
@@ -2170,6 +2170,373 @@ static const struct file_operations proc
 	.llseek		= default_llseek,
 };
 
+static struct vm_area_struct *
+find_exact_vma(struct mm_struct *mm, unsigned long vm_start, unsigned long vm_end)
+{
+	struct vm_area_struct *vma = find_vma(mm, vm_start);
+	if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end))
+		vma = NULL;
+	return vma;
+}
+
+static int map_name_to_addr(const unsigned char *name, unsigned long *start, unsigned long *end)
+{
+	int ret = -EINVAL;
+	char *endp;
+
+	if (unlikely(!name))
+		goto err;
+
+	*start = simple_strtoul(name, &endp, 16);
+	if (*endp != '-')
+		goto err;
+	*end = simple_strtoul(endp + 1, &endp, 16);
+	if (*endp != 0)
+		goto err;
+
+	ret = 0;
+
+err:
+	return ret;
+}
+
+static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	struct vm_area_struct *vma = NULL;
+	unsigned long vm_start, vm_end;
+	struct task_struct *task;
+	const struct cred *cred;
+	struct mm_struct *mm;
+	struct inode *inode;
+
+	if (nd && nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	inode = dentry->d_inode;
+	task = get_proc_task(inode);
+	if (!task)
+		goto out;
+
+	mm = get_task_mm(task);
+	put_task_struct(task);
+	if (!mm)
+		goto out;
+
+	if (!map_name_to_addr(dentry->d_name.name, &vm_start, &vm_end)) {
+		down_read(&mm->mmap_sem);
+		vma = find_exact_vma(mm, vm_start, vm_end);
+		up_read(&mm->mmap_sem);
+	}
+
+	mmput(mm);
+
+	if (vma) {
+		if (task_dumpable(task)) {
+			rcu_read_lock();
+			cred = __task_cred(task);
+			inode->i_uid = cred->euid;
+			inode->i_gid = cred->egid;
+			rcu_read_unlock();
+		} else {
+			inode->i_uid = 0;
+			inode->i_gid = 0;
+		}
+		security_task_to_inode(task, inode);
+		return 1;
+	}
+out:
+	d_drop(dentry);
+	return 0;
+}
+
+static const struct dentry_operations tid_map_files_dentry_operations = {
+	.d_revalidate	= map_files_d_revalidate,
+	.d_delete	= pid_delete_dentry,
+};
+
+static int proc_map_files_get_link(struct dentry *dentry, struct path *path)
+{
+	unsigned long vm_start, vm_end;
+	struct vm_area_struct *vma;
+	struct task_struct *task;
+	struct mm_struct *mm;
+	int rc = -ENOENT;
+
+	task = get_proc_task(dentry->d_inode);
+	if (!task)
+		goto out;
+
+	mm = get_task_mm(task);
+	put_task_struct(task);
+	if (!mm)
+		goto out;
+
+	rc = map_name_to_addr(dentry->d_name.name,
+			      &vm_start, &vm_end);
+	if (rc)
+		goto out_mmput;
+
+	down_read(&mm->mmap_sem);
+	vma = find_exact_vma(mm, vm_start, vm_end);
+	if (vma && vma->vm_file) {
+		*path = vma->vm_file->f_path;
+		path_get(path);
+		rc = 0;
+	}
+	up_read(&mm->mmap_sem);
+
+out_mmput:
+	mmput(mm);
+out:
+	return rc;
+}
+
+struct map_files_info {
+	struct file	*file;
+	unsigned long	len;
+	unsigned char	name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
+};
+
+static struct dentry *
+proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
+			   struct task_struct *task, const void *ptr)
+{
+	const struct file *file = ptr;
+	struct proc_inode *ei;
+	struct inode *inode;
+
+	if (!file)
+		return ERR_PTR(-ENOENT);
+
+	inode = proc_pid_make_inode(dir->i_sb, task);
+	if (!inode)
+		return ERR_PTR(-ENOENT);
+
+	ei			= PROC_I(inode);
+	ei->op.proc_get_link	= proc_map_files_get_link;
+
+	inode->i_op	= &proc_pid_link_inode_operations;
+	inode->i_size	= 64;
+	inode->i_mode	= S_IFLNK;
+
+	if (file->f_mode & FMODE_READ)
+		inode->i_mode |= S_IRUSR | S_IXUSR;
+	if (file->f_mode & FMODE_WRITE)
+		inode->i_mode |= S_IWUSR | S_IXUSR;
+
+	d_set_d_op(dentry, &tid_map_files_dentry_operations);
+	d_add(dentry, inode);
+
+	return NULL;
+}
+
+static struct dentry *proc_map_files_lookup(struct inode *dir,
+		struct dentry *dentry, struct nameidata *nd)
+{
+	unsigned long vm_start, vm_end;
+	struct task_struct *task;
+	struct vm_area_struct *vma;
+	struct mm_struct *mm;
+	struct dentry *result;
+
+	result = ERR_PTR(-ENOENT);
+	task = get_proc_task(dir);
+	if (!task)
+		goto out_no_task;
+
+	result = ERR_PTR(-EPERM);
+	if (!ptrace_may_access(task, PTRACE_MODE_READ));
+		goto out_no_mm;
+
+	result = ERR_PTR(-ENOENT);
+	if (map_name_to_addr(dentry->d_name.name,
+			     &vm_start, &vm_end))
+		goto out_no_mm;
+
+	mm = get_task_mm(task);
+	if (!mm)
+		goto out_no_mm;
+
+	down_read(&mm->mmap_sem);
+	vma = find_exact_vma(mm, vm_start, vm_end);
+	if (!vma)
+		goto out_no_vma;
+
+	result = proc_map_files_instantiate(dir, dentry, task, vma->vm_file);
+
+out_no_vma:
+	up_read(&mm->mmap_sem);
+	mmput(mm);
+out_no_mm:
+	put_task_struct(task);
+out_no_task:
+	return result;
+}
+
+/*
+ * NOTE: The getattr/setattr for both /proc/$pid/map_files and
+ * /proc/$pid/fd seems to have share the code, so need to be
+ * unified and code duplication eliminated!
+ */
+
+int proc_map_files_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	struct task_struct *task;
+	int ret = -EACCES;
+
+	task = get_proc_task(inode);
+	if (!task)
+		return -ESRCH;
+
+	if (!lock_trace(task)) {
+		ret = proc_setattr(dentry, attr);
+		unlock_trace(task);
+	}
+
+	put_task_struct(task);
+	return ret;
+}
+
+static int proc_map_files_getattr(struct vfsmount *mnt, struct dentry *dentry,
+				  struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+	struct task_struct *task;
+	int ret = -EACCES;
+
+	task = get_proc_task(inode);
+	if (!task)
+		return -ESRCH;
+
+	if (!lock_trace(task)) {
+		generic_fillattr(inode, stat);
+		unlock_trace(task);
+		ret = 0;
+	}
+
+	put_task_struct(task);
+	return ret;
+}
+
+static const struct inode_operations proc_map_files_inode_operations = {
+	.lookup		= proc_map_files_lookup,
+	.setattr	= proc_map_files_setattr,
+	.getattr	= proc_map_files_getattr,
+};
+
+static int proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct dentry *dentry = filp->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	struct vm_area_struct *vma;
+	struct task_struct *task;
+	struct mm_struct *mm;
+	unsigned int vmai;
+	ino_t ino;
+	int ret;
+
+	ret = -ENOENT;
+	task = get_proc_task(inode);
+	if (!task)
+		goto out_no_task;
+
+	ret = -EPERM;
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+		goto out;
+
+	ret = 0;
+	switch (filp->f_pos) {
+	case 0:
+		ino = inode->i_ino;
+		if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0)
+			goto out;
+		filp->f_pos++;
+	case 1:
+		ino = parent_ino(dentry);
+		if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
+			goto out;
+		filp->f_pos++;
+	default:
+	{
+		struct map_files_info *info = NULL;
+		unsigned long nr_files, used, i;
+
+		mm = get_task_mm(task);
+		if (!mm)
+			goto out;
+		down_read(&mm->mmap_sem);
+
+		nr_files = 0;
+		used = 0;
+
+		/*
+		 * We need two passes here:
+		 *  1) Collect vmas of mapped files with mmap_sem taken
+		 *  2) Release mmap_sem and instantiate entries
+		 * othrewise we get lockdep complains since filldir
+		 * might sleep.
+		 */
+
+		for (vma = mm->mmap; vma; vma = vma->vm_next) {
+			if (vma->vm_file)
+				nr_files++;
+		}
+
+		if (nr_files) {
+			info = kmalloc(nr_files * sizeof(*info), GFP_KERNEL);
+			if (!info)
+				ret = -ENOMEM;
+			for (vma = mm->mmap, vmai = 2; vma && info; vma = vma->vm_next) {
+				if (!vma->vm_file)
+					continue;
+				vmai++;
+				if (vmai <= filp->f_pos)
+					continue;
+
+				get_file(vma->vm_file);
+				info[used].file	= vma->vm_file;
+				info[used].len	= snprintf(info[used].name,
+							   sizeof(info[used].name),
+							   "%lx-%lx",
+							   vma->vm_start,
+							   vma->vm_end);
+				used++;
+			}
+		}
+
+		up_read(&mm->mmap_sem);
+
+		for (i = 0; i < used; i++) {
+			ret = proc_fill_cache(filp, dirent, filldir,
+					      info[i].name, info[i].len,
+					      proc_map_files_instantiate,
+					      task, info[i].file);
+			if (ret)
+				break;
+			filp->f_pos++;
+		}
+
+		for (i = 0; i < used; i++)
+			put_filp(info[i].file);
+
+		kfree(info);
+		mmput(mm);
+	}
+	}
+
+out:
+	put_task_struct(task);
+out_no_task:
+	return ret;
+}
+
+static const struct file_operations proc_map_files_operations = {
+	.read		= generic_read_dir,
+	.readdir	= proc_map_files_readdir,
+	.llseek		= default_llseek,
+};
+
 /*
  * /proc/pid/fd needs a special permission handler so that a process can still
  * access /proc/self/fd after it has executed a setuid().
@@ -2785,6 +3152,7 @@ static const struct inode_operations pro
 static const struct pid_entry tgid_base_stuff[] = {
 	DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
 	DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
+	DIR("map_files",  S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
 	DIR("fdinfo",     S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
 	DIR("ns",	  S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
 #ifdef CONFIG_NET

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-08-31 11:26     ` Cyrill Gorcunov
@ 2011-08-31 14:04       ` Kirill A. Shutemov
  2011-08-31 14:09         ` Cyrill Gorcunov
  2011-08-31 14:26         ` Cyrill Gorcunov
  0 siblings, 2 replies; 82+ messages in thread
From: Kirill A. Shutemov @ 2011-08-31 14:04 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Vasiliy Kulikov, containers, linux-kernel, linux-fsdevel,
	Nathan Lynch, Oren Laadan, Daniel Lezcano, Glauber Costa,
	James Bottomley, Tejun Heo, Alexey Dobriyan, Al Viro,
	Andrew Morton, Pavel Emelyanov

On Wed, Aug 31, 2011 at 03:26:42PM +0400, Cyrill Gorcunov wrote:
> On Wed, Aug 31, 2011 at 01:06:12PM +0400, Vasiliy Kulikov wrote:
> > Hi,
> > 
> > On Wed, Aug 31, 2011 at 11:58 +0400, Cyrill Gorcunov wrote:
> > > From: Pavel Emelyanov <xemul@parallels.com>
> > > 
> > > This one behaves similarly to the /proc/<pid>/fd/ one - it contains symlinks
> > > one for each mapping with file, the name of a symlink is "vma->vm_start-vma->vm_end",
> > > the target is the file. Opening a symlink results in a file that point exactly
> > > to the same inode as them vma's one.
> > 
> > I'm late noting it before the RFCv6 :)  Besides checking
> > ptrace_may_access() on ->lookup and ->readdir you also should define
> > ->stat, otherwise you can bypass ptrace checks if there is a
> > corresponding dentry in the cache.  The same issue existed in fd* handlers:
> > 
> > http://www.openwall.com/lists/kernel-hardening/2011/08/29/1
> > 
> 
> OK, here is an updated one. Thanks for feedback. Hope this time
> all nits are addressed. Still reviews/complains are *very* appreciated.

Please run checkpatch. It points several warnings and one dangerous error.

-- 
 Kirill A. Shutemov

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-08-31 14:04       ` Kirill A. Shutemov
@ 2011-08-31 14:09         ` Cyrill Gorcunov
  2011-08-31 14:26         ` Cyrill Gorcunov
  1 sibling, 0 replies; 82+ messages in thread
From: Cyrill Gorcunov @ 2011-08-31 14:09 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Vasiliy Kulikov, containers, linux-kernel, linux-fsdevel,
	Nathan Lynch, Oren Laadan, Daniel Lezcano, Glauber Costa,
	James Bottomley, Tejun Heo, Alexey Dobriyan, Al Viro,
	Andrew Morton, Pavel Emelyanov

On Wed, Aug 31, 2011 at 05:04:16PM +0300, Kirill A. Shutemov wrote:
...
> > 
> > OK, here is an updated one. Thanks for feedback. Hope this time
> > all nits are addressed. Still reviews/complains are *very* appreciated.
> 
> Please run checkpatch. It points several warnings and one dangerous error.
> 

Doh! Thanks Kirill, it's a parasite semicolon :(

	Cyrill

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-08-31 14:04       ` Kirill A. Shutemov
  2011-08-31 14:09         ` Cyrill Gorcunov
@ 2011-08-31 14:26         ` Cyrill Gorcunov
  2011-08-31 22:10           ` Andrew Morton
  2011-08-31 22:50           ` Andrew Morton
  1 sibling, 2 replies; 82+ messages in thread
From: Cyrill Gorcunov @ 2011-08-31 14:26 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Vasiliy Kulikov, containers, linux-kernel, linux-fsdevel,
	Nathan Lynch, Oren Laadan, Daniel Lezcano, Glauber Costa,
	James Bottomley, Tejun Heo, Alexey Dobriyan, Al Viro,
	Andrew Morton, Pavel Emelyanov

On Wed, Aug 31, 2011 at 05:04:16PM +0300, Kirill A. Shutemov wrote:
...
> > 
> > OK, here is an updated one. Thanks for feedback. Hope this time
> > all nits are addressed. Still reviews/complains are *very* appreciated.
> 
> Please run checkpatch. It points several warnings and one dangerous error.
> 

This one passes checkpatch without error (thanks again Kirill for spotting
the parasite semicolon there!).

	Cyrill
---
fs, proc: Introduce the /proc/<pid>/map_files/ directory v8

From: Pavel Emelyanov <xemul@parallels.com>

This one behaves similarly to the /proc/<pid>/fd/ one - it contains symlinks
one for each mapping with file, the name of a symlink is "vma->vm_start-vma->vm_end",
the target is the file. Opening a symlink results in a file that point exactly
to the same inode as them vma's one.

For example the ls -l of some arbitrary /proc/<pid>/map_files/

 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80403000-7f8f80404000 -> /lib64/libc-2.5.so
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f8061e000-7f8f80620000 -> /lib64/libselinux.so.1
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80826000-7f8f80827000 -> /lib64/libacl.so.1.1.0
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a2f000-7f8f80a30000 -> /lib64/librt-2.5.so
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a30000-7f8f80a4c000 -> /lib64/ld-2.5.so

This helps checkpointing process in three ways:

1. When dumping a task mappings we do know exact file that is mapped by particular
   region. We do this by opening /proc/pid/map_files/address symlink the way we do
   with file descriptors.

2. This also helps in determining which anonymous shared mappings are shared with
   each other by comparing the inodes of them.

3. When restoring a set of process in case two of them has a mapping shared, we map
   the memory by the 1st one and then open its /proc/pid/map_files/address file and
   map it by the 2nd task.

v2: (spotted by Tejun Heo)
 - /proc/<pid>/mfd changed to /proc/<pid>/map_files
 - find_vma helper is used instead of linear search
 - routines are re-grouped
 - d_revalidate is set now

v3:
 - d_revalidate reworked, now it should drops no longer valid dentries (Tejun Heo)
 - ptrace_may_access added into proc_map_files_lookup (Vasiliy Kulikov)
 - because of filldir (which eventually might need to lock mmap_sem)
   the proc_map_files_readdir() was reworked to call proc_fill_cache()
   with unlocked mmap_sem

v4: (feedback by Tejun Heo and Vasiliy Kulikov)
 - instead of saving data in proc_inode we rather make a dentry name
   to keep both vm_start and vm_end accordingly
 - d_revalidate now honor task credentials

v5: (feedback by Kirill A. Shutemov)
 - don't forget to release mmap_sem on error path

v6:
 - sizeof get used in map_files_info which shrink member
   a bit on x86-32 (by Kirill A. Shutemov)
 - map_name_to_addr returns -EINVAL instead of -1
   which is more appropriate (by Tejun Heo)

v7:
 - add [get/set]attr handlers for
   proc_map_files_inode_operations (by Vasiliy Kulikov)

v8:
 - Kirill A. Shutemov spotted a parasite semicolon
   which ruined the ptrace_check call, fixed.

Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
CC: Tejun Heo <tj@kernel.org>
CC: Vasiliy Kulikov <segoon@openwall.com>
CC: "Kirill A. Shutemov" <kirill@shutemov.name>
CC: Alexey Dobriyan <adobriyan@gmail.com>
CC: Al Viro <viro@ZenIV.linux.org.uk>
CC: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/base.c |  368 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 368 insertions(+)

Index: linux-2.6.git/fs/proc/base.c
===================================================================
--- linux-2.6.git.orig/fs/proc/base.c
+++ linux-2.6.git/fs/proc/base.c
@@ -2170,6 +2170,373 @@ static const struct file_operations proc
 	.llseek		= default_llseek,
 };
 
+static struct vm_area_struct *
+find_exact_vma(struct mm_struct *mm, unsigned long vm_start, unsigned long vm_end)
+{
+	struct vm_area_struct *vma = find_vma(mm, vm_start);
+	if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end))
+		vma = NULL;
+	return vma;
+}
+
+static int map_name_to_addr(const unsigned char *name, unsigned long *start, unsigned long *end)
+{
+	int ret = -EINVAL;
+	char *endp;
+
+	if (unlikely(!name))
+		goto err;
+
+	*start = simple_strtoul(name, &endp, 16);
+	if (*endp != '-')
+		goto err;
+	*end = simple_strtoul(endp + 1, &endp, 16);
+	if (*endp != 0)
+		goto err;
+
+	ret = 0;
+
+err:
+	return ret;
+}
+
+static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	struct vm_area_struct *vma = NULL;
+	unsigned long vm_start, vm_end;
+	struct task_struct *task;
+	const struct cred *cred;
+	struct mm_struct *mm;
+	struct inode *inode;
+
+	if (nd && nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	inode = dentry->d_inode;
+	task = get_proc_task(inode);
+	if (!task)
+		goto out;
+
+	mm = get_task_mm(task);
+	put_task_struct(task);
+	if (!mm)
+		goto out;
+
+	if (!map_name_to_addr(dentry->d_name.name, &vm_start, &vm_end)) {
+		down_read(&mm->mmap_sem);
+		vma = find_exact_vma(mm, vm_start, vm_end);
+		up_read(&mm->mmap_sem);
+	}
+
+	mmput(mm);
+
+	if (vma) {
+		if (task_dumpable(task)) {
+			rcu_read_lock();
+			cred = __task_cred(task);
+			inode->i_uid = cred->euid;
+			inode->i_gid = cred->egid;
+			rcu_read_unlock();
+		} else {
+			inode->i_uid = 0;
+			inode->i_gid = 0;
+		}
+		security_task_to_inode(task, inode);
+		return 1;
+	}
+out:
+	d_drop(dentry);
+	return 0;
+}
+
+static const struct dentry_operations tid_map_files_dentry_operations = {
+	.d_revalidate	= map_files_d_revalidate,
+	.d_delete	= pid_delete_dentry,
+};
+
+static int proc_map_files_get_link(struct dentry *dentry, struct path *path)
+{
+	unsigned long vm_start, vm_end;
+	struct vm_area_struct *vma;
+	struct task_struct *task;
+	struct mm_struct *mm;
+	int rc = -ENOENT;
+
+	task = get_proc_task(dentry->d_inode);
+	if (!task)
+		goto out;
+
+	mm = get_task_mm(task);
+	put_task_struct(task);
+	if (!mm)
+		goto out;
+
+	rc = map_name_to_addr(dentry->d_name.name,
+			      &vm_start, &vm_end);
+	if (rc)
+		goto out_mmput;
+
+	down_read(&mm->mmap_sem);
+	vma = find_exact_vma(mm, vm_start, vm_end);
+	if (vma && vma->vm_file) {
+		*path = vma->vm_file->f_path;
+		path_get(path);
+		rc = 0;
+	}
+	up_read(&mm->mmap_sem);
+
+out_mmput:
+	mmput(mm);
+out:
+	return rc;
+}
+
+struct map_files_info {
+	struct file	*file;
+	unsigned long	len;
+	unsigned char	name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
+};
+
+static struct dentry *
+proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
+			   struct task_struct *task, const void *ptr)
+{
+	const struct file *file = ptr;
+	struct proc_inode *ei;
+	struct inode *inode;
+
+	if (!file)
+		return ERR_PTR(-ENOENT);
+
+	inode = proc_pid_make_inode(dir->i_sb, task);
+	if (!inode)
+		return ERR_PTR(-ENOENT);
+
+	ei			= PROC_I(inode);
+	ei->op.proc_get_link	= proc_map_files_get_link;
+
+	inode->i_op	= &proc_pid_link_inode_operations;
+	inode->i_size	= 64;
+	inode->i_mode	= S_IFLNK;
+
+	if (file->f_mode & FMODE_READ)
+		inode->i_mode |= S_IRUSR | S_IXUSR;
+	if (file->f_mode & FMODE_WRITE)
+		inode->i_mode |= S_IWUSR | S_IXUSR;
+
+	d_set_d_op(dentry, &tid_map_files_dentry_operations);
+	d_add(dentry, inode);
+
+	return NULL;
+}
+
+static struct dentry *proc_map_files_lookup(struct inode *dir,
+		struct dentry *dentry, struct nameidata *nd)
+{
+	unsigned long vm_start, vm_end;
+	struct task_struct *task;
+	struct vm_area_struct *vma;
+	struct mm_struct *mm;
+	struct dentry *result;
+
+	result = ERR_PTR(-ENOENT);
+	task = get_proc_task(dir);
+	if (!task)
+		goto out_no_task;
+
+	result = ERR_PTR(-EPERM);
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+		goto out_no_mm;
+
+	result = ERR_PTR(-ENOENT);
+	if (map_name_to_addr(dentry->d_name.name,
+			     &vm_start, &vm_end))
+		goto out_no_mm;
+
+	mm = get_task_mm(task);
+	if (!mm)
+		goto out_no_mm;
+
+	down_read(&mm->mmap_sem);
+	vma = find_exact_vma(mm, vm_start, vm_end);
+	if (!vma)
+		goto out_no_vma;
+
+	result = proc_map_files_instantiate(dir, dentry, task, vma->vm_file);
+
+out_no_vma:
+	up_read(&mm->mmap_sem);
+	mmput(mm);
+out_no_mm:
+	put_task_struct(task);
+out_no_task:
+	return result;
+}
+
+/*
+ * NOTE: The getattr/setattr for both /proc/$pid/map_files and
+ * /proc/$pid/fd seems to have share the code, so need to be
+ * unified and code duplication eliminated!
+ */
+
+int proc_map_files_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	struct task_struct *task;
+	int ret = -EACCES;
+
+	task = get_proc_task(inode);
+	if (!task)
+		return -ESRCH;
+
+	if (!lock_trace(task)) {
+		ret = proc_setattr(dentry, attr);
+		unlock_trace(task);
+	}
+
+	put_task_struct(task);
+	return ret;
+}
+
+static int proc_map_files_getattr(struct vfsmount *mnt, struct dentry *dentry,
+				  struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+	struct task_struct *task;
+	int ret = -EACCES;
+
+	task = get_proc_task(inode);
+	if (!task)
+		return -ESRCH;
+
+	if (!lock_trace(task)) {
+		generic_fillattr(inode, stat);
+		unlock_trace(task);
+		ret = 0;
+	}
+
+	put_task_struct(task);
+	return ret;
+}
+
+static const struct inode_operations proc_map_files_inode_operations = {
+	.lookup		= proc_map_files_lookup,
+	.setattr	= proc_map_files_setattr,
+	.getattr	= proc_map_files_getattr,
+};
+
+static int proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct dentry *dentry = filp->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	struct vm_area_struct *vma;
+	struct task_struct *task;
+	struct mm_struct *mm;
+	unsigned int vmai;
+	ino_t ino;
+	int ret;
+
+	ret = -ENOENT;
+	task = get_proc_task(inode);
+	if (!task)
+		goto out_no_task;
+
+	ret = -EPERM;
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+		goto out;
+
+	ret = 0;
+	switch (filp->f_pos) {
+	case 0:
+		ino = inode->i_ino;
+		if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0)
+			goto out;
+		filp->f_pos++;
+	case 1:
+		ino = parent_ino(dentry);
+		if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
+			goto out;
+		filp->f_pos++;
+	default:
+	{
+		struct map_files_info *info = NULL;
+		unsigned long nr_files, used, i;
+
+		mm = get_task_mm(task);
+		if (!mm)
+			goto out;
+		down_read(&mm->mmap_sem);
+
+		nr_files = 0;
+		used = 0;
+
+		/*
+		 * We need two passes here:
+		 *  1) Collect vmas of mapped files with mmap_sem taken
+		 *  2) Release mmap_sem and instantiate entries
+		 * othrewise we get lockdep complains since filldir
+		 * might sleep.
+		 */
+
+		for (vma = mm->mmap; vma; vma = vma->vm_next) {
+			if (vma->vm_file)
+				nr_files++;
+		}
+
+		if (nr_files) {
+			info = kmalloc(nr_files * sizeof(*info), GFP_KERNEL);
+			if (!info)
+				ret = -ENOMEM;
+			for (vma = mm->mmap, vmai = 2; vma && info; vma = vma->vm_next) {
+				if (!vma->vm_file)
+					continue;
+				vmai++;
+				if (vmai <= filp->f_pos)
+					continue;
+
+				get_file(vma->vm_file);
+				info[used].file	= vma->vm_file;
+				info[used].len	= snprintf(info[used].name,
+							   sizeof(info[used].name),
+							   "%lx-%lx",
+							   vma->vm_start,
+							   vma->vm_end);
+				used++;
+			}
+		}
+
+		up_read(&mm->mmap_sem);
+
+		for (i = 0; i < used; i++) {
+			ret = proc_fill_cache(filp, dirent, filldir,
+					      info[i].name, info[i].len,
+					      proc_map_files_instantiate,
+					      task, info[i].file);
+			if (ret)
+				break;
+			filp->f_pos++;
+		}
+
+		for (i = 0; i < used; i++)
+			put_filp(info[i].file);
+
+		kfree(info);
+		mmput(mm);
+	}
+	}
+
+out:
+	put_task_struct(task);
+out_no_task:
+	return ret;
+}
+
+static const struct file_operations proc_map_files_operations = {
+	.read		= generic_read_dir,
+	.readdir	= proc_map_files_readdir,
+	.llseek		= default_llseek,
+};
+
 /*
  * /proc/pid/fd needs a special permission handler so that a process can still
  * access /proc/self/fd after it has executed a setuid().
@@ -2785,6 +3152,7 @@ static const struct inode_operations pro
 static const struct pid_entry tgid_base_stuff[] = {
 	DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
 	DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
+	DIR("map_files",  S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
 	DIR("fdinfo",     S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
 	DIR("ns",	  S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
 #ifdef CONFIG_NET

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-08-31 14:26         ` Cyrill Gorcunov
@ 2011-08-31 22:10           ` Andrew Morton
  2011-09-01  3:07               ` Kyle Moffett
                               ` (3 more replies)
  2011-08-31 22:50           ` Andrew Morton
  1 sibling, 4 replies; 82+ messages in thread
From: Andrew Morton @ 2011-08-31 22:10 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Kirill A. Shutemov, Vasiliy Kulikov, containers, linux-kernel,
	linux-fsdevel, Nathan Lynch, Oren Laadan, Daniel Lezcano,
	Glauber Costa, James Bottomley, Tejun Heo, Alexey Dobriyan,
	Al Viro, Pavel Emelyanov

On Wed, 31 Aug 2011 18:26:22 +0400
Cyrill Gorcunov <gorcunov@gmail.com> wrote:

> On Wed, Aug 31, 2011 at 05:04:16PM +0300, Kirill A. Shutemov wrote:
> ...
> > > 
> > > OK, here is an updated one. Thanks for feedback. Hope this time
> > > all nits are addressed. Still reviews/complains are *very* appreciated.
> > 
> > Please run checkpatch. It points several warnings and one dangerous error.
> > 
> 
> This one passes checkpatch without error (thanks again Kirill for spotting
> the parasite semicolon there!).
> 
> 	Cyrill
> ---
> fs, proc: Introduce the /proc/<pid>/map_files/ directory v8
> 
> From: Pavel Emelyanov <xemul@parallels.com>
> 
> This one behaves similarly to the /proc/<pid>/fd/ one - it contains symlinks
> one for each mapping with file, the name of a symlink is "vma->vm_start-vma->vm_end",
> the target is the file. Opening a symlink results in a file that point exactly
> to the same inode as them vma's one.
> 
> For example the ls -l of some arbitrary /proc/<pid>/map_files/
> 
>  | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80403000-7f8f80404000 -> /lib64/libc-2.5.so
>  | lr-x------ 1 root root 64 Aug 26 06:40 7f8f8061e000-7f8f80620000 -> /lib64/libselinux.so.1
>  | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80826000-7f8f80827000 -> /lib64/libacl.so.1.1.0
>  | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a2f000-7f8f80a30000 -> /lib64/librt-2.5.so
>  | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a30000-7f8f80a4c000 -> /lib64/ld-2.5.so
> 
> This helps checkpointing process in three ways:
> 
> 1. When dumping a task mappings we do know exact file that is mapped by particular
>    region. We do this by opening /proc/pid/map_files/address symlink the way we do
>    with file descriptors.
> 
> 2. This also helps in determining which anonymous shared mappings are shared with
>    each other by comparing the inodes of them.
> 
> 3. When restoring a set of process in case two of them has a mapping shared, we map
>    the memory by the 1st one and then open its /proc/pid/map_files/address file and
>    map it by the 2nd task.

I'm reluctant to merge something like this unless/until it has real
use-cases.

What is the status of your c/r effort?

What additional kernel patches are required to bring that effort to a
usable state and where are those patches?

IOW, before starting to merge things I'd like to get an understanding
of what *all* the patches look like and of what level of c/r
functionality they provide.


This particular patch introduces a distressing amount of duplication of
/proc/pid/maps.  The changelog should provide a really good
justification for doing this: why is /proc/pid/maps (and smaps!)
unsuitable and why cannot maps/smaps be fixed to be suitable?

>
> ...
>
> --- linux-2.6.git.orig/fs/proc/base.c
> +++ linux-2.6.git/fs/proc/base.c
> @@ -2170,6 +2170,373 @@ static const struct file_operations proc
>  	.llseek		= default_llseek,
>  };
>  
> +static struct vm_area_struct *
> +find_exact_vma(struct mm_struct *mm, unsigned long vm_start, unsigned long vm_end)
> +{
> +	struct vm_area_struct *vma = find_vma(mm, vm_start);
> +	if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end))
> +		vma = NULL;
> +	return vma;
> +}

This function would benefit from a code comment.

Given that it's pretty generic (indeed there might be open-coded code
which already does this elsewhere), perhaps it should be in mm/mmap.c
as a kernel-wide utility function.  That will add a little overhead to
CONFIG_PROC_FS=n builds, which doesn't seem terribly important.

> +static int map_name_to_addr(const unsigned char *name, unsigned long *start, unsigned long *end)
> +{
> +	int ret = -EINVAL;
> +	char *endp;
> +
> +	if (unlikely(!name))
> +		goto err;
> +
> +	*start = simple_strtoul(name, &endp, 16);
> +	if (*endp != '-')
> +		goto err;
> +	*end = simple_strtoul(endp + 1, &endp, 16);
> +	if (*endp != 0)
> +		goto err;
> +
> +	ret = 0;
> +
> +err:
> +	return ret;
> +}

Again, a little bit of interface documentation would be nice.  Explain
what the parsed input format is, at least.

simple_strtoul() is obsolete - use kstrto*().  A checkpatch rule for
this is queued.

> +static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd)
> +{
> +	struct vm_area_struct *vma = NULL;
> +	unsigned long vm_start, vm_end;
> +	struct task_struct *task;
> +	const struct cred *cred;
> +	struct mm_struct *mm;
> +	struct inode *inode;
> +
> +	if (nd && nd->flags & LOOKUP_RCU)
> +		return -ECHILD;
> +
> +	inode = dentry->d_inode;
> +	task = get_proc_task(inode);
> +	if (!task)
> +		goto out;
> +
> +	mm = get_task_mm(task);
> +	put_task_struct(task);
> +	if (!mm)
> +		goto out;
> +
> +	if (!map_name_to_addr(dentry->d_name.name, &vm_start, &vm_end)) {
> +		down_read(&mm->mmap_sem);
> +		vma = find_exact_vma(mm, vm_start, vm_end);

OK, this is nasty.  We have a local variable which points to a vma but
then we release the locks and refcounts which protect that vma.  So we
have a pointer which we cannot dereference.  That's dangerous.

> +		up_read(&mm->mmap_sem);
> +	}
> +
> +	mmput(mm);
> +
> +	if (vma) {
> +		if (task_dumpable(task)) {
> +			rcu_read_lock();
> +			cred = __task_cred(task);
> +			inode->i_uid = cred->euid;
> +			inode->i_gid = cred->egid;
> +			rcu_read_unlock();
> +		} else {
> +			inode->i_uid = 0;
> +			inode->i_gid = 0;
> +		}
> +		security_task_to_inode(task, inode);
> +		return 1;
> +	}

And we don't actually dereference it - at present.  We use it as a bool.

Would it not be nicer, safer and clearer to turn this pointer-as-a-bool
into a bool?

	bool matching_vma_exists = !!find_exact_vma(mm, vm_start, vm_end);

> +out:
> +	d_drop(dentry);
> +	return 0;
> +}
> +
>
> ...
>
> +static struct dentry *
> +proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
> +			   struct task_struct *task, const void *ptr)
> +{
> +	const struct file *file = ptr;
> +	struct proc_inode *ei;
> +	struct inode *inode;
> +
> +	if (!file)
> +		return ERR_PTR(-ENOENT);
> +
> +	inode = proc_pid_make_inode(dir->i_sb, task);
> +	if (!inode)
> +		return ERR_PTR(-ENOENT);
> +
> +	ei			= PROC_I(inode);
> +	ei->op.proc_get_link	= proc_map_files_get_link;
> +
> +	inode->i_op	= &proc_pid_link_inode_operations;
> +	inode->i_size	= 64;
> +	inode->i_mode	= S_IFLNK;

The fancy indenting is not a thing we usually do in the kernel.

> +	if (file->f_mode & FMODE_READ)
> +		inode->i_mode |= S_IRUSR | S_IXUSR;
> +	if (file->f_mode & FMODE_WRITE)
> +		inode->i_mode |= S_IWUSR | S_IXUSR;
> +
> +	d_set_d_op(dentry, &tid_map_files_dentry_operations);
> +	d_add(dentry, inode);
> +
> +	return NULL;
> +}
> +
>
> ...
>
> +/*
> + * NOTE: The getattr/setattr for both /proc/$pid/map_files and
> + * /proc/$pid/fd seems to have share the code, so need to be
> + * unified and code duplication eliminated!

Why not do this now?

> + */
> +
> +int proc_map_files_setattr(struct dentry *dentry, struct iattr *attr)

static

> +{
> +	struct inode *inode = dentry->d_inode;
> +	struct task_struct *task;
> +	int ret = -EACCES;
> +
> +	task = get_proc_task(inode);
> +	if (!task)
> +		return -ESRCH;
> +
> +	if (!lock_trace(task)) {
> +		ret = proc_setattr(dentry, attr);
> +		unlock_trace(task);
> +	}
> +
> +	put_task_struct(task);
> +	return ret;
> +}
> +
>
> ...
>
> +static int proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
> +{
> +	struct dentry *dentry = filp->f_path.dentry;
> +	struct inode *inode = dentry->d_inode;
> +	struct vm_area_struct *vma;
> +	struct task_struct *task;
> +	struct mm_struct *mm;
> +	unsigned int vmai;
> +	ino_t ino;
> +	int ret;
> +
> +	ret = -ENOENT;
> +	task = get_proc_task(inode);
> +	if (!task)
> +		goto out_no_task;
> +
> +	ret = -EPERM;
> +	if (!ptrace_may_access(task, PTRACE_MODE_READ))
> +		goto out;
> +
> +	ret = 0;
> +	switch (filp->f_pos) {
> +	case 0:
> +		ino = inode->i_ino;
> +		if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0)
> +			goto out;
> +		filp->f_pos++;
> +	case 1:
> +		ino = parent_ino(dentry);
> +		if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
> +			goto out;
> +		filp->f_pos++;
> +	default:
> +	{
> +		struct map_files_info *info = NULL;
> +		unsigned long nr_files, used, i;
> +
> +		mm = get_task_mm(task);
> +		if (!mm)
> +			goto out;
> +		down_read(&mm->mmap_sem);
> +
> +		nr_files = 0;
> +		used = 0;
> +
> +		/*
> +		 * We need two passes here:
> +		 *  1) Collect vmas of mapped files with mmap_sem taken
> +		 *  2) Release mmap_sem and instantiate entries
> +		 * othrewise we get lockdep complains since filldir

typo

> +		 * might sleep.
> +		 */

Why would lockdep complain about sleep-inside-mmap_sem?

> +		for (vma = mm->mmap; vma; vma = vma->vm_next) {
> +			if (vma->vm_file)
> +				nr_files++;
> +		}
> +
> +		if (nr_files) {
> +			info = kmalloc(nr_files * sizeof(*info), GFP_KERNEL);

I figure sizeof(*info) = 50 bytes.  nr_files can easily be >1000.

On large some applications the kmalloc attempt will be too large.  This
is a must-fix.  Using vmalloc() would be very lame.

> +			if (!info)
> +				ret = -ENOMEM;
> +			for (vma = mm->mmap, vmai = 2; vma && info; vma = vma->vm_next) {
> +				if (!vma->vm_file)
> +					continue;
> +				vmai++;

What is "vmai"?  "vma index"?  If so, please call it vma_index.  If
not, please call it something better anyway.

vmai/vma_index could be made local to this code block.

> +				if (vmai <= filp->f_pos)
> +					continue;
> +
> +				get_file(vma->vm_file);
> +				info[used].file	= vma->vm_file;
> +				info[used].len	= snprintf(info[used].name,
> +							   sizeof(info[used].name),
> +							   "%lx-%lx",
> +							   vma->vm_start,
> +							   vma->vm_end);
> +				used++;
> +			}
> +		}
> +
> +		up_read(&mm->mmap_sem);
> +
> +		for (i = 0; i < used; i++) {
> +			ret = proc_fill_cache(filp, dirent, filldir,
> +					      info[i].name, info[i].len,
> +					      proc_map_files_instantiate,
> +					      task, info[i].file);
> +			if (ret)
> +				break;
> +			filp->f_pos++;
> +		}
> +
> +		for (i = 0; i < used; i++)
> +			put_filp(info[i].file);

Why not do the put_filp() in the previous loop and avoid some cache
misses?

> +		kfree(info);
> +		mmput(mm);
> +	}
> +	}
> +
> +out:
> +	put_task_struct(task);
> +out_no_task:
> +	return ret;
> +}
>
> ...
>


^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-08-31 14:26         ` Cyrill Gorcunov
  2011-08-31 22:10           ` Andrew Morton
@ 2011-08-31 22:50           ` Andrew Morton
  1 sibling, 0 replies; 82+ messages in thread
From: Andrew Morton @ 2011-08-31 22:50 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Kirill A. Shutemov, Vasiliy Kulikov, containers, linux-kernel,
	linux-fsdevel, Nathan Lynch, Oren Laadan, Daniel Lezcano,
	Glauber Costa, James Bottomley, Tejun Heo, Alexey Dobriyan,
	Al Viro, Pavel Emelyanov

On Wed, 31 Aug 2011 18:26:22 +0400
Cyrill Gorcunov <gorcunov@gmail.com> wrote:

> fs, proc: Introduce the /proc/<pid>/map_files/ directory v8
> 
> From: Pavel Emelyanov <xemul@parallels.com>
> 
> This one behaves similarly to the /proc/<pid>/fd/ one - it contains symlinks
> one for each mapping with file, the name of a symlink is "vma->vm_start-vma->vm_end",
> the target is the file. Opening a symlink results in a file that point exactly
> to the same inode as them vma's one.
> 
> For example the ls -l of some arbitrary /proc/<pid>/map_files/
> 
>  | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80403000-7f8f80404000 -> /lib64/libc-2.5.so
>  | lr-x------ 1 root root 64 Aug 26 06:40 7f8f8061e000-7f8f80620000 -> /lib64/libselinux.so.1
>  | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80826000-7f8f80827000 -> /lib64/libacl.so.1.1.0
>  | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a2f000-7f8f80a30000 -> /lib64/librt-2.5.so
>  | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a30000-7f8f80a4c000 -> /lib64/ld-2.5.so
> 
> This helps checkpointing process in three ways:
> 
> 1. When dumping a task mappings we do know exact file that is mapped by particular
>    region. We do this by opening /proc/pid/map_files/address symlink the way we do
>    with file descriptors.
> 
> 2. This also helps in determining which anonymous shared mappings are shared with
>    each other by comparing the inodes of them.
> 
> 3. When restoring a set of process in case two of them has a mapping shared, we map
>    the memory by the 1st one and then open its /proc/pid/map_files/address file and
>    map it by the 2nd task.

I'm reluctant to merge large hunks c/r infrastructure before knowing
that it will actually be useful and used.

What it the state of your c/r effort?

What additional kernel patches will be needed to enable it?

Where are those patches?



This particular patch is distressingly similar to /proc/pid/maps and
smaps.  To justify a merge the changelog should clearly describe why
maps/smaps are unsuitable for this application and why they are
unfixable.

>
> ...
>
> --- linux-2.6.git.orig/fs/proc/base.c
> +++ linux-2.6.git/fs/proc/base.c
> @@ -2170,6 +2170,373 @@ static const struct file_operations proc
>  	.llseek		= default_llseek,
>  };
>  
> +static struct vm_area_struct *
> +find_exact_vma(struct mm_struct *mm, unsigned long vm_start, unsigned long vm_end)
> +{
> +	struct vm_area_struct *vma = find_vma(mm, vm_start);
> +	if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end))
> +		vma = NULL;
> +	return vma;
> +}

This function would be better with a little documentation.

It is quite generic and it may be the case that other parts of the
kernel are already doing this in an open-coded fashion.  Perhaps it
should be placed in mm/mmap.c along with the other vma manipulation
library code.  This would add a small amount of dead code to
CONFIG_PROC_FS=n builds.

> +static int map_name_to_addr(const unsigned char *name, unsigned long *start, unsigned long *end)
> +{
> +	int ret = -EINVAL;
> +	char *endp;
> +
> +	if (unlikely(!name))
> +		goto err;
> +
> +	*start = simple_strtoul(name, &endp, 16);
> +	if (*endp != '-')
> +		goto err;
> +	*end = simple_strtoul(endp + 1, &endp, 16);
> +	if (*endp != 0)
> +		goto err;
> +
> +	ret = 0;
> +
> +err:
> +	return ret;
> +}

Again, documentation would improve this code.  At least describe the
parsed input format.

simple_strtoul() is obsolete.  Use kstrto*().  There is a new
checkpatch rule pending for this.

> +static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd)
> +{
> +	struct vm_area_struct *vma = NULL;
> +	unsigned long vm_start, vm_end;
> +	struct task_struct *task;
> +	const struct cred *cred;
> +	struct mm_struct *mm;
> +	struct inode *inode;
> +
> +	if (nd && nd->flags & LOOKUP_RCU)
> +		return -ECHILD;
> +
> +	inode = dentry->d_inode;
> +	task = get_proc_task(inode);
> +	if (!task)
> +		goto out;
> +
> +	mm = get_task_mm(task);
> +	put_task_struct(task);
> +	if (!mm)
> +		goto out;
> +
> +	if (!map_name_to_addr(dentry->d_name.name, &vm_start, &vm_end)) {
> +		down_read(&mm->mmap_sem);
> +		vma = find_exact_vma(mm, vm_start, vm_end);

This is nasty.  We have a pointer to a vma, but we drop the locks and
refcounts which protect that vma.  So we have a local pointer which we
cannot dereference.  It's a bit of a hand-grenade.

> +		up_read(&mm->mmap_sem);
> +	}
> +
> +	mmput(mm);
> +
> +	if (vma) {
> +		if (task_dumpable(task)) {
> +			rcu_read_lock();
> +			cred = __task_cred(task);
> +			inode->i_uid = cred->euid;
> +			inode->i_gid = cred->egid;
> +			rcu_read_unlock();
> +		} else {
> +			inode->i_uid = 0;
> +			inode->i_gid = 0;
> +		}
> +		security_task_to_inode(task, inode);
> +		return 1;

And indeed it was not dereferenced (yet).  It is treated as a bool.

Would it not be nicer, safer and clearer to turn the pointer-as-a-bool
into a bool?

	bool matching_vma_found = !!find_exact_vma(mm, vm_start, vm_end);

> +	}
> +out:
> +	d_drop(dentry);
> +	return 0;
> +}
> +
>
> ...
>
> +static struct dentry *
> +proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
> +			   struct task_struct *task, const void *ptr)
> +{
> +	const struct file *file = ptr;
> +	struct proc_inode *ei;
> +	struct inode *inode;
> +
> +	if (!file)
> +		return ERR_PTR(-ENOENT);
> +
> +	inode = proc_pid_make_inode(dir->i_sb, task);
> +	if (!inode)
> +		return ERR_PTR(-ENOENT);
> +
> +	ei			= PROC_I(inode);
> +	ei->op.proc_get_link	= proc_map_files_get_link;
> +
> +	inode->i_op	= &proc_pid_link_inode_operations;
> +	inode->i_size	= 64;
> +	inode->i_mode	= S_IFLNK;

The fancy indenting is atypical for kernel code.

> +	if (file->f_mode & FMODE_READ)
> +		inode->i_mode |= S_IRUSR | S_IXUSR;
> +	if (file->f_mode & FMODE_WRITE)
> +		inode->i_mode |= S_IWUSR | S_IXUSR;
> +
> +	d_set_d_op(dentry, &tid_map_files_dentry_operations);
> +	d_add(dentry, inode);
> +
> +	return NULL;
> +}
> +
>
> ...
>
> +/*
> + * NOTE: The getattr/setattr for both /proc/$pid/map_files and
> + * /proc/$pid/fd seems to have share the code, so need to be
> + * unified and code duplication eliminated!

Why not do this now?

> + */
> +
> +int proc_map_files_setattr(struct dentry *dentry, struct iattr *attr)

static

> +{
> +	struct inode *inode = dentry->d_inode;
> +	struct task_struct *task;
> +	int ret = -EACCES;
> +
> +	task = get_proc_task(inode);
> +	if (!task)
> +		return -ESRCH;
> +
> +	if (!lock_trace(task)) {
> +		ret = proc_setattr(dentry, attr);
> +		unlock_trace(task);
> +	}
> +
> +	put_task_struct(task);
> +	return ret;
> +}
> +
>
> ...
>
> +static int proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
> +{
> +	struct dentry *dentry = filp->f_path.dentry;
> +	struct inode *inode = dentry->d_inode;
> +	struct vm_area_struct *vma;
> +	struct task_struct *task;
> +	struct mm_struct *mm;
> +	unsigned int vmai;
> +	ino_t ino;
> +	int ret;
> +
> +	ret = -ENOENT;
> +	task = get_proc_task(inode);
> +	if (!task)
> +		goto out_no_task;
> +
> +	ret = -EPERM;
> +	if (!ptrace_may_access(task, PTRACE_MODE_READ))
> +		goto out;
> +
> +	ret = 0;
> +	switch (filp->f_pos) {
> +	case 0:
> +		ino = inode->i_ino;
> +		if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0)
> +			goto out;
> +		filp->f_pos++;
> +	case 1:
> +		ino = parent_ino(dentry);
> +		if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
> +			goto out;
> +		filp->f_pos++;
> +	default:
> +	{
> +		struct map_files_info *info = NULL;
> +		unsigned long nr_files, used, i;
> +
> +		mm = get_task_mm(task);
> +		if (!mm)
> +			goto out;
> +		down_read(&mm->mmap_sem);
> +
> +		nr_files = 0;
> +		used = 0;
> +
> +		/*
> +		 * We need two passes here:
> +		 *  1) Collect vmas of mapped files with mmap_sem taken
> +		 *  2) Release mmap_sem and instantiate entries
> +		 * othrewise we get lockdep complains since filldir
> +		 * might sleep.

Why would a sleep-inside-mmap_sem cause a lockdep warning?

> +		 */
> +
> +		for (vma = mm->mmap; vma; vma = vma->vm_next) {
> +			if (vma->vm_file)
> +				nr_files++;
> +		}
> +
> +		if (nr_files) {
> +			info = kmalloc(nr_files * sizeof(*info), GFP_KERNEL);

I calculate sizeof(*info) to be 50 bytes.  nr_files can easily be in
the thousands.

So this kmalloc will be far too large on large applications.  This is a
must-fix.  Fixing it with vmalloc() is lame.

> +			if (!info)
> +				ret = -ENOMEM;
> +			for (vma = mm->mmap, vmai = 2; vma && info; vma = vma->vm_next) {

What does "vmai" mean?  "vma index"?  If so, call it vma_index.  If
not, call it something else which is meaningful.

vmai/vma_index could be made local to this code block.

> +				if (!vma->vm_file)
> +					continue;
> +				vmai++;
> +				if (vmai <= filp->f_pos)
> +					continue;
> +
> +				get_file(vma->vm_file);
> +				info[used].file	= vma->vm_file;
> +				info[used].len	= snprintf(info[used].name,
> +							   sizeof(info[used].name),
> +							   "%lx-%lx",
> +							   vma->vm_start,
> +							   vma->vm_end);
> +				used++;
> +			}
> +		}
> +
> +		up_read(&mm->mmap_sem);
> +
> +		for (i = 0; i < used; i++) {
> +			ret = proc_fill_cache(filp, dirent, filldir,
> +					      info[i].name, info[i].len,
> +					      proc_map_files_instantiate,
> +					      task, info[i].file);
> +			if (ret)
> +				break;
> +			filp->f_pos++;
> +		}
> +
> +		for (i = 0; i < used; i++)
> +			put_filp(info[i].file);

Why not do the put_filp() in the previous loop and avoid some cache misses?

> +		kfree(info);
> +		mmput(mm);
> +	}
> +	}
> +
> +out:
> +	put_task_struct(task);
> +out_no_task:
> +	return ret;
> +}
> +
> +static const struct file_operations proc_map_files_operations = {
> +	.read		= generic_read_dir,
> +	.readdir	= proc_map_files_readdir,
> +	.llseek		= default_llseek,
> +};
>
> ...
>


^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-08-31 22:10           ` Andrew Morton
@ 2011-09-01  3:07               ` Kyle Moffett
  2011-09-01  7:58             ` Pavel Emelyanov
                                 ` (2 subsequent siblings)
  3 siblings, 0 replies; 82+ messages in thread
From: Kyle Moffett @ 2011-09-01  3:07 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Cyrill Gorcunov, Kirill A. Shutemov, Vasiliy Kulikov, containers,
	linux-kernel, linux-fsdevel, Nathan Lynch, Oren Laadan,
	Daniel Lezcano, Glauber Costa, James Bottomley, Tejun Heo,
	Alexey Dobriyan, Al Viro, Pavel Emelyanov

On Wed, Aug 31, 2011 at 18:10, Andrew Morton <akpm@linux-foundation.org> wrote:
> On Wed, 31 Aug 2011 18:26:22 +0400  Cyrill Gorcunov <gorcunov@gmail.com> wrote:
>> This one behaves similarly to the /proc/<pid>/fd/ one - it contains symlinks
>> one for each mapping with file, the name of a symlink is "vma->vm_start-vma->vm_end",
>> the target is the file. Opening a symlink results in a file that point exactly
>> to the same inode as them vma's one.
>>
>> For example the ls -l of some arbitrary /proc/<pid>/map_files/
>>
>>  | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80403000-7f8f80404000 -> /lib64/libc-2.5.so
>>  | lr-x------ 1 root root 64 Aug 26 06:40 7f8f8061e000-7f8f80620000 -> /lib64/libselinux.so.1
>>  | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80826000-7f8f80827000 -> /lib64/libacl.so.1.1.0
>>  | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a2f000-7f8f80a30000 -> /lib64/librt-2.5.so
>>  | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a30000-7f8f80a4c000 -> /lib64/ld-2.5.so
>
> This particular patch introduces a distressing amount of duplication of
> /proc/pid/maps.  The changelog should provide a really good
> justification for doing this: why is /proc/pid/maps (and smaps!)
> unsuitable and why cannot maps/smaps be fixed to be suitable?

Andrew,

This is way more useful than /proc/$PID/maps, because this allows you
to reliably obtain an FD to an arbitrary mapped memory segment.

EG:
  $ cp /bin/sleep ~/my-sleep
  $ ~/my-sleep 1000 & kid=$!
  $ rm ~/my-sleep
  $ mkdir ~/mapped_files
  $ for file in /proc/$kid/map_files/*; do cp "$file" ~/mapped_files/; done

This trivially gets me copies of every mapped file, including the deleted
"my-sleep" binary, it also trivially handles lazy-unmounted filesystems,
chroots, namespaces, and all sorts of similar kernel cleverness.

It's exactly the same kind of utility that /proc/$PID/fd/* has, but for
mmap().

With /proc/$kid/maps you simply can't work around some of those
kinds of issues (IE: deleted files, etc).

Cheers,
Kyle Moffett
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
@ 2011-09-01  3:07               ` Kyle Moffett
  0 siblings, 0 replies; 82+ messages in thread
From: Kyle Moffett @ 2011-09-01  3:07 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Cyrill Gorcunov, Kirill A. Shutemov, Vasiliy Kulikov, containers,
	linux-kernel, linux-fsdevel, Nathan Lynch, Oren Laadan,
	Daniel Lezcano, Glauber Costa, James Bottomley, Tejun Heo,
	Alexey Dobriyan, Al Viro, Pavel Emelyanov

On Wed, Aug 31, 2011 at 18:10, Andrew Morton <akpm@linux-foundation.org> wrote:
> On Wed, 31 Aug 2011 18:26:22 +0400  Cyrill Gorcunov <gorcunov@gmail.com> wrote:
>> This one behaves similarly to the /proc/<pid>/fd/ one - it contains symlinks
>> one for each mapping with file, the name of a symlink is "vma->vm_start-vma->vm_end",
>> the target is the file. Opening a symlink results in a file that point exactly
>> to the same inode as them vma's one.
>>
>> For example the ls -l of some arbitrary /proc/<pid>/map_files/
>>
>>  | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80403000-7f8f80404000 -> /lib64/libc-2.5.so
>>  | lr-x------ 1 root root 64 Aug 26 06:40 7f8f8061e000-7f8f80620000 -> /lib64/libselinux.so.1
>>  | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80826000-7f8f80827000 -> /lib64/libacl.so.1.1.0
>>  | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a2f000-7f8f80a30000 -> /lib64/librt-2.5.so
>>  | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a30000-7f8f80a4c000 -> /lib64/ld-2.5.so
>
> This particular patch introduces a distressing amount of duplication of
> /proc/pid/maps.  The changelog should provide a really good
> justification for doing this: why is /proc/pid/maps (and smaps!)
> unsuitable and why cannot maps/smaps be fixed to be suitable?

Andrew,

This is way more useful than /proc/$PID/maps, because this allows you
to reliably obtain an FD to an arbitrary mapped memory segment.

EG:
  $ cp /bin/sleep ~/my-sleep
  $ ~/my-sleep 1000 & kid=$!
  $ rm ~/my-sleep
  $ mkdir ~/mapped_files
  $ for file in /proc/$kid/map_files/*; do cp "$file" ~/mapped_files/; done

This trivially gets me copies of every mapped file, including the deleted
"my-sleep" binary, it also trivially handles lazy-unmounted filesystems,
chroots, namespaces, and all sorts of similar kernel cleverness.

It's exactly the same kind of utility that /proc/$PID/fd/* has, but for
mmap().

With /proc/$kid/maps you simply can't work around some of those
kinds of issues (IE: deleted files, etc).

Cheers,
Kyle Moffett

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-08-31 22:10           ` Andrew Morton
  2011-09-01  3:07               ` Kyle Moffett
@ 2011-09-01  7:58             ` Pavel Emelyanov
  2011-09-01 11:50               ` Tejun Heo
  2011-09-02  0:09               ` Matt Helsley
  2011-09-01  8:05             ` Cyrill Gorcunov
  2011-09-01 10:46             ` Cyrill Gorcunov
  3 siblings, 2 replies; 82+ messages in thread
From: Pavel Emelyanov @ 2011-09-01  7:58 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Cyrill Gorcunov, Kirill A. Shutemov, Vasiliy Kulikov, containers,
	linux-kernel, linux-fsdevel, Nathan Lynch, Oren Laadan,
	Daniel Lezcano, Glauber Costa, James Bottomley, Tejun Heo,
	Alexey Dobriyan, Al Viro

> I'm reluctant to merge something like this unless/until it has real
> use-cases.
> 
> What is the status of your c/r effort?

Working prototype that can dump and restore tree of processes with their
registers, memory of all kinds (anon/file, private/shared), open regular
files and pipes (with contents).

And we have an RFC set from Tejun that can suspend and resume a TCP connection.

> What additional kernel patches are required to bring that effort to a
> usable state and where are those patches?

* The one you've already accepted with ->statfs for pipefs.
* PTRACE_SEIZE set from Tejun (RFC was sent some time earlier)
* CLONE_USEPID flag for the clone() syscall (Cyrill will re-send a bit later)
* The binfmt handler for images (I've sent it earlier, but there's a discussion
  happening over it. We can do restore without one, but it will improve the
  situation significantly)

Once this is merged the prototype I described above starts working on the
out-of-the-box kernel.

> IOW, before starting to merge things I'd like to get an understanding
> of what *all* the patches look like and of what level of c/r
> functionality they provide.
> 
> This particular patch introduces a distressing amount of duplication of
> /proc/pid/maps.  The changelog should provide a really good
> justification for doing this: why is /proc/pid/maps (and smaps!)
> unsuitable and why cannot maps/smaps be fixed to be suitable?

This is not duplication, since with /proc/pid/s?maps you have no idea of what
file is being mapped by a vma. I mean - you know some "path" to it, but it's
not reliable at all.

Besides, without this set there's absolutely no way to find out which two 
mappings are shared and is extremely hard to restore shared mapping.

Thanks,
Pavel

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-08-31 22:10           ` Andrew Morton
  2011-09-01  3:07               ` Kyle Moffett
  2011-09-01  7:58             ` Pavel Emelyanov
@ 2011-09-01  8:05             ` Cyrill Gorcunov
  2011-09-02 16:37                 ` [kernel-hardening] " Vasiliy Kulikov
  2011-09-01 10:46             ` Cyrill Gorcunov
  3 siblings, 1 reply; 82+ messages in thread
From: Cyrill Gorcunov @ 2011-09-01  8:05 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Kirill A. Shutemov, Vasiliy Kulikov, containers, linux-kernel,
	linux-fsdevel, Nathan Lynch, Oren Laadan, Daniel Lezcano,
	Glauber Costa, James Bottomley, Tejun Heo, Alexey Dobriyan,
	Al Viro, Pavel Emelyanov

On Wed, Aug 31, 2011 at 03:10:23PM -0700, Andrew Morton wrote:
...

Pavel has just addressed the intention of the patch in another
reply.

> >  
> > +static struct vm_area_struct *
> > +find_exact_vma(struct mm_struct *mm, unsigned long vm_start, unsigned long vm_end)
> > +{
> > +	struct vm_area_struct *vma = find_vma(mm, vm_start);
> > +	if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end))
> > +		vma = NULL;
> > +	return vma;
> > +}
> 
> This function would benefit from a code comment.

ok

> 
> Given that it's pretty generic (indeed there might be open-coded code
> which already does this elsewhere), perhaps it should be in mm/mmap.c
> as a kernel-wide utility function.  That will add a little overhead to
> CONFIG_PROC_FS=n builds, which doesn't seem terribly important.
> 

Will update.

> > +static int map_name_to_addr(const unsigned ...
> > +{
> 
...
> Again, a little bit of interface documentation would be nice.  Explain
> what the parsed input format is, at least.
> 
> simple_strtoul() is obsolete - use kstrto*().  A checkpatch rule for
> this is queued.

OK, will update.

> 
> > +static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd)
> > +{
> > +
...
> > +	if (!map_name_to_addr(dentry->d_name.name, &vm_start, &vm_end)) {
> > +		down_read(&mm->mmap_sem);
> > +		vma = find_exact_vma(mm, vm_start, vm_end);
> 
> OK, this is nasty.  We have a local variable which points to a vma but
> then we release the locks and refcounts which protect that vma.  So we
> have a pointer which we cannot dereference.  That's dangerous.
...
> And we don't actually dereference it - at present.  We use it as a bool.
>
> Would it not be nicer, safer and clearer to turn this pointer-as-a-bool
> into a bool?
> 
> 	bool matching_vma_exists = !!find_exact_vma(mm, vm_start, vm_end);
> 

Yeah, thanks, will update.

...
> > +
> > +	inode->i_op	= &proc_pid_link_inode_operations;
> > +	inode->i_size	= 64;
> > +	inode->i_mode	= S_IFLNK;
> 
> The fancy indenting is not a thing we usually do in the kernel.

Fancy indenting really makes code easier to read, and in real
we do it in kernel as well, but ok, I'll drop it.

...
> > +/*
> > + * NOTE: The getattr/setattr for both /proc/$pid/map_files and
> > + * /proc/$pid/fd seems to have share the code, so need to be
> > + * unified and code duplication eliminated!
> 
> Why not do this now?

There are a couple of reasons. Yesterday I was talking to
Vasiliy Kulikov about this snippet, so he seems about to send
you patches related to /proc/$pid/fd update, and after those
patches will be merged we are to drop code duplication.
Vasiliy, what the status of the update?

Secondly, I've had a problems trying to download -mm patches
yesterday (I believe 'cause of kernel.org problem) so I rather
put this note here in patch just to *not* forget to do a cleanup
work once things calm down. And eventually, I believe this
cleanup should be separately in a sake of bisectability.

Again, this note is related to the future patch from Vasiliy
and mark for myself to not forget clean it up later.

> 
> > + */
> > +
> > +int proc_map_files_setattr(struct dentry *dentry, struct iattr *attr)
> 
> static

Nod, thanks.

...
> > +		 * othrewise we get lockdep complains since filldir
> 
> typo

Thanks.

> 
> > +		 * might sleep.
> > +		 */
> 
> Why would lockdep complain about sleep-inside-mmap_sem?

filldir calls for might_fault and need mmap_sem as well
(when CONFIG_PROVE_LOCKING is set) and this makes lockdep
to complain. In real I revealed it during testing.

...
> > +		if (nr_files) {
> > +			info = kmalloc(nr_files * sizeof(*info), GFP_KERNEL);
> 
> I figure sizeof(*info) = 50 bytes.  nr_files can easily be >1000.
> 
> On large some applications the kmalloc attempt will be too large.  This
> is a must-fix.  Using vmalloc() would be very lame.

Yes, I'll add a test which one to use, either kmalloc, either vmalloc,
depending on size needed.

...
> > +				vmai++;
> 
> What is "vmai"?  "vma index"?  If so, please call it vma_index.  If
> not, please call it something better anyway.
> 
> vmai/vma_index could be made local to this code block.

ok

...
> > +		for (i = 0; i < used; i++) {
> > +			ret = proc_fill_cache(filp, dirent, filldir,
> > +					      info[i].name, info[i].len,
> > +					      proc_map_files_instantiate,
> > +					      task, info[i].file);
> > +			if (ret)
> > +				break;
> > +			filp->f_pos++;
> > +		}
> > +
> > +		for (i = 0; i < used; i++)
> > +			put_filp(info[i].file);
> 
> Why not do the put_filp() in the previous loop and avoid some cache
> misses?

Because first loop may fail (break) and I still have to drop
refs to filep -- which in turn means I'll have to continue
from broken position, ie like this

	for (; i < used; i++)
		put_filp(info[i].file);

But, OK, I'll update. Thanks a huge for comments, Andrew!

	Cyrill

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-08-31 22:10           ` Andrew Morton
                               ` (2 preceding siblings ...)
  2011-09-01  8:05             ` Cyrill Gorcunov
@ 2011-09-01 10:46             ` Cyrill Gorcunov
  2011-09-01 22:49               ` Andrew Morton
  3 siblings, 1 reply; 82+ messages in thread
From: Cyrill Gorcunov @ 2011-09-01 10:46 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Kirill A. Shutemov, Vasiliy Kulikov, containers, linux-kernel,
	linux-fsdevel, Nathan Lynch, Oren Laadan, Daniel Lezcano,
	Glauber Costa, James Bottomley, Tejun Heo, Alexey Dobriyan,
	Al Viro, Pavel Emelyanov

On Wed, Aug 31, 2011 at 03:10:23PM -0700, Andrew Morton wrote:
> 
> This function would benefit from a code comment.
> 
> Given that it's pretty generic (indeed there might be open-coded code
> which already does this elsewhere), perhaps it should be in mm/mmap.c
> as a kernel-wide utility function.  That will add a little overhead to
> CONFIG_PROC_FS=n builds, which doesn't seem terribly important.
> 

Andrew, here is an attempt to address concerns. Please review.
Complains are welcome as always!

	Cyrill
---
fs, proc: Introduce the /proc/<pid>/map_files/ directory v9

From: Pavel Emelyanov <xemul@parallels.com>

This one behaves similarly to the /proc/<pid>/fd/ one - it contains symlinks
one for each mapping with file, the name of a symlink is "vma->vm_start-vma->vm_end",
the target is the file. Opening a symlink results in a file that point exactly
to the same inode as them vma's one.

For example the ls -l of some arbitrary /proc/<pid>/map_files/

 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80403000-7f8f80404000 -> /lib64/libc-2.5.so
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f8061e000-7f8f80620000 -> /lib64/libselinux.so.1
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80826000-7f8f80827000 -> /lib64/libacl.so.1.1.0
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a2f000-7f8f80a30000 -> /lib64/librt-2.5.so
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a30000-7f8f80a4c000 -> /lib64/ld-2.5.so

This helps checkpointing process in three ways:

1. When dumping a task mappings we do know exact file that is mapped by particular
   region. We do this by opening /proc/pid/map_files/address symlink the way we do
   with file descriptors.

2. This also helps in determining which anonymous shared mappings are shared with
   each other by comparing the inodes of them.

3. When restoring a set of process in case two of them has a mapping shared, we map
   the memory by the 1st one and then open its /proc/pid/map_files/address file and
   map it by the 2nd task.

v2: (spotted by Tejun Heo)
 - /proc/<pid>/mfd changed to /proc/<pid>/map_files
 - find_vma helper is used instead of linear search
 - routines are re-grouped
 - d_revalidate is set now

v3:
 - d_revalidate reworked, now it should drops no longer valid dentries (Tejun Heo)
 - ptrace_may_access added into proc_map_files_lookup (Vasiliy Kulikov)
 - because of filldir (which eventually might need to lock mmap_sem)
   the proc_map_files_readdir() was reworked to call proc_fill_cache()
   with unlocked mmap_sem

v4: (feedback by Tejun Heo and Vasiliy Kulikov)
 - instead of saving data in proc_inode we rather make a dentry name
   to keep both vm_start and vm_end accordingly
 - d_revalidate now honor task credentials

v5: (feedback by Kirill A. Shutemov)
 - don't forget to release mmap_sem on error path

v6:
 - sizeof get used in map_files_info which shrink member
   a bit on x86-32 (by Kirill A. Shutemov)
 - map_name_to_addr returns -EINVAL instead of -1
   which is more appropriate (by Tejun Heo)

v7:
 - add [get/set]attr handlers for
   proc_map_files_inode_operations (by Vasiliy Kulikov)

v8:
 - Kirill A. Shutemov spotted a parasite semicolon
   which ruined the ptrace_check call, fixed.

v9: (feedback by Andrew Morton)
 - find_exact_vma moved into include/linux/mm.h as
   an inline helper
 - proc_map_files_setattr uses either kmalloc or
   vmalloc depending on how many ojects are to be
   allocated
 - no more map_name_to_addr but dname_to_vma_addr
   introduced instead and it uses sscanf
 - because in one case the find_exact_vma() is used
   only to confirm existence of vma area the
   boolean flag is used
 - fancy justification dropped
 - still the proc_map_files_get/setattr leaved untouched
   until additional fd/ patches applied first.

Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
CC: Tejun Heo <tj@kernel.org>
CC: Vasiliy Kulikov <segoon@openwall.com>
CC: "Kirill A. Shutemov" <kirill@shutemov.name>
CC: Alexey Dobriyan <adobriyan@gmail.com>
CC: Al Viro <viro@ZenIV.linux.org.uk>
CC: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/base.c     |  353 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mm.h |   12 +
 2 files changed, 365 insertions(+)

Index: linux-2.6.git/fs/proc/base.c
===================================================================
--- linux-2.6.git.orig/fs/proc/base.c
+++ linux-2.6.git/fs/proc/base.c
@@ -2171,6 +2171,358 @@ static const struct file_operations proc
 };
 
 /*
+ * dname_to_vma_addr - maps a dentry name into two unsigned longs
+ * which represent vma start and end addresses.
+ */
+static int dname_to_vma_addr(struct dentry *dentry,
+			     unsigned long *start, unsigned long *end)
+{
+	if (sscanf(dentry->d_name.name, "%lx-%lx", start, end) != 2)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	unsigned long vm_start, vm_end;
+	struct task_struct *task;
+	const struct cred *cred;
+	struct mm_struct *mm;
+	struct inode *inode;
+
+	bool exact_vma_exists = false;
+
+	if (nd && nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	inode = dentry->d_inode;
+	task = get_proc_task(inode);
+	if (!task)
+		goto out;
+
+	mm = get_task_mm(task);
+	put_task_struct(task);
+	if (!mm)
+		goto out;
+
+	if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
+		down_read(&mm->mmap_sem);
+		exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end);
+		up_read(&mm->mmap_sem);
+	}
+
+	mmput(mm);
+
+	if (exact_vma_exists) {
+		if (task_dumpable(task)) {
+			rcu_read_lock();
+			cred = __task_cred(task);
+			inode->i_uid = cred->euid;
+			inode->i_gid = cred->egid;
+			rcu_read_unlock();
+		} else {
+			inode->i_uid = 0;
+			inode->i_gid = 0;
+		}
+		security_task_to_inode(task, inode);
+		return 1;
+	}
+out:
+	d_drop(dentry);
+	return 0;
+}
+
+static const struct dentry_operations tid_map_files_dentry_operations = {
+	.d_revalidate	= map_files_d_revalidate,
+	.d_delete	= pid_delete_dentry,
+};
+
+static int proc_map_files_get_link(struct dentry *dentry, struct path *path)
+{
+	unsigned long vm_start, vm_end;
+	struct vm_area_struct *vma;
+	struct task_struct *task;
+	struct mm_struct *mm;
+	int rc = -ENOENT;
+
+	task = get_proc_task(dentry->d_inode);
+	if (!task)
+		goto out;
+
+	mm = get_task_mm(task);
+	put_task_struct(task);
+	if (!mm)
+		goto out;
+
+	rc = dname_to_vma_addr(dentry, &vm_start, &vm_end);
+	if (rc)
+		goto out_mmput;
+
+	down_read(&mm->mmap_sem);
+	vma = find_exact_vma(mm, vm_start, vm_end);
+	if (vma && vma->vm_file) {
+		*path = vma->vm_file->f_path;
+		path_get(path);
+		rc = 0;
+	}
+	up_read(&mm->mmap_sem);
+
+out_mmput:
+	mmput(mm);
+out:
+	return rc;
+}
+
+struct map_files_info {
+	struct file	*file;
+	unsigned long	len;
+	unsigned char	name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
+};
+
+static struct dentry *
+proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
+			   struct task_struct *task, const void *ptr)
+{
+	const struct file *file = ptr;
+	struct proc_inode *ei;
+	struct inode *inode;
+
+	if (!file)
+		return ERR_PTR(-ENOENT);
+
+	inode = proc_pid_make_inode(dir->i_sb, task);
+	if (!inode)
+		return ERR_PTR(-ENOENT);
+
+	ei = PROC_I(inode);
+	ei->op.proc_get_link = proc_map_files_get_link;
+
+	inode->i_op = &proc_pid_link_inode_operations;
+	inode->i_size = 64;
+	inode->i_mode = S_IFLNK;
+
+	if (file->f_mode & FMODE_READ)
+		inode->i_mode |= S_IRUSR | S_IXUSR;
+	if (file->f_mode & FMODE_WRITE)
+		inode->i_mode |= S_IWUSR | S_IXUSR;
+
+	d_set_d_op(dentry, &tid_map_files_dentry_operations);
+	d_add(dentry, inode);
+
+	return NULL;
+}
+
+static struct dentry *proc_map_files_lookup(struct inode *dir,
+		struct dentry *dentry, struct nameidata *nd)
+{
+	unsigned long vm_start, vm_end;
+	struct task_struct *task;
+	struct vm_area_struct *vma;
+	struct mm_struct *mm;
+	struct dentry *result;
+
+	result = ERR_PTR(-ENOENT);
+	task = get_proc_task(dir);
+	if (!task)
+		goto out_no_task;
+
+	result = ERR_PTR(-EPERM);
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+		goto out_no_mm;
+
+	result = ERR_PTR(-ENOENT);
+	if (dname_to_vma_addr(dentry, &vm_start, &vm_end))
+		goto out_no_mm;
+
+	mm = get_task_mm(task);
+	if (!mm)
+		goto out_no_mm;
+
+	down_read(&mm->mmap_sem);
+	vma = find_exact_vma(mm, vm_start, vm_end);
+	if (!vma)
+		goto out_no_vma;
+
+	result = proc_map_files_instantiate(dir, dentry, task, vma->vm_file);
+
+out_no_vma:
+	up_read(&mm->mmap_sem);
+	mmput(mm);
+out_no_mm:
+	put_task_struct(task);
+out_no_task:
+	return result;
+}
+
+static int proc_map_files_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	struct task_struct *task;
+	int ret = -EACCES;
+
+	task = get_proc_task(inode);
+	if (!task)
+		return -ESRCH;
+
+	if (!lock_trace(task)) {
+		ret = proc_setattr(dentry, attr);
+		unlock_trace(task);
+	}
+
+	put_task_struct(task);
+	return ret;
+}
+
+static int proc_map_files_getattr(struct vfsmount *mnt, struct dentry *dentry,
+				  struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+	struct task_struct *task;
+	int ret = -EACCES;
+
+	task = get_proc_task(inode);
+	if (!task)
+		return -ESRCH;
+
+	if (!lock_trace(task)) {
+		generic_fillattr(inode, stat);
+		unlock_trace(task);
+		ret = 0;
+	}
+
+	put_task_struct(task);
+	return ret;
+}
+
+static const struct inode_operations proc_map_files_inode_operations = {
+	.lookup		= proc_map_files_lookup,
+	.setattr	= proc_map_files_setattr,
+	.getattr	= proc_map_files_getattr,
+};
+
+static int proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct dentry *dentry = filp->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	struct vm_area_struct *vma;
+	struct task_struct *task;
+	struct mm_struct *mm;
+	ino_t ino;
+	int ret;
+
+	ret = -ENOENT;
+	task = get_proc_task(inode);
+	if (!task)
+		goto out_no_task;
+
+	ret = -EPERM;
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+		goto out;
+
+	ret = 0;
+	switch (filp->f_pos) {
+	case 0:
+		ino = inode->i_ino;
+		if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0)
+			goto out;
+		filp->f_pos++;
+	case 1:
+		ino = parent_ino(dentry);
+		if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
+			goto out;
+		filp->f_pos++;
+	default:
+	{
+		struct map_files_info *info = NULL;
+		unsigned long nr_files, used, pos, i;
+		unsigned long mem_size = 0;
+
+		mm = get_task_mm(task);
+		if (!mm)
+			goto out;
+		down_read(&mm->mmap_sem);
+
+		nr_files = 0;
+		used = 0;
+
+		/*
+		 * We need two passes here:
+		 *
+		 *  1) Collect vmas of mapped files with mmap_sem taken
+		 *  2) Release mmap_sem and instantiate entries
+		 *
+		 * otherwise we get lockdep complained, since filldir()
+		 * routine might require mmap_sem taken in might_fault().
+		 */
+
+		for (vma = mm->mmap; vma; vma = vma->vm_next) {
+			if (vma->vm_file)
+				nr_files++;
+		}
+
+		if (nr_files) {
+			mem_size = nr_files * sizeof(*info);
+			if (mem_size <= KMALLOC_MAX_SIZE)
+				info = kmalloc(mem_size, GFP_KERNEL);
+			else
+				info = vmalloc(mem_size);
+			if (!info)
+				ret = -ENOMEM;
+			for (vma = mm->mmap, pos = 2; vma && info; vma = vma->vm_next) {
+				if (!vma->vm_file)
+					continue;
+				if (++pos <= filp->f_pos)
+					continue;
+
+				get_file(vma->vm_file);
+				info[used].file	= vma->vm_file;
+				info[used].len	= snprintf(info[used].name,
+							   sizeof(info[used].name),
+							   "%lx-%lx",
+							   vma->vm_start,
+							   vma->vm_end);
+				used++;
+			}
+		}
+
+		up_read(&mm->mmap_sem);
+
+		for (i = 0; i < used; i++) {
+			ret = proc_fill_cache(filp, dirent, filldir,
+					      info[i].name, info[i].len,
+					      proc_map_files_instantiate,
+					      task, info[i].file);
+			if (ret)
+				break;
+			filp->f_pos++;
+			put_filp(info[i].file);
+		}
+
+		for (; i < used; i++)
+			put_filp(info[i].file);
+
+		if (mem_size <= KMALLOC_MAX_SIZE)
+			kfree(info);
+		else
+			vfree(info);
+		mmput(mm);
+	}
+	}
+
+out:
+	put_task_struct(task);
+out_no_task:
+	return ret;
+}
+
+static const struct file_operations proc_map_files_operations = {
+	.read		= generic_read_dir,
+	.readdir	= proc_map_files_readdir,
+	.llseek		= default_llseek,
+};
+
+/*
  * /proc/pid/fd needs a special permission handler so that a process can still
  * access /proc/self/fd after it has executed a setuid().
  */
@@ -2785,6 +3137,7 @@ static const struct inode_operations pro
 static const struct pid_entry tgid_base_stuff[] = {
 	DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
 	DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
+	DIR("map_files",  S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
 	DIR("fdinfo",     S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
 	DIR("ns",	  S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
 #ifdef CONFIG_NET
Index: linux-2.6.git/include/linux/mm.h
===================================================================
--- linux-2.6.git.orig/include/linux/mm.h
+++ linux-2.6.git/include/linux/mm.h
@@ -1491,6 +1491,18 @@ static inline unsigned long vma_pages(st
 	return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
 }
 
+/* Look up the first VMA which exactly match the interval vm_start ... vm_end */
+static inline struct vm_area_struct *
+find_exact_vma(struct mm_struct *mm, unsigned long vm_start, unsigned long vm_end)
+{
+	struct vm_area_struct *vma = find_vma(mm, vm_start);
+
+	if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end))
+		vma = NULL;
+
+	return vma;
+}
+
 #ifdef CONFIG_MMU
 pgprot_t vm_get_page_prot(unsigned long vm_flags);
 #else

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-09-01  7:58             ` Pavel Emelyanov
@ 2011-09-01 11:50               ` Tejun Heo
  2011-09-01 12:13                 ` Pavel Emelyanov
  2011-09-02  0:09               ` Matt Helsley
  1 sibling, 1 reply; 82+ messages in thread
From: Tejun Heo @ 2011-09-01 11:50 UTC (permalink / raw)
  To: Pavel Emelyanov
  Cc: Andrew Morton, Cyrill Gorcunov, Kirill A. Shutemov,
	Vasiliy Kulikov, containers, linux-kernel, linux-fsdevel,
	Nathan Lynch, Oren Laadan, Daniel Lezcano, Glauber Costa,
	James Bottomley, Alexey Dobriyan, Al Viro

Hello, Andrew, Pavel.

On Thu, Sep 01, 2011 at 11:58:29AM +0400, Pavel Emelyanov wrote:
> > What additional kernel patches are required to bring that effort to a
> > usable state and where are those patches?
> 
> * The one you've already accepted with ->statfs for pipefs.
> * PTRACE_SEIZE set from Tejun (RFC was sent some time earlier)

This one is already in mainline.  It's necessary to make the existing
debuggers (strace and gdb) interact properly with job control.

> * CLONE_USEPID flag for the clone() syscall (Cyrill will re-send a bit later)
> * The binfmt handler for images (I've sent it earlier, but there's a discussion
>   happening over it. We can do restore without one, but it will improve the
>   situation significantly)

I still can't see much point in binfmt handler.  The kernel pieces
should be pretty small no matter how this one gets resolved.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-09-01 11:50               ` Tejun Heo
@ 2011-09-01 12:13                 ` Pavel Emelyanov
  2011-09-01 17:13                   ` Tejun Heo
  0 siblings, 1 reply; 82+ messages in thread
From: Pavel Emelyanov @ 2011-09-01 12:13 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Andrew Morton, Cyrill Gorcunov, Kirill A. Shutemov,
	Vasiliy Kulikov, containers, linux-kernel, linux-fsdevel,
	Nathan Lynch, Oren Laadan, Daniel Lezcano, Glauber Costa,
	James Bottomley, Alexey Dobriyan, Al Viro

On 09/01/2011 03:50 PM, Tejun Heo wrote:
> Hello, Andrew, Pavel.
> 
> On Thu, Sep 01, 2011 at 11:58:29AM +0400, Pavel Emelyanov wrote:
>>> What additional kernel patches are required to bring that effort to a
>>> usable state and where are those patches?
>>
>> * The one you've already accepted with ->statfs for pipefs.
>> * PTRACE_SEIZE set from Tejun (RFC was sent some time earlier)
> 
> This one is already in mainline.  It's necessary to make the existing
> debuggers (strace and gdb) interact properly with job control.
> 
>> * CLONE_USEPID flag for the clone() syscall (Cyrill will re-send a bit later)
>> * The binfmt handler for images (I've sent it earlier, but there's a discussion
>>   happening over it. We can do restore without one, but it will improve the
>>   situation significantly)
> 
> I still can't see much point in binfmt handler.  The kernel pieces
> should be pretty small no matter how this one gets resolved.

Because with the handler restore process looks very natural and simple - each
task does the following steps

1. restore task resources (open files, set IDs, restore connections, wire back timers, etc.)
2. call execve() to jump into new memory+registers context which is
   a. unmap all the user memory
   b. map required mappings
   c. populate them with data
   d. restore registers
   e. restore IP

Note, that steps a through e are what execve() is designed for from day 1. Also note,
that when talking about the binary handler I do not insist in having my own one - it's
perfectly fine with me if we can make the ELF handler do the job (and I'm going to investigate
this ability soon).

With SEIZE it looks worse (maybe I'm seeing it wrong, then correct me please):

1. restore task resources
2. freeze
3. some foreigner attaches a parasite to the frozen task and the parasite
   should do steps a through e from the previous list to restore mem+regs context,
   but when doing steps a and b it should care about not killing himself from the
   target task context

This SEIZE-d restoring looks very complex and not efficient to me.

Am I wrong at some point?
   

> Thanks.
> 


^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-09-01 12:13                 ` Pavel Emelyanov
@ 2011-09-01 17:13                   ` Tejun Heo
  2011-09-02 19:15                     ` Matt Helsley
  0 siblings, 1 reply; 82+ messages in thread
From: Tejun Heo @ 2011-09-01 17:13 UTC (permalink / raw)
  To: Pavel Emelyanov
  Cc: Andrew Morton, Cyrill Gorcunov, Kirill A. Shutemov,
	Vasiliy Kulikov, containers, linux-kernel, linux-fsdevel,
	Nathan Lynch, Oren Laadan, Daniel Lezcano, Glauber Costa,
	James Bottomley, Alexey Dobriyan, Al Viro

Hello,

On Thu, Sep 01, 2011 at 04:13:54PM +0400, Pavel Emelyanov wrote:
> Because with the handler restore process looks very natural and simple - each
> task does the following steps
> 
> 1. restore task resources (open files, set IDs, restore connections, wire back timers, etc.)
> 2. call execve() to jump into new memory+registers context which is
>    a. unmap all the user memory
>    b. map required mappings
>    c. populate them with data
>    d. restore registers
>    e. restore IP

But what about multiple threads?  exec is already scary enough as it
is and I don't think it would be wise to overload it for this.

I don't really think binfmt handler would be able to achieve
completeness without ending up with full de-serializer in kernel.
There are a lot of states which already have API to manipulate from
the userland thread itself and they all will need to be replicated in
the binfmt handler.  It really has to be full de-serializer if it
wants to function like you described.

And if we're gonna have interventions anyway, I can't see much point
in implementing something which seems simpler - it's not gonna be
actually simpler.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-09-01 10:46             ` Cyrill Gorcunov
@ 2011-09-01 22:49               ` Andrew Morton
  2011-09-01 23:04                 ` Tejun Heo
  2011-09-02  5:53                 ` Cyrill Gorcunov
  0 siblings, 2 replies; 82+ messages in thread
From: Andrew Morton @ 2011-09-01 22:49 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Kirill A. Shutemov, Vasiliy Kulikov, containers, linux-kernel,
	linux-fsdevel, Nathan Lynch, Oren Laadan, Daniel Lezcano,
	Glauber Costa, James Bottomley, Tejun Heo, Alexey Dobriyan,
	Al Viro, Pavel Emelyanov

On Thu, 1 Sep 2011 14:46:34 +0400
Cyrill Gorcunov <gorcunov@gmail.com> wrote:

> On Wed, Aug 31, 2011 at 03:10:23PM -0700, Andrew Morton wrote:
> > 
> > This function would benefit from a code comment.
> > 
> > Given that it's pretty generic (indeed there might be open-coded code
> > which already does this elsewhere), perhaps it should be in mm/mmap.c
> > as a kernel-wide utility function.  That will add a little overhead to
> > CONFIG_PROC_FS=n builds, which doesn't seem terribly important.
> > 
> 
> Andrew, here is an attempt to address concerns. Please review.
> Complains are welcome as always!

Changelog still doesn't explain why /proc/pid/maps is unfixably unsuitable.

>
> ...
>
> +static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd)
> +{
> +	unsigned long vm_start, vm_end;
> +	struct task_struct *task;
> +	const struct cred *cred;
> +	struct mm_struct *mm;
> +	struct inode *inode;
> +
> +	bool exact_vma_exists = false;
>

Extraneous newline there.

> +	if (nd && nd->flags & LOOKUP_RCU)
> +		return -ECHILD;
> +
> +	inode = dentry->d_inode;
> +	task = get_proc_task(inode);
> +	if (!task)
> +		goto out;
> +
> +	mm = get_task_mm(task);
> +	put_task_struct(task);
> +	if (!mm)
> +		goto out;
> +
> +	if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
> +		down_read(&mm->mmap_sem);
> +		exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end);
> +		up_read(&mm->mmap_sem);
> +	}
> +
> +	mmput(mm);
> +
> +	if (exact_vma_exists) {
> +		if (task_dumpable(task)) {
> +			rcu_read_lock();
> +			cred = __task_cred(task);
> +			inode->i_uid = cred->euid;
> +			inode->i_gid = cred->egid;
> +			rcu_read_unlock();
> +		} else {
> +			inode->i_uid = 0;
> +			inode->i_gid = 0;
> +		}
> +		security_task_to_inode(task, inode);
> +		return 1;
> +	}
> +out:
> +	d_drop(dentry);
> +	return 0;
> +}
> +
>
> ...
>
> +static int proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
> +{
> +	struct dentry *dentry = filp->f_path.dentry;
> +	struct inode *inode = dentry->d_inode;
> +	struct vm_area_struct *vma;
> +	struct task_struct *task;
> +	struct mm_struct *mm;
> +	ino_t ino;
> +	int ret;
> +
> +	ret = -ENOENT;
> +	task = get_proc_task(inode);
> +	if (!task)
> +		goto out_no_task;
> +
> +	ret = -EPERM;
> +	if (!ptrace_may_access(task, PTRACE_MODE_READ))
> +		goto out;
> +
> +	ret = 0;
> +	switch (filp->f_pos) {
> +	case 0:
> +		ino = inode->i_ino;
> +		if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0)
> +			goto out;
> +		filp->f_pos++;
> +	case 1:
> +		ino = parent_ino(dentry);
> +		if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
> +			goto out;
> +		filp->f_pos++;
> +	default:
> +	{
> +		struct map_files_info *info = NULL;
> +		unsigned long nr_files, used, pos, i;
> +		unsigned long mem_size = 0;
> +
> +		mm = get_task_mm(task);
> +		if (!mm)
> +			goto out;
> +		down_read(&mm->mmap_sem);
> +
> +		nr_files = 0;
> +		used = 0;
> +
> +		/*
> +		 * We need two passes here:
> +		 *
> +		 *  1) Collect vmas of mapped files with mmap_sem taken
> +		 *  2) Release mmap_sem and instantiate entries
> +		 *
> +		 * otherwise we get lockdep complained, since filldir()
> +		 * routine might require mmap_sem taken in might_fault().
> +		 */
> +
> +		for (vma = mm->mmap; vma; vma = vma->vm_next) {
> +			if (vma->vm_file)
> +				nr_files++;
> +		}
> +
> +		if (nr_files) {
> +			mem_size = nr_files * sizeof(*info);
> +			if (mem_size <= KMALLOC_MAX_SIZE)
> +				info = kmalloc(mem_size, GFP_KERNEL);
> +			else
> +				info = vmalloc(mem_size);

This still sucks :(

A KMALLOC_MAX_SIZE allocation is huuuuuuuuuuuuge!  I don't know how big
it is nowadays, but over 100 kbytes.  This will frequently send page
reclaim on a berzerk rampage freeing *thousands* of pages (or
relocating pages) until it manages to generate 20 or 30 physically
contiguous free pages.

Also, vmalloc sucks.  The more often we perform vmallocs (with a mix of
differently-sized ones), the more internally fragmented the vmalloc
arena will become.  With some workloads we'll run out of
sufficiently-large contiguous free spaces and things will start
failing.  This doesn't happen often.  Yet.  But the more vmalloc()
callsites we add, the more likely and the more frequent it becomes.  So
vmalloc is something we should only use as a last resort.

The most robust implementation here is to allocate a large number of
small objects - one per vma.  A list would be a suitable way of
managing them.

But do we *really* need to do it in two passes?  Avoiding the temporary
storage would involve doing more work under mmap_sem, and a put_filp()
under mmap_sem might be problematic.


^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-09-01 22:49               ` Andrew Morton
@ 2011-09-01 23:04                 ` Tejun Heo
  2011-09-02  5:54                   ` Cyrill Gorcunov
  2011-09-02  5:53                 ` Cyrill Gorcunov
  1 sibling, 1 reply; 82+ messages in thread
From: Tejun Heo @ 2011-09-01 23:04 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Cyrill Gorcunov, Kirill A. Shutemov, Vasiliy Kulikov, containers,
	linux-kernel, linux-fsdevel, Nathan Lynch, Oren Laadan,
	Daniel Lezcano, Glauber Costa, James Bottomley, Alexey Dobriyan,
	Al Viro, Pavel Emelyanov

On Thu, Sep 01, 2011 at 03:49:46PM -0700, Andrew Morton wrote:
> The most robust implementation here is to allocate a large number of
> small objects - one per vma.  A list would be a suitable way of
> managing them.

Or just use flex_array?

-- 
tejun

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-09-01  7:58             ` Pavel Emelyanov
  2011-09-01 11:50               ` Tejun Heo
@ 2011-09-02  0:09               ` Matt Helsley
  1 sibling, 0 replies; 82+ messages in thread
From: Matt Helsley @ 2011-09-02  0:09 UTC (permalink / raw)
  To: Pavel Emelyanov
  Cc: Andrew Morton, Cyrill Gorcunov, Kirill A. Shutemov,
	Vasiliy Kulikov, containers, linux-kernel, linux-fsdevel,
	Nathan Lynch, Oren Laadan, Daniel Lezcano, Glauber Costa,
	James Bottomley, Serge Hallyn, Tejun Heo, Alexey Dobriyan,
	Al Viro

On Thu, Sep 01, 2011 at 11:58:29AM +0400, Pavel Emelyanov wrote:
> > What additional kernel patches are required to bring that effort to a
> > usable state and where are those patches?
> 
> * The one you've already accepted with ->statfs for pipefs.
> * PTRACE_SEIZE set from Tejun (RFC was sent some time earlier)
> * CLONE_USEPID flag for the clone() syscall (Cyrill will re-send a bit later)

Is that the last CLONE_ flag? Does it fully support nesting of pid namespaces?

What happened to the idea of using the existing eclone patches instead?

Cheers,
	-Matt Helsley

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-08-31  7:58 ` [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6 Cyrill Gorcunov
  2011-08-31  9:06   ` Vasiliy Kulikov
@ 2011-09-02  1:54   ` Nicholas Miell
  2011-09-02  1:58     ` Tejun Heo
  1 sibling, 1 reply; 82+ messages in thread
From: Nicholas Miell @ 2011-09-02  1:54 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: containers, linux-kernel, linux-fsdevel, Nathan Lynch,
	Oren Laadan, Daniel Lezcano, Glauber Costa, James Bottomley,
	Tejun Heo, Vasiliy Kulikov, Kirill A. Shutemov, Alexey Dobriyan,
	Al Viro, Andrew Morton, Pavel Emelyanov

On 08/31/2011 12:58 AM, Cyrill Gorcunov wrote:
> From: Pavel Emelyanov <xemul@parallels.com>
> 
> This one behaves similarly to the /proc/<pid>/fd/ one - it contains symlinks
> one for each mapping with file, the name of a symlink is "vma->vm_start-vma->vm_end",
> the target is the file. Opening a symlink results in a file that point exactly
> to the same inode as them vma's one.
> 
> For example the ls -l of some arbitrary /proc/<pid>/map_files/
> 
>  | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80403000-7f8f80404000 -> /lib64/libc-2.5.so
>  | lr-x------ 1 root root 64 Aug 26 06:40 7f8f8061e000-7f8f80620000 -> /lib64/libselinux.so.1
>  | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80826000-7f8f80827000 -> /lib64/libacl.so.1.1.0
>  | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a2f000-7f8f80a30000 -> /lib64/librt-2.5.so
>  | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a30000-7f8f80a4c000 -> /lib64/ld-2.5.so

This doesn't seem to fully export all the information passed to mmap(2)
-- it encompasses the addr, size, fd and (assuming the generated
symlink's permissions are accurate) prot parameters, but does nothing to
export the flags or offset parameters.


^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-09-02  1:54   ` Nicholas Miell
@ 2011-09-02  1:58     ` Tejun Heo
  2011-09-02  2:04       ` Nicholas Miell
  0 siblings, 1 reply; 82+ messages in thread
From: Tejun Heo @ 2011-09-02  1:58 UTC (permalink / raw)
  To: Nicholas Miell
  Cc: Cyrill Gorcunov, containers, linux-kernel, linux-fsdevel,
	Nathan Lynch, Oren Laadan, Daniel Lezcano, Glauber Costa,
	James Bottomley, Vasiliy Kulikov, Kirill A. Shutemov,
	Alexey Dobriyan, Al Viro, Andrew Morton, Pavel Emelyanov

On Thu, Sep 01, 2011 at 06:54:32PM -0700, Nicholas Miell wrote:
> This doesn't seem to fully export all the information passed to mmap(2)
> -- it encompasses the addr, size, fd and (assuming the generated
> symlink's permissions are accurate) prot parameters, but does nothing to
> export the flags or offset parameters.

It looks like,

/proc/*/fd : fdinfo ~= map_files : maps

-- 
tejun

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-09-02  1:58     ` Tejun Heo
@ 2011-09-02  2:04       ` Nicholas Miell
  2011-09-02  2:29         ` Tejun Heo
  0 siblings, 1 reply; 82+ messages in thread
From: Nicholas Miell @ 2011-09-02  2:04 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Cyrill Gorcunov, containers, linux-kernel, linux-fsdevel,
	Nathan Lynch, Oren Laadan, Daniel Lezcano, Glauber Costa,
	James Bottomley, Vasiliy Kulikov, Kirill A. Shutemov,
	Alexey Dobriyan, Al Viro, Andrew Morton, Pavel Emelyanov

On 09/01/2011 06:58 PM, Tejun Heo wrote:
> On Thu, Sep 01, 2011 at 06:54:32PM -0700, Nicholas Miell wrote:
>> This doesn't seem to fully export all the information passed to mmap(2)
>> -- it encompasses the addr, size, fd and (assuming the generated
>> symlink's permissions are accurate) prot parameters, but does nothing to
>> export the flags or offset parameters.
> 
> It looks like,
> 
> /proc/*/fd : fdinfo ~= map_files : maps
> 

So are the filenames in /proc/$PID/map_files/ guaranteed to be identical
to the first field (first two fields? the part before the first space)
in /proc/$PID/maps?

When I open a file in /proc/$PID/map_files/, is offset 0 the start of
the file on disk or is it the first byte of the mapping?

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-09-02  2:04       ` Nicholas Miell
@ 2011-09-02  2:29         ` Tejun Heo
  2011-09-02  8:07           ` Kirill A. Shutemov
  0 siblings, 1 reply; 82+ messages in thread
From: Tejun Heo @ 2011-09-02  2:29 UTC (permalink / raw)
  To: Nicholas Miell
  Cc: Cyrill Gorcunov, containers, linux-kernel, linux-fsdevel,
	Nathan Lynch, Oren Laadan, Daniel Lezcano, Glauber Costa,
	James Bottomley, Vasiliy Kulikov, Kirill A. Shutemov,
	Alexey Dobriyan, Al Viro, Andrew Morton, Pavel Emelyanov

Hello,

On Thu, Sep 01, 2011 at 07:04:42PM -0700, Nicholas Miell wrote:
> So are the filenames in /proc/$PID/map_files/ guaranteed to be identical
> to the first field (first two fields? the part before the first space)
> in /proc/$PID/maps?

Yeap, naturally.

> When I open a file in /proc/$PID/map_files/, is offset 0 the start of
> the file on disk or is it the first byte of the mapping?

It shows the same backing file so of course uses the same offset as
the file on disk.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-09-01 22:49               ` Andrew Morton
  2011-09-01 23:04                 ` Tejun Heo
@ 2011-09-02  5:53                 ` Cyrill Gorcunov
  1 sibling, 0 replies; 82+ messages in thread
From: Cyrill Gorcunov @ 2011-09-02  5:53 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Kirill A. Shutemov, Vasiliy Kulikov, containers, linux-kernel,
	linux-fsdevel, Nathan Lynch, Oren Laadan, Daniel Lezcano,
	Glauber Costa, James Bottomley, Tejun Heo, Alexey Dobriyan,
	Al Viro, Pavel Emelyanov

On Thu, Sep 01, 2011 at 03:49:46PM -0700, Andrew Morton wrote:
> On Thu, 1 Sep 2011 14:46:34 +0400
> Cyrill Gorcunov <gorcunov@gmail.com> wrote:
> 
> > On Wed, Aug 31, 2011 at 03:10:23PM -0700, Andrew Morton wrote:
> > > 
> > > This function would benefit from a code comment.
> > > 
> > > Given that it's pretty generic (indeed there might be open-coded code
> > > which already does this elsewhere), perhaps it should be in mm/mmap.c
> > > as a kernel-wide utility function.  That will add a little overhead to
> > > CONFIG_PROC_FS=n builds, which doesn't seem terribly important.
> > > 
> > 
> > Andrew, here is an attempt to address concerns. Please review.
> > Complains are welcome as always!
> 
> Changelog still doesn't explain why /proc/pid/maps is unfixably unsuitable.
> 

Will update.

> >
> > ...
> >
> > +static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd)
> > +{
> > +	unsigned long vm_start, vm_end;
> > +	struct task_struct *task;
> > +	const struct cred *cred;
> > +	struct mm_struct *mm;
> > +	struct inode *inode;
> > +
> > +	bool exact_vma_exists = false;
> >
> 
> Extraneous newline there.

ok, thanks

...
> > +
> > +		/*
> > +		 * We need two passes here:
> > +		 *
> > +		 *  1) Collect vmas of mapped files with mmap_sem taken
> > +		 *  2) Release mmap_sem and instantiate entries
> > +		 *
> > +		 * otherwise we get lockdep complained, since filldir()
> > +		 * routine might require mmap_sem taken in might_fault().
> > +		 */
> > +
> > +		for (vma = mm->mmap; vma; vma = vma->vm_next) {
> > +			if (vma->vm_file)
> > +				nr_files++;
> > +		}
> > +
> > +		if (nr_files) {
> > +			mem_size = nr_files * sizeof(*info);
> > +			if (mem_size <= KMALLOC_MAX_SIZE)
> > +				info = kmalloc(mem_size, GFP_KERNEL);
> > +			else
> > +				info = vmalloc(mem_size);
> 
> This still sucks :(
> 
> A KMALLOC_MAX_SIZE allocation is huuuuuuuuuuuuge!  I don't know how big

kmalloc_sizes.h said it could be quite a big :(

> it is nowadays, but over 100 kbytes.  This will frequently send page
> reclaim on a berzerk rampage freeing *thousands* of pages (or
> relocating pages) until it manages to generate 20 or 30 physically
> contiguous free pages.
> 
> Also, vmalloc sucks.  The more often we perform vmallocs (with a mix of
> differently-sized ones), the more internally fragmented the vmalloc
> arena will become.  With some workloads we'll run out of
> sufficiently-large contiguous free spaces and things will start
> failing.  This doesn't happen often.  Yet.  But the more vmalloc()
> callsites we add, the more likely and the more frequent it becomes.  So
> vmalloc is something we should only use as a last resort.
> 
> The most robust implementation here is to allocate a large number of
> small objects - one per vma.  A list would be a suitable way of
> managing them.

Actually I though about slab-cache with 64 bytes per object, but it
requires more code to push here, that is why I stopped. Still this
doesn't justify me. So yes, bad idea. Thanks!

> 
> But do we *really* need to do it in two passes?  Avoiding the temporary
> storage would involve doing more work under mmap_sem, and a put_filp()
> under mmap_sem might be problematic.
> 

In real, for the particular case where we use this /proc/pid/map_files
it could be done in one pass without mmap_sem taken (since we use it
when task is frozen and we know noone is poking vmas) but the problem
is that people might start using it not for c/r but in various different
cases when task is pretty running and I wanna give them more-less robust
result in ls -l over this directory.

Andrew, I'll think some more, probably I'll find a way to drop this two
passes requirement.

	Cyrill

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-09-01 23:04                 ` Tejun Heo
@ 2011-09-02  5:54                   ` Cyrill Gorcunov
  0 siblings, 0 replies; 82+ messages in thread
From: Cyrill Gorcunov @ 2011-09-02  5:54 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Andrew Morton, Kirill A. Shutemov, Vasiliy Kulikov, containers,
	linux-kernel, linux-fsdevel, Nathan Lynch, Oren Laadan,
	Daniel Lezcano, Glauber Costa, James Bottomley, Alexey Dobriyan,
	Al Viro, Pavel Emelyanov

On Fri, Sep 02, 2011 at 01:04:40AM +0200, Tejun Heo wrote:
> On Thu, Sep 01, 2011 at 03:49:46PM -0700, Andrew Morton wrote:
> > The most robust implementation here is to allocate a large number of
> > small objects - one per vma.  A list would be a suitable way of
> > managing them.
> 
> Or just use flex_array?
> 

Hey, looks interesting! I'll check. Thanks for the point!

	Cyrill

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-09-02  2:29         ` Tejun Heo
@ 2011-09-02  8:07           ` Kirill A. Shutemov
  0 siblings, 0 replies; 82+ messages in thread
From: Kirill A. Shutemov @ 2011-09-02  8:07 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Nicholas Miell, Cyrill Gorcunov, containers, linux-kernel,
	linux-fsdevel, Nathan Lynch, Oren Laadan, Daniel Lezcano,
	Glauber Costa, James Bottomley, Vasiliy Kulikov, Alexey Dobriyan,
	Al Viro, Andrew Morton, Pavel Emelyanov

On Fri, Sep 02, 2011 at 11:29:57AM +0900, Tejun Heo wrote:
> Hello,
> 
> On Thu, Sep 01, 2011 at 07:04:42PM -0700, Nicholas Miell wrote:
> > So are the filenames in /proc/$PID/map_files/ guaranteed to be identical
> > to the first field (first two fields? the part before the first space)
> > in /proc/$PID/maps?
> 
> Yeap, naturally.

Except zero padding. %08lx in maps and %lx in map_files.

-- 
 Kirill A. Shutemov

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-09-01  8:05             ` Cyrill Gorcunov
@ 2011-09-02 16:37                 ` Vasiliy Kulikov
  0 siblings, 0 replies; 82+ messages in thread
From: Vasiliy Kulikov @ 2011-09-02 16:37 UTC (permalink / raw)
  To: Cyrill Gorcunov, Andrew Morton
  Cc: Kirill A. Shutemov, containers, linux-kernel, linux-fsdevel,
	Nathan Lynch, kernel-hardening, Oren Laadan, Daniel Lezcano,
	Glauber Costa, James Bottomley, Tejun Heo, Alexey Dobriyan,
	Al Viro, Pavel Emelyanov

Hi,

On Thu, Sep 01, 2011 at 12:05 +0400, Cyrill Gorcunov wrote:
> ...
> > > +/*
> > > + * NOTE: The getattr/setattr for both /proc/$pid/map_files and
> > > + * /proc/$pid/fd seems to have share the code, so need to be
> > > + * unified and code duplication eliminated!
> > 
> > Why not do this now?
> 
> There are a couple of reasons. Yesterday I was talking to
> Vasiliy Kulikov about this snippet, so he seems about to send
> you patches related to /proc/$pid/fd update, and after those
> patches will be merged we are to drop code duplication.
> Vasiliy, what the status of the update?

It looks like protecting directories with sensible contents is a nasty
thing.  The problem here is that if the dentry is present in the cache,
->lookup() is not called at all and the permissions can be checked in
fop/dop/iop specific handler (getattr(), readlink(), etc.).  However, it
would be much simplier to hook ->lookup() only.  Otherwise, we have to
define procfs handlers for all operations, which don't call
->d_revalidate().

Is it possible to disable caching dentry for specific files?  It is not
performace critical thing in fd and map_files and it would much simplify
the task.  Creating handlers for all these op handler bloats procfs.

Also I'm not sure what other handlers might reveal dentry presence.
Besides ->getattr() I could find only one thing - ->link() (Cyrill,
AFAICS ->setattr() doesn't reveal files' presence).  Someone more
familiar with vfs than me - please, help to identify all infoleak
sources!


Thank you,

-- 
Vasiliy Kulikov
http://www.openwall.com - bringing security into open computing environments

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [kernel-hardening] Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
@ 2011-09-02 16:37                 ` Vasiliy Kulikov
  0 siblings, 0 replies; 82+ messages in thread
From: Vasiliy Kulikov @ 2011-09-02 16:37 UTC (permalink / raw)
  To: Cyrill Gorcunov, Andrew Morton
  Cc: Kirill A. Shutemov, containers, linux-kernel, linux-fsdevel,
	Nathan Lynch, kernel-hardening, Oren Laadan, Daniel Lezcano,
	Glauber Costa, James Bottomley, Tejun Heo, Alexey Dobriyan,
	Al Viro, Pavel Emelyanov

Hi,

On Thu, Sep 01, 2011 at 12:05 +0400, Cyrill Gorcunov wrote:
> ...
> > > +/*
> > > + * NOTE: The getattr/setattr for both /proc/$pid/map_files and
> > > + * /proc/$pid/fd seems to have share the code, so need to be
> > > + * unified and code duplication eliminated!
> > 
> > Why not do this now?
> 
> There are a couple of reasons. Yesterday I was talking to
> Vasiliy Kulikov about this snippet, so he seems about to send
> you patches related to /proc/$pid/fd update, and after those
> patches will be merged we are to drop code duplication.
> Vasiliy, what the status of the update?

It looks like protecting directories with sensible contents is a nasty
thing.  The problem here is that if the dentry is present in the cache,
->lookup() is not called at all and the permissions can be checked in
fop/dop/iop specific handler (getattr(), readlink(), etc.).  However, it
would be much simplier to hook ->lookup() only.  Otherwise, we have to
define procfs handlers for all operations, which don't call
->d_revalidate().

Is it possible to disable caching dentry for specific files?  It is not
performace critical thing in fd and map_files and it would much simplify
the task.  Creating handlers for all these op handler bloats procfs.

Also I'm not sure what other handlers might reveal dentry presence.
Besides ->getattr() I could find only one thing - ->link() (Cyrill,
AFAICS ->setattr() doesn't reveal files' presence).  Someone more
familiar with vfs than me - please, help to identify all infoleak
sources!


Thank you,

-- 
Vasiliy Kulikov
http://www.openwall.com - bringing security into open computing environments

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-09-01 17:13                   ` Tejun Heo
@ 2011-09-02 19:15                     ` Matt Helsley
  0 siblings, 0 replies; 82+ messages in thread
From: Matt Helsley @ 2011-09-02 19:15 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Pavel Emelyanov, Daniel Lezcano, linux-kernel, James Bottomley,
	Cyrill Gorcunov, linux-fsdevel, Nathan Lynch, Alexey Dobriyan,
	containers, Andrew Morton, Vasiliy Kulikov, Al Viro

On Fri, Sep 02, 2011 at 02:13:55AM +0900, Tejun Heo wrote:
> Hello,
> 
> On Thu, Sep 01, 2011 at 04:13:54PM +0400, Pavel Emelyanov wrote:
> > Because with the handler restore process looks very natural and simple - each
> > task does the following steps
> > 
> > 1. restore task resources (open files, set IDs, restore connections, wire back timers, etc.)
> > 2. call execve() to jump into new memory+registers context which is
> >    a. unmap all the user memory
> >    b. map required mappings
> >    c. populate them with data
> >    d. restore registers
> >    e. restore IP
> 
> But what about multiple threads?  exec is already scary enough as it
> is and I don't think it would be wise to overload it for this.
> 
> I don't really think binfmt handler would be able to achieve
> completeness without ending up with full de-serializer in kernel.

Yeah, I think you'd have to remove the de_thread()-based assumptions 
in exec to avoid this. And then there's the can of worms opened by
non-pthread threads where files, signals, etc. may or may not be shared..

> There are a lot of states which already have API to manipulate from
> the userland thread itself and they all will need to be replicated in
> the binfmt handler.  It really has to be full de-serializer if it

Re-creating all possible states from userspace is not necessarily going
to be any prettier. I'd really like to see SEIZE used to restart threaded
tasks before I'll be convinced it's any more or less pretty than a binfmt
handler.

> wants to function like you described.
> 
> And if we're gonna have interventions anyway, I can't see much point
> in implementing something which seems simpler - it's not gonna be
> actually simpler.

You may have lost me here -- I have no idea whether this is a summary
or some new point you're trying to make.

Cheers,
	-Matt Helsley

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-09-02 16:37                 ` [kernel-hardening] " Vasiliy Kulikov
@ 2011-09-05 18:53                   ` Vasiliy Kulikov
  -1 siblings, 0 replies; 82+ messages in thread
From: Vasiliy Kulikov @ 2011-09-05 18:53 UTC (permalink / raw)
  To: Cyrill Gorcunov, Andrew Morton
  Cc: Kirill A. Shutemov, containers, linux-kernel, linux-fsdevel,
	Nathan Lynch, kernel-hardening, Oren Laadan, Daniel Lezcano,
	Glauber Costa, James Bottomley, Tejun Heo, Alexey Dobriyan,
	Al Viro, Pavel Emelyanov

On Fri, Sep 02, 2011 at 20:37 +0400, Vasiliy Kulikov wrote:
> On Thu, Sep 01, 2011 at 12:05 +0400, Cyrill Gorcunov wrote:
> > ...
> > > > +/*
> > > > + * NOTE: The getattr/setattr for both /proc/$pid/map_files and
> > > > + * /proc/$pid/fd seems to have share the code, so need to be
> > > > + * unified and code duplication eliminated!
> > > 
> > > Why not do this now?
> > 
> > There are a couple of reasons. Yesterday I was talking to
> > Vasiliy Kulikov about this snippet, so he seems about to send
> > you patches related to /proc/$pid/fd update, and after those
> > patches will be merged we are to drop code duplication.
> > Vasiliy, what the status of the update?
> 
> It looks like protecting directories with sensible contents is a nasty
> thing.  The problem here is that if the dentry is present in the cache,
> ->lookup() is not called at all and the permissions can be checked in
> fop/dop/iop specific handler (getattr(), readlink(), etc.).  However, it
> would be much simplier to hook ->lookup() only.  Otherwise, we have to
> define procfs handlers for all operations, which don't call
> ->d_revalidate().
> 
> Is it possible to disable caching dentry for specific files?  It is not
> performace critical thing in fd and map_files and it would much simplify
> the task.  Creating handlers for all these op handler bloats procfs.

Looks like the following patch solves the problem.  Tested on stat() and
link().

diff --git a/fs/proc/base.c b/fs/proc/base.c
index d44c701..219588b 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1665,46 +1665,12 @@ out:
 	return error;
 }
 
-static int proc_pid_fd_link_getattr(struct vfsmount *mnt, struct dentry *dentry,
-		struct kstat *stat)
-{
-	struct inode *inode = dentry->d_inode;
-	struct task_struct *task = get_proc_task(inode);
-	int rc;
-
-	if (task == NULL)
-		return -ESRCH;
-
-	rc = -EACCES;
-	if (lock_trace(task))
-		goto out_task;
-
-	generic_fillattr(inode, stat);
-	unlock_trace(task);
-	rc = 0;
-out_task:
-	put_task_struct(task);
-	return rc;
-}
-
 static const struct inode_operations proc_pid_link_inode_operations = {
 	.readlink	= proc_pid_readlink,
 	.follow_link	= proc_pid_follow_link,
 	.setattr	= proc_setattr,
 };
 
-static const struct inode_operations proc_fdinfo_link_inode_operations = {
-	.setattr	= proc_setattr,
-	.getattr	= proc_pid_fd_link_getattr,
-};
-
-static const struct inode_operations proc_fd_link_inode_operations = {
-	.readlink	= proc_pid_readlink,
-	.follow_link	= proc_pid_follow_link,
-	.setattr	= proc_setattr,
-	.getattr	= proc_pid_fd_link_getattr,
-};
-
 
 /* building an inode */
 
@@ -2044,9 +2010,18 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
 	return 0;
 }
 
+static int pid_no_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	if (nd && nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	d_drop(dentry);
+	return 0;
+}
+
 static const struct dentry_operations tid_fd_dentry_operations =
 {
-	.d_revalidate	= tid_fd_revalidate,
+	.d_revalidate	= pid_no_revalidate,
 	.d_delete	= pid_delete_dentry,
 };
 
@@ -2085,7 +2060,7 @@ static struct dentry *proc_fd_instantiate(struct inode *dir,
 	spin_unlock(&files->file_lock);
 	put_files_struct(files);
 
-	inode->i_op = &proc_fd_link_inode_operations;
+	inode->i_op = &proc_pid_link_inode_operations;
 	inode->i_size = 64;
 	ei->op.proc_get_link = proc_fd_link;
 	d_set_d_op(dentry, &tid_fd_dentry_operations);
@@ -2267,7 +2242,6 @@ static struct dentry *proc_fdinfo_instantiate(struct inode *dir,
 	ei->fd = fd;
 	inode->i_mode = S_IFREG | S_IRUSR;
 	inode->i_fop = &proc_fdinfo_file_operations;
-	inode->i_op = &proc_fdinfo_link_inode_operations;
 	d_set_d_op(dentry, &tid_fd_dentry_operations);
 	d_add(dentry, inode);
 	/* Close the race of the process dying before we return the dentry */
-- 
Vasiliy Kulikov
http://www.openwall.com - bringing security into open computing environments

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [kernel-hardening] Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
@ 2011-09-05 18:53                   ` Vasiliy Kulikov
  0 siblings, 0 replies; 82+ messages in thread
From: Vasiliy Kulikov @ 2011-09-05 18:53 UTC (permalink / raw)
  To: Cyrill Gorcunov, Andrew Morton
  Cc: Kirill A. Shutemov, containers, linux-kernel, linux-fsdevel,
	Nathan Lynch, kernel-hardening, Oren Laadan, Daniel Lezcano,
	Glauber Costa, James Bottomley, Tejun Heo, Alexey Dobriyan,
	Al Viro, Pavel Emelyanov

On Fri, Sep 02, 2011 at 20:37 +0400, Vasiliy Kulikov wrote:
> On Thu, Sep 01, 2011 at 12:05 +0400, Cyrill Gorcunov wrote:
> > ...
> > > > +/*
> > > > + * NOTE: The getattr/setattr for both /proc/$pid/map_files and
> > > > + * /proc/$pid/fd seems to have share the code, so need to be
> > > > + * unified and code duplication eliminated!
> > > 
> > > Why not do this now?
> > 
> > There are a couple of reasons. Yesterday I was talking to
> > Vasiliy Kulikov about this snippet, so he seems about to send
> > you patches related to /proc/$pid/fd update, and after those
> > patches will be merged we are to drop code duplication.
> > Vasiliy, what the status of the update?
> 
> It looks like protecting directories with sensible contents is a nasty
> thing.  The problem here is that if the dentry is present in the cache,
> ->lookup() is not called at all and the permissions can be checked in
> fop/dop/iop specific handler (getattr(), readlink(), etc.).  However, it
> would be much simplier to hook ->lookup() only.  Otherwise, we have to
> define procfs handlers for all operations, which don't call
> ->d_revalidate().
> 
> Is it possible to disable caching dentry for specific files?  It is not
> performace critical thing in fd and map_files and it would much simplify
> the task.  Creating handlers for all these op handler bloats procfs.

Looks like the following patch solves the problem.  Tested on stat() and
link().

diff --git a/fs/proc/base.c b/fs/proc/base.c
index d44c701..219588b 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1665,46 +1665,12 @@ out:
 	return error;
 }
 
-static int proc_pid_fd_link_getattr(struct vfsmount *mnt, struct dentry *dentry,
-		struct kstat *stat)
-{
-	struct inode *inode = dentry->d_inode;
-	struct task_struct *task = get_proc_task(inode);
-	int rc;
-
-	if (task == NULL)
-		return -ESRCH;
-
-	rc = -EACCES;
-	if (lock_trace(task))
-		goto out_task;
-
-	generic_fillattr(inode, stat);
-	unlock_trace(task);
-	rc = 0;
-out_task:
-	put_task_struct(task);
-	return rc;
-}
-
 static const struct inode_operations proc_pid_link_inode_operations = {
 	.readlink	= proc_pid_readlink,
 	.follow_link	= proc_pid_follow_link,
 	.setattr	= proc_setattr,
 };
 
-static const struct inode_operations proc_fdinfo_link_inode_operations = {
-	.setattr	= proc_setattr,
-	.getattr	= proc_pid_fd_link_getattr,
-};
-
-static const struct inode_operations proc_fd_link_inode_operations = {
-	.readlink	= proc_pid_readlink,
-	.follow_link	= proc_pid_follow_link,
-	.setattr	= proc_setattr,
-	.getattr	= proc_pid_fd_link_getattr,
-};
-
 
 /* building an inode */
 
@@ -2044,9 +2010,18 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
 	return 0;
 }
 
+static int pid_no_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	if (nd && nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	d_drop(dentry);
+	return 0;
+}
+
 static const struct dentry_operations tid_fd_dentry_operations =
 {
-	.d_revalidate	= tid_fd_revalidate,
+	.d_revalidate	= pid_no_revalidate,
 	.d_delete	= pid_delete_dentry,
 };
 
@@ -2085,7 +2060,7 @@ static struct dentry *proc_fd_instantiate(struct inode *dir,
 	spin_unlock(&files->file_lock);
 	put_files_struct(files);
 
-	inode->i_op = &proc_fd_link_inode_operations;
+	inode->i_op = &proc_pid_link_inode_operations;
 	inode->i_size = 64;
 	ei->op.proc_get_link = proc_fd_link;
 	d_set_d_op(dentry, &tid_fd_dentry_operations);
@@ -2267,7 +2242,6 @@ static struct dentry *proc_fdinfo_instantiate(struct inode *dir,
 	ei->fd = fd;
 	inode->i_mode = S_IFREG | S_IRUSR;
 	inode->i_fop = &proc_fdinfo_file_operations;
-	inode->i_op = &proc_fdinfo_link_inode_operations;
 	d_set_d_op(dentry, &tid_fd_dentry_operations);
 	d_add(dentry, inode);
 	/* Close the race of the process dying before we return the dentry */
-- 
Vasiliy Kulikov
http://www.openwall.com - bringing security into open computing environments

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-09-05 18:53                   ` [kernel-hardening] " Vasiliy Kulikov
@ 2011-09-05 19:20                     ` Cyrill Gorcunov
  -1 siblings, 0 replies; 82+ messages in thread
From: Cyrill Gorcunov @ 2011-09-05 19:20 UTC (permalink / raw)
  To: Vasiliy Kulikov
  Cc: Andrew Morton, Kirill A. Shutemov, containers, linux-kernel,
	linux-fsdevel, Nathan Lynch, kernel-hardening, Oren Laadan,
	Daniel Lezcano, Glauber Costa, James Bottomley, Tejun Heo,
	Alexey Dobriyan, Al Viro, Pavel Emelyanov

On Mon, Sep 05, 2011 at 10:53:58PM +0400, Vasiliy Kulikov wrote:
...
>  
> +static int pid_no_revalidate(struct dentry *dentry, struct nameidata *nd)
> +{
> +	if (nd && nd->flags & LOOKUP_RCU)
> +		return -ECHILD;
> +
> +	d_drop(dentry);
> +	return 0;
> +}
> +

Thanks Vasiliy! So every lookup will cause dcache to drop previous cached
entry and alloc and hash new one instead, pretty dramatic, espec in case
of huge number of files mapped ;) Still since it's not time critical operation
(at least for now) I tend to agree.

	Cyrill

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [kernel-hardening] Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
@ 2011-09-05 19:20                     ` Cyrill Gorcunov
  0 siblings, 0 replies; 82+ messages in thread
From: Cyrill Gorcunov @ 2011-09-05 19:20 UTC (permalink / raw)
  To: Vasiliy Kulikov
  Cc: Andrew Morton, Kirill A. Shutemov, containers, linux-kernel,
	linux-fsdevel, Nathan Lynch, kernel-hardening, Oren Laadan,
	Daniel Lezcano, Glauber Costa, James Bottomley, Tejun Heo,
	Alexey Dobriyan, Al Viro, Pavel Emelyanov

On Mon, Sep 05, 2011 at 10:53:58PM +0400, Vasiliy Kulikov wrote:
...
>  
> +static int pid_no_revalidate(struct dentry *dentry, struct nameidata *nd)
> +{
> +	if (nd && nd->flags & LOOKUP_RCU)
> +		return -ECHILD;
> +
> +	d_drop(dentry);
> +	return 0;
> +}
> +

Thanks Vasiliy! So every lookup will cause dcache to drop previous cached
entry and alloc and hash new one instead, pretty dramatic, espec in case
of huge number of files mapped ;) Still since it's not time critical operation
(at least for now) I tend to agree.

	Cyrill

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-09-05 19:20                     ` [kernel-hardening] " Cyrill Gorcunov
@ 2011-09-05 19:49                       ` Vasiliy Kulikov
  -1 siblings, 0 replies; 82+ messages in thread
From: Vasiliy Kulikov @ 2011-09-05 19:49 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Andrew Morton, Kirill A. Shutemov, containers, linux-kernel,
	linux-fsdevel, Nathan Lynch, kernel-hardening, Oren Laadan,
	Daniel Lezcano, Glauber Costa, James Bottomley, Tejun Heo,
	Alexey Dobriyan, Al Viro, Pavel Emelyanov

On Mon, Sep 05, 2011 at 23:20 +0400, Cyrill Gorcunov wrote:
> On Mon, Sep 05, 2011 at 10:53:58PM +0400, Vasiliy Kulikov wrote:
> ...
> >  
> > +static int pid_no_revalidate(struct dentry *dentry, struct nameidata *nd)
> > +{
> > +	if (nd && nd->flags & LOOKUP_RCU)
> > +		return -ECHILD;
> > +
> > +	d_drop(dentry);
> > +	return 0;
> > +}
> > +
> 
> Thanks Vasiliy! So every lookup will cause dcache to drop previous cached
> entry and alloc and hash new one instead, pretty dramatic, espec in case
> of huge number of files mapped ;) Still since it's not time critical operation
> (at least for now) I tend to agree.

Actually, it can be speed up by introducing the same ptrace check.  If
ptrace check fails, then just drop the dentry, otherwise continue to use
it.  Then each revalidate would trigger ptrace check instead of full
drop-lookup-alloc cycle.  If one process actively looks into
map_files/ or fd/, it will not become significantly slower.  However, it
will trigger 2 capable() fail alerts in ptrace_may_access() instead of
one :)


But I still see one very nasty issue - one may trigger this ptrace check,
trigger d_drop() and then look at /proc/slabinfo at "dentry" row.  If
the number has changed, then the interested dentry existed before the
revalidate call.  This infoleak is tricky to fix without any race.

Probably it's time to close /proc/slabinfo infoleak?


Thanks,

-- 
Vasiliy Kulikov
http://www.openwall.com - bringing security into open computing environments

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [kernel-hardening] Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
@ 2011-09-05 19:49                       ` Vasiliy Kulikov
  0 siblings, 0 replies; 82+ messages in thread
From: Vasiliy Kulikov @ 2011-09-05 19:49 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Andrew Morton, Kirill A. Shutemov, containers, linux-kernel,
	linux-fsdevel, Nathan Lynch, kernel-hardening, Oren Laadan,
	Daniel Lezcano, Glauber Costa, James Bottomley, Tejun Heo,
	Alexey Dobriyan, Al Viro, Pavel Emelyanov

On Mon, Sep 05, 2011 at 23:20 +0400, Cyrill Gorcunov wrote:
> On Mon, Sep 05, 2011 at 10:53:58PM +0400, Vasiliy Kulikov wrote:
> ...
> >  
> > +static int pid_no_revalidate(struct dentry *dentry, struct nameidata *nd)
> > +{
> > +	if (nd && nd->flags & LOOKUP_RCU)
> > +		return -ECHILD;
> > +
> > +	d_drop(dentry);
> > +	return 0;
> > +}
> > +
> 
> Thanks Vasiliy! So every lookup will cause dcache to drop previous cached
> entry and alloc and hash new one instead, pretty dramatic, espec in case
> of huge number of files mapped ;) Still since it's not time critical operation
> (at least for now) I tend to agree.

Actually, it can be speed up by introducing the same ptrace check.  If
ptrace check fails, then just drop the dentry, otherwise continue to use
it.  Then each revalidate would trigger ptrace check instead of full
drop-lookup-alloc cycle.  If one process actively looks into
map_files/ or fd/, it will not become significantly slower.  However, it
will trigger 2 capable() fail alerts in ptrace_may_access() instead of
one :)


But I still see one very nasty issue - one may trigger this ptrace check,
trigger d_drop() and then look at /proc/slabinfo at "dentry" row.  If
the number has changed, then the interested dentry existed before the
revalidate call.  This infoleak is tricky to fix without any race.

Probably it's time to close /proc/slabinfo infoleak?


Thanks,

-- 
Vasiliy Kulikov
http://www.openwall.com - bringing security into open computing environments

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-09-05 19:49                       ` [kernel-hardening] " Vasiliy Kulikov
@ 2011-09-05 20:36                         ` Cyrill Gorcunov
  -1 siblings, 0 replies; 82+ messages in thread
From: Cyrill Gorcunov @ 2011-09-05 20:36 UTC (permalink / raw)
  To: Vasiliy Kulikov
  Cc: Andrew Morton, Kirill A. Shutemov, containers, linux-kernel,
	linux-fsdevel, Nathan Lynch, kernel-hardening, Oren Laadan,
	Daniel Lezcano, Glauber Costa, James Bottomley, Tejun Heo,
	Alexey Dobriyan, Al Viro, Pavel Emelyanov

On Mon, Sep 05, 2011 at 11:49:08PM +0400, Vasiliy Kulikov wrote:
...
> 
> Actually, it can be speed up by introducing the same ptrace check.  If
> ptrace check fails, then just drop the dentry, otherwise continue to use
> it.  Then each revalidate would trigger ptrace check instead of full
> drop-lookup-alloc cycle.  If one process actively looks into
> map_files/ or fd/, it will not become significantly slower.  However, it
> will trigger 2 capable() fail alerts in ptrace_may_access() instead of
> one :)

Hmm, at least it's better than trashing dcache I think.

> 
> But I still see one very nasty issue - one may trigger this ptrace check,
> trigger d_drop() and then look at /proc/slabinfo at "dentry" row.  If
> the number has changed, then the interested dentry existed before the
> revalidate call.  This infoleak is tricky to fix without any race.
> 
> Probably it's time to close /proc/slabinfo infoleak? 
> 

Actually I miss to see how exactly this infoleak can be used by attacker
or whoever. So, Vasiliy, what the security issue there?

	Cyrill

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [kernel-hardening] Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
@ 2011-09-05 20:36                         ` Cyrill Gorcunov
  0 siblings, 0 replies; 82+ messages in thread
From: Cyrill Gorcunov @ 2011-09-05 20:36 UTC (permalink / raw)
  To: Vasiliy Kulikov
  Cc: Andrew Morton, Kirill A. Shutemov, containers, linux-kernel,
	linux-fsdevel, Nathan Lynch, kernel-hardening, Oren Laadan,
	Daniel Lezcano, Glauber Costa, James Bottomley, Tejun Heo,
	Alexey Dobriyan, Al Viro, Pavel Emelyanov

On Mon, Sep 05, 2011 at 11:49:08PM +0400, Vasiliy Kulikov wrote:
...
> 
> Actually, it can be speed up by introducing the same ptrace check.  If
> ptrace check fails, then just drop the dentry, otherwise continue to use
> it.  Then each revalidate would trigger ptrace check instead of full
> drop-lookup-alloc cycle.  If one process actively looks into
> map_files/ or fd/, it will not become significantly slower.  However, it
> will trigger 2 capable() fail alerts in ptrace_may_access() instead of
> one :)

Hmm, at least it's better than trashing dcache I think.

> 
> But I still see one very nasty issue - one may trigger this ptrace check,
> trigger d_drop() and then look at /proc/slabinfo at "dentry" row.  If
> the number has changed, then the interested dentry existed before the
> revalidate call.  This infoleak is tricky to fix without any race.
> 
> Probably it's time to close /proc/slabinfo infoleak? 
> 

Actually I miss to see how exactly this infoleak can be used by attacker
or whoever. So, Vasiliy, what the security issue there?

	Cyrill

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-09-05 20:36                         ` [kernel-hardening] " Cyrill Gorcunov
@ 2011-09-06 10:15                           ` Vasiliy Kulikov
  -1 siblings, 0 replies; 82+ messages in thread
From: Vasiliy Kulikov @ 2011-09-06 10:15 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Andrew Morton, Kirill A. Shutemov, containers, linux-kernel,
	linux-fsdevel, Nathan Lynch, kernel-hardening, Oren Laadan,
	Daniel Lezcano, Glauber Costa, James Bottomley, Tejun Heo,
	Alexey Dobriyan, Al Viro, Pavel Emelyanov

On Tue, Sep 06, 2011 at 00:36 +0400, Cyrill Gorcunov wrote:
> > But I still see one very nasty issue - one may trigger this ptrace check,
> > trigger d_drop() and then look at /proc/slabinfo at "dentry" row.  If
> > the number has changed, then the interested dentry existed before the
> > revalidate call.  This infoleak is tricky to fix without any race.
> > 
> > Probably it's time to close /proc/slabinfo infoleak? 
> > 
> 
> Actually I miss to see how exactly this infoleak can be used by attacker
> or whoever. So, Vasiliy, what the security issue there?

The security model of procfs is: /proc/PID/fd/ is available to users
that may ptrace PID only.  Particularly, the number of opened file
descriptors is a private information.  If other task that may not ptrace
PID is able to get this information, this is an issue.  Keeping opened
file descriptor of /proc/PID/fd/ and exec'ing some setxid binary as PID
might lead to the infoleak.  It can be used in certain rare cases when
the knowledge of whether specific fd is opened/closed gains some
important information, e.g. whether some security check has
failed/succeeded (which is indirectly signaled by the kept fd).  As for
map_files/ it may reveal ASLR offsets (but only some bits, not all of
them, I guess).

Without dropping denries it can be identified by calling stat() or
link() against dentries existing in the cache.  In more details:

1) an attacker has a task with pid=PID with many opened fds.

2) Other task (PID2) opens /proc/PID/fd/ and fills the dentry cache.
Now dcache contains procfs entries for file descriptors of PID.

3) PID execve's setxid binary.  (From this point PID2 should not get
_any_ information about /proc/PID/fd/, but this rule is violated in (4).)

4) PID2 does something to learn whether any fd of PID is opened/closed.

  a) before "proc: fix races against execve() of /proc/PID/fd**" patch
     PID2 could simply do getdents() against kept file descriptor of
     /proc/PID/fd and get the list of opened fds.

  b) Without dentry dropping on each access PID2 could use link(2) to
     read /proc/PID/fd/* dentries from dcache.  As they are in the
     dcache since (2), ptrace check from ->lookup() is not applied.

  c) If dentry is lazily dropped on each access attempt (or each illegal
     access) then PID2 can:

     i) read dentry line of /proc/slabinfo
     ii) call link(2) against /proc/PID/fd, which invalidates the
         specific dentry
     iii) re-read dentry line of /proc/slabinfo.  If it has decreased by
         one, the dentry existed before (ii).


Is it possible to either allocate already dropped dentry or to force
->lookup() without invalidating dentry?  The latter would potentially
pollute the dchache, though.


Thanks,

-- 
Vasiliy Kulikov
http://www.openwall.com - bringing security into open computing environments

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [kernel-hardening] Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
@ 2011-09-06 10:15                           ` Vasiliy Kulikov
  0 siblings, 0 replies; 82+ messages in thread
From: Vasiliy Kulikov @ 2011-09-06 10:15 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Andrew Morton, Kirill A. Shutemov, containers, linux-kernel,
	linux-fsdevel, Nathan Lynch, kernel-hardening, Oren Laadan,
	Daniel Lezcano, Glauber Costa, James Bottomley, Tejun Heo,
	Alexey Dobriyan, Al Viro, Pavel Emelyanov

On Tue, Sep 06, 2011 at 00:36 +0400, Cyrill Gorcunov wrote:
> > But I still see one very nasty issue - one may trigger this ptrace check,
> > trigger d_drop() and then look at /proc/slabinfo at "dentry" row.  If
> > the number has changed, then the interested dentry existed before the
> > revalidate call.  This infoleak is tricky to fix without any race.
> > 
> > Probably it's time to close /proc/slabinfo infoleak? 
> > 
> 
> Actually I miss to see how exactly this infoleak can be used by attacker
> or whoever. So, Vasiliy, what the security issue there?

The security model of procfs is: /proc/PID/fd/ is available to users
that may ptrace PID only.  Particularly, the number of opened file
descriptors is a private information.  If other task that may not ptrace
PID is able to get this information, this is an issue.  Keeping opened
file descriptor of /proc/PID/fd/ and exec'ing some setxid binary as PID
might lead to the infoleak.  It can be used in certain rare cases when
the knowledge of whether specific fd is opened/closed gains some
important information, e.g. whether some security check has
failed/succeeded (which is indirectly signaled by the kept fd).  As for
map_files/ it may reveal ASLR offsets (but only some bits, not all of
them, I guess).

Without dropping denries it can be identified by calling stat() or
link() against dentries existing in the cache.  In more details:

1) an attacker has a task with pid=PID with many opened fds.

2) Other task (PID2) opens /proc/PID/fd/ and fills the dentry cache.
Now dcache contains procfs entries for file descriptors of PID.

3) PID execve's setxid binary.  (From this point PID2 should not get
_any_ information about /proc/PID/fd/, but this rule is violated in (4).)

4) PID2 does something to learn whether any fd of PID is opened/closed.

  a) before "proc: fix races against execve() of /proc/PID/fd**" patch
     PID2 could simply do getdents() against kept file descriptor of
     /proc/PID/fd and get the list of opened fds.

  b) Without dentry dropping on each access PID2 could use link(2) to
     read /proc/PID/fd/* dentries from dcache.  As they are in the
     dcache since (2), ptrace check from ->lookup() is not applied.

  c) If dentry is lazily dropped on each access attempt (or each illegal
     access) then PID2 can:

     i) read dentry line of /proc/slabinfo
     ii) call link(2) against /proc/PID/fd, which invalidates the
         specific dentry
     iii) re-read dentry line of /proc/slabinfo.  If it has decreased by
         one, the dentry existed before (ii).


Is it possible to either allocate already dropped dentry or to force
->lookup() without invalidating dentry?  The latter would potentially
pollute the dchache, though.


Thanks,

-- 
Vasiliy Kulikov
http://www.openwall.com - bringing security into open computing environments

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-09-06 10:15                           ` [kernel-hardening] " Vasiliy Kulikov
@ 2011-09-06 16:51                             ` Tejun Heo
  -1 siblings, 0 replies; 82+ messages in thread
From: Tejun Heo @ 2011-09-06 16:51 UTC (permalink / raw)
  To: Vasiliy Kulikov
  Cc: Cyrill Gorcunov, Andrew Morton, Kirill A. Shutemov, containers,
	linux-kernel, linux-fsdevel, Nathan Lynch, kernel-hardening,
	Oren Laadan, Daniel Lezcano, Glauber Costa, James Bottomley,
	Alexey Dobriyan, Al Viro, Pavel Emelyanov

Hello, Vasiliy.

On Tue, Sep 06, 2011 at 02:15:18PM +0400, Vasiliy Kulikov wrote:
>   c) If dentry is lazily dropped on each access attempt (or each illegal
>      access) then PID2 can:
> 
>      i) read dentry line of /proc/slabinfo
>      ii) call link(2) against /proc/PID/fd, which invalidates the
>          specific dentry
>      iii) re-read dentry line of /proc/slabinfo.  If it has decreased by
>          one, the dentry existed before (ii).

If we really worry about this, probably the right thing to do is
hiding slabinfo from mortal UIDs instead of worrying about what
exactly are freed or not from each user.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [kernel-hardening] Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
@ 2011-09-06 16:51                             ` Tejun Heo
  0 siblings, 0 replies; 82+ messages in thread
From: Tejun Heo @ 2011-09-06 16:51 UTC (permalink / raw)
  To: Vasiliy Kulikov
  Cc: Cyrill Gorcunov, Andrew Morton, Kirill A. Shutemov, containers,
	linux-kernel, linux-fsdevel, Nathan Lynch, kernel-hardening,
	Oren Laadan, Daniel Lezcano, Glauber Costa, James Bottomley,
	Alexey Dobriyan, Al Viro, Pavel Emelyanov

Hello, Vasiliy.

On Tue, Sep 06, 2011 at 02:15:18PM +0400, Vasiliy Kulikov wrote:
>   c) If dentry is lazily dropped on each access attempt (or each illegal
>      access) then PID2 can:
> 
>      i) read dentry line of /proc/slabinfo
>      ii) call link(2) against /proc/PID/fd, which invalidates the
>          specific dentry
>      iii) re-read dentry line of /proc/slabinfo.  If it has decreased by
>          one, the dentry existed before (ii).

If we really worry about this, probably the right thing to do is
hiding slabinfo from mortal UIDs instead of worrying about what
exactly are freed or not from each user.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-09-06 16:51                             ` [kernel-hardening] " Tejun Heo
@ 2011-09-06 17:29                               ` Vasiliy Kulikov
  -1 siblings, 0 replies; 82+ messages in thread
From: Vasiliy Kulikov @ 2011-09-06 17:29 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Cyrill Gorcunov, Andrew Morton, Kirill A. Shutemov, containers,
	linux-kernel, linux-fsdevel, Nathan Lynch, kernel-hardening,
	Oren Laadan, Daniel Lezcano, Glauber Costa, James Bottomley,
	Alexey Dobriyan, Al Viro, Pavel Emelyanov

Hi Tejun,

On Wed, Sep 07, 2011 at 01:51 +0900, Tejun Heo wrote:
> On Tue, Sep 06, 2011 at 02:15:18PM +0400, Vasiliy Kulikov wrote:
> >   c) If dentry is lazily dropped on each access attempt (or each illegal
> >      access) then PID2 can:
> > 
> >      i) read dentry line of /proc/slabinfo
> >      ii) call link(2) against /proc/PID/fd, which invalidates the
> >          specific dentry
> >      iii) re-read dentry line of /proc/slabinfo.  If it has decreased by
> >          one, the dentry existed before (ii).
> 
> If we really worry about this, probably the right thing to do is
> hiding slabinfo from mortal UIDs instead of worrying about what
> exactly are freed or not from each user.

I agree with you.  I don't think that showing system-global debug
information to all users by default is the right thing.  But some people
doesn't agree with this point of view:

http://thread.gmane.org/gmane.linux.kernel/1108378

-- 
Vasiliy Kulikov
http://www.openwall.com - bringing security into open computing environments

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [kernel-hardening] Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
@ 2011-09-06 17:29                               ` Vasiliy Kulikov
  0 siblings, 0 replies; 82+ messages in thread
From: Vasiliy Kulikov @ 2011-09-06 17:29 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Cyrill Gorcunov, Andrew Morton, Kirill A. Shutemov, containers,
	linux-kernel, linux-fsdevel, Nathan Lynch, kernel-hardening,
	Oren Laadan, Daniel Lezcano, Glauber Costa, James Bottomley,
	Alexey Dobriyan, Al Viro, Pavel Emelyanov

Hi Tejun,

On Wed, Sep 07, 2011 at 01:51 +0900, Tejun Heo wrote:
> On Tue, Sep 06, 2011 at 02:15:18PM +0400, Vasiliy Kulikov wrote:
> >   c) If dentry is lazily dropped on each access attempt (or each illegal
> >      access) then PID2 can:
> > 
> >      i) read dentry line of /proc/slabinfo
> >      ii) call link(2) against /proc/PID/fd, which invalidates the
> >          specific dentry
> >      iii) re-read dentry line of /proc/slabinfo.  If it has decreased by
> >          one, the dentry existed before (ii).
> 
> If we really worry about this, probably the right thing to do is
> hiding slabinfo from mortal UIDs instead of worrying about what
> exactly are freed or not from each user.

I agree with you.  I don't think that showing system-global debug
information to all users by default is the right thing.  But some people
doesn't agree with this point of view:

http://thread.gmane.org/gmane.linux.kernel/1108378

-- 
Vasiliy Kulikov
http://www.openwall.com - bringing security into open computing environments

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-09-06 17:29                               ` [kernel-hardening] " Vasiliy Kulikov
@ 2011-09-06 17:33                                 ` Tejun Heo
  -1 siblings, 0 replies; 82+ messages in thread
From: Tejun Heo @ 2011-09-06 17:33 UTC (permalink / raw)
  To: Vasiliy Kulikov
  Cc: Cyrill Gorcunov, Andrew Morton, Kirill A. Shutemov, containers,
	linux-kernel, linux-fsdevel, Nathan Lynch, kernel-hardening,
	Oren Laadan, Daniel Lezcano, Glauber Costa, James Bottomley,
	Alexey Dobriyan, Al Viro, Pavel Emelyanov

Hello,

On Tue, Sep 06, 2011 at 09:29:52PM +0400, Vasiliy Kulikov wrote:
> I agree with you.  I don't think that showing system-global debug
> information to all users by default is the right thing.  But some people
> doesn't agree with this point of view:
> 
> http://thread.gmane.org/gmane.linux.kernel/1108378

Yeap, I know there are two sides of the discussion but if one takes
the position that hiding such global debug info is more harmful, it's
only crazier to hide such information from each individual users of
the said global facility.  So, let's just forget about information
leak via freeing or not freeing here.  It's the wrong battle field.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [kernel-hardening] Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
@ 2011-09-06 17:33                                 ` Tejun Heo
  0 siblings, 0 replies; 82+ messages in thread
From: Tejun Heo @ 2011-09-06 17:33 UTC (permalink / raw)
  To: Vasiliy Kulikov
  Cc: Cyrill Gorcunov, Andrew Morton, Kirill A. Shutemov, containers,
	linux-kernel, linux-fsdevel, Nathan Lynch, kernel-hardening,
	Oren Laadan, Daniel Lezcano, Glauber Costa, James Bottomley,
	Alexey Dobriyan, Al Viro, Pavel Emelyanov

Hello,

On Tue, Sep 06, 2011 at 09:29:52PM +0400, Vasiliy Kulikov wrote:
> I agree with you.  I don't think that showing system-global debug
> information to all users by default is the right thing.  But some people
> doesn't agree with this point of view:
> 
> http://thread.gmane.org/gmane.linux.kernel/1108378

Yeap, I know there are two sides of the discussion but if one takes
the position that hiding such global debug info is more harmful, it's
only crazier to hide such information from each individual users of
the said global facility.  So, let's just forget about information
leak via freeing or not freeing here.  It's the wrong battle field.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-09-06 17:33                                 ` [kernel-hardening] " Tejun Heo
@ 2011-09-06 18:15                                   ` Cyrill Gorcunov
  -1 siblings, 0 replies; 82+ messages in thread
From: Cyrill Gorcunov @ 2011-09-06 18:15 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Vasiliy Kulikov, Andrew Morton, Kirill A. Shutemov, containers,
	linux-kernel, linux-fsdevel, Nathan Lynch, kernel-hardening,
	Oren Laadan, Daniel Lezcano, Glauber Costa, James Bottomley,
	Alexey Dobriyan, Al Viro, Pavel Emelyanov

On Wed, Sep 07, 2011 at 02:33:41AM +0900, Tejun Heo wrote:
...
> 
> Yeap, I know there are two sides of the discussion but if one takes
> the position that hiding such global debug info is more harmful, it's
> only crazier to hide such information from each individual users of
> the said global facility.  So, let's just forget about information
> leak via freeing or not freeing here.  It's the wrong battle field.
>

Heh, I definitely didn't expect all this would turn out into slabinfo ;)
 
	Cyrill

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [kernel-hardening] Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
@ 2011-09-06 18:15                                   ` Cyrill Gorcunov
  0 siblings, 0 replies; 82+ messages in thread
From: Cyrill Gorcunov @ 2011-09-06 18:15 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Vasiliy Kulikov, Andrew Morton, Kirill A. Shutemov, containers,
	linux-kernel, linux-fsdevel, Nathan Lynch, kernel-hardening,
	Oren Laadan, Daniel Lezcano, Glauber Costa, James Bottomley,
	Alexey Dobriyan, Al Viro, Pavel Emelyanov

On Wed, Sep 07, 2011 at 02:33:41AM +0900, Tejun Heo wrote:
...
> 
> Yeap, I know there are two sides of the discussion but if one takes
> the position that hiding such global debug info is more harmful, it's
> only crazier to hide such information from each individual users of
> the said global facility.  So, let's just forget about information
> leak via freeing or not freeing here.  It's the wrong battle field.
>

Heh, I definitely didn't expect all this would turn out into slabinfo ;)
 
	Cyrill

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-09-06 17:33                                 ` [kernel-hardening] " Tejun Heo
@ 2011-09-07 11:23                                     ` Vasiliy Kulikov
  -1 siblings, 0 replies; 82+ messages in thread
From: Vasiliy Kulikov @ 2011-09-07 11:23 UTC (permalink / raw)
  To: Tejun Heo
  Cc: kernel-hardening-ZwoEplunGu1jrUoiu81ncdBPR1lH4CV8,
	Pavel Emelyanov, linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	James Bottomley, Cyrill Gorcunov,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, Nathan Lynch,
	Alexey Dobriyan, containers-qjLDD68F18O7TbgM5vRIOg,
	Andrew Morton, Daniel Lezcano, Al Viro

Hi,

On Wed, Sep 07, 2011 at 02:33 +0900, Tejun Heo wrote:
> On Tue, Sep 06, 2011 at 09:29:52PM +0400, Vasiliy Kulikov wrote:
> > I agree with you.  I don't think that showing system-global debug
> > information to all users by default is the right thing.  But some people
> > doesn't agree with this point of view:
> > 
> > http://thread.gmane.org/gmane.linux.kernel/1108378
> 
> Yeap, I know there are two sides of the discussion but if one takes
> the position that hiding such global debug info is more harmful, it's
> only crazier to hide such information from each individual users of
> the said global facility.  So, let's just forget about information
> leak via freeing or not freeing here.  It's the wrong battle field.

Andrew, are you OK with closing the hole with pid_no_revalidate()
and 0600 /proc/slabinfo?  If so, I feel I have to start this discussion
with people participating in the discussion above: Theodore, Dan, Linus, etc.

Thanks,

-- 
Vasiliy Kulikov
http://www.openwall.com - bringing security into open computing environments

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [kernel-hardening] Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
@ 2011-09-07 11:23                                     ` Vasiliy Kulikov
  0 siblings, 0 replies; 82+ messages in thread
From: Vasiliy Kulikov @ 2011-09-07 11:23 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Cyrill Gorcunov, Andrew Morton, Kirill A. Shutemov, containers,
	linux-kernel, linux-fsdevel, Nathan Lynch, kernel-hardening,
	Oren Laadan, Daniel Lezcano, Glauber Costa, James Bottomley,
	Alexey Dobriyan, Al Viro, Pavel Emelyanov

Hi,

On Wed, Sep 07, 2011 at 02:33 +0900, Tejun Heo wrote:
> On Tue, Sep 06, 2011 at 09:29:52PM +0400, Vasiliy Kulikov wrote:
> > I agree with you.  I don't think that showing system-global debug
> > information to all users by default is the right thing.  But some people
> > doesn't agree with this point of view:
> > 
> > http://thread.gmane.org/gmane.linux.kernel/1108378
> 
> Yeap, I know there are two sides of the discussion but if one takes
> the position that hiding such global debug info is more harmful, it's
> only crazier to hide such information from each individual users of
> the said global facility.  So, let's just forget about information
> leak via freeing or not freeing here.  It's the wrong battle field.

Andrew, are you OK with closing the hole with pid_no_revalidate()
and 0600 /proc/slabinfo?  If so, I feel I have to start this discussion
with people participating in the discussion above: Theodore, Dan, Linus, etc.

Thanks,

-- 
Vasiliy Kulikov
http://www.openwall.com - bringing security into open computing environments

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-09-07 11:23                                     ` [kernel-hardening] " Vasiliy Kulikov
@ 2011-09-07 21:53                                       ` Cyrill Gorcunov
  -1 siblings, 0 replies; 82+ messages in thread
From: Cyrill Gorcunov @ 2011-09-07 21:53 UTC (permalink / raw)
  To: Vasiliy Kulikov, Andrew Morton
  Cc: Tejun Heo, Kirill A. Shutemov, containers, linux-kernel,
	linux-fsdevel, Nathan Lynch, kernel-hardening, Oren Laadan,
	Daniel Lezcano, Glauber Costa, James Bottomley, Alexey Dobriyan,
	Al Viro, Pavel Emelyanov

On Wed, Sep 07, 2011 at 03:23:01PM +0400, Vasiliy Kulikov wrote:
> Hi,
> 
> On Wed, Sep 07, 2011 at 02:33 +0900, Tejun Heo wrote:
> > On Tue, Sep 06, 2011 at 09:29:52PM +0400, Vasiliy Kulikov wrote:
> > > I agree with you.  I don't think that showing system-global debug
> > > information to all users by default is the right thing.  But some people
> > > doesn't agree with this point of view:
> > > 
> > > http://thread.gmane.org/gmane.linux.kernel/1108378
> > 
> > Yeap, I know there are two sides of the discussion but if one takes
> > the position that hiding such global debug info is more harmful, it's
> > only crazier to hide such information from each individual users of
> > the said global facility.  So, let's just forget about information
> > leak via freeing or not freeing here.  It's the wrong battle field.
> 
> Andrew, are you OK with closing the hole with pid_no_revalidate()
> and 0600 /proc/slabinfo?  If so, I feel I have to start this discussion
> with people participating in the discussion above: Theodore, Dan, Linus, etc.
> 
> Thanks,

Since kernel.org is still down (and Andrew, I can't download -mm bundle
as well for this very reason), here is an updated version for review.
I've updated map_files_d_revalidate to include ptrace hook, and switched
to flex_array. So while there is uncertainty in would we use pid_no_revalidate
or we wouldn't (seems calling ptrace hook there instead would help) i remain
the get/setattr untouched for a while. Also I hope an updated changelog
would make it more clear why we need this feature.

> By Andrew Morton
>
> But do we *really* need to do it in two passes?  Avoiding the temporary
> storage would involve doing more work under mmap_sem, and a put_filp()
> under mmap_sem might be problematic.

I fear we still need to use two passes in proc_map_files_readdir, I found no way
to escape lockdep complains when doing all work in one pass with mmap_sem taken.
The /maps does the same thing -- ie it fills maps file with mmap_sem taken to produce
robust data. And I'm not really sure what you mean with problematic put_filp?

	Cyrill
---
fs, proc: Introduce the /proc/<pid>/map_files/ directory v10

From: Pavel Emelyanov <xemul@parallels.com>

This one behaves similarly to the /proc/<pid>/fd/ one - it contains symlinks
one for each mapping with file, the name of a symlink is "vma->vm_start-vma->vm_end",
the target is the file. Opening a symlink results in a file that point exactly
to the same inode as them vma's one.

For example the ls -l of some arbitrary /proc/<pid>/map_files/

 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80403000-7f8f80404000 -> /lib64/libc-2.5.so
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f8061e000-7f8f80620000 -> /lib64/libselinux.so.1
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80826000-7f8f80827000 -> /lib64/libacl.so.1.1.0
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a2f000-7f8f80a30000 -> /lib64/librt-2.5.so
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a30000-7f8f80a4c000 -> /lib64/ld-2.5.so

This *helps* checkpointing process in three ways:

1. When dumping a task mappings we do know exact file that is mapped by particular
   region. We do this by opening /proc/$pid/map_files/address symlink the way we do
   with file descriptors.

2. This also helps in determining which anonymous shared mappings are shared with
   each other by comparing the inodes of them.

3. When restoring a set of process in case two of them has a mapping shared, we map
   the memory by the 1st one and then open its /proc/$pid/map_files/address file and
   map it by the 2nd task.

Using /proc/$pid/maps for this is quite inconvenient since it brings repeatable
re-reading and reparsing for this text file which slows down restore procesure
significantly. Also as being pointed in (3) it is a way easier to use top level
shared mapping in children as /proc/$pid/map_files/address when needed.

v2: (spotted by Tejun Heo)
 - /proc/<pid>/mfd changed to /proc/<pid>/map_files
 - find_vma helper is used instead of linear search
 - routines are re-grouped
 - d_revalidate is set now

v3:
 - d_revalidate reworked, now it should drops no longer valid dentries (Tejun Heo)
 - ptrace_may_access added into proc_map_files_lookup (Vasiliy Kulikov)
 - because of filldir (which eventually might need to lock mmap_sem)
   the proc_map_files_readdir() was reworked to call proc_fill_cache()
   with unlocked mmap_sem

v4: (feedback by Tejun Heo and Vasiliy Kulikov)
 - instead of saving data in proc_inode we rather make a dentry name
   to keep both vm_start and vm_end accordingly
 - d_revalidate now honor task credentials

v5: (feedback by Kirill A. Shutemov)
 - don't forget to release mmap_sem on error path

v6:
 - sizeof get used in map_files_info which shrink member a bit on
   x86-32 (by Kirill A. Shutemov)
 - map_name_to_addr returns -EINVAL instead of -1
   which is more appropriate (by Tejun Heo)

v7:
 - add [get/set]attr handlers for
   proc_map_files_inode_operations (by Vasiliy Kulikov)

v8:
 - Kirill A. Shutemov spotted a parasite semicolon
   which ruined the ptrace_check call, fixed.

v9: (feedback by Andrew Morton)
 - find_exact_vma moved into include/linux/mm.h as an inline helper
 - proc_map_files_setattr uses either kmalloc or vmalloc depending
   on how many ojects are to be allocated
 - no more map_name_to_addr but dname_to_vma_addr introduced instead
   and it uses sscanf because in one case the find_exact_vma() is used
   only to confirm existence of vma area the boolean flag is used
 - fancy justification dropped
 - still the proc_map_files_get/setattr leaved untouched
   until additional fd/ patches applied first.

v10: (feedback by Andrew Morton)
 - flex_arrays are used instead of kmalloc/vmalloc calls
 - map_files_d_revalidate use ptrace_may_access for
   security reason (by Vasiliy Kulikov)

Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
CC: Tejun Heo <tj@kernel.org>
CC: Vasiliy Kulikov <segoon@openwall.com>
CC: "Kirill A. Shutemov" <kirill@shutemov.name>
CC: Alexey Dobriyan <adobriyan@gmail.com>
CC: Al Viro <viro@ZenIV.linux.org.uk>
CC: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/base.c     |  366 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mm.h |   12 +
 2 files changed, 378 insertions(+)

Index: linux-2.6.git/fs/proc/base.c
===================================================================
--- linux-2.6.git.orig/fs/proc/base.c
+++ linux-2.6.git/fs/proc/base.c
@@ -83,6 +83,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/fs_struct.h>
 #include <linux/slab.h>
+#include <linux/flex_array.h>
 #ifdef CONFIG_HARDWALL
 #include <asm/hardwall.h>
 #endif
@@ -2171,6 +2172,370 @@ static const struct file_operations proc
 };
 
 /*
+ * dname_to_vma_addr - maps a dentry name into two unsigned longs
+ * which represent vma start and end addresses.
+ */
+static int dname_to_vma_addr(struct dentry *dentry,
+			     unsigned long *start, unsigned long *end)
+{
+	if (sscanf(dentry->d_name.name, "%lx-%lx", start, end) != 2)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	unsigned long vm_start, vm_end;
+	bool exact_vma_exists = false;
+	struct task_struct *task;
+	const struct cred *cred;
+	struct mm_struct *mm;
+	struct inode *inode;
+
+	if (nd && nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	inode = dentry->d_inode;
+	task = get_proc_task(inode);
+	if (!task)
+		goto out;
+
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+		goto out;
+
+	mm = get_task_mm(task);
+	put_task_struct(task);
+	if (!mm)
+		goto out;
+
+	if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
+		down_read(&mm->mmap_sem);
+		exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end);
+		up_read(&mm->mmap_sem);
+	}
+
+	mmput(mm);
+
+	if (exact_vma_exists) {
+		if (task_dumpable(task)) {
+			rcu_read_lock();
+			cred = __task_cred(task);
+			inode->i_uid = cred->euid;
+			inode->i_gid = cred->egid;
+			rcu_read_unlock();
+		} else {
+			inode->i_uid = 0;
+			inode->i_gid = 0;
+		}
+		security_task_to_inode(task, inode);
+		return 1;
+	}
+out:
+	d_drop(dentry);
+	return 0;
+}
+
+static const struct dentry_operations tid_map_files_dentry_operations = {
+	.d_revalidate	= map_files_d_revalidate,
+	.d_delete	= pid_delete_dentry,
+};
+
+static int proc_map_files_get_link(struct dentry *dentry, struct path *path)
+{
+	unsigned long vm_start, vm_end;
+	struct vm_area_struct *vma;
+	struct task_struct *task;
+	struct mm_struct *mm;
+	int rc = -ENOENT;
+
+	task = get_proc_task(dentry->d_inode);
+	if (!task)
+		goto out;
+
+	mm = get_task_mm(task);
+	put_task_struct(task);
+	if (!mm)
+		goto out;
+
+	rc = dname_to_vma_addr(dentry, &vm_start, &vm_end);
+	if (rc)
+		goto out_mmput;
+
+	down_read(&mm->mmap_sem);
+	vma = find_exact_vma(mm, vm_start, vm_end);
+	if (vma && vma->vm_file) {
+		*path = vma->vm_file->f_path;
+		path_get(path);
+		rc = 0;
+	}
+	up_read(&mm->mmap_sem);
+
+out_mmput:
+	mmput(mm);
+out:
+	return rc;
+}
+
+struct map_files_info {
+	struct file	*file;
+	unsigned long	len;
+	unsigned char	name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
+};
+
+static struct dentry *
+proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
+			   struct task_struct *task, const void *ptr)
+{
+	const struct file *file = ptr;
+	struct proc_inode *ei;
+	struct inode *inode;
+
+	if (!file)
+		return ERR_PTR(-ENOENT);
+
+	inode = proc_pid_make_inode(dir->i_sb, task);
+	if (!inode)
+		return ERR_PTR(-ENOENT);
+
+	ei = PROC_I(inode);
+	ei->op.proc_get_link = proc_map_files_get_link;
+
+	inode->i_op = &proc_pid_link_inode_operations;
+	inode->i_size = 64;
+	inode->i_mode = S_IFLNK;
+
+	if (file->f_mode & FMODE_READ)
+		inode->i_mode |= S_IRUSR | S_IXUSR;
+	if (file->f_mode & FMODE_WRITE)
+		inode->i_mode |= S_IWUSR | S_IXUSR;
+
+	d_set_d_op(dentry, &tid_map_files_dentry_operations);
+	d_add(dentry, inode);
+
+	return NULL;
+}
+
+static struct dentry *proc_map_files_lookup(struct inode *dir,
+		struct dentry *dentry, struct nameidata *nd)
+{
+	unsigned long vm_start, vm_end;
+	struct task_struct *task;
+	struct vm_area_struct *vma;
+	struct mm_struct *mm;
+	struct dentry *result;
+
+	result = ERR_PTR(-ENOENT);
+	task = get_proc_task(dir);
+	if (!task)
+		goto out_no_task;
+
+	result = ERR_PTR(-EPERM);
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+		goto out_no_mm;
+
+	result = ERR_PTR(-ENOENT);
+	if (dname_to_vma_addr(dentry, &vm_start, &vm_end))
+		goto out_no_mm;
+
+	mm = get_task_mm(task);
+	if (!mm)
+		goto out_no_mm;
+
+	down_read(&mm->mmap_sem);
+	vma = find_exact_vma(mm, vm_start, vm_end);
+	if (!vma)
+		goto out_no_vma;
+
+	result = proc_map_files_instantiate(dir, dentry, task, vma->vm_file);
+
+out_no_vma:
+	up_read(&mm->mmap_sem);
+	mmput(mm);
+out_no_mm:
+	put_task_struct(task);
+out_no_task:
+	return result;
+}
+
+static int proc_map_files_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	struct task_struct *task;
+	int ret = -EACCES;
+
+	task = get_proc_task(inode);
+	if (!task)
+		return -ESRCH;
+
+	if (!lock_trace(task)) {
+		ret = proc_setattr(dentry, attr);
+		unlock_trace(task);
+	}
+
+	put_task_struct(task);
+	return ret;
+}
+
+static int proc_map_files_getattr(struct vfsmount *mnt, struct dentry *dentry,
+				  struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+	struct task_struct *task;
+	int ret = -EACCES;
+
+	task = get_proc_task(inode);
+	if (!task)
+		return -ESRCH;
+
+	if (!lock_trace(task)) {
+		generic_fillattr(inode, stat);
+		unlock_trace(task);
+		ret = 0;
+	}
+
+	put_task_struct(task);
+	return ret;
+}
+
+static const struct inode_operations proc_map_files_inode_operations = {
+	.lookup		= proc_map_files_lookup,
+	.setattr	= proc_map_files_setattr,
+	.getattr	= proc_map_files_getattr,
+};
+
+static int proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct dentry *dentry = filp->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	struct vm_area_struct *vma;
+	struct task_struct *task;
+	struct mm_struct *mm;
+	ino_t ino;
+	int ret;
+
+	ret = -ENOENT;
+	task = get_proc_task(inode);
+	if (!task)
+		goto out_no_task;
+
+	ret = -EPERM;
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+		goto out;
+
+	ret = 0;
+	switch (filp->f_pos) {
+	case 0:
+		ino = inode->i_ino;
+		if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0)
+			goto out;
+		filp->f_pos++;
+	case 1:
+		ino = parent_ino(dentry);
+		if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
+			goto out;
+		filp->f_pos++;
+	default:
+	{
+		unsigned long nr_files, used, pos, i;
+		struct flex_array *fa = NULL;
+		struct map_files_info info;
+		struct map_files_info *p;
+
+		mm = get_task_mm(task);
+		if (!mm)
+			goto out;
+		down_read(&mm->mmap_sem);
+
+		nr_files = 0;
+		used = 0;
+
+		/*
+		 * We need two passes here:
+		 *
+		 *  1) Collect vmas of mapped files with mmap_sem taken
+		 *  2) Release mmap_sem and instantiate entries
+		 *
+		 * otherwise we get lockdep complained, since filldir()
+		 * routine might require mmap_sem taken in might_fault().
+		 */
+
+		for (vma = mm->mmap; vma; vma = vma->vm_next) {
+			if (vma->vm_file)
+				nr_files++;
+		}
+
+		if (nr_files) {
+			ret = -ENOMEM;
+			fa = flex_array_alloc(sizeof(info), nr_files, GFP_KERNEL);
+			if (!fa)
+				goto err;
+			if (flex_array_prealloc(fa, 0, nr_files, GFP_KERNEL))
+				goto err;
+			for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
+				if (!vma->vm_file)
+					continue;
+				if (++pos <= filp->f_pos)
+					continue;
+
+				get_file(vma->vm_file);
+				info.file = vma->vm_file;
+				info.len = snprintf(info.name, sizeof(info.name),
+						    "%lx-%lx", vma->vm_start,
+						    vma->vm_end);
+				if (flex_array_put(fa, used, &info, GFP_KERNEL)) {
+					/*
+					 * This must never happen on preallocated array,
+					 * but just to be sure.
+					 */
+					WARN_ON_ONCE(1);
+					put_filp(vma->vm_file);
+					goto err;
+				}
+				used++;
+			}
+			ret = 0;
+		}
+err:
+		up_read(&mm->mmap_sem);
+
+		for (i = 0; i < used && !ret; i++) {
+			p = flex_array_get(fa, i);
+			ret = proc_fill_cache(filp, dirent, filldir,
+					      p->name, p->len,
+					      proc_map_files_instantiate,
+					      task, p->file);
+			if (ret)
+				break;
+			filp->f_pos++;
+			put_filp(p->file);
+		}
+
+		for (; i < used; i++) {
+			p = flex_array_get(fa, i);
+			put_filp(p->file);
+		}
+
+		if (fa)
+			flex_array_free(fa);
+
+		mmput(mm);
+	}
+	}
+
+out:
+	put_task_struct(task);
+out_no_task:
+	return ret;
+}
+
+static const struct file_operations proc_map_files_operations = {
+	.read		= generic_read_dir,
+	.readdir	= proc_map_files_readdir,
+	.llseek		= default_llseek,
+};
+
+/*
  * /proc/pid/fd needs a special permission handler so that a process can still
  * access /proc/self/fd after it has executed a setuid().
  */
@@ -2785,6 +3150,7 @@ static const struct inode_operations pro
 static const struct pid_entry tgid_base_stuff[] = {
 	DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
 	DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
+	DIR("map_files",  S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
 	DIR("fdinfo",     S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
 	DIR("ns",	  S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
 #ifdef CONFIG_NET
Index: linux-2.6.git/include/linux/mm.h
===================================================================
--- linux-2.6.git.orig/include/linux/mm.h
+++ linux-2.6.git/include/linux/mm.h
@@ -1491,6 +1491,18 @@ static inline unsigned long vma_pages(st
 	return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
 }
 
+/* Look up the first VMA which exactly match the interval vm_start ... vm_end */
+static inline struct vm_area_struct *
+find_exact_vma(struct mm_struct *mm, unsigned long vm_start, unsigned long vm_end)
+{
+	struct vm_area_struct *vma = find_vma(mm, vm_start);
+
+	if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end))
+		vma = NULL;
+
+	return vma;
+}
+
 #ifdef CONFIG_MMU
 pgprot_t vm_get_page_prot(unsigned long vm_flags);
 #else

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [kernel-hardening] Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
@ 2011-09-07 21:53                                       ` Cyrill Gorcunov
  0 siblings, 0 replies; 82+ messages in thread
From: Cyrill Gorcunov @ 2011-09-07 21:53 UTC (permalink / raw)
  To: Vasiliy Kulikov, Andrew Morton
  Cc: Tejun Heo, Kirill A. Shutemov, containers, linux-kernel,
	linux-fsdevel, Nathan Lynch, kernel-hardening, Oren Laadan,
	Daniel Lezcano, Glauber Costa, James Bottomley, Alexey Dobriyan,
	Al Viro, Pavel Emelyanov

On Wed, Sep 07, 2011 at 03:23:01PM +0400, Vasiliy Kulikov wrote:
> Hi,
> 
> On Wed, Sep 07, 2011 at 02:33 +0900, Tejun Heo wrote:
> > On Tue, Sep 06, 2011 at 09:29:52PM +0400, Vasiliy Kulikov wrote:
> > > I agree with you.  I don't think that showing system-global debug
> > > information to all users by default is the right thing.  But some people
> > > doesn't agree with this point of view:
> > > 
> > > http://thread.gmane.org/gmane.linux.kernel/1108378
> > 
> > Yeap, I know there are two sides of the discussion but if one takes
> > the position that hiding such global debug info is more harmful, it's
> > only crazier to hide such information from each individual users of
> > the said global facility.  So, let's just forget about information
> > leak via freeing or not freeing here.  It's the wrong battle field.
> 
> Andrew, are you OK with closing the hole with pid_no_revalidate()
> and 0600 /proc/slabinfo?  If so, I feel I have to start this discussion
> with people participating in the discussion above: Theodore, Dan, Linus, etc.
> 
> Thanks,

Since kernel.org is still down (and Andrew, I can't download -mm bundle
as well for this very reason), here is an updated version for review.
I've updated map_files_d_revalidate to include ptrace hook, and switched
to flex_array. So while there is uncertainty in would we use pid_no_revalidate
or we wouldn't (seems calling ptrace hook there instead would help) i remain
the get/setattr untouched for a while. Also I hope an updated changelog
would make it more clear why we need this feature.

> By Andrew Morton
>
> But do we *really* need to do it in two passes?  Avoiding the temporary
> storage would involve doing more work under mmap_sem, and a put_filp()
> under mmap_sem might be problematic.

I fear we still need to use two passes in proc_map_files_readdir, I found no way
to escape lockdep complains when doing all work in one pass with mmap_sem taken.
The /maps does the same thing -- ie it fills maps file with mmap_sem taken to produce
robust data. And I'm not really sure what you mean with problematic put_filp?

	Cyrill
---
fs, proc: Introduce the /proc/<pid>/map_files/ directory v10

From: Pavel Emelyanov <xemul@parallels.com>

This one behaves similarly to the /proc/<pid>/fd/ one - it contains symlinks
one for each mapping with file, the name of a symlink is "vma->vm_start-vma->vm_end",
the target is the file. Opening a symlink results in a file that point exactly
to the same inode as them vma's one.

For example the ls -l of some arbitrary /proc/<pid>/map_files/

 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80403000-7f8f80404000 -> /lib64/libc-2.5.so
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f8061e000-7f8f80620000 -> /lib64/libselinux.so.1
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80826000-7f8f80827000 -> /lib64/libacl.so.1.1.0
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a2f000-7f8f80a30000 -> /lib64/librt-2.5.so
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a30000-7f8f80a4c000 -> /lib64/ld-2.5.so

This *helps* checkpointing process in three ways:

1. When dumping a task mappings we do know exact file that is mapped by particular
   region. We do this by opening /proc/$pid/map_files/address symlink the way we do
   with file descriptors.

2. This also helps in determining which anonymous shared mappings are shared with
   each other by comparing the inodes of them.

3. When restoring a set of process in case two of them has a mapping shared, we map
   the memory by the 1st one and then open its /proc/$pid/map_files/address file and
   map it by the 2nd task.

Using /proc/$pid/maps for this is quite inconvenient since it brings repeatable
re-reading and reparsing for this text file which slows down restore procesure
significantly. Also as being pointed in (3) it is a way easier to use top level
shared mapping in children as /proc/$pid/map_files/address when needed.

v2: (spotted by Tejun Heo)
 - /proc/<pid>/mfd changed to /proc/<pid>/map_files
 - find_vma helper is used instead of linear search
 - routines are re-grouped
 - d_revalidate is set now

v3:
 - d_revalidate reworked, now it should drops no longer valid dentries (Tejun Heo)
 - ptrace_may_access added into proc_map_files_lookup (Vasiliy Kulikov)
 - because of filldir (which eventually might need to lock mmap_sem)
   the proc_map_files_readdir() was reworked to call proc_fill_cache()
   with unlocked mmap_sem

v4: (feedback by Tejun Heo and Vasiliy Kulikov)
 - instead of saving data in proc_inode we rather make a dentry name
   to keep both vm_start and vm_end accordingly
 - d_revalidate now honor task credentials

v5: (feedback by Kirill A. Shutemov)
 - don't forget to release mmap_sem on error path

v6:
 - sizeof get used in map_files_info which shrink member a bit on
   x86-32 (by Kirill A. Shutemov)
 - map_name_to_addr returns -EINVAL instead of -1
   which is more appropriate (by Tejun Heo)

v7:
 - add [get/set]attr handlers for
   proc_map_files_inode_operations (by Vasiliy Kulikov)

v8:
 - Kirill A. Shutemov spotted a parasite semicolon
   which ruined the ptrace_check call, fixed.

v9: (feedback by Andrew Morton)
 - find_exact_vma moved into include/linux/mm.h as an inline helper
 - proc_map_files_setattr uses either kmalloc or vmalloc depending
   on how many ojects are to be allocated
 - no more map_name_to_addr but dname_to_vma_addr introduced instead
   and it uses sscanf because in one case the find_exact_vma() is used
   only to confirm existence of vma area the boolean flag is used
 - fancy justification dropped
 - still the proc_map_files_get/setattr leaved untouched
   until additional fd/ patches applied first.

v10: (feedback by Andrew Morton)
 - flex_arrays are used instead of kmalloc/vmalloc calls
 - map_files_d_revalidate use ptrace_may_access for
   security reason (by Vasiliy Kulikov)

Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
CC: Tejun Heo <tj@kernel.org>
CC: Vasiliy Kulikov <segoon@openwall.com>
CC: "Kirill A. Shutemov" <kirill@shutemov.name>
CC: Alexey Dobriyan <adobriyan@gmail.com>
CC: Al Viro <viro@ZenIV.linux.org.uk>
CC: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/base.c     |  366 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mm.h |   12 +
 2 files changed, 378 insertions(+)

Index: linux-2.6.git/fs/proc/base.c
===================================================================
--- linux-2.6.git.orig/fs/proc/base.c
+++ linux-2.6.git/fs/proc/base.c
@@ -83,6 +83,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/fs_struct.h>
 #include <linux/slab.h>
+#include <linux/flex_array.h>
 #ifdef CONFIG_HARDWALL
 #include <asm/hardwall.h>
 #endif
@@ -2171,6 +2172,370 @@ static const struct file_operations proc
 };
 
 /*
+ * dname_to_vma_addr - maps a dentry name into two unsigned longs
+ * which represent vma start and end addresses.
+ */
+static int dname_to_vma_addr(struct dentry *dentry,
+			     unsigned long *start, unsigned long *end)
+{
+	if (sscanf(dentry->d_name.name, "%lx-%lx", start, end) != 2)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	unsigned long vm_start, vm_end;
+	bool exact_vma_exists = false;
+	struct task_struct *task;
+	const struct cred *cred;
+	struct mm_struct *mm;
+	struct inode *inode;
+
+	if (nd && nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	inode = dentry->d_inode;
+	task = get_proc_task(inode);
+	if (!task)
+		goto out;
+
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+		goto out;
+
+	mm = get_task_mm(task);
+	put_task_struct(task);
+	if (!mm)
+		goto out;
+
+	if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
+		down_read(&mm->mmap_sem);
+		exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end);
+		up_read(&mm->mmap_sem);
+	}
+
+	mmput(mm);
+
+	if (exact_vma_exists) {
+		if (task_dumpable(task)) {
+			rcu_read_lock();
+			cred = __task_cred(task);
+			inode->i_uid = cred->euid;
+			inode->i_gid = cred->egid;
+			rcu_read_unlock();
+		} else {
+			inode->i_uid = 0;
+			inode->i_gid = 0;
+		}
+		security_task_to_inode(task, inode);
+		return 1;
+	}
+out:
+	d_drop(dentry);
+	return 0;
+}
+
+static const struct dentry_operations tid_map_files_dentry_operations = {
+	.d_revalidate	= map_files_d_revalidate,
+	.d_delete	= pid_delete_dentry,
+};
+
+static int proc_map_files_get_link(struct dentry *dentry, struct path *path)
+{
+	unsigned long vm_start, vm_end;
+	struct vm_area_struct *vma;
+	struct task_struct *task;
+	struct mm_struct *mm;
+	int rc = -ENOENT;
+
+	task = get_proc_task(dentry->d_inode);
+	if (!task)
+		goto out;
+
+	mm = get_task_mm(task);
+	put_task_struct(task);
+	if (!mm)
+		goto out;
+
+	rc = dname_to_vma_addr(dentry, &vm_start, &vm_end);
+	if (rc)
+		goto out_mmput;
+
+	down_read(&mm->mmap_sem);
+	vma = find_exact_vma(mm, vm_start, vm_end);
+	if (vma && vma->vm_file) {
+		*path = vma->vm_file->f_path;
+		path_get(path);
+		rc = 0;
+	}
+	up_read(&mm->mmap_sem);
+
+out_mmput:
+	mmput(mm);
+out:
+	return rc;
+}
+
+struct map_files_info {
+	struct file	*file;
+	unsigned long	len;
+	unsigned char	name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
+};
+
+static struct dentry *
+proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
+			   struct task_struct *task, const void *ptr)
+{
+	const struct file *file = ptr;
+	struct proc_inode *ei;
+	struct inode *inode;
+
+	if (!file)
+		return ERR_PTR(-ENOENT);
+
+	inode = proc_pid_make_inode(dir->i_sb, task);
+	if (!inode)
+		return ERR_PTR(-ENOENT);
+
+	ei = PROC_I(inode);
+	ei->op.proc_get_link = proc_map_files_get_link;
+
+	inode->i_op = &proc_pid_link_inode_operations;
+	inode->i_size = 64;
+	inode->i_mode = S_IFLNK;
+
+	if (file->f_mode & FMODE_READ)
+		inode->i_mode |= S_IRUSR | S_IXUSR;
+	if (file->f_mode & FMODE_WRITE)
+		inode->i_mode |= S_IWUSR | S_IXUSR;
+
+	d_set_d_op(dentry, &tid_map_files_dentry_operations);
+	d_add(dentry, inode);
+
+	return NULL;
+}
+
+static struct dentry *proc_map_files_lookup(struct inode *dir,
+		struct dentry *dentry, struct nameidata *nd)
+{
+	unsigned long vm_start, vm_end;
+	struct task_struct *task;
+	struct vm_area_struct *vma;
+	struct mm_struct *mm;
+	struct dentry *result;
+
+	result = ERR_PTR(-ENOENT);
+	task = get_proc_task(dir);
+	if (!task)
+		goto out_no_task;
+
+	result = ERR_PTR(-EPERM);
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+		goto out_no_mm;
+
+	result = ERR_PTR(-ENOENT);
+	if (dname_to_vma_addr(dentry, &vm_start, &vm_end))
+		goto out_no_mm;
+
+	mm = get_task_mm(task);
+	if (!mm)
+		goto out_no_mm;
+
+	down_read(&mm->mmap_sem);
+	vma = find_exact_vma(mm, vm_start, vm_end);
+	if (!vma)
+		goto out_no_vma;
+
+	result = proc_map_files_instantiate(dir, dentry, task, vma->vm_file);
+
+out_no_vma:
+	up_read(&mm->mmap_sem);
+	mmput(mm);
+out_no_mm:
+	put_task_struct(task);
+out_no_task:
+	return result;
+}
+
+static int proc_map_files_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	struct task_struct *task;
+	int ret = -EACCES;
+
+	task = get_proc_task(inode);
+	if (!task)
+		return -ESRCH;
+
+	if (!lock_trace(task)) {
+		ret = proc_setattr(dentry, attr);
+		unlock_trace(task);
+	}
+
+	put_task_struct(task);
+	return ret;
+}
+
+static int proc_map_files_getattr(struct vfsmount *mnt, struct dentry *dentry,
+				  struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+	struct task_struct *task;
+	int ret = -EACCES;
+
+	task = get_proc_task(inode);
+	if (!task)
+		return -ESRCH;
+
+	if (!lock_trace(task)) {
+		generic_fillattr(inode, stat);
+		unlock_trace(task);
+		ret = 0;
+	}
+
+	put_task_struct(task);
+	return ret;
+}
+
+static const struct inode_operations proc_map_files_inode_operations = {
+	.lookup		= proc_map_files_lookup,
+	.setattr	= proc_map_files_setattr,
+	.getattr	= proc_map_files_getattr,
+};
+
+static int proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct dentry *dentry = filp->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	struct vm_area_struct *vma;
+	struct task_struct *task;
+	struct mm_struct *mm;
+	ino_t ino;
+	int ret;
+
+	ret = -ENOENT;
+	task = get_proc_task(inode);
+	if (!task)
+		goto out_no_task;
+
+	ret = -EPERM;
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+		goto out;
+
+	ret = 0;
+	switch (filp->f_pos) {
+	case 0:
+		ino = inode->i_ino;
+		if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0)
+			goto out;
+		filp->f_pos++;
+	case 1:
+		ino = parent_ino(dentry);
+		if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
+			goto out;
+		filp->f_pos++;
+	default:
+	{
+		unsigned long nr_files, used, pos, i;
+		struct flex_array *fa = NULL;
+		struct map_files_info info;
+		struct map_files_info *p;
+
+		mm = get_task_mm(task);
+		if (!mm)
+			goto out;
+		down_read(&mm->mmap_sem);
+
+		nr_files = 0;
+		used = 0;
+
+		/*
+		 * We need two passes here:
+		 *
+		 *  1) Collect vmas of mapped files with mmap_sem taken
+		 *  2) Release mmap_sem and instantiate entries
+		 *
+		 * otherwise we get lockdep complained, since filldir()
+		 * routine might require mmap_sem taken in might_fault().
+		 */
+
+		for (vma = mm->mmap; vma; vma = vma->vm_next) {
+			if (vma->vm_file)
+				nr_files++;
+		}
+
+		if (nr_files) {
+			ret = -ENOMEM;
+			fa = flex_array_alloc(sizeof(info), nr_files, GFP_KERNEL);
+			if (!fa)
+				goto err;
+			if (flex_array_prealloc(fa, 0, nr_files, GFP_KERNEL))
+				goto err;
+			for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
+				if (!vma->vm_file)
+					continue;
+				if (++pos <= filp->f_pos)
+					continue;
+
+				get_file(vma->vm_file);
+				info.file = vma->vm_file;
+				info.len = snprintf(info.name, sizeof(info.name),
+						    "%lx-%lx", vma->vm_start,
+						    vma->vm_end);
+				if (flex_array_put(fa, used, &info, GFP_KERNEL)) {
+					/*
+					 * This must never happen on preallocated array,
+					 * but just to be sure.
+					 */
+					WARN_ON_ONCE(1);
+					put_filp(vma->vm_file);
+					goto err;
+				}
+				used++;
+			}
+			ret = 0;
+		}
+err:
+		up_read(&mm->mmap_sem);
+
+		for (i = 0; i < used && !ret; i++) {
+			p = flex_array_get(fa, i);
+			ret = proc_fill_cache(filp, dirent, filldir,
+					      p->name, p->len,
+					      proc_map_files_instantiate,
+					      task, p->file);
+			if (ret)
+				break;
+			filp->f_pos++;
+			put_filp(p->file);
+		}
+
+		for (; i < used; i++) {
+			p = flex_array_get(fa, i);
+			put_filp(p->file);
+		}
+
+		if (fa)
+			flex_array_free(fa);
+
+		mmput(mm);
+	}
+	}
+
+out:
+	put_task_struct(task);
+out_no_task:
+	return ret;
+}
+
+static const struct file_operations proc_map_files_operations = {
+	.read		= generic_read_dir,
+	.readdir	= proc_map_files_readdir,
+	.llseek		= default_llseek,
+};
+
+/*
  * /proc/pid/fd needs a special permission handler so that a process can still
  * access /proc/self/fd after it has executed a setuid().
  */
@@ -2785,6 +3150,7 @@ static const struct inode_operations pro
 static const struct pid_entry tgid_base_stuff[] = {
 	DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
 	DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
+	DIR("map_files",  S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
 	DIR("fdinfo",     S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
 	DIR("ns",	  S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
 #ifdef CONFIG_NET
Index: linux-2.6.git/include/linux/mm.h
===================================================================
--- linux-2.6.git.orig/include/linux/mm.h
+++ linux-2.6.git/include/linux/mm.h
@@ -1491,6 +1491,18 @@ static inline unsigned long vma_pages(st
 	return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
 }
 
+/* Look up the first VMA which exactly match the interval vm_start ... vm_end */
+static inline struct vm_area_struct *
+find_exact_vma(struct mm_struct *mm, unsigned long vm_start, unsigned long vm_end)
+{
+	struct vm_area_struct *vma = find_vma(mm, vm_start);
+
+	if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end))
+		vma = NULL;
+
+	return vma;
+}
+
 #ifdef CONFIG_MMU
 pgprot_t vm_get_page_prot(unsigned long vm_flags);
 #else

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-09-07 21:53                                       ` [kernel-hardening] " Cyrill Gorcunov
  (?)
@ 2011-09-07 22:13                                         ` Andrew Morton
  -1 siblings, 0 replies; 82+ messages in thread
From: Andrew Morton @ 2011-09-07 22:13 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: kernel-hardening-ZwoEplunGu1jrUoiu81ncdBPR1lH4CV8,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, Pavel Emelyanov,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, James Bottomley,
	containers-qjLDD68F18O7TbgM5vRIOg, Nathan Lynch, Alexey Dobriyan,
	Tejun Heo, Daniel Lezcano, Vasiliy Kulikov, Al Viro

On Thu, 8 Sep 2011 01:53:29 +0400
Cyrill Gorcunov <gorcunov-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:

> On Wed, Sep 07, 2011 at 03:23:01PM +0400, Vasiliy Kulikov wrote:
> > Hi,
> > 
> > On Wed, Sep 07, 2011 at 02:33 +0900, Tejun Heo wrote:
> > > On Tue, Sep 06, 2011 at 09:29:52PM +0400, Vasiliy Kulikov wrote:
> > > > I agree with you.  I don't think that showing system-global debug
> > > > information to all users by default is the right thing.  But some people
> > > > doesn't agree with this point of view:
> > > > 
> > > > http://thread.gmane.org/gmane.linux.kernel/1108378
> > > 
> > > Yeap, I know there are two sides of the discussion but if one takes
> > > the position that hiding such global debug info is more harmful, it's
> > > only crazier to hide such information from each individual users of
> > > the said global facility.  So, let's just forget about information
> > > leak via freeing or not freeing here.  It's the wrong battle field.
> > 
> > Andrew, are you OK with closing the hole with pid_no_revalidate()
> > and 0600 /proc/slabinfo?  If so, I feel I have to start this discussion
> > with people participating in the discussion above: Theodore, Dan, Linus, etc.

I fell asleep a long time ago and don't know what pid_no_revalidate()
and slabinfo permissions have to do with this.  Perhaps summarising the
issues in the changelog would be appropriate, dunno.

> > By Andrew Morton
> >
> > But do we *really* need to do it in two passes?  Avoiding the temporary
> > storage would involve doing more work under mmap_sem, and a put_filp()
> > under mmap_sem might be problematic.
> 
> I fear we still need to use two passes in proc_map_files_readdir, I found no way
> to escape lockdep complains when doing all work in one pass with mmap_sem taken.
> The /maps does the same thing -- ie it fills maps file with mmap_sem taken to produce
> robust data.

The code's using three passes.

> And I'm not really sure what you mean with problematic put_filp?

I was thinking fput(), which can do a hell of a lot of stuff if it's
the final put on the inode.

>
> ...
>
> +static int proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
> +{
> +	struct dentry *dentry = filp->f_path.dentry;
> +	struct inode *inode = dentry->d_inode;
> +	struct vm_area_struct *vma;
> +	struct task_struct *task;
> +	struct mm_struct *mm;
> +	ino_t ino;
> +	int ret;
> +
> +	ret = -ENOENT;
> +	task = get_proc_task(inode);
> +	if (!task)
> +		goto out_no_task;
> +
> +	ret = -EPERM;
> +	if (!ptrace_may_access(task, PTRACE_MODE_READ))
> +		goto out;
> +
> +	ret = 0;
> +	switch (filp->f_pos) {
> +	case 0:
> +		ino = inode->i_ino;
> +		if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0)
> +			goto out;
> +		filp->f_pos++;
> +	case 1:
> +		ino = parent_ino(dentry);
> +		if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
> +			goto out;
> +		filp->f_pos++;
> +	default:
> +	{
> +		unsigned long nr_files, used, pos, i;
> +		struct flex_array *fa = NULL;
> +		struct map_files_info info;
> +		struct map_files_info *p;
> +
> +		mm = get_task_mm(task);
> +		if (!mm)
> +			goto out;
> +		down_read(&mm->mmap_sem);
> +
> +		nr_files = 0;
> +		used = 0;
> +
> +		/*
> +		 * We need two passes here:
> +		 *
> +		 *  1) Collect vmas of mapped files with mmap_sem taken
> +		 *  2) Release mmap_sem and instantiate entries
> +		 *
> +		 * otherwise we get lockdep complained, since filldir()
> +		 * routine might require mmap_sem taken in might_fault().
> +		 */
> +
> +		for (vma = mm->mmap; vma; vma = vma->vm_next) {
> +			if (vma->vm_file)
> +				nr_files++;
> +		}
> +
> +		if (nr_files) {
> +			ret = -ENOMEM;
> +			fa = flex_array_alloc(sizeof(info), nr_files, GFP_KERNEL);
> +			if (!fa)
> +				goto err;
> +			if (flex_array_prealloc(fa, 0, nr_files, GFP_KERNEL))
> +				goto err;
> +			for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
> +				if (!vma->vm_file)
> +					continue;
> +				if (++pos <= filp->f_pos)
> +					continue;
> +
> +				get_file(vma->vm_file);
> +				info.file = vma->vm_file;
> +				info.len = snprintf(info.name, sizeof(info.name),
> +						    "%lx-%lx", vma->vm_start,
> +						    vma->vm_end);
> +				if (flex_array_put(fa, used, &info, GFP_KERNEL)) {
> +					/*
> +					 * This must never happen on preallocated array,
> +					 * but just to be sure.
> +					 */
> +					WARN_ON_ONCE(1);
> +					put_filp(vma->vm_file);
> +					goto err;
> +				}
> +				used++;
> +			}
> +			ret = 0;
> +		}
> +err:
> +		up_read(&mm->mmap_sem);
> +
> +		for (i = 0; i < used && !ret; i++) {

The "&& !ret" is unneeded?

> +			p = flex_array_get(fa, i);
> +			ret = proc_fill_cache(filp, dirent, filldir,
> +					      p->name, p->len,
> +					      proc_map_files_instantiate,
> +					      task, p->file);
> +			if (ret)
> +				break;
> +			filp->f_pos++;
> +			put_filp(p->file);
> +		}
> +
> +		for (; i < used; i++) {
> +			p = flex_array_get(fa, i);
> +			put_filp(p->file);
> +		}

Still unclear why we need the third loop.

> +		if (fa)
> +			flex_array_free(fa);
> +
> +		mmput(mm);
> +	}
> +	}
> +
> +out:
> +	put_task_struct(task);
> +out_no_task:
> +	return ret;
> +}
>
> ...
>

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
@ 2011-09-07 22:13                                         ` Andrew Morton
  0 siblings, 0 replies; 82+ messages in thread
From: Andrew Morton @ 2011-09-07 22:13 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Vasiliy Kulikov, Tejun Heo, Kirill A. Shutemov, containers,
	linux-kernel, linux-fsdevel, Nathan Lynch, kernel-hardening,
	Oren Laadan, Daniel Lezcano, Glauber Costa, James Bottomley,
	Alexey Dobriyan, Al Viro, Pavel Emelyanov

On Thu, 8 Sep 2011 01:53:29 +0400
Cyrill Gorcunov <gorcunov@gmail.com> wrote:

> On Wed, Sep 07, 2011 at 03:23:01PM +0400, Vasiliy Kulikov wrote:
> > Hi,
> > 
> > On Wed, Sep 07, 2011 at 02:33 +0900, Tejun Heo wrote:
> > > On Tue, Sep 06, 2011 at 09:29:52PM +0400, Vasiliy Kulikov wrote:
> > > > I agree with you.  I don't think that showing system-global debug
> > > > information to all users by default is the right thing.  But some people
> > > > doesn't agree with this point of view:
> > > > 
> > > > http://thread.gmane.org/gmane.linux.kernel/1108378
> > > 
> > > Yeap, I know there are two sides of the discussion but if one takes
> > > the position that hiding such global debug info is more harmful, it's
> > > only crazier to hide such information from each individual users of
> > > the said global facility.  So, let's just forget about information
> > > leak via freeing or not freeing here.  It's the wrong battle field.
> > 
> > Andrew, are you OK with closing the hole with pid_no_revalidate()
> > and 0600 /proc/slabinfo?  If so, I feel I have to start this discussion
> > with people participating in the discussion above: Theodore, Dan, Linus, etc.

I fell asleep a long time ago and don't know what pid_no_revalidate()
and slabinfo permissions have to do with this.  Perhaps summarising the
issues in the changelog would be appropriate, dunno.

> > By Andrew Morton
> >
> > But do we *really* need to do it in two passes?  Avoiding the temporary
> > storage would involve doing more work under mmap_sem, and a put_filp()
> > under mmap_sem might be problematic.
> 
> I fear we still need to use two passes in proc_map_files_readdir, I found no way
> to escape lockdep complains when doing all work in one pass with mmap_sem taken.
> The /maps does the same thing -- ie it fills maps file with mmap_sem taken to produce
> robust data.

The code's using three passes.

> And I'm not really sure what you mean with problematic put_filp?

I was thinking fput(), which can do a hell of a lot of stuff if it's
the final put on the inode.

>
> ...
>
> +static int proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
> +{
> +	struct dentry *dentry = filp->f_path.dentry;
> +	struct inode *inode = dentry->d_inode;
> +	struct vm_area_struct *vma;
> +	struct task_struct *task;
> +	struct mm_struct *mm;
> +	ino_t ino;
> +	int ret;
> +
> +	ret = -ENOENT;
> +	task = get_proc_task(inode);
> +	if (!task)
> +		goto out_no_task;
> +
> +	ret = -EPERM;
> +	if (!ptrace_may_access(task, PTRACE_MODE_READ))
> +		goto out;
> +
> +	ret = 0;
> +	switch (filp->f_pos) {
> +	case 0:
> +		ino = inode->i_ino;
> +		if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0)
> +			goto out;
> +		filp->f_pos++;
> +	case 1:
> +		ino = parent_ino(dentry);
> +		if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
> +			goto out;
> +		filp->f_pos++;
> +	default:
> +	{
> +		unsigned long nr_files, used, pos, i;
> +		struct flex_array *fa = NULL;
> +		struct map_files_info info;
> +		struct map_files_info *p;
> +
> +		mm = get_task_mm(task);
> +		if (!mm)
> +			goto out;
> +		down_read(&mm->mmap_sem);
> +
> +		nr_files = 0;
> +		used = 0;
> +
> +		/*
> +		 * We need two passes here:
> +		 *
> +		 *  1) Collect vmas of mapped files with mmap_sem taken
> +		 *  2) Release mmap_sem and instantiate entries
> +		 *
> +		 * otherwise we get lockdep complained, since filldir()
> +		 * routine might require mmap_sem taken in might_fault().
> +		 */
> +
> +		for (vma = mm->mmap; vma; vma = vma->vm_next) {
> +			if (vma->vm_file)
> +				nr_files++;
> +		}
> +
> +		if (nr_files) {
> +			ret = -ENOMEM;
> +			fa = flex_array_alloc(sizeof(info), nr_files, GFP_KERNEL);
> +			if (!fa)
> +				goto err;
> +			if (flex_array_prealloc(fa, 0, nr_files, GFP_KERNEL))
> +				goto err;
> +			for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
> +				if (!vma->vm_file)
> +					continue;
> +				if (++pos <= filp->f_pos)
> +					continue;
> +
> +				get_file(vma->vm_file);
> +				info.file = vma->vm_file;
> +				info.len = snprintf(info.name, sizeof(info.name),
> +						    "%lx-%lx", vma->vm_start,
> +						    vma->vm_end);
> +				if (flex_array_put(fa, used, &info, GFP_KERNEL)) {
> +					/*
> +					 * This must never happen on preallocated array,
> +					 * but just to be sure.
> +					 */
> +					WARN_ON_ONCE(1);
> +					put_filp(vma->vm_file);
> +					goto err;
> +				}
> +				used++;
> +			}
> +			ret = 0;
> +		}
> +err:
> +		up_read(&mm->mmap_sem);
> +
> +		for (i = 0; i < used && !ret; i++) {

The "&& !ret" is unneeded?

> +			p = flex_array_get(fa, i);
> +			ret = proc_fill_cache(filp, dirent, filldir,
> +					      p->name, p->len,
> +					      proc_map_files_instantiate,
> +					      task, p->file);
> +			if (ret)
> +				break;
> +			filp->f_pos++;
> +			put_filp(p->file);
> +		}
> +
> +		for (; i < used; i++) {
> +			p = flex_array_get(fa, i);
> +			put_filp(p->file);
> +		}

Still unclear why we need the third loop.

> +		if (fa)
> +			flex_array_free(fa);
> +
> +		mmput(mm);
> +	}
> +	}
> +
> +out:
> +	put_task_struct(task);
> +out_no_task:
> +	return ret;
> +}
>
> ...
>

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [kernel-hardening] Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
@ 2011-09-07 22:13                                         ` Andrew Morton
  0 siblings, 0 replies; 82+ messages in thread
From: Andrew Morton @ 2011-09-07 22:13 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Vasiliy Kulikov, Tejun Heo, Kirill A. Shutemov, containers,
	linux-kernel, linux-fsdevel, Nathan Lynch, kernel-hardening,
	Oren Laadan, Daniel Lezcano, Glauber Costa, James Bottomley,
	Alexey Dobriyan, Al Viro, Pavel Emelyanov

On Thu, 8 Sep 2011 01:53:29 +0400
Cyrill Gorcunov <gorcunov@gmail.com> wrote:

> On Wed, Sep 07, 2011 at 03:23:01PM +0400, Vasiliy Kulikov wrote:
> > Hi,
> > 
> > On Wed, Sep 07, 2011 at 02:33 +0900, Tejun Heo wrote:
> > > On Tue, Sep 06, 2011 at 09:29:52PM +0400, Vasiliy Kulikov wrote:
> > > > I agree with you.  I don't think that showing system-global debug
> > > > information to all users by default is the right thing.  But some people
> > > > doesn't agree with this point of view:
> > > > 
> > > > http://thread.gmane.org/gmane.linux.kernel/1108378
> > > 
> > > Yeap, I know there are two sides of the discussion but if one takes
> > > the position that hiding such global debug info is more harmful, it's
> > > only crazier to hide such information from each individual users of
> > > the said global facility.  So, let's just forget about information
> > > leak via freeing or not freeing here.  It's the wrong battle field.
> > 
> > Andrew, are you OK with closing the hole with pid_no_revalidate()
> > and 0600 /proc/slabinfo?  If so, I feel I have to start this discussion
> > with people participating in the discussion above: Theodore, Dan, Linus, etc.

I fell asleep a long time ago and don't know what pid_no_revalidate()
and slabinfo permissions have to do with this.  Perhaps summarising the
issues in the changelog would be appropriate, dunno.

> > By Andrew Morton
> >
> > But do we *really* need to do it in two passes?  Avoiding the temporary
> > storage would involve doing more work under mmap_sem, and a put_filp()
> > under mmap_sem might be problematic.
> 
> I fear we still need to use two passes in proc_map_files_readdir, I found no way
> to escape lockdep complains when doing all work in one pass with mmap_sem taken.
> The /maps does the same thing -- ie it fills maps file with mmap_sem taken to produce
> robust data.

The code's using three passes.

> And I'm not really sure what you mean with problematic put_filp?

I was thinking fput(), which can do a hell of a lot of stuff if it's
the final put on the inode.

>
> ...
>
> +static int proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
> +{
> +	struct dentry *dentry = filp->f_path.dentry;
> +	struct inode *inode = dentry->d_inode;
> +	struct vm_area_struct *vma;
> +	struct task_struct *task;
> +	struct mm_struct *mm;
> +	ino_t ino;
> +	int ret;
> +
> +	ret = -ENOENT;
> +	task = get_proc_task(inode);
> +	if (!task)
> +		goto out_no_task;
> +
> +	ret = -EPERM;
> +	if (!ptrace_may_access(task, PTRACE_MODE_READ))
> +		goto out;
> +
> +	ret = 0;
> +	switch (filp->f_pos) {
> +	case 0:
> +		ino = inode->i_ino;
> +		if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0)
> +			goto out;
> +		filp->f_pos++;
> +	case 1:
> +		ino = parent_ino(dentry);
> +		if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
> +			goto out;
> +		filp->f_pos++;
> +	default:
> +	{
> +		unsigned long nr_files, used, pos, i;
> +		struct flex_array *fa = NULL;
> +		struct map_files_info info;
> +		struct map_files_info *p;
> +
> +		mm = get_task_mm(task);
> +		if (!mm)
> +			goto out;
> +		down_read(&mm->mmap_sem);
> +
> +		nr_files = 0;
> +		used = 0;
> +
> +		/*
> +		 * We need two passes here:
> +		 *
> +		 *  1) Collect vmas of mapped files with mmap_sem taken
> +		 *  2) Release mmap_sem and instantiate entries
> +		 *
> +		 * otherwise we get lockdep complained, since filldir()
> +		 * routine might require mmap_sem taken in might_fault().
> +		 */
> +
> +		for (vma = mm->mmap; vma; vma = vma->vm_next) {
> +			if (vma->vm_file)
> +				nr_files++;
> +		}
> +
> +		if (nr_files) {
> +			ret = -ENOMEM;
> +			fa = flex_array_alloc(sizeof(info), nr_files, GFP_KERNEL);
> +			if (!fa)
> +				goto err;
> +			if (flex_array_prealloc(fa, 0, nr_files, GFP_KERNEL))
> +				goto err;
> +			for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
> +				if (!vma->vm_file)
> +					continue;
> +				if (++pos <= filp->f_pos)
> +					continue;
> +
> +				get_file(vma->vm_file);
> +				info.file = vma->vm_file;
> +				info.len = snprintf(info.name, sizeof(info.name),
> +						    "%lx-%lx", vma->vm_start,
> +						    vma->vm_end);
> +				if (flex_array_put(fa, used, &info, GFP_KERNEL)) {
> +					/*
> +					 * This must never happen on preallocated array,
> +					 * but just to be sure.
> +					 */
> +					WARN_ON_ONCE(1);
> +					put_filp(vma->vm_file);
> +					goto err;
> +				}
> +				used++;
> +			}
> +			ret = 0;
> +		}
> +err:
> +		up_read(&mm->mmap_sem);
> +
> +		for (i = 0; i < used && !ret; i++) {

The "&& !ret" is unneeded?

> +			p = flex_array_get(fa, i);
> +			ret = proc_fill_cache(filp, dirent, filldir,
> +					      p->name, p->len,
> +					      proc_map_files_instantiate,
> +					      task, p->file);
> +			if (ret)
> +				break;
> +			filp->f_pos++;
> +			put_filp(p->file);
> +		}
> +
> +		for (; i < used; i++) {
> +			p = flex_array_get(fa, i);
> +			put_filp(p->file);
> +		}

Still unclear why we need the third loop.

> +		if (fa)
> +			flex_array_free(fa);
> +
> +		mmput(mm);
> +	}
> +	}
> +
> +out:
> +	put_task_struct(task);
> +out_no_task:
> +	return ret;
> +}
>
> ...
>

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-09-07 22:13                                         ` Andrew Morton
@ 2011-09-07 22:42                                           ` Cyrill Gorcunov
  -1 siblings, 0 replies; 82+ messages in thread
From: Cyrill Gorcunov @ 2011-09-07 22:42 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Vasiliy Kulikov, Tejun Heo, Kirill A. Shutemov, containers,
	linux-kernel, linux-fsdevel, Nathan Lynch, kernel-hardening,
	Oren Laadan, Daniel Lezcano, Glauber Costa, James Bottomley,
	Alexey Dobriyan, Al Viro, Pavel Emelyanov

On Wed, Sep 07, 2011 at 03:13:23PM -0700, Andrew Morton wrote:
...
> > > 
> > > Andrew, are you OK with closing the hole with pid_no_revalidate()
> > > and 0600 /proc/slabinfo?  If so, I feel I have to start this discussion
> > > with people participating in the discussion above: Theodore, Dan, Linus, etc.
> 
> I fell asleep a long time ago and don't know what pid_no_revalidate()
> and slabinfo permissions have to do with this.  Perhaps summarising the
> issues in the changelog would be appropriate, dunno.

Well, time to poke Vasiliy ;)

...
> > 
> > I fear we still need to use two passes in proc_map_files_readdir, I found no way
> > to escape lockdep complains when doing all work in one pass with mmap_sem taken.
> > The /maps does the same thing -- ie it fills maps file with mmap_sem taken to produce
> > robust data.
> 
> The code's using three passes.

Yes, and I didn't find thy way to escape it (actually if there would not
be filldir+might_fault tuple I would create this all under mmap_sem and
would not need this flex_array or any temporary storage at all and code
would be a way simplier).

> 
> > And I'm not really sure what you mean with problematic put_filp?
> 
> I was thinking fput(), which can do a hell of a lot of stuff if it's
> the final put on the inode.

Ouch, somehow missed it, thanks!

> > +err:
> > +		up_read(&mm->mmap_sem);
> > +
> > +		for (i = 0; i < used && !ret; i++) {
> 
> The "&& !ret" is unneeded?

No, it's needed, since it makes sure that if "impossible"
scenario happens and flex-arrays fails with preallocated
data so we will reach this point with used > 0 and ret = -ENOMEM
and thus will not call for proc_map_files_instantiate as needed.

> 
> > +			p = flex_array_get(fa, i);
> > +			ret = proc_fill_cache(filp, dirent, filldir,
> > +					      p->name, p->len,
> > +					      proc_map_files_instantiate,
> > +					      task, p->file);
> > +			if (ret)
> > +				break;

1: Say we failed here

> > +			filp->f_pos++;
> > +			put_filp(p->file);
> > +		}
> > +
> > +		for (; i < used; i++) {
> > +			p = flex_array_get(fa, i);
> > +			put_filp(p->file);
> > +		}
> 
> Still unclear why we need the third loop.

Due to (1) -- so we will have a number of files reference
taken and need to put them back.

	Cyrill

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [kernel-hardening] Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
@ 2011-09-07 22:42                                           ` Cyrill Gorcunov
  0 siblings, 0 replies; 82+ messages in thread
From: Cyrill Gorcunov @ 2011-09-07 22:42 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Vasiliy Kulikov, Tejun Heo, Kirill A. Shutemov, containers,
	linux-kernel, linux-fsdevel, Nathan Lynch, kernel-hardening,
	Oren Laadan, Daniel Lezcano, Glauber Costa, James Bottomley,
	Alexey Dobriyan, Al Viro, Pavel Emelyanov

On Wed, Sep 07, 2011 at 03:13:23PM -0700, Andrew Morton wrote:
...
> > > 
> > > Andrew, are you OK with closing the hole with pid_no_revalidate()
> > > and 0600 /proc/slabinfo?  If so, I feel I have to start this discussion
> > > with people participating in the discussion above: Theodore, Dan, Linus, etc.
> 
> I fell asleep a long time ago and don't know what pid_no_revalidate()
> and slabinfo permissions have to do with this.  Perhaps summarising the
> issues in the changelog would be appropriate, dunno.

Well, time to poke Vasiliy ;)

...
> > 
> > I fear we still need to use two passes in proc_map_files_readdir, I found no way
> > to escape lockdep complains when doing all work in one pass with mmap_sem taken.
> > The /maps does the same thing -- ie it fills maps file with mmap_sem taken to produce
> > robust data.
> 
> The code's using three passes.

Yes, and I didn't find thy way to escape it (actually if there would not
be filldir+might_fault tuple I would create this all under mmap_sem and
would not need this flex_array or any temporary storage at all and code
would be a way simplier).

> 
> > And I'm not really sure what you mean with problematic put_filp?
> 
> I was thinking fput(), which can do a hell of a lot of stuff if it's
> the final put on the inode.

Ouch, somehow missed it, thanks!

> > +err:
> > +		up_read(&mm->mmap_sem);
> > +
> > +		for (i = 0; i < used && !ret; i++) {
> 
> The "&& !ret" is unneeded?

No, it's needed, since it makes sure that if "impossible"
scenario happens and flex-arrays fails with preallocated
data so we will reach this point with used > 0 and ret = -ENOMEM
and thus will not call for proc_map_files_instantiate as needed.

> 
> > +			p = flex_array_get(fa, i);
> > +			ret = proc_fill_cache(filp, dirent, filldir,
> > +					      p->name, p->len,
> > +					      proc_map_files_instantiate,
> > +					      task, p->file);
> > +			if (ret)
> > +				break;

1: Say we failed here

> > +			filp->f_pos++;
> > +			put_filp(p->file);
> > +		}
> > +
> > +		for (; i < used; i++) {
> > +			p = flex_array_get(fa, i);
> > +			put_filp(p->file);
> > +		}
> 
> Still unclear why we need the third loop.

Due to (1) -- so we will have a number of files reference
taken and need to put them back.

	Cyrill

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-09-07 22:42                                           ` [kernel-hardening] " Cyrill Gorcunov
  (?)
@ 2011-09-07 22:53                                             ` Andrew Morton
  -1 siblings, 0 replies; 82+ messages in thread
From: Andrew Morton @ 2011-09-07 22:53 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: kernel-hardening-ZwoEplunGu1jrUoiu81ncdBPR1lH4CV8,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, Pavel Emelyanov,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, James Bottomley,
	containers-qjLDD68F18O7TbgM5vRIOg, Nathan Lynch, Alexey Dobriyan,
	Tejun Heo, Daniel Lezcano, Vasiliy Kulikov, Al Viro

On Thu, 8 Sep 2011 02:42:34 +0400
Cyrill Gorcunov <gorcunov-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:

> > > +err:
> > > +		up_read(&mm->mmap_sem);
> > > +
> > > +		for (i = 0; i < used && !ret; i++) {
> > 
> > The "&& !ret" is unneeded?
> 
> No, it's needed, since it makes sure that if "impossible"
> scenario happens and flex-arrays fails with preallocated
> data so we will reach this point with used > 0 and ret = -ENOMEM
> and thus will not call for proc_map_files_instantiate as needed.

Well, it doesn't need to be tested on each pass around the loop - that's
misleading and inefficient (unless the compiler is being particularly clever).

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
@ 2011-09-07 22:53                                             ` Andrew Morton
  0 siblings, 0 replies; 82+ messages in thread
From: Andrew Morton @ 2011-09-07 22:53 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Vasiliy Kulikov, Tejun Heo, Kirill A. Shutemov, containers,
	linux-kernel, linux-fsdevel, Nathan Lynch, kernel-hardening,
	Oren Laadan, Daniel Lezcano, Glauber Costa, James Bottomley,
	Alexey Dobriyan, Al Viro, Pavel Emelyanov

On Thu, 8 Sep 2011 02:42:34 +0400
Cyrill Gorcunov <gorcunov@gmail.com> wrote:

> > > +err:
> > > +		up_read(&mm->mmap_sem);
> > > +
> > > +		for (i = 0; i < used && !ret; i++) {
> > 
> > The "&& !ret" is unneeded?
> 
> No, it's needed, since it makes sure that if "impossible"
> scenario happens and flex-arrays fails with preallocated
> data so we will reach this point with used > 0 and ret = -ENOMEM
> and thus will not call for proc_map_files_instantiate as needed.

Well, it doesn't need to be tested on each pass around the loop - that's
misleading and inefficient (unless the compiler is being particularly clever).

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [kernel-hardening] Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
@ 2011-09-07 22:53                                             ` Andrew Morton
  0 siblings, 0 replies; 82+ messages in thread
From: Andrew Morton @ 2011-09-07 22:53 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Vasiliy Kulikov, Tejun Heo, Kirill A. Shutemov, containers,
	linux-kernel, linux-fsdevel, Nathan Lynch, kernel-hardening,
	Oren Laadan, Daniel Lezcano, Glauber Costa, James Bottomley,
	Alexey Dobriyan, Al Viro, Pavel Emelyanov

On Thu, 8 Sep 2011 02:42:34 +0400
Cyrill Gorcunov <gorcunov@gmail.com> wrote:

> > > +err:
> > > +		up_read(&mm->mmap_sem);
> > > +
> > > +		for (i = 0; i < used && !ret; i++) {
> > 
> > The "&& !ret" is unneeded?
> 
> No, it's needed, since it makes sure that if "impossible"
> scenario happens and flex-arrays fails with preallocated
> data so we will reach this point with used > 0 and ret = -ENOMEM
> and thus will not call for proc_map_files_instantiate as needed.

Well, it doesn't need to be tested on each pass around the loop - that's
misleading and inefficient (unless the compiler is being particularly clever).

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-09-07 22:53                                             ` Andrew Morton
@ 2011-09-08  5:48                                               ` Cyrill Gorcunov
  -1 siblings, 0 replies; 82+ messages in thread
From: Cyrill Gorcunov @ 2011-09-08  5:48 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Vasiliy Kulikov, Tejun Heo, Kirill A. Shutemov, containers,
	linux-kernel, linux-fsdevel, Nathan Lynch, kernel-hardening,
	Oren Laadan, Daniel Lezcano, Glauber Costa, James Bottomley,
	Alexey Dobriyan, Al Viro, Pavel Emelyanov

On Wed, Sep 07, 2011 at 03:53:32PM -0700, Andrew Morton wrote:
...
> 
> > > > +err:
> > > > +		up_read(&mm->mmap_sem);
> > > > +
> > > > +		for (i = 0; i < used && !ret; i++) {
> > > 
> > > The "&& !ret" is unneeded?
> > 
> > No, it's needed, since it makes sure that if "impossible"
> > scenario happens and flex-arrays fails with preallocated
> > data so we will reach this point with used > 0 and ret = -ENOMEM
> > and thus will not call for proc_map_files_instantiate as needed.
> 
> Well, it doesn't need to be tested on each pass around the loop - that's
> misleading and inefficient (unless the compiler is being particularly clever).

Here an update is coming, please take a look. Note I simply added BUG() in case
if preallocated array failed since found that it's what other kernel code
does as well (ie grep for flex_array_put_ptr). Other notes are at v11: mark
below. Thanks.

	Cyrill
---
fs, proc: Introduce the /proc/<pid>/map_files/ directory v11

From: Pavel Emelyanov <xemul@parallels.com>

This one behaves similarly to the /proc/<pid>/fd/ one - it contains symlinks
one for each mapping with file, the name of a symlink is "vma->vm_start-vma->vm_end",
the target is the file. Opening a symlink results in a file that point exactly
to the same inode as them vma's one.

For example the ls -l of some arbitrary /proc/<pid>/map_files/

 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80403000-7f8f80404000 -> /lib64/libc-2.5.so
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f8061e000-7f8f80620000 -> /lib64/libselinux.so.1
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80826000-7f8f80827000 -> /lib64/libacl.so.1.1.0
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a2f000-7f8f80a30000 -> /lib64/librt-2.5.so
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a30000-7f8f80a4c000 -> /lib64/ld-2.5.so

This *helps* checkpointing process in three ways:

1. When dumping a task mappings we do know exact file that is mapped by particular
   region. We do this by opening /proc/$pid/map_files/address symlink the way we do
   with file descriptors.

2. This also helps in determining which anonymous shared mappings are shared with
   each other by comparing the inodes of them.

3. When restoring a set of process in case two of them has a mapping shared, we map
   the memory by the 1st one and then open its /proc/$pid/map_files/address file and
   map it by the 2nd task.

Using /proc/$pid/maps for this is quite inconvenient since it brings repeatable
re-reading and reparsing for this text file which slows down restore procesure
significantly. Also as being pointed in (3) it is a way easier to use top level
shared mapping in children as /proc/$pid/map_files/address when needed.

v2: (spotted by Tejun Heo)
 - /proc/<pid>/mfd changed to /proc/<pid>/map_files
 - find_vma helper is used instead of linear search
 - routines are re-grouped
 - d_revalidate is set now

v3:
 - d_revalidate reworked, now it should drops no longer valid dentries (Tejun Heo)
 - ptrace_may_access added into proc_map_files_lookup (Vasiliy Kulikov)
 - because of filldir (which eventually might need to lock mmap_sem)
   the proc_map_files_readdir() was reworked to call proc_fill_cache()
   with unlocked mmap_sem

v4: (feedback by Tejun Heo and Vasiliy Kulikov)
 - instead of saving data in proc_inode we rather make a dentry name
   to keep both vm_start and vm_end accordingly
 - d_revalidate now honor task credentials

v5: (feedback by Kirill A. Shutemov)
 - don't forget to release mmap_sem on error path

v6:
 - sizeof get used in map_files_info which shrink member a bit on
   x86-32 (by Kirill A. Shutemov)
 - map_name_to_addr returns -EINVAL instead of -1
   which is more appropriate (by Tejun Heo)

v7:
 - add [get/set]attr handlers for
   proc_map_files_inode_operations (by Vasiliy Kulikov)

v8:
 - Kirill A. Shutemov spotted a parasite semicolon
   which ruined the ptrace_check call, fixed.

v9: (feedback by Andrew Morton)
 - find_exact_vma moved into include/linux/mm.h as an inline helper
 - proc_map_files_setattr uses either kmalloc or vmalloc depending
   on how many ojects are to be allocated
 - no more map_name_to_addr but dname_to_vma_addr introduced instead
   and it uses sscanf because in one case the find_exact_vma() is used
   only to confirm existence of vma area the boolean flag is used
 - fancy justification dropped
 - still the proc_map_files_get/setattr leaved untouched
   until additional fd/ patches applied first.

v10: (feedback by Andrew Morton)
 - flex_arrays are used instead of kmalloc/vmalloc calls
 - map_files_d_revalidate use ptrace_may_access for
   security reason (by Vasiliy Kulikov)

v11:
 - should use fput and drop !ret test from a loop code
   (feedback by Andrew Morton)
 - no need for 'used' variable, use existing
   nr_files with file->pos predicate

Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
CC: Tejun Heo <tj@kernel.org>
CC: Vasiliy Kulikov <segoon@openwall.com>
CC: "Kirill A. Shutemov" <kirill@shutemov.name>
CC: Alexey Dobriyan <adobriyan@gmail.com>
CC: Al Viro <viro@ZenIV.linux.org.uk>
CC: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/base.c     |  365 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mm.h |   12 +
 2 files changed, 377 insertions(+)

Index: linux-2.6.git/fs/proc/base.c
===================================================================
--- linux-2.6.git.orig/fs/proc/base.c
+++ linux-2.6.git/fs/proc/base.c
@@ -83,6 +83,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/fs_struct.h>
 #include <linux/slab.h>
+#include <linux/flex_array.h>
 #ifdef CONFIG_HARDWALL
 #include <asm/hardwall.h>
 #endif
@@ -2171,6 +2172,369 @@ static const struct file_operations proc
 };
 
 /*
+ * dname_to_vma_addr - maps a dentry name into two unsigned longs
+ * which represent vma start and end addresses.
+ */
+static int dname_to_vma_addr(struct dentry *dentry,
+			     unsigned long *start, unsigned long *end)
+{
+	if (sscanf(dentry->d_name.name, "%lx-%lx", start, end) != 2)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	unsigned long vm_start, vm_end;
+	bool exact_vma_exists = false;
+	struct task_struct *task;
+	const struct cred *cred;
+	struct mm_struct *mm;
+	struct inode *inode;
+
+	if (nd && nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	inode = dentry->d_inode;
+	task = get_proc_task(inode);
+	if (!task)
+		goto out;
+
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+		goto out;
+
+	mm = get_task_mm(task);
+	put_task_struct(task);
+	if (!mm)
+		goto out;
+
+	if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
+		down_read(&mm->mmap_sem);
+		exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end);
+		up_read(&mm->mmap_sem);
+	}
+
+	mmput(mm);
+
+	if (exact_vma_exists) {
+		if (task_dumpable(task)) {
+			rcu_read_lock();
+			cred = __task_cred(task);
+			inode->i_uid = cred->euid;
+			inode->i_gid = cred->egid;
+			rcu_read_unlock();
+		} else {
+			inode->i_uid = 0;
+			inode->i_gid = 0;
+		}
+		security_task_to_inode(task, inode);
+		return 1;
+	}
+out:
+	d_drop(dentry);
+	return 0;
+}
+
+static const struct dentry_operations tid_map_files_dentry_operations = {
+	.d_revalidate	= map_files_d_revalidate,
+	.d_delete	= pid_delete_dentry,
+};
+
+static int proc_map_files_get_link(struct dentry *dentry, struct path *path)
+{
+	unsigned long vm_start, vm_end;
+	struct vm_area_struct *vma;
+	struct task_struct *task;
+	struct mm_struct *mm;
+	int rc = -ENOENT;
+
+	task = get_proc_task(dentry->d_inode);
+	if (!task)
+		goto out;
+
+	mm = get_task_mm(task);
+	put_task_struct(task);
+	if (!mm)
+		goto out;
+
+	rc = dname_to_vma_addr(dentry, &vm_start, &vm_end);
+	if (rc)
+		goto out_mmput;
+
+	down_read(&mm->mmap_sem);
+	vma = find_exact_vma(mm, vm_start, vm_end);
+	if (vma && vma->vm_file) {
+		*path = vma->vm_file->f_path;
+		path_get(path);
+		rc = 0;
+	}
+	up_read(&mm->mmap_sem);
+
+out_mmput:
+	mmput(mm);
+out:
+	return rc;
+}
+
+struct map_files_info {
+	struct file	*file;
+	unsigned long	len;
+	unsigned char	name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
+};
+
+static struct dentry *
+proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
+			   struct task_struct *task, const void *ptr)
+{
+	const struct file *file = ptr;
+	struct proc_inode *ei;
+	struct inode *inode;
+
+	if (!file)
+		return ERR_PTR(-ENOENT);
+
+	inode = proc_pid_make_inode(dir->i_sb, task);
+	if (!inode)
+		return ERR_PTR(-ENOENT);
+
+	ei = PROC_I(inode);
+	ei->op.proc_get_link = proc_map_files_get_link;
+
+	inode->i_op = &proc_pid_link_inode_operations;
+	inode->i_size = 64;
+	inode->i_mode = S_IFLNK;
+
+	if (file->f_mode & FMODE_READ)
+		inode->i_mode |= S_IRUSR | S_IXUSR;
+	if (file->f_mode & FMODE_WRITE)
+		inode->i_mode |= S_IWUSR | S_IXUSR;
+
+	d_set_d_op(dentry, &tid_map_files_dentry_operations);
+	d_add(dentry, inode);
+
+	return NULL;
+}
+
+static struct dentry *proc_map_files_lookup(struct inode *dir,
+		struct dentry *dentry, struct nameidata *nd)
+{
+	unsigned long vm_start, vm_end;
+	struct task_struct *task;
+	struct vm_area_struct *vma;
+	struct mm_struct *mm;
+	struct dentry *result;
+
+	result = ERR_PTR(-ENOENT);
+	task = get_proc_task(dir);
+	if (!task)
+		goto out_no_task;
+
+	result = ERR_PTR(-EPERM);
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+		goto out_no_mm;
+
+	result = ERR_PTR(-ENOENT);
+	if (dname_to_vma_addr(dentry, &vm_start, &vm_end))
+		goto out_no_mm;
+
+	mm = get_task_mm(task);
+	if (!mm)
+		goto out_no_mm;
+
+	down_read(&mm->mmap_sem);
+	vma = find_exact_vma(mm, vm_start, vm_end);
+	if (!vma)
+		goto out_no_vma;
+
+	result = proc_map_files_instantiate(dir, dentry, task, vma->vm_file);
+
+out_no_vma:
+	up_read(&mm->mmap_sem);
+	mmput(mm);
+out_no_mm:
+	put_task_struct(task);
+out_no_task:
+	return result;
+}
+
+static int proc_map_files_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	struct task_struct *task;
+	int ret = -EACCES;
+
+	task = get_proc_task(inode);
+	if (!task)
+		return -ESRCH;
+
+	if (!lock_trace(task)) {
+		ret = proc_setattr(dentry, attr);
+		unlock_trace(task);
+	}
+
+	put_task_struct(task);
+	return ret;
+}
+
+static int proc_map_files_getattr(struct vfsmount *mnt, struct dentry *dentry,
+				  struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+	struct task_struct *task;
+	int ret = -EACCES;
+
+	task = get_proc_task(inode);
+	if (!task)
+		return -ESRCH;
+
+	if (!lock_trace(task)) {
+		generic_fillattr(inode, stat);
+		unlock_trace(task);
+		ret = 0;
+	}
+
+	put_task_struct(task);
+	return ret;
+}
+
+static const struct inode_operations proc_map_files_inode_operations = {
+	.lookup		= proc_map_files_lookup,
+	.setattr	= proc_map_files_setattr,
+	.getattr	= proc_map_files_getattr,
+};
+
+static int proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct dentry *dentry = filp->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	struct vm_area_struct *vma;
+	struct task_struct *task;
+	struct mm_struct *mm;
+	ino_t ino;
+	int ret;
+
+	ret = -ENOENT;
+	task = get_proc_task(inode);
+	if (!task)
+		goto out_no_task;
+
+	ret = -EPERM;
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+		goto out;
+
+	ret = 0;
+	switch (filp->f_pos) {
+	case 0:
+		ino = inode->i_ino;
+		if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0)
+			goto out;
+		filp->f_pos++;
+	case 1:
+		ino = parent_ino(dentry);
+		if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
+			goto out;
+		filp->f_pos++;
+	default:
+	{
+		unsigned long nr_files, pos, i;
+		struct flex_array *fa = NULL;
+		struct map_files_info info;
+		struct map_files_info *p;
+
+		mm = get_task_mm(task);
+		if (!mm)
+			goto out;
+		down_read(&mm->mmap_sem);
+
+		nr_files = 0;
+
+		/*
+		 * We need two passes here:
+		 *
+		 *  1) Collect vmas of mapped files with mmap_sem taken
+		 *  2) Release mmap_sem and instantiate entries
+		 *
+		 * otherwise we get lockdep complained, since filldir()
+		 * routine might require mmap_sem taken in might_fault().
+		 */
+
+		for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
+			if (vma->vm_file && ++pos > filp->f_pos)
+				nr_files++;
+		}
+
+		if (nr_files) {
+			ret = -ENOMEM;
+			fa = flex_array_alloc(sizeof(info), nr_files, GFP_KERNEL);
+			if (!fa)
+				goto nomem;
+			if (flex_array_prealloc(fa, 0, nr_files, GFP_KERNEL)) {
+				flex_array_free(fa);
+				fa = NULL;
+				goto nomem;
+			}
+			for (i = 0, vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
+				if (!vma->vm_file)
+					continue;
+				if (++pos <= filp->f_pos)
+					continue;
+
+				get_file(vma->vm_file);
+				info.file = vma->vm_file;
+				info.len = snprintf(info.name, sizeof(info.name),
+						    "%lx-%lx", vma->vm_start,
+						    vma->vm_end);
+				if (flex_array_put(fa, i++, &info, GFP_KERNEL))
+					BUG();
+			}
+			ret = 0;
+		}
+nomem:
+		up_read(&mm->mmap_sem);
+
+		if (fa) {
+			for (i = 0; i < nr_files; i++) {
+				p = flex_array_get(fa, i);
+				ret = proc_fill_cache(filp, dirent, filldir,
+						      p->name, p->len,
+						      proc_map_files_instantiate,
+						      task, p->file);
+				if (ret)
+					break;
+				filp->f_pos++;
+				fput(p->file);
+			}
+
+			for (; i < nr_files; i++) {
+				/*
+				 * In case of error don't forget
+				 * to put rest of file refs.
+				 */
+				p = flex_array_get(fa, i);
+				fput(p->file);
+			}
+
+			flex_array_free(fa);
+		}
+
+		mmput(mm);
+	}
+	}
+
+out:
+	put_task_struct(task);
+out_no_task:
+	return ret;
+}
+
+static const struct file_operations proc_map_files_operations = {
+	.read		= generic_read_dir,
+	.readdir	= proc_map_files_readdir,
+	.llseek		= default_llseek,
+};
+
+/*
  * /proc/pid/fd needs a special permission handler so that a process can still
  * access /proc/self/fd after it has executed a setuid().
  */
@@ -2785,6 +3149,7 @@ static const struct inode_operations pro
 static const struct pid_entry tgid_base_stuff[] = {
 	DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
 	DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
+	DIR("map_files",  S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
 	DIR("fdinfo",     S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
 	DIR("ns",	  S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
 #ifdef CONFIG_NET
Index: linux-2.6.git/include/linux/mm.h
===================================================================
--- linux-2.6.git.orig/include/linux/mm.h
+++ linux-2.6.git/include/linux/mm.h
@@ -1491,6 +1491,18 @@ static inline unsigned long vma_pages(st
 	return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
 }
 
+/* Look up the first VMA which exactly match the interval vm_start ... vm_end */
+static inline struct vm_area_struct *
+find_exact_vma(struct mm_struct *mm, unsigned long vm_start, unsigned long vm_end)
+{
+	struct vm_area_struct *vma = find_vma(mm, vm_start);
+
+	if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end))
+		vma = NULL;
+
+	return vma;
+}
+
 #ifdef CONFIG_MMU
 pgprot_t vm_get_page_prot(unsigned long vm_flags);
 #else

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [kernel-hardening] Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
@ 2011-09-08  5:48                                               ` Cyrill Gorcunov
  0 siblings, 0 replies; 82+ messages in thread
From: Cyrill Gorcunov @ 2011-09-08  5:48 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Vasiliy Kulikov, Tejun Heo, Kirill A. Shutemov, containers,
	linux-kernel, linux-fsdevel, Nathan Lynch, kernel-hardening,
	Oren Laadan, Daniel Lezcano, Glauber Costa, James Bottomley,
	Alexey Dobriyan, Al Viro, Pavel Emelyanov

On Wed, Sep 07, 2011 at 03:53:32PM -0700, Andrew Morton wrote:
...
> 
> > > > +err:
> > > > +		up_read(&mm->mmap_sem);
> > > > +
> > > > +		for (i = 0; i < used && !ret; i++) {
> > > 
> > > The "&& !ret" is unneeded?
> > 
> > No, it's needed, since it makes sure that if "impossible"
> > scenario happens and flex-arrays fails with preallocated
> > data so we will reach this point with used > 0 and ret = -ENOMEM
> > and thus will not call for proc_map_files_instantiate as needed.
> 
> Well, it doesn't need to be tested on each pass around the loop - that's
> misleading and inefficient (unless the compiler is being particularly clever).

Here an update is coming, please take a look. Note I simply added BUG() in case
if preallocated array failed since found that it's what other kernel code
does as well (ie grep for flex_array_put_ptr). Other notes are at v11: mark
below. Thanks.

	Cyrill
---
fs, proc: Introduce the /proc/<pid>/map_files/ directory v11

From: Pavel Emelyanov <xemul@parallels.com>

This one behaves similarly to the /proc/<pid>/fd/ one - it contains symlinks
one for each mapping with file, the name of a symlink is "vma->vm_start-vma->vm_end",
the target is the file. Opening a symlink results in a file that point exactly
to the same inode as them vma's one.

For example the ls -l of some arbitrary /proc/<pid>/map_files/

 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80403000-7f8f80404000 -> /lib64/libc-2.5.so
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f8061e000-7f8f80620000 -> /lib64/libselinux.so.1
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80826000-7f8f80827000 -> /lib64/libacl.so.1.1.0
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a2f000-7f8f80a30000 -> /lib64/librt-2.5.so
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a30000-7f8f80a4c000 -> /lib64/ld-2.5.so

This *helps* checkpointing process in three ways:

1. When dumping a task mappings we do know exact file that is mapped by particular
   region. We do this by opening /proc/$pid/map_files/address symlink the way we do
   with file descriptors.

2. This also helps in determining which anonymous shared mappings are shared with
   each other by comparing the inodes of them.

3. When restoring a set of process in case two of them has a mapping shared, we map
   the memory by the 1st one and then open its /proc/$pid/map_files/address file and
   map it by the 2nd task.

Using /proc/$pid/maps for this is quite inconvenient since it brings repeatable
re-reading and reparsing for this text file which slows down restore procesure
significantly. Also as being pointed in (3) it is a way easier to use top level
shared mapping in children as /proc/$pid/map_files/address when needed.

v2: (spotted by Tejun Heo)
 - /proc/<pid>/mfd changed to /proc/<pid>/map_files
 - find_vma helper is used instead of linear search
 - routines are re-grouped
 - d_revalidate is set now

v3:
 - d_revalidate reworked, now it should drops no longer valid dentries (Tejun Heo)
 - ptrace_may_access added into proc_map_files_lookup (Vasiliy Kulikov)
 - because of filldir (which eventually might need to lock mmap_sem)
   the proc_map_files_readdir() was reworked to call proc_fill_cache()
   with unlocked mmap_sem

v4: (feedback by Tejun Heo and Vasiliy Kulikov)
 - instead of saving data in proc_inode we rather make a dentry name
   to keep both vm_start and vm_end accordingly
 - d_revalidate now honor task credentials

v5: (feedback by Kirill A. Shutemov)
 - don't forget to release mmap_sem on error path

v6:
 - sizeof get used in map_files_info which shrink member a bit on
   x86-32 (by Kirill A. Shutemov)
 - map_name_to_addr returns -EINVAL instead of -1
   which is more appropriate (by Tejun Heo)

v7:
 - add [get/set]attr handlers for
   proc_map_files_inode_operations (by Vasiliy Kulikov)

v8:
 - Kirill A. Shutemov spotted a parasite semicolon
   which ruined the ptrace_check call, fixed.

v9: (feedback by Andrew Morton)
 - find_exact_vma moved into include/linux/mm.h as an inline helper
 - proc_map_files_setattr uses either kmalloc or vmalloc depending
   on how many ojects are to be allocated
 - no more map_name_to_addr but dname_to_vma_addr introduced instead
   and it uses sscanf because in one case the find_exact_vma() is used
   only to confirm existence of vma area the boolean flag is used
 - fancy justification dropped
 - still the proc_map_files_get/setattr leaved untouched
   until additional fd/ patches applied first.

v10: (feedback by Andrew Morton)
 - flex_arrays are used instead of kmalloc/vmalloc calls
 - map_files_d_revalidate use ptrace_may_access for
   security reason (by Vasiliy Kulikov)

v11:
 - should use fput and drop !ret test from a loop code
   (feedback by Andrew Morton)
 - no need for 'used' variable, use existing
   nr_files with file->pos predicate

Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
CC: Tejun Heo <tj@kernel.org>
CC: Vasiliy Kulikov <segoon@openwall.com>
CC: "Kirill A. Shutemov" <kirill@shutemov.name>
CC: Alexey Dobriyan <adobriyan@gmail.com>
CC: Al Viro <viro@ZenIV.linux.org.uk>
CC: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/base.c     |  365 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mm.h |   12 +
 2 files changed, 377 insertions(+)

Index: linux-2.6.git/fs/proc/base.c
===================================================================
--- linux-2.6.git.orig/fs/proc/base.c
+++ linux-2.6.git/fs/proc/base.c
@@ -83,6 +83,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/fs_struct.h>
 #include <linux/slab.h>
+#include <linux/flex_array.h>
 #ifdef CONFIG_HARDWALL
 #include <asm/hardwall.h>
 #endif
@@ -2171,6 +2172,369 @@ static const struct file_operations proc
 };
 
 /*
+ * dname_to_vma_addr - maps a dentry name into two unsigned longs
+ * which represent vma start and end addresses.
+ */
+static int dname_to_vma_addr(struct dentry *dentry,
+			     unsigned long *start, unsigned long *end)
+{
+	if (sscanf(dentry->d_name.name, "%lx-%lx", start, end) != 2)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	unsigned long vm_start, vm_end;
+	bool exact_vma_exists = false;
+	struct task_struct *task;
+	const struct cred *cred;
+	struct mm_struct *mm;
+	struct inode *inode;
+
+	if (nd && nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	inode = dentry->d_inode;
+	task = get_proc_task(inode);
+	if (!task)
+		goto out;
+
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+		goto out;
+
+	mm = get_task_mm(task);
+	put_task_struct(task);
+	if (!mm)
+		goto out;
+
+	if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
+		down_read(&mm->mmap_sem);
+		exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end);
+		up_read(&mm->mmap_sem);
+	}
+
+	mmput(mm);
+
+	if (exact_vma_exists) {
+		if (task_dumpable(task)) {
+			rcu_read_lock();
+			cred = __task_cred(task);
+			inode->i_uid = cred->euid;
+			inode->i_gid = cred->egid;
+			rcu_read_unlock();
+		} else {
+			inode->i_uid = 0;
+			inode->i_gid = 0;
+		}
+		security_task_to_inode(task, inode);
+		return 1;
+	}
+out:
+	d_drop(dentry);
+	return 0;
+}
+
+static const struct dentry_operations tid_map_files_dentry_operations = {
+	.d_revalidate	= map_files_d_revalidate,
+	.d_delete	= pid_delete_dentry,
+};
+
+static int proc_map_files_get_link(struct dentry *dentry, struct path *path)
+{
+	unsigned long vm_start, vm_end;
+	struct vm_area_struct *vma;
+	struct task_struct *task;
+	struct mm_struct *mm;
+	int rc = -ENOENT;
+
+	task = get_proc_task(dentry->d_inode);
+	if (!task)
+		goto out;
+
+	mm = get_task_mm(task);
+	put_task_struct(task);
+	if (!mm)
+		goto out;
+
+	rc = dname_to_vma_addr(dentry, &vm_start, &vm_end);
+	if (rc)
+		goto out_mmput;
+
+	down_read(&mm->mmap_sem);
+	vma = find_exact_vma(mm, vm_start, vm_end);
+	if (vma && vma->vm_file) {
+		*path = vma->vm_file->f_path;
+		path_get(path);
+		rc = 0;
+	}
+	up_read(&mm->mmap_sem);
+
+out_mmput:
+	mmput(mm);
+out:
+	return rc;
+}
+
+struct map_files_info {
+	struct file	*file;
+	unsigned long	len;
+	unsigned char	name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
+};
+
+static struct dentry *
+proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
+			   struct task_struct *task, const void *ptr)
+{
+	const struct file *file = ptr;
+	struct proc_inode *ei;
+	struct inode *inode;
+
+	if (!file)
+		return ERR_PTR(-ENOENT);
+
+	inode = proc_pid_make_inode(dir->i_sb, task);
+	if (!inode)
+		return ERR_PTR(-ENOENT);
+
+	ei = PROC_I(inode);
+	ei->op.proc_get_link = proc_map_files_get_link;
+
+	inode->i_op = &proc_pid_link_inode_operations;
+	inode->i_size = 64;
+	inode->i_mode = S_IFLNK;
+
+	if (file->f_mode & FMODE_READ)
+		inode->i_mode |= S_IRUSR | S_IXUSR;
+	if (file->f_mode & FMODE_WRITE)
+		inode->i_mode |= S_IWUSR | S_IXUSR;
+
+	d_set_d_op(dentry, &tid_map_files_dentry_operations);
+	d_add(dentry, inode);
+
+	return NULL;
+}
+
+static struct dentry *proc_map_files_lookup(struct inode *dir,
+		struct dentry *dentry, struct nameidata *nd)
+{
+	unsigned long vm_start, vm_end;
+	struct task_struct *task;
+	struct vm_area_struct *vma;
+	struct mm_struct *mm;
+	struct dentry *result;
+
+	result = ERR_PTR(-ENOENT);
+	task = get_proc_task(dir);
+	if (!task)
+		goto out_no_task;
+
+	result = ERR_PTR(-EPERM);
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+		goto out_no_mm;
+
+	result = ERR_PTR(-ENOENT);
+	if (dname_to_vma_addr(dentry, &vm_start, &vm_end))
+		goto out_no_mm;
+
+	mm = get_task_mm(task);
+	if (!mm)
+		goto out_no_mm;
+
+	down_read(&mm->mmap_sem);
+	vma = find_exact_vma(mm, vm_start, vm_end);
+	if (!vma)
+		goto out_no_vma;
+
+	result = proc_map_files_instantiate(dir, dentry, task, vma->vm_file);
+
+out_no_vma:
+	up_read(&mm->mmap_sem);
+	mmput(mm);
+out_no_mm:
+	put_task_struct(task);
+out_no_task:
+	return result;
+}
+
+static int proc_map_files_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	struct task_struct *task;
+	int ret = -EACCES;
+
+	task = get_proc_task(inode);
+	if (!task)
+		return -ESRCH;
+
+	if (!lock_trace(task)) {
+		ret = proc_setattr(dentry, attr);
+		unlock_trace(task);
+	}
+
+	put_task_struct(task);
+	return ret;
+}
+
+static int proc_map_files_getattr(struct vfsmount *mnt, struct dentry *dentry,
+				  struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+	struct task_struct *task;
+	int ret = -EACCES;
+
+	task = get_proc_task(inode);
+	if (!task)
+		return -ESRCH;
+
+	if (!lock_trace(task)) {
+		generic_fillattr(inode, stat);
+		unlock_trace(task);
+		ret = 0;
+	}
+
+	put_task_struct(task);
+	return ret;
+}
+
+static const struct inode_operations proc_map_files_inode_operations = {
+	.lookup		= proc_map_files_lookup,
+	.setattr	= proc_map_files_setattr,
+	.getattr	= proc_map_files_getattr,
+};
+
+static int proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct dentry *dentry = filp->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	struct vm_area_struct *vma;
+	struct task_struct *task;
+	struct mm_struct *mm;
+	ino_t ino;
+	int ret;
+
+	ret = -ENOENT;
+	task = get_proc_task(inode);
+	if (!task)
+		goto out_no_task;
+
+	ret = -EPERM;
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+		goto out;
+
+	ret = 0;
+	switch (filp->f_pos) {
+	case 0:
+		ino = inode->i_ino;
+		if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0)
+			goto out;
+		filp->f_pos++;
+	case 1:
+		ino = parent_ino(dentry);
+		if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
+			goto out;
+		filp->f_pos++;
+	default:
+	{
+		unsigned long nr_files, pos, i;
+		struct flex_array *fa = NULL;
+		struct map_files_info info;
+		struct map_files_info *p;
+
+		mm = get_task_mm(task);
+		if (!mm)
+			goto out;
+		down_read(&mm->mmap_sem);
+
+		nr_files = 0;
+
+		/*
+		 * We need two passes here:
+		 *
+		 *  1) Collect vmas of mapped files with mmap_sem taken
+		 *  2) Release mmap_sem and instantiate entries
+		 *
+		 * otherwise we get lockdep complained, since filldir()
+		 * routine might require mmap_sem taken in might_fault().
+		 */
+
+		for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
+			if (vma->vm_file && ++pos > filp->f_pos)
+				nr_files++;
+		}
+
+		if (nr_files) {
+			ret = -ENOMEM;
+			fa = flex_array_alloc(sizeof(info), nr_files, GFP_KERNEL);
+			if (!fa)
+				goto nomem;
+			if (flex_array_prealloc(fa, 0, nr_files, GFP_KERNEL)) {
+				flex_array_free(fa);
+				fa = NULL;
+				goto nomem;
+			}
+			for (i = 0, vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
+				if (!vma->vm_file)
+					continue;
+				if (++pos <= filp->f_pos)
+					continue;
+
+				get_file(vma->vm_file);
+				info.file = vma->vm_file;
+				info.len = snprintf(info.name, sizeof(info.name),
+						    "%lx-%lx", vma->vm_start,
+						    vma->vm_end);
+				if (flex_array_put(fa, i++, &info, GFP_KERNEL))
+					BUG();
+			}
+			ret = 0;
+		}
+nomem:
+		up_read(&mm->mmap_sem);
+
+		if (fa) {
+			for (i = 0; i < nr_files; i++) {
+				p = flex_array_get(fa, i);
+				ret = proc_fill_cache(filp, dirent, filldir,
+						      p->name, p->len,
+						      proc_map_files_instantiate,
+						      task, p->file);
+				if (ret)
+					break;
+				filp->f_pos++;
+				fput(p->file);
+			}
+
+			for (; i < nr_files; i++) {
+				/*
+				 * In case of error don't forget
+				 * to put rest of file refs.
+				 */
+				p = flex_array_get(fa, i);
+				fput(p->file);
+			}
+
+			flex_array_free(fa);
+		}
+
+		mmput(mm);
+	}
+	}
+
+out:
+	put_task_struct(task);
+out_no_task:
+	return ret;
+}
+
+static const struct file_operations proc_map_files_operations = {
+	.read		= generic_read_dir,
+	.readdir	= proc_map_files_readdir,
+	.llseek		= default_llseek,
+};
+
+/*
  * /proc/pid/fd needs a special permission handler so that a process can still
  * access /proc/self/fd after it has executed a setuid().
  */
@@ -2785,6 +3149,7 @@ static const struct inode_operations pro
 static const struct pid_entry tgid_base_stuff[] = {
 	DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
 	DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
+	DIR("map_files",  S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
 	DIR("fdinfo",     S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
 	DIR("ns",	  S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
 #ifdef CONFIG_NET
Index: linux-2.6.git/include/linux/mm.h
===================================================================
--- linux-2.6.git.orig/include/linux/mm.h
+++ linux-2.6.git/include/linux/mm.h
@@ -1491,6 +1491,18 @@ static inline unsigned long vma_pages(st
 	return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
 }
 
+/* Look up the first VMA which exactly match the interval vm_start ... vm_end */
+static inline struct vm_area_struct *
+find_exact_vma(struct mm_struct *mm, unsigned long vm_start, unsigned long vm_end)
+{
+	struct vm_area_struct *vma = find_vma(mm, vm_start);
+
+	if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end))
+		vma = NULL;
+
+	return vma;
+}
+
 #ifdef CONFIG_MMU
 pgprot_t vm_get_page_prot(unsigned long vm_flags);
 #else

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-09-08  5:48                                               ` [kernel-hardening] " Cyrill Gorcunov
@ 2011-09-08  5:50                                                 ` Cyrill Gorcunov
  -1 siblings, 0 replies; 82+ messages in thread
From: Cyrill Gorcunov @ 2011-09-08  5:50 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Vasiliy Kulikov, Tejun Heo, Kirill A. Shutemov, containers,
	linux-kernel, linux-fsdevel, Nathan Lynch, kernel-hardening,
	Oren Laadan, Daniel Lezcano, Glauber Costa, James Bottomley,
	Alexey Dobriyan, Al Viro, Pavel Emelyanov

On Thu, Sep 08, 2011 at 09:48:26AM +0400, Cyrill Gorcunov wrote:
...
> 
> Here an update is coming, please take a look. Note I simply added BUG() in case
> if preallocated array failed since found that it's what other kernel code
> does as well (ie grep for flex_array_put_ptr). Other notes are at v11: mark
> below. Thanks.
> 
> 	Cyrill
> ---
> fs, proc: Introduce the /proc/<pid>/map_files/ directory v11
> 

Crap. Andrew drop this one please.

	Cyrill

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [kernel-hardening] Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
@ 2011-09-08  5:50                                                 ` Cyrill Gorcunov
  0 siblings, 0 replies; 82+ messages in thread
From: Cyrill Gorcunov @ 2011-09-08  5:50 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Vasiliy Kulikov, Tejun Heo, Kirill A. Shutemov, containers,
	linux-kernel, linux-fsdevel, Nathan Lynch, kernel-hardening,
	Oren Laadan, Daniel Lezcano, Glauber Costa, James Bottomley,
	Alexey Dobriyan, Al Viro, Pavel Emelyanov

On Thu, Sep 08, 2011 at 09:48:26AM +0400, Cyrill Gorcunov wrote:
...
> 
> Here an update is coming, please take a look. Note I simply added BUG() in case
> if preallocated array failed since found that it's what other kernel code
> does as well (ie grep for flex_array_put_ptr). Other notes are at v11: mark
> below. Thanks.
> 
> 	Cyrill
> ---
> fs, proc: Introduce the /proc/<pid>/map_files/ directory v11
> 

Crap. Andrew drop this one please.

	Cyrill

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-09-08  5:50                                                 ` [kernel-hardening] " Cyrill Gorcunov
@ 2011-09-08  6:04                                                   ` Cyrill Gorcunov
  -1 siblings, 0 replies; 82+ messages in thread
From: Cyrill Gorcunov @ 2011-09-08  6:04 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Vasiliy Kulikov, Tejun Heo, Kirill A. Shutemov, containers,
	linux-kernel, linux-fsdevel, Nathan Lynch, kernel-hardening,
	Oren Laadan, Daniel Lezcano, Glauber Costa, James Bottomley,
	Alexey Dobriyan, Al Viro, Pavel Emelyanov

On Thu, Sep 08, 2011 at 09:50:25AM +0400, Cyrill Gorcunov wrote:
> On Thu, Sep 08, 2011 at 09:48:26AM +0400, Cyrill Gorcunov wrote:
> ...
> > 
> > Here an update is coming, please take a look. Note I simply added BUG() in case
> > if preallocated array failed since found that it's what other kernel code
> > does as well (ie grep for flex_array_put_ptr). Other notes are at v11: mark
> > below. Thanks.
> > 
> > 	Cyrill
> > ---
> > fs, proc: Introduce the /proc/<pid>/map_files/ directory v11
> > 
> 
> Crap. Andrew drop this one please.
> 

This one should pass better. The changes from previous version
are at proc_map_files_readdir().

	Cyrill
---
fs, proc: Introduce the /proc/<pid>/map_files/ directory v11

From: Pavel Emelyanov <xemul@parallels.com>

This one behaves similarly to the /proc/<pid>/fd/ one - it contains symlinks
one for each mapping with file, the name of a symlink is "vma->vm_start-vma->vm_end",
the target is the file. Opening a symlink results in a file that point exactly
to the same inode as them vma's one.

For example the ls -l of some arbitrary /proc/<pid>/map_files/

 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80403000-7f8f80404000 -> /lib64/libc-2.5.so
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f8061e000-7f8f80620000 -> /lib64/libselinux.so.1
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80826000-7f8f80827000 -> /lib64/libacl.so.1.1.0
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a2f000-7f8f80a30000 -> /lib64/librt-2.5.so
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a30000-7f8f80a4c000 -> /lib64/ld-2.5.so

This *helps* checkpointing process in three ways:

1. When dumping a task mappings we do know exact file that is mapped by particular
   region. We do this by opening /proc/$pid/map_files/address symlink the way we do
   with file descriptors.

2. This also helps in determining which anonymous shared mappings are shared with
   each other by comparing the inodes of them.

3. When restoring a set of process in case two of them has a mapping shared, we map
   the memory by the 1st one and then open its /proc/$pid/map_files/address file and
   map it by the 2nd task.

Using /proc/$pid/maps for this is quite inconvenient since it brings repeatable
re-reading and reparsing for this text file which slows down restore procesure
significantly. Also as being pointed in (3) it is a way easier to use top level
shared mapping in children as /proc/$pid/map_files/address when needed.

v2: (spotted by Tejun Heo)
 - /proc/<pid>/mfd changed to /proc/<pid>/map_files
 - find_vma helper is used instead of linear search
 - routines are re-grouped
 - d_revalidate is set now

v3:
 - d_revalidate reworked, now it should drops no longer valid dentries (Tejun Heo)
 - ptrace_may_access added into proc_map_files_lookup (Vasiliy Kulikov)
 - because of filldir (which eventually might need to lock mmap_sem)
   the proc_map_files_readdir() was reworked to call proc_fill_cache()
   with unlocked mmap_sem

v4: (feedback by Tejun Heo and Vasiliy Kulikov)
 - instead of saving data in proc_inode we rather make a dentry name
   to keep both vm_start and vm_end accordingly
 - d_revalidate now honor task credentials

v5: (feedback by Kirill A. Shutemov)
 - don't forget to release mmap_sem on error path

v6:
 - sizeof get used in map_files_info which shrink member a bit on
   x86-32 (by Kirill A. Shutemov)
 - map_name_to_addr returns -EINVAL instead of -1
   which is more appropriate (by Tejun Heo)

v7:
 - add [get/set]attr handlers for
   proc_map_files_inode_operations (by Vasiliy Kulikov)

v8:
 - Kirill A. Shutemov spotted a parasite semicolon
   which ruined the ptrace_check call, fixed.

v9: (feedback by Andrew Morton)
 - find_exact_vma moved into include/linux/mm.h as an inline helper
 - proc_map_files_setattr uses either kmalloc or vmalloc depending
   on how many ojects are to be allocated
 - no more map_name_to_addr but dname_to_vma_addr introduced instead
   and it uses sscanf because in one case the find_exact_vma() is used
   only to confirm existence of vma area the boolean flag is used
 - fancy justification dropped
 - still the proc_map_files_get/setattr leaved untouched
   until additional fd/ patches applied first.

v10: (feedback by Andrew Morton)
 - flex_arrays are used instead of kmalloc/vmalloc calls
 - map_files_d_revalidate use ptrace_may_access for
   security reason (by Vasiliy Kulikov)

v11:
 - should use fput and drop !ret test from a loop code
   (feedback by Andrew Morton)
 - no need for 'used' variable, use existing
   nr_files with file->pos predicate
 - if preallocation fails no need to go further,
   simply release mmap semaphore and jump out

Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
CC: Tejun Heo <tj@kernel.org>
CC: Vasiliy Kulikov <segoon@openwall.com>
CC: "Kirill A. Shutemov" <kirill@shutemov.name>
CC: Alexey Dobriyan <adobriyan@gmail.com>
CC: Al Viro <viro@ZenIV.linux.org.uk>
CC: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/base.c     |  359 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mm.h |   12 +
 2 files changed, 371 insertions(+)

Index: linux-2.6.git/fs/proc/base.c
===================================================================
--- linux-2.6.git.orig/fs/proc/base.c
+++ linux-2.6.git/fs/proc/base.c
@@ -83,6 +83,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/fs_struct.h>
 #include <linux/slab.h>
+#include <linux/flex_array.h>
 #ifdef CONFIG_HARDWALL
 #include <asm/hardwall.h>
 #endif
@@ -2171,6 +2172,363 @@ static const struct file_operations proc
 };
 
 /*
+ * dname_to_vma_addr - maps a dentry name into two unsigned longs
+ * which represent vma start and end addresses.
+ */
+static int dname_to_vma_addr(struct dentry *dentry,
+			     unsigned long *start, unsigned long *end)
+{
+	if (sscanf(dentry->d_name.name, "%lx-%lx", start, end) != 2)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	unsigned long vm_start, vm_end;
+	bool exact_vma_exists = false;
+	struct task_struct *task;
+	const struct cred *cred;
+	struct mm_struct *mm;
+	struct inode *inode;
+
+	if (nd && nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	inode = dentry->d_inode;
+	task = get_proc_task(inode);
+	if (!task)
+		goto out;
+
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+		goto out;
+
+	mm = get_task_mm(task);
+	put_task_struct(task);
+	if (!mm)
+		goto out;
+
+	if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
+		down_read(&mm->mmap_sem);
+		exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end);
+		up_read(&mm->mmap_sem);
+	}
+
+	mmput(mm);
+
+	if (exact_vma_exists) {
+		if (task_dumpable(task)) {
+			rcu_read_lock();
+			cred = __task_cred(task);
+			inode->i_uid = cred->euid;
+			inode->i_gid = cred->egid;
+			rcu_read_unlock();
+		} else {
+			inode->i_uid = 0;
+			inode->i_gid = 0;
+		}
+		security_task_to_inode(task, inode);
+		return 1;
+	}
+out:
+	d_drop(dentry);
+	return 0;
+}
+
+static const struct dentry_operations tid_map_files_dentry_operations = {
+	.d_revalidate	= map_files_d_revalidate,
+	.d_delete	= pid_delete_dentry,
+};
+
+static int proc_map_files_get_link(struct dentry *dentry, struct path *path)
+{
+	unsigned long vm_start, vm_end;
+	struct vm_area_struct *vma;
+	struct task_struct *task;
+	struct mm_struct *mm;
+	int rc = -ENOENT;
+
+	task = get_proc_task(dentry->d_inode);
+	if (!task)
+		goto out;
+
+	mm = get_task_mm(task);
+	put_task_struct(task);
+	if (!mm)
+		goto out;
+
+	rc = dname_to_vma_addr(dentry, &vm_start, &vm_end);
+	if (rc)
+		goto out_mmput;
+
+	down_read(&mm->mmap_sem);
+	vma = find_exact_vma(mm, vm_start, vm_end);
+	if (vma && vma->vm_file) {
+		*path = vma->vm_file->f_path;
+		path_get(path);
+		rc = 0;
+	}
+	up_read(&mm->mmap_sem);
+
+out_mmput:
+	mmput(mm);
+out:
+	return rc;
+}
+
+struct map_files_info {
+	struct file	*file;
+	unsigned long	len;
+	unsigned char	name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
+};
+
+static struct dentry *
+proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
+			   struct task_struct *task, const void *ptr)
+{
+	const struct file *file = ptr;
+	struct proc_inode *ei;
+	struct inode *inode;
+
+	if (!file)
+		return ERR_PTR(-ENOENT);
+
+	inode = proc_pid_make_inode(dir->i_sb, task);
+	if (!inode)
+		return ERR_PTR(-ENOENT);
+
+	ei = PROC_I(inode);
+	ei->op.proc_get_link = proc_map_files_get_link;
+
+	inode->i_op = &proc_pid_link_inode_operations;
+	inode->i_size = 64;
+	inode->i_mode = S_IFLNK;
+
+	if (file->f_mode & FMODE_READ)
+		inode->i_mode |= S_IRUSR | S_IXUSR;
+	if (file->f_mode & FMODE_WRITE)
+		inode->i_mode |= S_IWUSR | S_IXUSR;
+
+	d_set_d_op(dentry, &tid_map_files_dentry_operations);
+	d_add(dentry, inode);
+
+	return NULL;
+}
+
+static struct dentry *proc_map_files_lookup(struct inode *dir,
+		struct dentry *dentry, struct nameidata *nd)
+{
+	unsigned long vm_start, vm_end;
+	struct task_struct *task;
+	struct vm_area_struct *vma;
+	struct mm_struct *mm;
+	struct dentry *result;
+
+	result = ERR_PTR(-ENOENT);
+	task = get_proc_task(dir);
+	if (!task)
+		goto out_no_task;
+
+	result = ERR_PTR(-EPERM);
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+		goto out_no_mm;
+
+	result = ERR_PTR(-ENOENT);
+	if (dname_to_vma_addr(dentry, &vm_start, &vm_end))
+		goto out_no_mm;
+
+	mm = get_task_mm(task);
+	if (!mm)
+		goto out_no_mm;
+
+	down_read(&mm->mmap_sem);
+	vma = find_exact_vma(mm, vm_start, vm_end);
+	if (!vma)
+		goto out_no_vma;
+
+	result = proc_map_files_instantiate(dir, dentry, task, vma->vm_file);
+
+out_no_vma:
+	up_read(&mm->mmap_sem);
+	mmput(mm);
+out_no_mm:
+	put_task_struct(task);
+out_no_task:
+	return result;
+}
+
+static int proc_map_files_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	struct task_struct *task;
+	int ret = -EACCES;
+
+	task = get_proc_task(inode);
+	if (!task)
+		return -ESRCH;
+
+	if (!lock_trace(task)) {
+		ret = proc_setattr(dentry, attr);
+		unlock_trace(task);
+	}
+
+	put_task_struct(task);
+	return ret;
+}
+
+static int proc_map_files_getattr(struct vfsmount *mnt, struct dentry *dentry,
+				  struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+	struct task_struct *task;
+	int ret = -EACCES;
+
+	task = get_proc_task(inode);
+	if (!task)
+		return -ESRCH;
+
+	if (!lock_trace(task)) {
+		generic_fillattr(inode, stat);
+		unlock_trace(task);
+		ret = 0;
+	}
+
+	put_task_struct(task);
+	return ret;
+}
+
+static const struct inode_operations proc_map_files_inode_operations = {
+	.lookup		= proc_map_files_lookup,
+	.setattr	= proc_map_files_setattr,
+	.getattr	= proc_map_files_getattr,
+};
+
+static int proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct dentry *dentry = filp->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	struct vm_area_struct *vma;
+	struct task_struct *task;
+	struct mm_struct *mm;
+	ino_t ino;
+	int ret;
+
+	ret = -ENOENT;
+	task = get_proc_task(inode);
+	if (!task)
+		goto out_no_task;
+
+	ret = -EPERM;
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+		goto out;
+
+	ret = 0;
+	switch (filp->f_pos) {
+	case 0:
+		ino = inode->i_ino;
+		if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0)
+			goto out;
+		filp->f_pos++;
+	case 1:
+		ino = parent_ino(dentry);
+		if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
+			goto out;
+		filp->f_pos++;
+	default:
+	{
+		unsigned long nr_files, pos, i;
+		struct flex_array *fa = NULL;
+		struct map_files_info info;
+		struct map_files_info *p;
+
+		mm = get_task_mm(task);
+		if (!mm)
+			goto out;
+		down_read(&mm->mmap_sem);
+
+		nr_files = 0;
+
+		/*
+		 * We need two passes here:
+		 *
+		 *  1) Collect vmas of mapped files with mmap_sem taken
+		 *  2) Release mmap_sem and instantiate entries
+		 *
+		 * otherwise we get lockdep complained, since filldir()
+		 * routine might require mmap_sem taken in might_fault().
+		 */
+
+		for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
+			if (vma->vm_file && ++pos > filp->f_pos)
+				nr_files++;
+		}
+
+		if (nr_files) {
+			fa = flex_array_alloc(sizeof(info), nr_files, GFP_KERNEL);
+			if (!fa || flex_array_prealloc(fa, 0, nr_files, GFP_KERNEL)) {
+				ret = -ENOMEM;
+				if (fa)
+					flex_array_free(fa);
+				up_read(&mm->mmap_sem);
+				mmput(mm);
+				goto out;
+			}
+			for (i = 0, vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
+				if (!vma->vm_file)
+					continue;
+				if (++pos <= filp->f_pos)
+					continue;
+
+				get_file(vma->vm_file);
+				info.file = vma->vm_file;
+				info.len = snprintf(info.name, sizeof(info.name),
+						    "%lx-%lx", vma->vm_start,
+						    vma->vm_end);
+				if (flex_array_put(fa, i++, &info, GFP_KERNEL))
+					BUG();
+			}
+		}
+		up_read(&mm->mmap_sem);
+
+		for (i = 0; i < nr_files; i++) {
+			p = flex_array_get(fa, i);
+			ret = proc_fill_cache(filp, dirent, filldir,
+					      p->name, p->len,
+					      proc_map_files_instantiate,
+					      task, p->file);
+			if (ret)
+				break;
+			filp->f_pos++;
+			fput(p->file);
+		}
+		for (; i < nr_files; i++) {
+			/*
+			 * In case of error don't forget
+			 * to put rest of file refs.
+			 */
+			p = flex_array_get(fa, i);
+			fput(p->file);
+		}
+		if (fa)
+			flex_array_free(fa);
+		mmput(mm);
+	}
+	}
+
+out:
+	put_task_struct(task);
+out_no_task:
+	return ret;
+}
+
+static const struct file_operations proc_map_files_operations = {
+	.read		= generic_read_dir,
+	.readdir	= proc_map_files_readdir,
+	.llseek		= default_llseek,
+};
+
+/*
  * /proc/pid/fd needs a special permission handler so that a process can still
  * access /proc/self/fd after it has executed a setuid().
  */
@@ -2785,6 +3143,7 @@ static const struct inode_operations pro
 static const struct pid_entry tgid_base_stuff[] = {
 	DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
 	DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
+	DIR("map_files",  S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
 	DIR("fdinfo",     S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
 	DIR("ns",	  S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
 #ifdef CONFIG_NET
Index: linux-2.6.git/include/linux/mm.h
===================================================================
--- linux-2.6.git.orig/include/linux/mm.h
+++ linux-2.6.git/include/linux/mm.h
@@ -1491,6 +1491,18 @@ static inline unsigned long vma_pages(st
 	return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
 }
 
+/* Look up the first VMA which exactly match the interval vm_start ... vm_end */
+static inline struct vm_area_struct *
+find_exact_vma(struct mm_struct *mm, unsigned long vm_start, unsigned long vm_end)
+{
+	struct vm_area_struct *vma = find_vma(mm, vm_start);
+
+	if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end))
+		vma = NULL;
+
+	return vma;
+}
+
 #ifdef CONFIG_MMU
 pgprot_t vm_get_page_prot(unsigned long vm_flags);
 #else

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [kernel-hardening] Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
@ 2011-09-08  6:04                                                   ` Cyrill Gorcunov
  0 siblings, 0 replies; 82+ messages in thread
From: Cyrill Gorcunov @ 2011-09-08  6:04 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Vasiliy Kulikov, Tejun Heo, Kirill A. Shutemov, containers,
	linux-kernel, linux-fsdevel, Nathan Lynch, kernel-hardening,
	Oren Laadan, Daniel Lezcano, Glauber Costa, James Bottomley,
	Alexey Dobriyan, Al Viro, Pavel Emelyanov

On Thu, Sep 08, 2011 at 09:50:25AM +0400, Cyrill Gorcunov wrote:
> On Thu, Sep 08, 2011 at 09:48:26AM +0400, Cyrill Gorcunov wrote:
> ...
> > 
> > Here an update is coming, please take a look. Note I simply added BUG() in case
> > if preallocated array failed since found that it's what other kernel code
> > does as well (ie grep for flex_array_put_ptr). Other notes are at v11: mark
> > below. Thanks.
> > 
> > 	Cyrill
> > ---
> > fs, proc: Introduce the /proc/<pid>/map_files/ directory v11
> > 
> 
> Crap. Andrew drop this one please.
> 

This one should pass better. The changes from previous version
are at proc_map_files_readdir().

	Cyrill
---
fs, proc: Introduce the /proc/<pid>/map_files/ directory v11

From: Pavel Emelyanov <xemul@parallels.com>

This one behaves similarly to the /proc/<pid>/fd/ one - it contains symlinks
one for each mapping with file, the name of a symlink is "vma->vm_start-vma->vm_end",
the target is the file. Opening a symlink results in a file that point exactly
to the same inode as them vma's one.

For example the ls -l of some arbitrary /proc/<pid>/map_files/

 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80403000-7f8f80404000 -> /lib64/libc-2.5.so
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f8061e000-7f8f80620000 -> /lib64/libselinux.so.1
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80826000-7f8f80827000 -> /lib64/libacl.so.1.1.0
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a2f000-7f8f80a30000 -> /lib64/librt-2.5.so
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a30000-7f8f80a4c000 -> /lib64/ld-2.5.so

This *helps* checkpointing process in three ways:

1. When dumping a task mappings we do know exact file that is mapped by particular
   region. We do this by opening /proc/$pid/map_files/address symlink the way we do
   with file descriptors.

2. This also helps in determining which anonymous shared mappings are shared with
   each other by comparing the inodes of them.

3. When restoring a set of process in case two of them has a mapping shared, we map
   the memory by the 1st one and then open its /proc/$pid/map_files/address file and
   map it by the 2nd task.

Using /proc/$pid/maps for this is quite inconvenient since it brings repeatable
re-reading and reparsing for this text file which slows down restore procesure
significantly. Also as being pointed in (3) it is a way easier to use top level
shared mapping in children as /proc/$pid/map_files/address when needed.

v2: (spotted by Tejun Heo)
 - /proc/<pid>/mfd changed to /proc/<pid>/map_files
 - find_vma helper is used instead of linear search
 - routines are re-grouped
 - d_revalidate is set now

v3:
 - d_revalidate reworked, now it should drops no longer valid dentries (Tejun Heo)
 - ptrace_may_access added into proc_map_files_lookup (Vasiliy Kulikov)
 - because of filldir (which eventually might need to lock mmap_sem)
   the proc_map_files_readdir() was reworked to call proc_fill_cache()
   with unlocked mmap_sem

v4: (feedback by Tejun Heo and Vasiliy Kulikov)
 - instead of saving data in proc_inode we rather make a dentry name
   to keep both vm_start and vm_end accordingly
 - d_revalidate now honor task credentials

v5: (feedback by Kirill A. Shutemov)
 - don't forget to release mmap_sem on error path

v6:
 - sizeof get used in map_files_info which shrink member a bit on
   x86-32 (by Kirill A. Shutemov)
 - map_name_to_addr returns -EINVAL instead of -1
   which is more appropriate (by Tejun Heo)

v7:
 - add [get/set]attr handlers for
   proc_map_files_inode_operations (by Vasiliy Kulikov)

v8:
 - Kirill A. Shutemov spotted a parasite semicolon
   which ruined the ptrace_check call, fixed.

v9: (feedback by Andrew Morton)
 - find_exact_vma moved into include/linux/mm.h as an inline helper
 - proc_map_files_setattr uses either kmalloc or vmalloc depending
   on how many ojects are to be allocated
 - no more map_name_to_addr but dname_to_vma_addr introduced instead
   and it uses sscanf because in one case the find_exact_vma() is used
   only to confirm existence of vma area the boolean flag is used
 - fancy justification dropped
 - still the proc_map_files_get/setattr leaved untouched
   until additional fd/ patches applied first.

v10: (feedback by Andrew Morton)
 - flex_arrays are used instead of kmalloc/vmalloc calls
 - map_files_d_revalidate use ptrace_may_access for
   security reason (by Vasiliy Kulikov)

v11:
 - should use fput and drop !ret test from a loop code
   (feedback by Andrew Morton)
 - no need for 'used' variable, use existing
   nr_files with file->pos predicate
 - if preallocation fails no need to go further,
   simply release mmap semaphore and jump out

Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
CC: Tejun Heo <tj@kernel.org>
CC: Vasiliy Kulikov <segoon@openwall.com>
CC: "Kirill A. Shutemov" <kirill@shutemov.name>
CC: Alexey Dobriyan <adobriyan@gmail.com>
CC: Al Viro <viro@ZenIV.linux.org.uk>
CC: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/base.c     |  359 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mm.h |   12 +
 2 files changed, 371 insertions(+)

Index: linux-2.6.git/fs/proc/base.c
===================================================================
--- linux-2.6.git.orig/fs/proc/base.c
+++ linux-2.6.git/fs/proc/base.c
@@ -83,6 +83,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/fs_struct.h>
 #include <linux/slab.h>
+#include <linux/flex_array.h>
 #ifdef CONFIG_HARDWALL
 #include <asm/hardwall.h>
 #endif
@@ -2171,6 +2172,363 @@ static const struct file_operations proc
 };
 
 /*
+ * dname_to_vma_addr - maps a dentry name into two unsigned longs
+ * which represent vma start and end addresses.
+ */
+static int dname_to_vma_addr(struct dentry *dentry,
+			     unsigned long *start, unsigned long *end)
+{
+	if (sscanf(dentry->d_name.name, "%lx-%lx", start, end) != 2)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	unsigned long vm_start, vm_end;
+	bool exact_vma_exists = false;
+	struct task_struct *task;
+	const struct cred *cred;
+	struct mm_struct *mm;
+	struct inode *inode;
+
+	if (nd && nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	inode = dentry->d_inode;
+	task = get_proc_task(inode);
+	if (!task)
+		goto out;
+
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+		goto out;
+
+	mm = get_task_mm(task);
+	put_task_struct(task);
+	if (!mm)
+		goto out;
+
+	if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
+		down_read(&mm->mmap_sem);
+		exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end);
+		up_read(&mm->mmap_sem);
+	}
+
+	mmput(mm);
+
+	if (exact_vma_exists) {
+		if (task_dumpable(task)) {
+			rcu_read_lock();
+			cred = __task_cred(task);
+			inode->i_uid = cred->euid;
+			inode->i_gid = cred->egid;
+			rcu_read_unlock();
+		} else {
+			inode->i_uid = 0;
+			inode->i_gid = 0;
+		}
+		security_task_to_inode(task, inode);
+		return 1;
+	}
+out:
+	d_drop(dentry);
+	return 0;
+}
+
+static const struct dentry_operations tid_map_files_dentry_operations = {
+	.d_revalidate	= map_files_d_revalidate,
+	.d_delete	= pid_delete_dentry,
+};
+
+static int proc_map_files_get_link(struct dentry *dentry, struct path *path)
+{
+	unsigned long vm_start, vm_end;
+	struct vm_area_struct *vma;
+	struct task_struct *task;
+	struct mm_struct *mm;
+	int rc = -ENOENT;
+
+	task = get_proc_task(dentry->d_inode);
+	if (!task)
+		goto out;
+
+	mm = get_task_mm(task);
+	put_task_struct(task);
+	if (!mm)
+		goto out;
+
+	rc = dname_to_vma_addr(dentry, &vm_start, &vm_end);
+	if (rc)
+		goto out_mmput;
+
+	down_read(&mm->mmap_sem);
+	vma = find_exact_vma(mm, vm_start, vm_end);
+	if (vma && vma->vm_file) {
+		*path = vma->vm_file->f_path;
+		path_get(path);
+		rc = 0;
+	}
+	up_read(&mm->mmap_sem);
+
+out_mmput:
+	mmput(mm);
+out:
+	return rc;
+}
+
+struct map_files_info {
+	struct file	*file;
+	unsigned long	len;
+	unsigned char	name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
+};
+
+static struct dentry *
+proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
+			   struct task_struct *task, const void *ptr)
+{
+	const struct file *file = ptr;
+	struct proc_inode *ei;
+	struct inode *inode;
+
+	if (!file)
+		return ERR_PTR(-ENOENT);
+
+	inode = proc_pid_make_inode(dir->i_sb, task);
+	if (!inode)
+		return ERR_PTR(-ENOENT);
+
+	ei = PROC_I(inode);
+	ei->op.proc_get_link = proc_map_files_get_link;
+
+	inode->i_op = &proc_pid_link_inode_operations;
+	inode->i_size = 64;
+	inode->i_mode = S_IFLNK;
+
+	if (file->f_mode & FMODE_READ)
+		inode->i_mode |= S_IRUSR | S_IXUSR;
+	if (file->f_mode & FMODE_WRITE)
+		inode->i_mode |= S_IWUSR | S_IXUSR;
+
+	d_set_d_op(dentry, &tid_map_files_dentry_operations);
+	d_add(dentry, inode);
+
+	return NULL;
+}
+
+static struct dentry *proc_map_files_lookup(struct inode *dir,
+		struct dentry *dentry, struct nameidata *nd)
+{
+	unsigned long vm_start, vm_end;
+	struct task_struct *task;
+	struct vm_area_struct *vma;
+	struct mm_struct *mm;
+	struct dentry *result;
+
+	result = ERR_PTR(-ENOENT);
+	task = get_proc_task(dir);
+	if (!task)
+		goto out_no_task;
+
+	result = ERR_PTR(-EPERM);
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+		goto out_no_mm;
+
+	result = ERR_PTR(-ENOENT);
+	if (dname_to_vma_addr(dentry, &vm_start, &vm_end))
+		goto out_no_mm;
+
+	mm = get_task_mm(task);
+	if (!mm)
+		goto out_no_mm;
+
+	down_read(&mm->mmap_sem);
+	vma = find_exact_vma(mm, vm_start, vm_end);
+	if (!vma)
+		goto out_no_vma;
+
+	result = proc_map_files_instantiate(dir, dentry, task, vma->vm_file);
+
+out_no_vma:
+	up_read(&mm->mmap_sem);
+	mmput(mm);
+out_no_mm:
+	put_task_struct(task);
+out_no_task:
+	return result;
+}
+
+static int proc_map_files_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	struct task_struct *task;
+	int ret = -EACCES;
+
+	task = get_proc_task(inode);
+	if (!task)
+		return -ESRCH;
+
+	if (!lock_trace(task)) {
+		ret = proc_setattr(dentry, attr);
+		unlock_trace(task);
+	}
+
+	put_task_struct(task);
+	return ret;
+}
+
+static int proc_map_files_getattr(struct vfsmount *mnt, struct dentry *dentry,
+				  struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+	struct task_struct *task;
+	int ret = -EACCES;
+
+	task = get_proc_task(inode);
+	if (!task)
+		return -ESRCH;
+
+	if (!lock_trace(task)) {
+		generic_fillattr(inode, stat);
+		unlock_trace(task);
+		ret = 0;
+	}
+
+	put_task_struct(task);
+	return ret;
+}
+
+static const struct inode_operations proc_map_files_inode_operations = {
+	.lookup		= proc_map_files_lookup,
+	.setattr	= proc_map_files_setattr,
+	.getattr	= proc_map_files_getattr,
+};
+
+static int proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct dentry *dentry = filp->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	struct vm_area_struct *vma;
+	struct task_struct *task;
+	struct mm_struct *mm;
+	ino_t ino;
+	int ret;
+
+	ret = -ENOENT;
+	task = get_proc_task(inode);
+	if (!task)
+		goto out_no_task;
+
+	ret = -EPERM;
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+		goto out;
+
+	ret = 0;
+	switch (filp->f_pos) {
+	case 0:
+		ino = inode->i_ino;
+		if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0)
+			goto out;
+		filp->f_pos++;
+	case 1:
+		ino = parent_ino(dentry);
+		if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
+			goto out;
+		filp->f_pos++;
+	default:
+	{
+		unsigned long nr_files, pos, i;
+		struct flex_array *fa = NULL;
+		struct map_files_info info;
+		struct map_files_info *p;
+
+		mm = get_task_mm(task);
+		if (!mm)
+			goto out;
+		down_read(&mm->mmap_sem);
+
+		nr_files = 0;
+
+		/*
+		 * We need two passes here:
+		 *
+		 *  1) Collect vmas of mapped files with mmap_sem taken
+		 *  2) Release mmap_sem and instantiate entries
+		 *
+		 * otherwise we get lockdep complained, since filldir()
+		 * routine might require mmap_sem taken in might_fault().
+		 */
+
+		for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
+			if (vma->vm_file && ++pos > filp->f_pos)
+				nr_files++;
+		}
+
+		if (nr_files) {
+			fa = flex_array_alloc(sizeof(info), nr_files, GFP_KERNEL);
+			if (!fa || flex_array_prealloc(fa, 0, nr_files, GFP_KERNEL)) {
+				ret = -ENOMEM;
+				if (fa)
+					flex_array_free(fa);
+				up_read(&mm->mmap_sem);
+				mmput(mm);
+				goto out;
+			}
+			for (i = 0, vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
+				if (!vma->vm_file)
+					continue;
+				if (++pos <= filp->f_pos)
+					continue;
+
+				get_file(vma->vm_file);
+				info.file = vma->vm_file;
+				info.len = snprintf(info.name, sizeof(info.name),
+						    "%lx-%lx", vma->vm_start,
+						    vma->vm_end);
+				if (flex_array_put(fa, i++, &info, GFP_KERNEL))
+					BUG();
+			}
+		}
+		up_read(&mm->mmap_sem);
+
+		for (i = 0; i < nr_files; i++) {
+			p = flex_array_get(fa, i);
+			ret = proc_fill_cache(filp, dirent, filldir,
+					      p->name, p->len,
+					      proc_map_files_instantiate,
+					      task, p->file);
+			if (ret)
+				break;
+			filp->f_pos++;
+			fput(p->file);
+		}
+		for (; i < nr_files; i++) {
+			/*
+			 * In case of error don't forget
+			 * to put rest of file refs.
+			 */
+			p = flex_array_get(fa, i);
+			fput(p->file);
+		}
+		if (fa)
+			flex_array_free(fa);
+		mmput(mm);
+	}
+	}
+
+out:
+	put_task_struct(task);
+out_no_task:
+	return ret;
+}
+
+static const struct file_operations proc_map_files_operations = {
+	.read		= generic_read_dir,
+	.readdir	= proc_map_files_readdir,
+	.llseek		= default_llseek,
+};
+
+/*
  * /proc/pid/fd needs a special permission handler so that a process can still
  * access /proc/self/fd after it has executed a setuid().
  */
@@ -2785,6 +3143,7 @@ static const struct inode_operations pro
 static const struct pid_entry tgid_base_stuff[] = {
 	DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
 	DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
+	DIR("map_files",  S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
 	DIR("fdinfo",     S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
 	DIR("ns",	  S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
 #ifdef CONFIG_NET
Index: linux-2.6.git/include/linux/mm.h
===================================================================
--- linux-2.6.git.orig/include/linux/mm.h
+++ linux-2.6.git/include/linux/mm.h
@@ -1491,6 +1491,18 @@ static inline unsigned long vma_pages(st
 	return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
 }
 
+/* Look up the first VMA which exactly match the interval vm_start ... vm_end */
+static inline struct vm_area_struct *
+find_exact_vma(struct mm_struct *mm, unsigned long vm_start, unsigned long vm_end)
+{
+	struct vm_area_struct *vma = find_vma(mm, vm_start);
+
+	if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end))
+		vma = NULL;
+
+	return vma;
+}
+
 #ifdef CONFIG_MMU
 pgprot_t vm_get_page_prot(unsigned long vm_flags);
 #else

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-09-08  6:04                                                   ` [kernel-hardening] " Cyrill Gorcunov
  (?)
@ 2011-09-08 23:52                                                     ` Andrew Morton
  -1 siblings, 0 replies; 82+ messages in thread
From: Andrew Morton @ 2011-09-08 23:52 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: kernel-hardening-ZwoEplunGu1jrUoiu81ncdBPR1lH4CV8,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, Pavel Emelyanov,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, James Bottomley,
	containers-qjLDD68F18O7TbgM5vRIOg, Nathan Lynch, Alexey Dobriyan,
	Tejun Heo, Daniel Lezcano, Vasiliy Kulikov, Al Viro

On Thu, 8 Sep 2011 10:04:05 +0400
Cyrill Gorcunov <gorcunov-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:

> fs, proc: Introduce the /proc/<pid>/map_files/ directory v11

Ho hum, I've pretty much run out of excuses to avoid merging this.

except...

We don't really want to bloat fs/proc/base.o by 4k until all the other
things which support c/r are mergeable and we know that the whole
project is actually useful.  When will we be at this stage?

<looks at the warning>

fs/proc/base.c: In function 'proc_map_files_instantiate':
fs/proc/base.c:2348: warning: assignment from incompatible pointer type

err, that code will crash at runtime and it isn't trivial to fix. 
How could this happen?

>
> ...
>
> +				if (fa)
> +					flex_array_free(fa);
>
> ...
>
> +		if (fa)
> +			flex_array_free(fa);

I think I'll do this:

From: Andrew Morton <akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>

Lots of callers are avoiding passing NULL into flex_array_free().  Move
the check into flex_array_free() in the usual fashion.

Cc: Stephen Smalley <sds-+05T5uksL2qpZYMLLGbcSA@public.gmane.org>
Cc: James Morris <jmorris-gx6/JNMH7DfYtjvyW6yDsg@public.gmane.org>
Cc: Cyrill Gorcunov <gorcunov-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
Signed-off-by: Andrew Morton <akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>
---

 fs/proc/base.c                 |    6 ++----
 lib/flex_array.c               |    2 ++
 security/selinux/ss/policydb.c |    9 +++------
 3 files changed, 7 insertions(+), 10 deletions(-)

diff -puN lib/flex_array.c~lib-flex_arrayc-accept-null-arg-to-flex_array_free lib/flex_array.c
--- a/lib/flex_array.c~lib-flex_arrayc-accept-null-arg-to-flex_array_free
+++ a/lib/flex_array.c
@@ -142,6 +142,8 @@ EXPORT_SYMBOL(flex_array_free_parts);
 
 void flex_array_free(struct flex_array *fa)
 {
+	if (!fa)
+		return;
 	flex_array_free_parts(fa);
 	kfree(fa);
 }
diff -puN fs/proc/base.c~lib-flex_arrayc-accept-null-arg-to-flex_array_free fs/proc/base.c
--- a/fs/proc/base.c~lib-flex_arrayc-accept-null-arg-to-flex_array_free
+++ a/fs/proc/base.c
@@ -2514,8 +2514,7 @@ static int proc_map_files_readdir(struct
 			fa = flex_array_alloc(sizeof(info), nr_files, GFP_KERNEL);
 			if (!fa || flex_array_prealloc(fa, 0, nr_files, GFP_KERNEL)) {
 				ret = -ENOMEM;
-				if (fa)
-					flex_array_free(fa);
+				flex_array_free(fa);
 				up_read(&mm->mmap_sem);
 				mmput(mm);
 				goto out;
@@ -2556,8 +2555,7 @@ static int proc_map_files_readdir(struct
 			p = flex_array_get(fa, i);
 			fput(p->file);
 		}
-		if (fa)
-			flex_array_free(fa);
+		flex_array_free(fa);
 		mmput(mm);
 	}
 	}
diff -puN security/selinux/ss/policydb.c~lib-flex_arrayc-accept-null-arg-to-flex_array_free security/selinux/ss/policydb.c
--- a/security/selinux/ss/policydb.c~lib-flex_arrayc-accept-null-arg-to-flex_array_free
+++ a/security/selinux/ss/policydb.c
@@ -769,16 +769,13 @@ void policydb_destroy(struct policydb *p
 		hashtab_destroy(p->symtab[i].table);
 	}
 
-	for (i = 0; i < SYM_NUM; i++) {
-		if (p->sym_val_to_name[i])
-			flex_array_free(p->sym_val_to_name[i]);
-	}
+	for (i = 0; i < SYM_NUM; i++)
+		flex_array_free(p->sym_val_to_name[i]);
 
 	kfree(p->class_val_to_struct);
 	kfree(p->role_val_to_struct);
 	kfree(p->user_val_to_struct);
-	if (p->type_val_to_struct_array)
-		flex_array_free(p->type_val_to_struct_array);
+	flex_array_free(p->type_val_to_struct_array);
 
 	avtab_destroy(&p->te_avtab);
 
_

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
@ 2011-09-08 23:52                                                     ` Andrew Morton
  0 siblings, 0 replies; 82+ messages in thread
From: Andrew Morton @ 2011-09-08 23:52 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Vasiliy Kulikov, Tejun Heo, Kirill A. Shutemov, containers,
	linux-kernel, linux-fsdevel, Nathan Lynch, kernel-hardening,
	Oren Laadan, Daniel Lezcano, Glauber Costa, James Bottomley,
	Alexey Dobriyan, Al Viro, Pavel Emelyanov

On Thu, 8 Sep 2011 10:04:05 +0400
Cyrill Gorcunov <gorcunov@gmail.com> wrote:

> fs, proc: Introduce the /proc/<pid>/map_files/ directory v11

Ho hum, I've pretty much run out of excuses to avoid merging this.

except...

We don't really want to bloat fs/proc/base.o by 4k until all the other
things which support c/r are mergeable and we know that the whole
project is actually useful.  When will we be at this stage?

<looks at the warning>

fs/proc/base.c: In function 'proc_map_files_instantiate':
fs/proc/base.c:2348: warning: assignment from incompatible pointer type

err, that code will crash at runtime and it isn't trivial to fix. 
How could this happen?

>
> ...
>
> +				if (fa)
> +					flex_array_free(fa);
>
> ...
>
> +		if (fa)
> +			flex_array_free(fa);

I think I'll do this:

From: Andrew Morton <akpm@linux-foundation.org>

Lots of callers are avoiding passing NULL into flex_array_free().  Move
the check into flex_array_free() in the usual fashion.

Cc: Stephen Smalley <sds@tycho.nsa.gov>
Cc: James Morris <jmorris@namei.org>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 fs/proc/base.c                 |    6 ++----
 lib/flex_array.c               |    2 ++
 security/selinux/ss/policydb.c |    9 +++------
 3 files changed, 7 insertions(+), 10 deletions(-)

diff -puN lib/flex_array.c~lib-flex_arrayc-accept-null-arg-to-flex_array_free lib/flex_array.c
--- a/lib/flex_array.c~lib-flex_arrayc-accept-null-arg-to-flex_array_free
+++ a/lib/flex_array.c
@@ -142,6 +142,8 @@ EXPORT_SYMBOL(flex_array_free_parts);
 
 void flex_array_free(struct flex_array *fa)
 {
+	if (!fa)
+		return;
 	flex_array_free_parts(fa);
 	kfree(fa);
 }
diff -puN fs/proc/base.c~lib-flex_arrayc-accept-null-arg-to-flex_array_free fs/proc/base.c
--- a/fs/proc/base.c~lib-flex_arrayc-accept-null-arg-to-flex_array_free
+++ a/fs/proc/base.c
@@ -2514,8 +2514,7 @@ static int proc_map_files_readdir(struct
 			fa = flex_array_alloc(sizeof(info), nr_files, GFP_KERNEL);
 			if (!fa || flex_array_prealloc(fa, 0, nr_files, GFP_KERNEL)) {
 				ret = -ENOMEM;
-				if (fa)
-					flex_array_free(fa);
+				flex_array_free(fa);
 				up_read(&mm->mmap_sem);
 				mmput(mm);
 				goto out;
@@ -2556,8 +2555,7 @@ static int proc_map_files_readdir(struct
 			p = flex_array_get(fa, i);
 			fput(p->file);
 		}
-		if (fa)
-			flex_array_free(fa);
+		flex_array_free(fa);
 		mmput(mm);
 	}
 	}
diff -puN security/selinux/ss/policydb.c~lib-flex_arrayc-accept-null-arg-to-flex_array_free security/selinux/ss/policydb.c
--- a/security/selinux/ss/policydb.c~lib-flex_arrayc-accept-null-arg-to-flex_array_free
+++ a/security/selinux/ss/policydb.c
@@ -769,16 +769,13 @@ void policydb_destroy(struct policydb *p
 		hashtab_destroy(p->symtab[i].table);
 	}
 
-	for (i = 0; i < SYM_NUM; i++) {
-		if (p->sym_val_to_name[i])
-			flex_array_free(p->sym_val_to_name[i]);
-	}
+	for (i = 0; i < SYM_NUM; i++)
+		flex_array_free(p->sym_val_to_name[i]);
 
 	kfree(p->class_val_to_struct);
 	kfree(p->role_val_to_struct);
 	kfree(p->user_val_to_struct);
-	if (p->type_val_to_struct_array)
-		flex_array_free(p->type_val_to_struct_array);
+	flex_array_free(p->type_val_to_struct_array);
 
 	avtab_destroy(&p->te_avtab);
 
_


^ permalink raw reply	[flat|nested] 82+ messages in thread

* [kernel-hardening] Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
@ 2011-09-08 23:52                                                     ` Andrew Morton
  0 siblings, 0 replies; 82+ messages in thread
From: Andrew Morton @ 2011-09-08 23:52 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Vasiliy Kulikov, Tejun Heo, Kirill A. Shutemov, containers,
	linux-kernel, linux-fsdevel, Nathan Lynch, kernel-hardening,
	Oren Laadan, Daniel Lezcano, Glauber Costa, James Bottomley,
	Alexey Dobriyan, Al Viro, Pavel Emelyanov

On Thu, 8 Sep 2011 10:04:05 +0400
Cyrill Gorcunov <gorcunov@gmail.com> wrote:

> fs, proc: Introduce the /proc/<pid>/map_files/ directory v11

Ho hum, I've pretty much run out of excuses to avoid merging this.

except...

We don't really want to bloat fs/proc/base.o by 4k until all the other
things which support c/r are mergeable and we know that the whole
project is actually useful.  When will we be at this stage?

<looks at the warning>

fs/proc/base.c: In function 'proc_map_files_instantiate':
fs/proc/base.c:2348: warning: assignment from incompatible pointer type

err, that code will crash at runtime and it isn't trivial to fix. 
How could this happen?

>
> ...
>
> +				if (fa)
> +					flex_array_free(fa);
>
> ...
>
> +		if (fa)
> +			flex_array_free(fa);

I think I'll do this:

From: Andrew Morton <akpm@linux-foundation.org>

Lots of callers are avoiding passing NULL into flex_array_free().  Move
the check into flex_array_free() in the usual fashion.

Cc: Stephen Smalley <sds@tycho.nsa.gov>
Cc: James Morris <jmorris@namei.org>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 fs/proc/base.c                 |    6 ++----
 lib/flex_array.c               |    2 ++
 security/selinux/ss/policydb.c |    9 +++------
 3 files changed, 7 insertions(+), 10 deletions(-)

diff -puN lib/flex_array.c~lib-flex_arrayc-accept-null-arg-to-flex_array_free lib/flex_array.c
--- a/lib/flex_array.c~lib-flex_arrayc-accept-null-arg-to-flex_array_free
+++ a/lib/flex_array.c
@@ -142,6 +142,8 @@ EXPORT_SYMBOL(flex_array_free_parts);
 
 void flex_array_free(struct flex_array *fa)
 {
+	if (!fa)
+		return;
 	flex_array_free_parts(fa);
 	kfree(fa);
 }
diff -puN fs/proc/base.c~lib-flex_arrayc-accept-null-arg-to-flex_array_free fs/proc/base.c
--- a/fs/proc/base.c~lib-flex_arrayc-accept-null-arg-to-flex_array_free
+++ a/fs/proc/base.c
@@ -2514,8 +2514,7 @@ static int proc_map_files_readdir(struct
 			fa = flex_array_alloc(sizeof(info), nr_files, GFP_KERNEL);
 			if (!fa || flex_array_prealloc(fa, 0, nr_files, GFP_KERNEL)) {
 				ret = -ENOMEM;
-				if (fa)
-					flex_array_free(fa);
+				flex_array_free(fa);
 				up_read(&mm->mmap_sem);
 				mmput(mm);
 				goto out;
@@ -2556,8 +2555,7 @@ static int proc_map_files_readdir(struct
 			p = flex_array_get(fa, i);
 			fput(p->file);
 		}
-		if (fa)
-			flex_array_free(fa);
+		flex_array_free(fa);
 		mmput(mm);
 	}
 	}
diff -puN security/selinux/ss/policydb.c~lib-flex_arrayc-accept-null-arg-to-flex_array_free security/selinux/ss/policydb.c
--- a/security/selinux/ss/policydb.c~lib-flex_arrayc-accept-null-arg-to-flex_array_free
+++ a/security/selinux/ss/policydb.c
@@ -769,16 +769,13 @@ void policydb_destroy(struct policydb *p
 		hashtab_destroy(p->symtab[i].table);
 	}
 
-	for (i = 0; i < SYM_NUM; i++) {
-		if (p->sym_val_to_name[i])
-			flex_array_free(p->sym_val_to_name[i]);
-	}
+	for (i = 0; i < SYM_NUM; i++)
+		flex_array_free(p->sym_val_to_name[i]);
 
 	kfree(p->class_val_to_struct);
 	kfree(p->role_val_to_struct);
 	kfree(p->user_val_to_struct);
-	if (p->type_val_to_struct_array)
-		flex_array_free(p->type_val_to_struct_array);
+	flex_array_free(p->type_val_to_struct_array);
 
 	avtab_destroy(&p->te_avtab);
 
_

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-09-08 23:52                                                     ` Andrew Morton
  (?)
@ 2011-09-09  0:24                                                       ` Pavel Emelyanov
  -1 siblings, 0 replies; 82+ messages in thread
From: Pavel Emelyanov @ 2011-09-09  0:24 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Cyrill Gorcunov, Vasiliy Kulikov, Tejun Heo, Kirill A. Shutemov,
	containers, linux-kernel, linux-fsdevel, Nathan Lynch,
	kernel-hardening, Oren Laadan, Daniel Lezcano, Glauber Costa,
	James Bottomley, Alexey Dobriyan, Al Viro

On 09/09/2011 03:52 AM, Andrew Morton wrote:
> On Thu, 8 Sep 2011 10:04:05 +0400
> Cyrill Gorcunov <gorcunov@gmail.com> wrote:
> 
>> fs, proc: Introduce the /proc/<pid>/map_files/ directory v11
> 
> Ho hum, I've pretty much run out of excuses to avoid merging this.
> 
> except...
> 
> We don't really want to bloat fs/proc/base.o by 4k until all the other
> things which support c/r are mergeable and we know that the whole
> project is actually useful.  When will we be at this stage?

Well, I see no other stuff that will be required for us in the nearest
future (well, and in the not-so-near future as well) in the fs/proc/base.o
to checkpoint or restore a task.

Thanks,
Pavel

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
@ 2011-09-09  0:24                                                       ` Pavel Emelyanov
  0 siblings, 0 replies; 82+ messages in thread
From: Pavel Emelyanov @ 2011-09-09  0:24 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Cyrill Gorcunov, Vasiliy Kulikov, Tejun Heo, Kirill A. Shutemov,
	containers, linux-kernel, linux-fsdevel, Nathan Lynch,
	kernel-hardening, Oren Laadan, Daniel Lezcano, Glauber Costa,
	James Bottomley, Alexey Dobriyan, Al Viro

On 09/09/2011 03:52 AM, Andrew Morton wrote:
> On Thu, 8 Sep 2011 10:04:05 +0400
> Cyrill Gorcunov <gorcunov@gmail.com> wrote:
> 
>> fs, proc: Introduce the /proc/<pid>/map_files/ directory v11
> 
> Ho hum, I've pretty much run out of excuses to avoid merging this.
> 
> except...
> 
> We don't really want to bloat fs/proc/base.o by 4k until all the other
> things which support c/r are mergeable and we know that the whole
> project is actually useful.  When will we be at this stage?

Well, I see no other stuff that will be required for us in the nearest
future (well, and in the not-so-near future as well) in the fs/proc/base.o
to checkpoint or restore a task.

Thanks,
Pavel

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [kernel-hardening] Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
@ 2011-09-09  0:24                                                       ` Pavel Emelyanov
  0 siblings, 0 replies; 82+ messages in thread
From: Pavel Emelyanov @ 2011-09-09  0:24 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Cyrill Gorcunov, Vasiliy Kulikov, Tejun Heo, Kirill A. Shutemov,
	containers, linux-kernel, linux-fsdevel, Nathan Lynch,
	kernel-hardening, Oren Laadan, Daniel Lezcano, Glauber Costa,
	James Bottomley, Alexey Dobriyan, Al Viro

On 09/09/2011 03:52 AM, Andrew Morton wrote:
> On Thu, 8 Sep 2011 10:04:05 +0400
> Cyrill Gorcunov <gorcunov@gmail.com> wrote:
> 
>> fs, proc: Introduce the /proc/<pid>/map_files/ directory v11
> 
> Ho hum, I've pretty much run out of excuses to avoid merging this.
> 
> except...
> 
> We don't really want to bloat fs/proc/base.o by 4k until all the other
> things which support c/r are mergeable and we know that the whole
> project is actually useful.  When will we be at this stage?

Well, I see no other stuff that will be required for us in the nearest
future (well, and in the not-so-near future as well) in the fs/proc/base.o
to checkpoint or restore a task.

Thanks,
Pavel

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-09-08 23:52                                                     ` Andrew Morton
@ 2011-09-09  5:48                                                       ` Cyrill Gorcunov
  -1 siblings, 0 replies; 82+ messages in thread
From: Cyrill Gorcunov @ 2011-09-09  5:48 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Vasiliy Kulikov, Tejun Heo, Kirill A. Shutemov, containers,
	linux-kernel, linux-fsdevel, Nathan Lynch, kernel-hardening,
	Oren Laadan, Daniel Lezcano, Glauber Costa, James Bottomley,
	Alexey Dobriyan, Al Viro, Pavel Emelyanov

On Thu, Sep 08, 2011 at 04:52:01PM -0700, Andrew Morton wrote:
> On Thu, 8 Sep 2011 10:04:05 +0400
> Cyrill Gorcunov <gorcunov@gmail.com> wrote:
> 
> > fs, proc: Introduce the /proc/<pid>/map_files/ directory v11
> 
> Ho hum, I've pretty much run out of excuses to avoid merging this.
> 
> except...
> 
> We don't really want to bloat fs/proc/base.o by 4k until all the other
> things which support c/r are mergeable and we know that the whole
> project is actually useful.  When will we be at this stage?

I hope we will bring in a final set in a couple of weeks.

> 
> <looks at the warning>
> 
> fs/proc/base.c: In function 'proc_map_files_instantiate':
> fs/proc/base.c:2348: warning: assignment from incompatible pointer type
> 
> err, that code will crash at runtime and it isn't trivial to fix. 
> How could this happen?
> 

Hmm. I never saw this warning. (Andrew, I'm still unable to fetch
your current -mm tree, is there some place other than kernel.org?
So the patch is done on top of 3.1-rc3). I guess this warrning is
from p = flex_array_get(fa, i); ? (since I don't have any warning
at all).

> >
> > ...
> >
> > +				if (fa)
> > +					flex_array_free(fa);
> >
> > ...
> >
> > +		if (fa)
> > +			flex_array_free(fa);
> 
> I think I'll do this:
> 
> From: Andrew Morton <akpm@linux-foundation.org>
> 
> Lots of callers are avoiding passing NULL into flex_array_free().  Move
> the check into flex_array_free() in the usual fashion.
> 
> Cc: Stephen Smalley <sds@tycho.nsa.gov>
> Cc: James Morris <jmorris@namei.org>
> Cc: Cyrill Gorcunov <gorcunov@gmail.com>
> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
> ---

Yeah, great. Moreover, flex_array_free calls for kfree which
support NULL argument so it's natural to make this one NULL
capable as well.

	Cyrill

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [kernel-hardening] Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
@ 2011-09-09  5:48                                                       ` Cyrill Gorcunov
  0 siblings, 0 replies; 82+ messages in thread
From: Cyrill Gorcunov @ 2011-09-09  5:48 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Vasiliy Kulikov, Tejun Heo, Kirill A. Shutemov, containers,
	linux-kernel, linux-fsdevel, Nathan Lynch, kernel-hardening,
	Oren Laadan, Daniel Lezcano, Glauber Costa, James Bottomley,
	Alexey Dobriyan, Al Viro, Pavel Emelyanov

On Thu, Sep 08, 2011 at 04:52:01PM -0700, Andrew Morton wrote:
> On Thu, 8 Sep 2011 10:04:05 +0400
> Cyrill Gorcunov <gorcunov@gmail.com> wrote:
> 
> > fs, proc: Introduce the /proc/<pid>/map_files/ directory v11
> 
> Ho hum, I've pretty much run out of excuses to avoid merging this.
> 
> except...
> 
> We don't really want to bloat fs/proc/base.o by 4k until all the other
> things which support c/r are mergeable and we know that the whole
> project is actually useful.  When will we be at this stage?

I hope we will bring in a final set in a couple of weeks.

> 
> <looks at the warning>
> 
> fs/proc/base.c: In function 'proc_map_files_instantiate':
> fs/proc/base.c:2348: warning: assignment from incompatible pointer type
> 
> err, that code will crash at runtime and it isn't trivial to fix. 
> How could this happen?
> 

Hmm. I never saw this warning. (Andrew, I'm still unable to fetch
your current -mm tree, is there some place other than kernel.org?
So the patch is done on top of 3.1-rc3). I guess this warrning is
from p = flex_array_get(fa, i); ? (since I don't have any warning
at all).

> >
> > ...
> >
> > +				if (fa)
> > +					flex_array_free(fa);
> >
> > ...
> >
> > +		if (fa)
> > +			flex_array_free(fa);
> 
> I think I'll do this:
> 
> From: Andrew Morton <akpm@linux-foundation.org>
> 
> Lots of callers are avoiding passing NULL into flex_array_free().  Move
> the check into flex_array_free() in the usual fashion.
> 
> Cc: Stephen Smalley <sds@tycho.nsa.gov>
> Cc: James Morris <jmorris@namei.org>
> Cc: Cyrill Gorcunov <gorcunov@gmail.com>
> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
> ---

Yeah, great. Moreover, flex_array_free calls for kfree which
support NULL argument so it's natural to make this one NULL
capable as well.

	Cyrill

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-09-09  5:48                                                       ` [kernel-hardening] " Cyrill Gorcunov
@ 2011-09-09  6:00                                                         ` Andrew Morton
  -1 siblings, 0 replies; 82+ messages in thread
From: Andrew Morton @ 2011-09-09  6:00 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Vasiliy Kulikov, Tejun Heo, Kirill A. Shutemov, containers,
	linux-kernel, linux-fsdevel, Nathan Lynch, kernel-hardening,
	Oren Laadan, Daniel Lezcano, Glauber Costa, James Bottomley,
	Alexey Dobriyan, Al Viro, Pavel Emelyanov

On Fri, 9 Sep 2011 09:48:19 +0400 Cyrill Gorcunov <gorcunov@gmail.com> wrote:

> > 
> > <looks at the warning>
> > 
> > fs/proc/base.c: In function 'proc_map_files_instantiate':
> > fs/proc/base.c:2348: warning: assignment from incompatible pointer type
> > 
> > err, that code will crash at runtime and it isn't trivial to fix. 
> > How could this happen?
> > 
> 
> Hmm. I never saw this warning. (Andrew, I'm still unable to fetch
> your current -mm tree, is there some place other than kernel.org?

Nope, sorry - we're dead in the water at present.

> So the patch is done on top of 3.1-rc3). I guess this warrning is
> from p = flex_array_get(fa, i); ? (since I don't have any warning
> at all).

The warning is from

	ei->op.proc_get_link = proc_map_files_get_link;

The lhs has type

union proc_op {
	int (*proc_get_link)(struct inode *, struct path *);

and the rhs has type

static int proc_map_files_get_link(struct dentry *dentry, struct path *path)

So we end up passing an inode* to a function which expects a dentry*.

That's in 3.1-rc4.  proc_op.proc_get_link() hasn't changed since 3.0 (at least).

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [kernel-hardening] Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
@ 2011-09-09  6:00                                                         ` Andrew Morton
  0 siblings, 0 replies; 82+ messages in thread
From: Andrew Morton @ 2011-09-09  6:00 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Vasiliy Kulikov, Tejun Heo, Kirill A. Shutemov, containers,
	linux-kernel, linux-fsdevel, Nathan Lynch, kernel-hardening,
	Oren Laadan, Daniel Lezcano, Glauber Costa, James Bottomley,
	Alexey Dobriyan, Al Viro, Pavel Emelyanov

On Fri, 9 Sep 2011 09:48:19 +0400 Cyrill Gorcunov <gorcunov@gmail.com> wrote:

> > 
> > <looks at the warning>
> > 
> > fs/proc/base.c: In function 'proc_map_files_instantiate':
> > fs/proc/base.c:2348: warning: assignment from incompatible pointer type
> > 
> > err, that code will crash at runtime and it isn't trivial to fix. 
> > How could this happen?
> > 
> 
> Hmm. I never saw this warning. (Andrew, I'm still unable to fetch
> your current -mm tree, is there some place other than kernel.org?

Nope, sorry - we're dead in the water at present.

> So the patch is done on top of 3.1-rc3). I guess this warrning is
> from p = flex_array_get(fa, i); ? (since I don't have any warning
> at all).

The warning is from

	ei->op.proc_get_link = proc_map_files_get_link;

The lhs has type

union proc_op {
	int (*proc_get_link)(struct inode *, struct path *);

and the rhs has type

static int proc_map_files_get_link(struct dentry *dentry, struct path *path)

So we end up passing an inode* to a function which expects a dentry*.

That's in 3.1-rc4.  proc_op.proc_get_link() hasn't changed since 3.0 (at least).

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-09-09  6:00                                                         ` [kernel-hardening] " Andrew Morton
@ 2011-09-09  6:22                                                           ` Cyrill Gorcunov
  -1 siblings, 0 replies; 82+ messages in thread
From: Cyrill Gorcunov @ 2011-09-09  6:22 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Vasiliy Kulikov, Tejun Heo, Kirill A. Shutemov, containers,
	linux-kernel, linux-fsdevel, Nathan Lynch, kernel-hardening,
	Oren Laadan, Daniel Lezcano, Glauber Costa, James Bottomley,
	Alexey Dobriyan, Al Viro, Pavel Emelyanov

On Thu, Sep 08, 2011 at 11:00:20PM -0700, Andrew Morton wrote:
...
> > 
> > Hmm. I never saw this warning. (Andrew, I'm still unable to fetch
> > your current -mm tree, is there some place other than kernel.org?
> 
> Nope, sorry - we're dead in the water at present.
> 
> > So the patch is done on top of 3.1-rc3). I guess this warrning is
> > from p = flex_array_get(fa, i); ? (since I don't have any warning
> > at all).
> 
> The warning is from
> 
> 	ei->op.proc_get_link = proc_map_files_get_link;
> 
> The lhs has type
> 
> union proc_op {
> 	int (*proc_get_link)(struct inode *, struct path *);
> 
> and the rhs has type
> 
> static int proc_map_files_get_link(struct dentry *dentry, struct path *path)
> 
> So we end up passing an inode* to a function which expects a dentry*.
> 
> That's in 3.1-rc4.  proc_op.proc_get_link() hasn't changed since 3.0 (at least).

Crap. I know what happened. At first proposal time Tejun (iirc ;) said that
is might be better to separate proc_get_link change. I did so... and of
course I forgot to send it out ;) Ie it's in my queue and I dont see any
warnings for that reason. Sorry for that. Attached below.
---
From: Cyrill Gorcunov <gorcunov@openvz.org>
Subject: fs, proc: Make proc_get_link to use dentry instead of inode

This patch prepares the ground for the next "map_files"
patch which needs a name of a link file to analyse.

So instead of squashing this change into one big
patch the separate one is done.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
CC: Pavel Emelyanov <xemul@parallels.com>
CC: Tejun Heo <tj@kernel.org>
CC: Vasiliy Kulikov <segoon@openwall.com>
CC: "Kirill A. Shutemov" <kirill@shutemov.name>
CC: Alexey Dobriyan <adobriyan@gmail.com>
CC: Al Viro <viro@ZenIV.linux.org.uk>
CC: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/base.c          |   20 ++++++++++----------
 include/linux/proc_fs.h |    2 +-
 2 files changed, 11 insertions(+), 11 deletions(-)

Index: linux-2.6.git/fs/proc/base.c
===================================================================
--- linux-2.6.git.orig/fs/proc/base.c
+++ linux-2.6.git/fs/proc/base.c
@@ -165,9 +165,9 @@ static int get_task_root(struct task_str
 	return result;
 }
 
-static int proc_cwd_link(struct inode *inode, struct path *path)
+static int proc_cwd_link(struct dentry *dentry, struct path *path)
 {
-	struct task_struct *task = get_proc_task(inode);
+	struct task_struct *task = get_proc_task(dentry->d_inode);
 	int result = -ENOENT;
 
 	if (task) {
@@ -182,9 +182,9 @@ static int proc_cwd_link(struct inode *i
 	return result;
 }
 
-static int proc_root_link(struct inode *inode, struct path *path)
+static int proc_root_link(struct dentry *dentry, struct path *path)
 {
-	struct task_struct *task = get_proc_task(inode);
+	struct task_struct *task = get_proc_task(dentry->d_inode);
 	int result = -ENOENT;
 
 	if (task) {
@@ -1580,13 +1580,13 @@ static const struct file_operations proc
 	.release	= single_release,
 };
 
-static int proc_exe_link(struct inode *inode, struct path *exe_path)
+static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
 {
 	struct task_struct *task;
 	struct mm_struct *mm;
 	struct file *exe_file;
 
-	task = get_proc_task(inode);
+	task = get_proc_task(dentry->d_inode);
 	if (!task)
 		return -ENOENT;
 	mm = get_task_mm(task);
@@ -1616,7 +1616,7 @@ static void *proc_pid_follow_link(struct
 	if (!proc_fd_access_allowed(inode))
 		goto out;
 
-	error = PROC_I(inode)->op.proc_get_link(inode, &nd->path);
+	error = PROC_I(inode)->op.proc_get_link(dentry, &nd->path);
 out:
 	return ERR_PTR(error);
 }
@@ -1655,7 +1655,7 @@ static int proc_pid_readlink(struct dent
 	if (!proc_fd_access_allowed(inode))
 		goto out;
 
-	error = PROC_I(inode)->op.proc_get_link(inode, &path);
+	error = PROC_I(inode)->op.proc_get_link(dentry, &path);
 	if (error)
 		goto out;
 
@@ -1947,9 +1947,9 @@ static int proc_fd_info(struct inode *in
 	return -ENOENT;
 }
 
-static int proc_fd_link(struct inode *inode, struct path *path)
+static int proc_fd_link(struct dentry *dentry, struct path *path)
 {
-	return proc_fd_info(inode, path, NULL);
+	return proc_fd_info(dentry->d_inode, path, NULL);
 }
 
 static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
Index: linux-2.6.git/include/linux/proc_fs.h
===================================================================
--- linux-2.6.git.orig/include/linux/proc_fs.h
+++ linux-2.6.git/include/linux/proc_fs.h
@@ -253,7 +253,7 @@ extern const struct proc_ns_operations u
 extern const struct proc_ns_operations ipcns_operations;
 
 union proc_op {
-	int (*proc_get_link)(struct inode *, struct path *);
+	int (*proc_get_link)(struct dentry *, struct path *);
 	int (*proc_read)(struct task_struct *task, char *page);
 	int (*proc_show)(struct seq_file *m,
 		struct pid_namespace *ns, struct pid *pid,

	Cyrill

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [kernel-hardening] Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
@ 2011-09-09  6:22                                                           ` Cyrill Gorcunov
  0 siblings, 0 replies; 82+ messages in thread
From: Cyrill Gorcunov @ 2011-09-09  6:22 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Vasiliy Kulikov, Tejun Heo, Kirill A. Shutemov, containers,
	linux-kernel, linux-fsdevel, Nathan Lynch, kernel-hardening,
	Oren Laadan, Daniel Lezcano, Glauber Costa, James Bottomley,
	Alexey Dobriyan, Al Viro, Pavel Emelyanov

On Thu, Sep 08, 2011 at 11:00:20PM -0700, Andrew Morton wrote:
...
> > 
> > Hmm. I never saw this warning. (Andrew, I'm still unable to fetch
> > your current -mm tree, is there some place other than kernel.org?
> 
> Nope, sorry - we're dead in the water at present.
> 
> > So the patch is done on top of 3.1-rc3). I guess this warrning is
> > from p = flex_array_get(fa, i); ? (since I don't have any warning
> > at all).
> 
> The warning is from
> 
> 	ei->op.proc_get_link = proc_map_files_get_link;
> 
> The lhs has type
> 
> union proc_op {
> 	int (*proc_get_link)(struct inode *, struct path *);
> 
> and the rhs has type
> 
> static int proc_map_files_get_link(struct dentry *dentry, struct path *path)
> 
> So we end up passing an inode* to a function which expects a dentry*.
> 
> That's in 3.1-rc4.  proc_op.proc_get_link() hasn't changed since 3.0 (at least).

Crap. I know what happened. At first proposal time Tejun (iirc ;) said that
is might be better to separate proc_get_link change. I did so... and of
course I forgot to send it out ;) Ie it's in my queue and I dont see any
warnings for that reason. Sorry for that. Attached below.
---
From: Cyrill Gorcunov <gorcunov@openvz.org>
Subject: fs, proc: Make proc_get_link to use dentry instead of inode

This patch prepares the ground for the next "map_files"
patch which needs a name of a link file to analyse.

So instead of squashing this change into one big
patch the separate one is done.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
CC: Pavel Emelyanov <xemul@parallels.com>
CC: Tejun Heo <tj@kernel.org>
CC: Vasiliy Kulikov <segoon@openwall.com>
CC: "Kirill A. Shutemov" <kirill@shutemov.name>
CC: Alexey Dobriyan <adobriyan@gmail.com>
CC: Al Viro <viro@ZenIV.linux.org.uk>
CC: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/base.c          |   20 ++++++++++----------
 include/linux/proc_fs.h |    2 +-
 2 files changed, 11 insertions(+), 11 deletions(-)

Index: linux-2.6.git/fs/proc/base.c
===================================================================
--- linux-2.6.git.orig/fs/proc/base.c
+++ linux-2.6.git/fs/proc/base.c
@@ -165,9 +165,9 @@ static int get_task_root(struct task_str
 	return result;
 }
 
-static int proc_cwd_link(struct inode *inode, struct path *path)
+static int proc_cwd_link(struct dentry *dentry, struct path *path)
 {
-	struct task_struct *task = get_proc_task(inode);
+	struct task_struct *task = get_proc_task(dentry->d_inode);
 	int result = -ENOENT;
 
 	if (task) {
@@ -182,9 +182,9 @@ static int proc_cwd_link(struct inode *i
 	return result;
 }
 
-static int proc_root_link(struct inode *inode, struct path *path)
+static int proc_root_link(struct dentry *dentry, struct path *path)
 {
-	struct task_struct *task = get_proc_task(inode);
+	struct task_struct *task = get_proc_task(dentry->d_inode);
 	int result = -ENOENT;
 
 	if (task) {
@@ -1580,13 +1580,13 @@ static const struct file_operations proc
 	.release	= single_release,
 };
 
-static int proc_exe_link(struct inode *inode, struct path *exe_path)
+static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
 {
 	struct task_struct *task;
 	struct mm_struct *mm;
 	struct file *exe_file;
 
-	task = get_proc_task(inode);
+	task = get_proc_task(dentry->d_inode);
 	if (!task)
 		return -ENOENT;
 	mm = get_task_mm(task);
@@ -1616,7 +1616,7 @@ static void *proc_pid_follow_link(struct
 	if (!proc_fd_access_allowed(inode))
 		goto out;
 
-	error = PROC_I(inode)->op.proc_get_link(inode, &nd->path);
+	error = PROC_I(inode)->op.proc_get_link(dentry, &nd->path);
 out:
 	return ERR_PTR(error);
 }
@@ -1655,7 +1655,7 @@ static int proc_pid_readlink(struct dent
 	if (!proc_fd_access_allowed(inode))
 		goto out;
 
-	error = PROC_I(inode)->op.proc_get_link(inode, &path);
+	error = PROC_I(inode)->op.proc_get_link(dentry, &path);
 	if (error)
 		goto out;
 
@@ -1947,9 +1947,9 @@ static int proc_fd_info(struct inode *in
 	return -ENOENT;
 }
 
-static int proc_fd_link(struct inode *inode, struct path *path)
+static int proc_fd_link(struct dentry *dentry, struct path *path)
 {
-	return proc_fd_info(inode, path, NULL);
+	return proc_fd_info(dentry->d_inode, path, NULL);
 }
 
 static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
Index: linux-2.6.git/include/linux/proc_fs.h
===================================================================
--- linux-2.6.git.orig/include/linux/proc_fs.h
+++ linux-2.6.git/include/linux/proc_fs.h
@@ -253,7 +253,7 @@ extern const struct proc_ns_operations u
 extern const struct proc_ns_operations ipcns_operations;
 
 union proc_op {
-	int (*proc_get_link)(struct inode *, struct path *);
+	int (*proc_get_link)(struct dentry *, struct path *);
 	int (*proc_read)(struct task_struct *task, char *page);
 	int (*proc_show)(struct seq_file *m,
 		struct pid_namespace *ns, struct pid *pid,

	Cyrill

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [kernel-hardening] Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-09-08  6:04                                                   ` [kernel-hardening] " Cyrill Gorcunov
  (?)
  (?)
@ 2011-09-10 13:21                                                   ` Vasiliy Kulikov
  2011-09-10 13:49                                                     ` Cyrill Gorcunov
  -1 siblings, 1 reply; 82+ messages in thread
From: Vasiliy Kulikov @ 2011-09-10 13:21 UTC (permalink / raw)
  To: kernel-hardening
  Cc: Andrew Morton, Tejun Heo, Kirill A. Shutemov, containers,
	linux-kernel, linux-fsdevel, Nathan Lynch, Oren Laadan,
	Daniel Lezcano, Glauber Costa, James Bottomley, Alexey Dobriyan,
	Al Viro, Pavel Emelyanov

Hi Cyrill,

On Thu, Sep 08, 2011 at 10:04 +0400, Cyrill Gorcunov wrote:
> +static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd)
> +{
> +	unsigned long vm_start, vm_end;
> +	bool exact_vma_exists = false;
> +	struct task_struct *task;
> +	const struct cred *cred;
> +	struct mm_struct *mm;
> +	struct inode *inode;
> +
> +	if (nd && nd->flags & LOOKUP_RCU)
> +		return -ECHILD;
> +
> +	inode = dentry->d_inode;
> +	task = get_proc_task(inode);
> +	if (!task)
> +		goto out;
> +
> +	if (!ptrace_may_access(task, PTRACE_MODE_READ))

        put_task_struct(task) belongs here.

> +		goto out;
> +
> +	mm = get_task_mm(task);
> +	put_task_struct(task);
> +	if (!mm)
> +		goto out;
> +
> +	if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
> +		down_read(&mm->mmap_sem);
> +		exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end);
> +		up_read(&mm->mmap_sem);
> +	}
> +
> +	mmput(mm);
> +
> +	if (exact_vma_exists) {
> +		if (task_dumpable(task)) {
> +			rcu_read_lock();
> +			cred = __task_cred(task);
> +			inode->i_uid = cred->euid;
> +			inode->i_gid = cred->egid;
> +			rcu_read_unlock();
> +		} else {
> +			inode->i_uid = 0;
> +			inode->i_gid = 0;
> +		}
> +		security_task_to_inode(task, inode);
> +		return 1;
> +	}
> +out:
> +	d_drop(dentry);
> +	return 0;
> +}

Thanks,

-- 
Vasiliy Kulikov
http://www.openwall.com - bringing security into open computing environments

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [kernel-hardening] Re: [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6
  2011-09-10 13:21                                                   ` Vasiliy Kulikov
@ 2011-09-10 13:49                                                     ` Cyrill Gorcunov
  0 siblings, 0 replies; 82+ messages in thread
From: Cyrill Gorcunov @ 2011-09-10 13:49 UTC (permalink / raw)
  To: Vasiliy Kulikov
  Cc: kernel-hardening, Andrew Morton, Tejun Heo, Kirill A. Shutemov,
	containers, linux-kernel, linux-fsdevel, Nathan Lynch,
	Oren Laadan, Daniel Lezcano, Glauber Costa, James Bottomley,
	Alexey Dobriyan, Al Viro, Pavel Emelyanov

On Sat, Sep 10, 2011 at 05:21:01PM +0400, Vasiliy Kulikov wrote:
...
> > +
> > +	if (!ptrace_may_access(task, PTRACE_MODE_READ))
> 
>         put_task_struct(task) belongs here.
> 

Yeah, thanks! I'll update.

	Cyrill

^ permalink raw reply	[flat|nested] 82+ messages in thread

end of thread, other threads:[~2011-09-10 13:49 UTC | newest]

Thread overview: 82+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-08-31  7:58 [patch 0/2] Introduce /proc/pid/map_files v6 Cyrill Gorcunov
2011-08-31  7:58 ` [patch 1/2] fs, proc: Make proc_get_link to use dentry instead of inode Cyrill Gorcunov
2011-08-31  7:58 ` [patch 2/2] fs, proc: Introduce the /proc/<pid>/map_files/ directory v6 Cyrill Gorcunov
2011-08-31  9:06   ` Vasiliy Kulikov
2011-08-31 10:12     ` Cyrill Gorcunov
2011-08-31 11:26     ` Cyrill Gorcunov
2011-08-31 14:04       ` Kirill A. Shutemov
2011-08-31 14:09         ` Cyrill Gorcunov
2011-08-31 14:26         ` Cyrill Gorcunov
2011-08-31 22:10           ` Andrew Morton
2011-09-01  3:07             ` Kyle Moffett
2011-09-01  3:07               ` Kyle Moffett
2011-09-01  7:58             ` Pavel Emelyanov
2011-09-01 11:50               ` Tejun Heo
2011-09-01 12:13                 ` Pavel Emelyanov
2011-09-01 17:13                   ` Tejun Heo
2011-09-02 19:15                     ` Matt Helsley
2011-09-02  0:09               ` Matt Helsley
2011-09-01  8:05             ` Cyrill Gorcunov
2011-09-02 16:37               ` Vasiliy Kulikov
2011-09-02 16:37                 ` [kernel-hardening] " Vasiliy Kulikov
2011-09-05 18:53                 ` Vasiliy Kulikov
2011-09-05 18:53                   ` [kernel-hardening] " Vasiliy Kulikov
2011-09-05 19:20                   ` Cyrill Gorcunov
2011-09-05 19:20                     ` [kernel-hardening] " Cyrill Gorcunov
2011-09-05 19:49                     ` Vasiliy Kulikov
2011-09-05 19:49                       ` [kernel-hardening] " Vasiliy Kulikov
2011-09-05 20:36                       ` Cyrill Gorcunov
2011-09-05 20:36                         ` [kernel-hardening] " Cyrill Gorcunov
2011-09-06 10:15                         ` Vasiliy Kulikov
2011-09-06 10:15                           ` [kernel-hardening] " Vasiliy Kulikov
2011-09-06 16:51                           ` Tejun Heo
2011-09-06 16:51                             ` [kernel-hardening] " Tejun Heo
2011-09-06 17:29                             ` Vasiliy Kulikov
2011-09-06 17:29                               ` [kernel-hardening] " Vasiliy Kulikov
2011-09-06 17:33                               ` Tejun Heo
2011-09-06 17:33                                 ` [kernel-hardening] " Tejun Heo
2011-09-06 18:15                                 ` Cyrill Gorcunov
2011-09-06 18:15                                   ` [kernel-hardening] " Cyrill Gorcunov
     [not found]                                 ` <20110906173341.GM18425-9pTldWuhBndy/B6EtB590w@public.gmane.org>
2011-09-07 11:23                                   ` Vasiliy Kulikov
2011-09-07 11:23                                     ` [kernel-hardening] " Vasiliy Kulikov
2011-09-07 21:53                                     ` Cyrill Gorcunov
2011-09-07 21:53                                       ` [kernel-hardening] " Cyrill Gorcunov
2011-09-07 22:13                                       ` Andrew Morton
2011-09-07 22:13                                         ` [kernel-hardening] " Andrew Morton
2011-09-07 22:13                                         ` Andrew Morton
2011-09-07 22:42                                         ` Cyrill Gorcunov
2011-09-07 22:42                                           ` [kernel-hardening] " Cyrill Gorcunov
2011-09-07 22:53                                           ` Andrew Morton
2011-09-07 22:53                                             ` [kernel-hardening] " Andrew Morton
2011-09-07 22:53                                             ` Andrew Morton
2011-09-08  5:48                                             ` Cyrill Gorcunov
2011-09-08  5:48                                               ` [kernel-hardening] " Cyrill Gorcunov
2011-09-08  5:50                                               ` Cyrill Gorcunov
2011-09-08  5:50                                                 ` [kernel-hardening] " Cyrill Gorcunov
2011-09-08  6:04                                                 ` Cyrill Gorcunov
2011-09-08  6:04                                                   ` [kernel-hardening] " Cyrill Gorcunov
2011-09-08 23:52                                                   ` Andrew Morton
2011-09-08 23:52                                                     ` [kernel-hardening] " Andrew Morton
2011-09-08 23:52                                                     ` Andrew Morton
2011-09-09  0:24                                                     ` Pavel Emelyanov
2011-09-09  0:24                                                       ` [kernel-hardening] " Pavel Emelyanov
2011-09-09  0:24                                                       ` Pavel Emelyanov
2011-09-09  5:48                                                     ` Cyrill Gorcunov
2011-09-09  5:48                                                       ` [kernel-hardening] " Cyrill Gorcunov
2011-09-09  6:00                                                       ` Andrew Morton
2011-09-09  6:00                                                         ` [kernel-hardening] " Andrew Morton
2011-09-09  6:22                                                         ` Cyrill Gorcunov
2011-09-09  6:22                                                           ` [kernel-hardening] " Cyrill Gorcunov
2011-09-10 13:21                                                   ` Vasiliy Kulikov
2011-09-10 13:49                                                     ` Cyrill Gorcunov
2011-09-01 10:46             ` Cyrill Gorcunov
2011-09-01 22:49               ` Andrew Morton
2011-09-01 23:04                 ` Tejun Heo
2011-09-02  5:54                   ` Cyrill Gorcunov
2011-09-02  5:53                 ` Cyrill Gorcunov
2011-08-31 22:50           ` Andrew Morton
2011-09-02  1:54   ` Nicholas Miell
2011-09-02  1:58     ` Tejun Heo
2011-09-02  2:04       ` Nicholas Miell
2011-09-02  2:29         ` Tejun Heo
2011-09-02  8:07           ` Kirill A. Shutemov

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.