All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2
@ 2011-08-24  8:53 Cyrill Gorcunov
  2011-08-24  9:21 ` Pekka Enberg
                   ` (4 more replies)
  0 siblings, 5 replies; 48+ messages in thread
From: Cyrill Gorcunov @ 2011-08-24  8:53 UTC (permalink / raw)
  To: Nathan Lynch, Oren Laadan, Daniel Lezcano, Tejun Heo, Andrew Morton
  Cc: Glauber Costa, containers, linux-kernel, Pavel Emelyanov,
	Serge Hallyn, LINUXFS-ML, James Bottomley

From: Pavel Emelyanov <xemul@parallels.com>

This one behaves similarly to the /proc/<pid>/fd/ one - it contains symlinks
one for each mapping with file, the name of a symlink is vma->vm_start, the
target is the file. Opening a symlink results in a file that point exactly
to the same inode as them vma's one.

This thing is aimed to help checkpointing processes.

For example the ls -l of some arbitrary /proc/<pid>/map_files/

 | lr-x------ 1 cyrill cyrill 64 Aug  9 15:25 0x3d73a00000 -> /lib64/ld-2.5.so
 | lr-x------ 1 cyrill cyrill 64 Aug  9 15:25 0x3d73c1b000 -> /lib64/ld-2.5.so
 | lr-x------ 1 cyrill cyrill 64 Aug  9 15:25 0x3d73c1c000 -> /lib64/ld-2.5.so
 | lr-x------ 1 cyrill cyrill 64 Aug  9 15:25 0x3d73e00000 -> /lib64/libc-2.5.so
 | lr-x------ 1 cyrill cyrill 64 Aug  9 15:25 0x3d73f4e000 -> /lib64/libc-2.5.so

v2:
 - /proc/<pid>/mfd changed to /proc/<pid>/map_files
 - find_vma helper is used instead of linear search
 - routines are re-grouped
 - .d_revalidate is set now

Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
---
 fs/proc/base.c          |  191 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/proc_fs.h |    5 +
 2 files changed, 195 insertions(+), 1 deletion(-)

Please review. This is a part of previous c/r patchset as well
but I guess it might be treated separately.

Objections, complains, comments are highly appreciated!

Index: linux-2.6.git/fs/proc/base.c
===================================================================
--- linux-2.6.git.orig/fs/proc/base.c
+++ linux-2.6.git/fs/proc/base.c
@@ -2170,6 +2170,196 @@ static const struct file_operations proc
 	.llseek		= default_llseek,
 };
 
+static const struct dentry_operations tid_map_files_dentry_operations = {
+	.d_revalidate	= pid_revalidate,
+	.d_delete	= pid_delete_dentry,
+};
+
+static int proc_map_files_get_link(struct inode *inode, struct path *path)
+{
+	struct task_struct *task;
+	struct vm_area_struct *vma;
+	struct mm_struct *mm;
+	unsigned long vm_start;
+	int rc = -ENOENT;
+
+	task = get_proc_task(inode);
+	if (!task)
+		goto out;
+
+	mm = get_task_mm(task);
+	put_task_struct(task);
+
+	if (!mm)
+		goto out;
+
+	vm_start = PROC_I(inode)->vm_start;
+
+	down_read(&mm->mmap_sem);
+	vma = find_vma(mm, vm_start);
+	if (vma && vma->vm_start == vm_start && vma->vm_file) {
+		*path = vma->vm_file->f_path;
+		path_get(path);
+		rc = 0;
+	}
+	up_read(&mm->mmap_sem);
+
+	mmput(mm);
+
+out:
+	return rc;
+}
+
+static struct dentry *
+proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
+			   struct task_struct *task, const void *ptr)
+{
+	const struct vm_area_struct *vma = ptr;
+	struct file *file = vma->vm_file;
+	struct proc_inode *ei;
+	struct inode *inode;
+
+	if (!file)
+		return ERR_PTR(-ENOENT);
+
+	inode = proc_pid_make_inode(dir->i_sb, task);
+	if (!inode)
+		return ERR_PTR(-ENOENT);
+
+	ei			= PROC_I(inode);
+	ei->vm_start		= vma->vm_start;
+	ei->op.proc_get_link	= proc_map_files_get_link;
+
+	inode->i_op	= &proc_pid_link_inode_operations;
+	inode->i_size	= 64;
+	inode->i_mode	= S_IFLNK;
+
+	if (file->f_mode & FMODE_READ)
+		inode->i_mode |= S_IRUSR | S_IXUSR;
+	if (file->f_mode & FMODE_WRITE)
+		inode->i_mode |= S_IWUSR | S_IXUSR;
+
+	d_set_d_op(dentry, &tid_map_files_dentry_operations);
+	d_add(dentry, inode);
+
+	return NULL;
+}
+
+static struct dentry *proc_map_files_lookup(struct inode *dir,
+		struct dentry *dentry, struct nameidata *nd)
+{
+	struct task_struct *task;
+	unsigned long vm_start;
+	struct vm_area_struct *vma;
+	struct mm_struct *mm;
+	struct dentry *result;
+	char *endp;
+
+	result = ERR_PTR(-ENOENT);
+
+	task = get_proc_task(dir);
+	if (!task)
+		goto out_no_task;
+
+	vm_start = simple_strtoul(dentry->d_name.name, &endp, 16);
+	if (*endp != '\0')
+		goto out_no_mm;
+
+	mm = get_task_mm(task);
+	if (!mm)
+		goto out_no_mm;
+
+	down_read(&mm->mmap_sem);
+	vma = find_vma(mm, vm_start);
+	if (!vma || vma->vm_start != vm_start)
+		goto out_no_vma;
+	result = proc_map_files_instantiate(dir, dentry, task, vma);
+
+out_no_vma:
+	up_read(&mm->mmap_sem);
+	mmput(mm);
+out_no_mm:
+	put_task_struct(task);
+out_no_task:
+	return result;
+}
+
+static const struct inode_operations proc_map_files_inode_operations = {
+	.lookup		= proc_map_files_lookup,
+	.setattr	= proc_setattr,
+};
+
+static int proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct dentry *dentry = filp->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	struct vm_area_struct *vma;
+	struct task_struct *task;
+	struct mm_struct *mm;
+	unsigned int vmai;
+	ino_t ino;
+	int ret;
+
+	ret = -ENOENT;
+	task = get_proc_task(inode);
+	if (!task)
+		goto out_no_task;
+
+	ret = -EPERM;
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+		goto out;
+
+	ret = 0;
+	switch (filp->f_pos) {
+	case 0:
+		ino = inode->i_ino;
+		if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0)
+			goto out;
+		filp->f_pos++;
+	case 1:
+		ino = parent_ino(dentry);
+		if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
+			goto out;
+		filp->f_pos++;
+	default:
+		mm = get_task_mm(task);
+		if (!mm)
+			goto out;
+		down_read(&mm->mmap_sem);
+		for (vma = mm->mmap, vmai = 2; vma; vma = vma->vm_next) {
+			char name[2 + 16 + 1];
+			int len;
+
+			if (!vma->vm_file)
+				continue;
+
+			vmai++;
+			if (vmai < filp->f_pos)
+				continue;
+
+			filp->f_pos++;
+			len = snprintf(name, sizeof(name), "0x%lx", vma->vm_start);
+			if (proc_fill_cache(filp, dirent, filldir,
+					    name, len, proc_map_files_instantiate,
+					    task, vma) < 0)
+				break;
+		}
+		up_read(&mm->mmap_sem);
+		mmput(mm);
+	}
+
+out:
+	put_task_struct(task);
+out_no_task:
+	return ret;
+}
+
+static const struct file_operations proc_map_files_operations = {
+	.read		= generic_read_dir,
+	.readdir	= proc_map_files_readdir,
+	.llseek		= default_llseek,
+};
+
 /*
  * /proc/pid/fd needs a special permission handler so that a process can still
  * access /proc/self/fd after it has executed a setuid().
@@ -2785,6 +2975,7 @@ static const struct inode_operations pro
 static const struct pid_entry tgid_base_stuff[] = {
 	DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
 	DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
+	DIR("map_files",  S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
 	DIR("fdinfo",     S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
 	DIR("ns",	  S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
 #ifdef CONFIG_NET
Index: linux-2.6.git/include/linux/proc_fs.h
===================================================================
--- linux-2.6.git.orig/include/linux/proc_fs.h
+++ linux-2.6.git/include/linux/proc_fs.h
@@ -265,7 +265,10 @@ struct ctl_table;
 
 struct proc_inode {
 	struct pid *pid;
-	int fd;
+	union {
+		int fd;
+		unsigned long vm_start;
+	};
 	union proc_op op;
 	struct proc_dir_entry *pde;
 	struct ctl_table_header *sysctl;

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2
  2011-08-24  8:53 [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2 Cyrill Gorcunov
@ 2011-08-24  9:21 ` Pekka Enberg
  2011-08-24  9:33   ` Pavel Emelyanov
  2011-08-24  9:34 ` Tejun Heo
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 48+ messages in thread
From: Pekka Enberg @ 2011-08-24  9:21 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Nathan Lynch, Oren Laadan, Daniel Lezcano, Tejun Heo,
	Andrew Morton, Glauber Costa, containers, linux-kernel,
	Pavel Emelyanov, Serge Hallyn, LINUXFS-ML, James Bottomley

On Wed, Aug 24, 2011 at 11:53 AM, Cyrill Gorcunov <gorcunov@gmail.com> wrote:
> From: Pavel Emelyanov <xemul@parallels.com>
>
> This one behaves similarly to the /proc/<pid>/fd/ one - it contains symlinks
> one for each mapping with file, the name of a symlink is vma->vm_start, the
> target is the file. Opening a symlink results in a file that point exactly
> to the same inode as them vma's one.
>
> This thing is aimed to help checkpointing processes.

OK, but you really should explain _how_ this will help with checkpointing.

                        Pekka

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2
  2011-08-24  9:21 ` Pekka Enberg
@ 2011-08-24  9:33   ` Pavel Emelyanov
  0 siblings, 0 replies; 48+ messages in thread
From: Pavel Emelyanov @ 2011-08-24  9:33 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Cyrill Gorcunov, Nathan Lynch, Oren Laadan, Daniel Lezcano,
	Tejun Heo, Andrew Morton, Glauber Costa, containers,
	linux-kernel, Serge Hallyn, LINUXFS-ML, James Bottomley

On 08/24/2011 01:21 PM, Pekka Enberg wrote:
> On Wed, Aug 24, 2011 at 11:53 AM, Cyrill Gorcunov <gorcunov@gmail.com> wrote:
>> From: Pavel Emelyanov <xemul@parallels.com>
>>
>> This one behaves similarly to the /proc/<pid>/fd/ one - it contains symlinks
>> one for each mapping with file, the name of a symlink is vma->vm_start, the
>> target is the file. Opening a symlink results in a file that point exactly
>> to the same inode as them vma's one.
>>
>> This thing is aimed to help checkpointing processes.
> 
> OK, but you really should explain _how_ this will help with checkpointing.

This helps in three ways:

1. When dumping a task mappings we do know exact file that is mapped by particular
   region. We do this by opening /proc/pid/map_files/address symlink the way we do
   with file descriptors.
   
2. This also helps in determining which anonymous shared mappings are shared with
   each other by comparing the inodes of them.

3. When restoring a set of process in case two of them has a mapping shared, we map
   the memory by the 1st one and then open its /proc/pid/map_files/address file and
   map it by the 2nd task.

>                         Pekka
> .
> 


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2
  2011-08-24  8:53 [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2 Cyrill Gorcunov
  2011-08-24  9:21 ` Pekka Enberg
@ 2011-08-24  9:34 ` Tejun Heo
  2011-08-24  9:37   ` Cyrill Gorcunov
  2011-08-24 11:18 ` Vasiliy Kulikov
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 48+ messages in thread
From: Tejun Heo @ 2011-08-24  9:34 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Nathan Lynch, Oren Laadan, Daniel Lezcano, Andrew Morton,
	Glauber Costa, containers, linux-kernel, Pavel Emelyanov,
	Serge Hallyn, LINUXFS-ML, James Bottomley

Hello,

On Wed, Aug 24, 2011 at 12:53:29PM +0400, Cyrill Gorcunov wrote:
> +static const struct dentry_operations tid_map_files_dentry_operations = {
> +	.d_revalidate	= pid_revalidate,
> +	.d_delete	= pid_delete_dentry,
> +};

Why pid_revalidate?  Shouldn't it be verifying the entry against the
current vmas?  vmas (of course) can change while the process is
running.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2
  2011-08-24  9:34 ` Tejun Heo
@ 2011-08-24  9:37   ` Cyrill Gorcunov
  2011-08-24  9:41     ` Cyrill Gorcunov
  2011-08-24  9:41       ` Cyrill Gorcunov
  0 siblings, 2 replies; 48+ messages in thread
From: Cyrill Gorcunov @ 2011-08-24  9:37 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Nathan Lynch, Oren Laadan, Daniel Lezcano, Andrew Morton,
	Glauber Costa, containers, linux-kernel, Pavel Emelyanov,
	Serge Hallyn, LINUXFS-ML, James Bottomley

On Wed, Aug 24, 2011 at 11:34:31AM +0200, Tejun Heo wrote:
> Hello,
> 
> On Wed, Aug 24, 2011 at 12:53:29PM +0400, Cyrill Gorcunov wrote:
> > +static const struct dentry_operations tid_map_files_dentry_operations = {
> > +	.d_revalidate	= pid_revalidate,
> > +	.d_delete	= pid_delete_dentry,
> > +};
> 
> Why pid_revalidate?  Shouldn't it be verifying the entry against the
> current vmas?  vmas (of course) can change while the process is
> running.
> 
> Thanks.
> 

Hmm, good point, letme check...

	Cyrill

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2
  2011-08-24  9:37   ` Cyrill Gorcunov
@ 2011-08-24  9:41     ` Cyrill Gorcunov
  2011-08-24  9:41       ` Cyrill Gorcunov
  1 sibling, 0 replies; 48+ messages in thread
From: Cyrill Gorcunov @ 2011-08-24  9:41 UTC (permalink / raw)
  To: Tejun Heo, Nathan Lynch, Oren Laadan, Daniel Lezcano, Andrew Morton

On Wed, Aug 24, 2011 at 01:37:52PM +0400, Cyrill Gorcunov wrote:
...
> 
> Hmm, good point, letme check...
> 

Yes, Tejun, we need an own revalidate here! So'll update the patch
anyway (will put Pavel's comments into changelog). I somehow forgot
in first place that this entries might be referred even if process is
not stopped. Thanks for chatching it!

	Cyrill

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2
  2011-08-24  9:37   ` Cyrill Gorcunov
@ 2011-08-24  9:41       ` Cyrill Gorcunov
  2011-08-24  9:41       ` Cyrill Gorcunov
  1 sibling, 0 replies; 48+ messages in thread
From: Cyrill Gorcunov @ 2011-08-24  9:41 UTC (permalink / raw)
  To: Tejun Heo, Nathan Lynch, Oren Laadan, Daniel Lezcano,
	Andrew Morton, Glauber Costa, containers, linux-kernel,
	Pavel Emelyanov, Serge Hallyn, LINUXFS-ML, James Bottomley

On Wed, Aug 24, 2011 at 01:37:52PM +0400, Cyrill Gorcunov wrote:
...
> 
> Hmm, good point, letme check...
> 

Yes, Tejun, we need an own revalidate here! So'll update the patch
anyway (will put Pavel's comments into changelog). I somehow forgot
in first place that this entries might be referred even if process is
not stopped. Thanks for chatching it!

	Cyrill

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2
@ 2011-08-24  9:41       ` Cyrill Gorcunov
  0 siblings, 0 replies; 48+ messages in thread
From: Cyrill Gorcunov @ 2011-08-24  9:41 UTC (permalink / raw)
  To: Tejun Heo, Nathan Lynch, Oren Laadan, Daniel Lezcano,
	Andrew Morton, Glauber

On Wed, Aug 24, 2011 at 01:37:52PM +0400, Cyrill Gorcunov wrote:
...
> 
> Hmm, good point, letme check...
> 

Yes, Tejun, we need an own revalidate here! So'll update the patch
anyway (will put Pavel's comments into changelog). I somehow forgot
in first place that this entries might be referred even if process is
not stopped. Thanks for chatching it!

	Cyrill

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2
  2011-08-24  8:53 [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2 Cyrill Gorcunov
  2011-08-24  9:21 ` Pekka Enberg
  2011-08-24  9:34 ` Tejun Heo
@ 2011-08-24 11:18 ` Vasiliy Kulikov
  2011-08-24 11:31   ` Cyrill Gorcunov
  2011-08-25  8:29   ` Cyrill Gorcunov
  2011-08-24 15:05   ` Zan Lynx
  2011-09-13 14:14 ` Pavel Machek
  4 siblings, 2 replies; 48+ messages in thread
From: Vasiliy Kulikov @ 2011-08-24 11:18 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Nathan Lynch, Oren Laadan, Daniel Lezcano, Tejun Heo,
	Andrew Morton, Pavel Emelyanov, linux-kernel, James Bottomley,
	LINUXFS-ML, containers

Hi Cyrill,

On Wed, Aug 24, 2011 at 12:53 +0400, Cyrill Gorcunov wrote:
> +static struct dentry *proc_map_files_lookup(struct inode *dir,
> +		struct dentry *dentry, struct nameidata *nd)
> +{
> +	struct task_struct *task;
> +	unsigned long vm_start;
> +	struct vm_area_struct *vma;
> +	struct mm_struct *mm;
> +	struct dentry *result;
> +	char *endp;
> +
> +	result = ERR_PTR(-ENOENT);
> +
> +	task = get_proc_task(dir);
> +	if (!task)
> +		goto out_no_task;
> +
> +	vm_start = simple_strtoul(dentry->d_name.name, &endp, 16);
> +	if (*endp != '\0')
> +		goto out_no_mm;
> +
> +	mm = get_task_mm(task);
> +	if (!mm)
> +		goto out_no_mm;
> +
> +	down_read(&mm->mmap_sem);
> +	vma = find_vma(mm, vm_start);
> +	if (!vma || vma->vm_start != vm_start)
> +		goto out_no_vma;
> +	result = proc_map_files_instantiate(dir, dentry, task, vma);
> +
> +out_no_vma:
> +	up_read(&mm->mmap_sem);
> +	mmput(mm);
> +out_no_mm:
> +	put_task_struct(task);
> +out_no_task:
> +	return result;
> +}

You have no ptrace_may_access() check in ->lookup(), only in ->readdir().

> +static const struct inode_operations proc_map_files_inode_operations = {
> +	.lookup		= proc_map_files_lookup,
> +	.setattr	= proc_setattr,
> +};
> +
> +static int proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
> +{
...
> +	ret = -EPERM;
> +	if (!ptrace_may_access(task, PTRACE_MODE_READ))
> +		goto out;
...
> +	}

Thanks,

-- 
Vasiliy

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2
  2011-08-24 11:18 ` Vasiliy Kulikov
@ 2011-08-24 11:31   ` Cyrill Gorcunov
  2011-08-25  8:29   ` Cyrill Gorcunov
  1 sibling, 0 replies; 48+ messages in thread
From: Cyrill Gorcunov @ 2011-08-24 11:31 UTC (permalink / raw)
  To: Vasiliy Kulikov
  Cc: Nathan Lynch, Oren Laadan, Daniel Lezcano, Tejun Heo,
	Andrew Morton, Pavel Emelyanov, linux-kernel, James Bottomley,
	LINUXFS-ML, containers

On Wed, Aug 24, 2011 at 03:18:06PM +0400, Vasiliy Kulikov wrote:
...
> 
> You have no ptrace_may_access() check in ->lookup(), only in ->readdir().
> 

Thanks Vasiliy! I'm reworking it anyway ;)

	Cyrill

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2
  2011-08-24  8:53 [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2 Cyrill Gorcunov
@ 2011-08-24 15:05   ` Zan Lynx
  2011-08-24  9:34 ` Tejun Heo
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 48+ messages in thread
From: Zan Lynx @ 2011-08-24 15:05 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Pavel Emelyanov, linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	James Bottomley, containers-qjLDD68F18O7TbgM5vRIOg, Nathan Lynch,
	Tejun Heo, LINUXFS-ML, Daniel Lezcano, Andrew Morton

On 8/24/2011 2:53 AM, Cyrill Gorcunov wrote:
> From: Pavel Emelyanov <xemul-bzQdu9zFT3WakBO8gow8eQ@public.gmane.org>
> 
> This one behaves similarly to the /proc/<pid>/fd/ one - it contains symlinks
> one for each mapping with file, the name of a symlink is vma->vm_start, the
> target is the file. Opening a symlink results in a file that point exactly
> to the same inode as them vma's one.
> 
> This thing is aimed to help checkpointing processes.
> 
> For example the ls -l of some arbitrary /proc/<pid>/map_files/
> 
>  | lr-x------ 1 cyrill cyrill 64 Aug  9 15:25 0x3d73a00000 -> /lib64/ld-2.5.so
[snip]

Just curious: How do these symlinks work when the process reading a
/proc file is in a chroot or a different namespace?

For example, a chroot environment might have independent copies of
/lib64/ld-2.5.so and a bind mount of /proc. Does the symlink then point
to the wrong file?

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2
@ 2011-08-24 15:05   ` Zan Lynx
  0 siblings, 0 replies; 48+ messages in thread
From: Zan Lynx @ 2011-08-24 15:05 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Nathan Lynch, Oren Laadan, Daniel Lezcano, Tejun Heo,
	Andrew Morton, Glauber Costa, containers, linux-kernel,
	Pavel Emelyanov, Serge Hallyn, LINUXFS-ML, James Bottomley

On 8/24/2011 2:53 AM, Cyrill Gorcunov wrote:
> From: Pavel Emelyanov <xemul@parallels.com>
> 
> This one behaves similarly to the /proc/<pid>/fd/ one - it contains symlinks
> one for each mapping with file, the name of a symlink is vma->vm_start, the
> target is the file. Opening a symlink results in a file that point exactly
> to the same inode as them vma's one.
> 
> This thing is aimed to help checkpointing processes.
> 
> For example the ls -l of some arbitrary /proc/<pid>/map_files/
> 
>  | lr-x------ 1 cyrill cyrill 64 Aug  9 15:25 0x3d73a00000 -> /lib64/ld-2.5.so
[snip]

Just curious: How do these symlinks work when the process reading a
/proc file is in a chroot or a different namespace?

For example, a chroot environment might have independent copies of
/lib64/ld-2.5.so and a bind mount of /proc. Does the symlink then point
to the wrong file?

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2
  2011-08-24 15:05   ` Zan Lynx
  (?)
@ 2011-08-24 15:19   ` Pavel Emelyanov
  2011-08-24 17:36       ` Andi Kleen
  -1 siblings, 1 reply; 48+ messages in thread
From: Pavel Emelyanov @ 2011-08-24 15:19 UTC (permalink / raw)
  To: Zan Lynx
  Cc: Cyrill Gorcunov, Nathan Lynch, Oren Laadan, Daniel Lezcano,
	Tejun Heo, Andrew Morton, Glauber Costa, containers,
	linux-kernel, Serge Hallyn, LINUXFS-ML, James Bottomley

On 08/24/2011 07:05 PM, Zan Lynx wrote:
> On 8/24/2011 2:53 AM, Cyrill Gorcunov wrote:
>> From: Pavel Emelyanov <xemul@parallels.com>
>>
>> This one behaves similarly to the /proc/<pid>/fd/ one - it contains symlinks
>> one for each mapping with file, the name of a symlink is vma->vm_start, the
>> target is the file. Opening a symlink results in a file that point exactly
>> to the same inode as them vma's one.
>>
>> This thing is aimed to help checkpointing processes.
>>
>> For example the ls -l of some arbitrary /proc/<pid>/map_files/
>>
>>  | lr-x------ 1 cyrill cyrill 64 Aug  9 15:25 0x3d73a00000 -> /lib64/ld-2.5.so
> [snip]
> 
> Just curious: How do these symlinks work when the process reading a
> /proc file is in a chroot or a different namespace?
> 
> For example, a chroot environment might have independent copies of
> /lib64/ld-2.5.so and a bind mount of /proc. Does the symlink then point
> to the wrong file?

No and this is the trick - when you readlink it - it give you trash, but
when you open one - you get exactly the same file as the map points to.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2
  2011-08-24 15:05   ` Zan Lynx
  (?)
  (?)
@ 2011-08-24 15:22   ` Cyrill Gorcunov
  -1 siblings, 0 replies; 48+ messages in thread
From: Cyrill Gorcunov @ 2011-08-24 15:22 UTC (permalink / raw)
  To: Zan Lynx
  Cc: Nathan Lynch, Oren Laadan, Daniel Lezcano, Tejun Heo,
	Andrew Morton, Glauber Costa, containers, linux-kernel,
	Pavel Emelyanov, Serge Hallyn, LINUXFS-ML, James Bottomley

On Wed, Aug 24, 2011 at 09:05:21AM -0600, Zan Lynx wrote:
> On 8/24/2011 2:53 AM, Cyrill Gorcunov wrote:
> > From: Pavel Emelyanov <xemul@parallels.com>
> > 
> > This one behaves similarly to the /proc/<pid>/fd/ one - it contains symlinks
> > one for each mapping with file, the name of a symlink is vma->vm_start, the
> > target is the file. Opening a symlink results in a file that point exactly
> > to the same inode as them vma's one.
> > 
> > This thing is aimed to help checkpointing processes.
> > 
> > For example the ls -l of some arbitrary /proc/<pid>/map_files/
> > 
> >  | lr-x------ 1 cyrill cyrill 64 Aug  9 15:25 0x3d73a00000 -> /lib64/ld-2.5.so
> [snip]
> 
> Just curious: How do these symlinks work when the process reading a
> /proc file is in a chroot or a different namespace?
> 
> For example, a chroot environment might have independent copies of
> /lib64/ld-2.5.so and a bind mount of /proc. Does the symlink then point
> to the wrong file?

To be fair, I personally didn't tried such scenario but it should be
the same behaviour as for /proc/$pid/fd since data is taken from
a task pointed by pid. I'll check though, good question.

	Cyrill

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2
  2011-08-24 15:19   ` Pavel Emelyanov
@ 2011-08-24 17:36       ` Andi Kleen
  0 siblings, 0 replies; 48+ messages in thread
From: Andi Kleen @ 2011-08-24 17:36 UTC (permalink / raw)
  To: Pavel Emelyanov
  Cc: Zan Lynx, Cyrill Gorcunov, Nathan Lynch, Oren Laadan,
	Daniel Lezcano, Tejun Heo, Andrew Morton, Glauber Costa,
	containers, linux-kernel, Serge Hallyn, LINUXFS-ML,
	James Bottomley

Pavel Emelyanov <xemul@parallels.com> writes:
>
> No and this is the trick - when you readlink it - it give you trash, but
> when you open one - you get exactly the same file as the map points to.

Isn't that a minor security hole?

For example if I pass a file descriptor into a chroot process for
reading, and with this interface you can open it for writing too.
I could see this causing problems.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2
@ 2011-08-24 17:36       ` Andi Kleen
  0 siblings, 0 replies; 48+ messages in thread
From: Andi Kleen @ 2011-08-24 17:36 UTC (permalink / raw)
  To: Pavel Emelyanov
  Cc: Zan Lynx, Cyrill Gorcunov, Nathan Lynch, Oren Laadan,
	Daniel Lezcano, Tejun Heo, Andrew Morton, Glauber Costa,
	containers, linux-kernel, Serge Hallyn, LINUXFS-ML,
	James Bottomley

Pavel Emelyanov <xemul@parallels.com> writes:
>
> No and this is the trick - when you readlink it - it give you trash, but
> when you open one - you get exactly the same file as the map points to.

Isn't that a minor security hole?

For example if I pass a file descriptor into a chroot process for
reading, and with this interface you can open it for writing too.
I could see this causing problems.

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2
  2011-08-24 17:36       ` Andi Kleen
  (?)
@ 2011-08-25  6:42       ` Pavel Emelyanov
  2011-08-25 14:04         ` Andi Kleen
  -1 siblings, 1 reply; 48+ messages in thread
From: Pavel Emelyanov @ 2011-08-25  6:42 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Zan Lynx, Cyrill Gorcunov, Nathan Lynch, Oren Laadan,
	Daniel Lezcano, Tejun Heo, Andrew Morton, Glauber Costa,
	containers, linux-kernel, Serge Hallyn, LINUXFS-ML,
	James Bottomley

On 08/24/2011 09:36 PM, Andi Kleen wrote:
> Pavel Emelyanov <xemul@parallels.com> writes:
>>
>> No and this is the trick - when you readlink it - it give you trash, but
>> when you open one - you get exactly the same file as the map points to.
> 
> Isn't that a minor security hole?
> 
> For example if I pass a file descriptor into a chroot process for
> reading, and with this interface you can open it for writing too.
> I could see this causing problems.

How does it differ from the /proc/pid/fd links?

> -Andi
> 


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2
  2011-08-24 11:18 ` Vasiliy Kulikov
  2011-08-24 11:31   ` Cyrill Gorcunov
@ 2011-08-25  8:29   ` Cyrill Gorcunov
  2011-08-25 17:01     ` Tejun Heo
  2011-08-25 17:36     ` Vasiliy Kulikov
  1 sibling, 2 replies; 48+ messages in thread
From: Cyrill Gorcunov @ 2011-08-25  8:29 UTC (permalink / raw)
  To: Vasiliy Kulikov
  Cc: Nathan Lynch, Oren Laadan, Daniel Lezcano, Tejun Heo,
	Andrew Morton, Pavel Emelyanov, linux-kernel, James Bottomley,
	LINUXFS-ML, containers, Zan Lynx, Andi Kleen

On Wed, Aug 24, 2011 at 03:18:06PM +0400, Vasiliy Kulikov wrote:
...
> 
> You have no ptrace_may_access() check in ->lookup(), only in ->readdir().
> 

A huge thanks to all for feedback! Please review this updated version.
The main changes are at v3 mark in changelog. Complains (as always) are
welcome ;)

	Cyrill
---
From: Pavel Emelyanov <xemul@parallels.com>
fs, proc: Introduce the /proc/<pid>/map_files/ directory v3

This one behaves similarly to the /proc/<pid>/fd/ one - it contains symlinks
one for each mapping with file, the name of a symlink is vma->vm_start, the
target is the file. Opening a symlink results in a file that point exactly
to the same inode as them vma's one.

For example the ls -l of some arbitrary /proc/<pid>/map_files/

 | lr-x------ 1 cyrill cyrill 64 Aug  9 15:25 0x3d73a00000 -> /lib64/ld-2.5.so
 | lr-x------ 1 cyrill cyrill 64 Aug  9 15:25 0x3d73c1b000 -> /lib64/ld-2.5.so
 | lr-x------ 1 cyrill cyrill 64 Aug  9 15:25 0x3d73c1c000 -> /lib64/ld-2.5.so
 | lr-x------ 1 cyrill cyrill 64 Aug  9 15:25 0x3d73e00000 -> /lib64/libc-2.5.so
 | lr-x------ 1 cyrill cyrill 64 Aug  9 15:25 0x3d73f4e000 -> /lib64/libc-2.5.so

This helps checkpointing process in three ways:

1. When dumping a task mappings we do know exact file that is mapped by particular
   region. We do this by opening /proc/pid/map_files/address symlink the way we do
   with file descriptors.

2. This also helps in determining which anonymous shared mappings are shared with
   each other by comparing the inodes of them.

3. When restoring a set of process in case two of them has a mapping shared, we map
   the memory by the 1st one and then open its /proc/pid/map_files/address file and
   map it by the 2nd task.

v2:
 - /proc/<pid>/mfd changed to /proc/<pid>/map_files
 - find_vma helper is used instead of linear search
 - routines are re-grouped
 - d_revalidate is set now

v3:
 - d_revalidate reworked, now it should drops no longer valid dentries
 - ptrace_may_access added into proc_map_files_lookup
 - because of filldir (which eventually might need to lock mmap_sem)
   the proc_map_files_readdir() was reworked to call proc_fill_cache()
   with unlocked mmap_sem

Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
---
 fs/proc/base.c          |  278 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/proc_fs.h |    5 
 2 files changed, 282 insertions(+), 1 deletion(-)

Index: linux-2.6.git/fs/proc/base.c
===================================================================
--- linux-2.6.git.orig/fs/proc/base.c
+++ linux-2.6.git/fs/proc/base.c
@@ -2170,6 +2170,283 @@ static const struct file_operations proc
 	.llseek		= default_llseek,
 };
 
+static struct vm_area_struct *find_exact_vma(struct mm_struct *mm, unsigned long vm_start)
+{
+	struct vm_area_struct *vma = find_vma(mm, vm_start);
+	if (vma && vma->vm_start != vm_start)
+		vma = NULL;
+	return vma;
+}
+
+static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	struct task_struct *task;
+	struct vm_area_struct *vma;
+	struct mm_struct *mm;
+	struct inode *inode;
+
+	if (nd && nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	inode = dentry->d_inode;
+	task = get_proc_task(inode);
+	if (!task)
+		goto out;
+
+	mm = get_task_mm(task);
+	put_task_struct(task);
+	if (!mm)
+		goto out;
+
+	down_read(&mm->mmap_sem);
+	vma = find_exact_vma(mm, PROC_I(inode)->vm_start);
+	up_read(&mm->mmap_sem);
+	mmput(mm);
+
+	if (vma)
+		return 1;
+out:
+	d_drop(dentry);
+	return 0;
+}
+
+static const struct dentry_operations tid_map_files_dentry_operations = {
+	.d_revalidate	= map_files_d_revalidate,
+	.d_delete	= pid_delete_dentry,
+};
+
+static int proc_map_files_get_link(struct inode *inode, struct path *path)
+{
+	struct task_struct *task;
+	struct vm_area_struct *vma;
+	struct mm_struct *mm;
+	int rc = -ENOENT;
+
+	task = get_proc_task(inode);
+	if (!task)
+		goto out;
+
+	mm = get_task_mm(task);
+	put_task_struct(task);
+	if (!mm)
+		goto out;
+
+	down_read(&mm->mmap_sem);
+	vma = find_exact_vma(mm, PROC_I(inode)->vm_start);
+	if (vma && vma->vm_file) {
+		*path = vma->vm_file->f_path;
+		path_get(path);
+		rc = 0;
+	}
+	up_read(&mm->mmap_sem);
+
+	mmput(mm);
+
+out:
+	return rc;
+}
+
+struct map_files_info {
+	struct file	*file;
+	unsigned long	vm_start;
+	unsigned char	name[24];
+	unsigned long	len;
+};
+
+static struct dentry *
+proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
+			   struct task_struct *task, const void *ptr)
+{
+	const struct map_files_info *info = ptr;
+	struct proc_inode *ei;
+	struct inode *inode;
+
+	if (!info->file)
+		return ERR_PTR(-ENOENT);
+
+	inode = proc_pid_make_inode(dir->i_sb, task);
+	if (!inode)
+		return ERR_PTR(-ENOENT);
+
+	ei			= PROC_I(inode);
+	ei->vm_start		= info->vm_start;
+	ei->op.proc_get_link	= proc_map_files_get_link;
+
+	inode->i_op	= &proc_pid_link_inode_operations;
+	inode->i_size	= 64;
+	inode->i_mode	= S_IFLNK;
+
+	if (info->file->f_mode & FMODE_READ)
+		inode->i_mode |= S_IRUSR | S_IXUSR;
+	if (info->file->f_mode & FMODE_WRITE)
+		inode->i_mode |= S_IWUSR | S_IXUSR;
+
+	d_set_d_op(dentry, &tid_map_files_dentry_operations);
+	d_add(dentry, inode);
+
+	return NULL;
+}
+
+static struct dentry *proc_map_files_lookup(struct inode *dir,
+		struct dentry *dentry, struct nameidata *nd)
+{
+	struct task_struct *task;
+	unsigned long vm_start;
+	struct vm_area_struct *vma;
+	struct map_files_info info;
+	struct mm_struct *mm;
+	struct dentry *result;
+	char *endp;
+
+	result = ERR_PTR(-ENOENT);
+	task = get_proc_task(dir);
+	if (!task)
+		goto out_no_task;
+
+	result = ERR_PTR(-EPERM);
+	if (!ptrace_may_access(task, PTRACE_MODE_READ));
+		goto out_no_mm;
+
+	vm_start = simple_strtoul(dentry->d_name.name, &endp, 16);
+	if (*endp != '\0')
+		goto out_no_mm;
+
+	mm = get_task_mm(task);
+	if (!mm)
+		goto out_no_mm;
+
+	down_read(&mm->mmap_sem);
+	vma = find_exact_vma(mm, vm_start);
+	if (!vma)
+		goto out_no_vma;
+
+	memset(&info, 0, sizeof(info));
+	info.file	= vma->vm_file;
+	info.vm_start	= vm_start;
+
+	result = proc_map_files_instantiate(dir, dentry, task, &info);
+
+out_no_vma:
+	up_read(&mm->mmap_sem);
+	mmput(mm);
+out_no_mm:
+	put_task_struct(task);
+out_no_task:
+	return result;
+}
+
+static const struct inode_operations proc_map_files_inode_operations = {
+	.lookup		= proc_map_files_lookup,
+	.setattr	= proc_setattr,
+};
+
+static int proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct dentry *dentry = filp->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	struct vm_area_struct *vma;
+	struct task_struct *task;
+	struct mm_struct *mm;
+	unsigned int vmai;
+	ino_t ino;
+	int ret;
+
+	ret = -ENOENT;
+	task = get_proc_task(inode);
+	if (!task)
+		goto out_no_task;
+
+	ret = -EPERM;
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+		goto out;
+
+	ret = 0;
+	switch (filp->f_pos) {
+	case 0:
+		ino = inode->i_ino;
+		if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0)
+			goto out;
+		filp->f_pos++;
+	case 1:
+		ino = parent_ino(dentry);
+		if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
+			goto out;
+		filp->f_pos++;
+	default:
+	{
+		unsigned long nr_files, used, i;
+		struct map_files_info *info;
+
+		mm = get_task_mm(task);
+		if (!mm)
+			goto out;
+		down_read(&mm->mmap_sem);
+
+		nr_files = 0;
+		for (vma = mm->mmap; vma; vma = vma->vm_next) {
+			if (vma->vm_file)
+				nr_files++;
+		}
+		if (!nr_files)
+			goto out;
+
+		info = kmalloc(nr_files * sizeof(*info), GFP_KERNEL);
+		if (!info) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		used = 0;
+		for (vma = mm->mmap, vmai = 2; vma; vma = vma->vm_next) {
+			if (!vma->vm_file)
+				continue;
+			vmai++;
+			if (vmai <= filp->f_pos)
+				continue;
+
+			get_file(vma->vm_file);
+			info[used].file	= vma->vm_file;
+			info[used].vm_start= vma->vm_start;
+
+			info[used].len = snprintf(info[used].name,
+						  sizeof(info[used].name),
+						  "0x%lx", vma->vm_start);
+			used++;
+		}
+
+		up_read(&mm->mmap_sem);
+
+		for (i = 0; i < used; i++) {
+			ret = proc_fill_cache(filp, dirent, filldir,
+					      info[i].name,
+					      info[i].len,
+					      proc_map_files_instantiate,
+					      task, &info[i]);
+			if (ret)
+				break;
+			filp->f_pos++;
+		}
+
+		for (i = 0; i < used; i++)
+			put_filp(info[i].file);
+
+		kfree(info);
+		mmput(mm);
+	}
+	}
+
+out:
+	put_task_struct(task);
+out_no_task:
+	return ret;
+}
+
+static const struct file_operations proc_map_files_operations = {
+	.read		= generic_read_dir,
+	.readdir	= proc_map_files_readdir,
+	.llseek		= default_llseek,
+};
+
 /*
  * /proc/pid/fd needs a special permission handler so that a process can still
  * access /proc/self/fd after it has executed a setuid().
@@ -2785,6 +3062,7 @@ static const struct inode_operations pro
 static const struct pid_entry tgid_base_stuff[] = {
 	DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
 	DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
+	DIR("map_files",  S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
 	DIR("fdinfo",     S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
 	DIR("ns",	  S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
 #ifdef CONFIG_NET
Index: linux-2.6.git/include/linux/proc_fs.h
===================================================================
--- linux-2.6.git.orig/include/linux/proc_fs.h
+++ linux-2.6.git/include/linux/proc_fs.h
@@ -265,7 +265,10 @@ struct ctl_table;
 
 struct proc_inode {
 	struct pid *pid;
-	int fd;
+	union {
+		int fd;
+		unsigned long vm_start;
+	};
 	union proc_op op;
 	struct proc_dir_entry *pde;
 	struct ctl_table_header *sysctl;

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2
  2011-08-25  6:42       ` Pavel Emelyanov
@ 2011-08-25 14:04         ` Andi Kleen
  2011-08-25 14:30           ` Cyrill Gorcunov
  2011-08-25 14:47           ` Pavel Emelyanov
  0 siblings, 2 replies; 48+ messages in thread
From: Andi Kleen @ 2011-08-25 14:04 UTC (permalink / raw)
  To: Pavel Emelyanov
  Cc: Andi Kleen, Zan Lynx, Cyrill Gorcunov, Nathan Lynch, Oren Laadan,
	Daniel Lezcano, Tejun Heo, Andrew Morton, Glauber Costa,
	containers, linux-kernel, Serge Hallyn, LINUXFS-ML,
	James Bottomley

On Thu, Aug 25, 2011 at 10:42:44AM +0400, Pavel Emelyanov wrote:
> On 08/24/2011 09:36 PM, Andi Kleen wrote:
> > Pavel Emelyanov <xemul@parallels.com> writes:
> >>
> >> No and this is the trick - when you readlink it - it give you trash, but
> >> when you open one - you get exactly the same file as the map points to.
> > 
> > Isn't that a minor security hole?
> > 
> > For example if I pass a file descriptor into a chroot process for
> > reading, and with this interface you can open it for writing too.
> > I could see this causing problems.
> 
> How does it differ from the /proc/pid/fd links?

Those cannot be opened I thought.
-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2
  2011-08-25 14:04         ` Andi Kleen
@ 2011-08-25 14:30           ` Cyrill Gorcunov
  2011-08-25 14:47           ` Pavel Emelyanov
  1 sibling, 0 replies; 48+ messages in thread
From: Cyrill Gorcunov @ 2011-08-25 14:30 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Pavel Emelyanov, Zan Lynx, Nathan Lynch, Oren Laadan,
	Daniel Lezcano, Tejun Heo, Andrew Morton, Glauber Costa,
	containers, linux-kernel, Serge Hallyn, LINUXFS-ML,
	James Bottomley

On Thu, Aug 25, 2011 at 04:04:35PM +0200, Andi Kleen wrote:
...
> > > 
> > > For example if I pass a file descriptor into a chroot process for
> > > reading, and with this interface you can open it for writing too.
> > > I could see this causing problems.
> > 
> > How does it differ from the /proc/pid/fd links?
> 
> Those cannot be opened I thought.
> -Andi
> 

We have ptrace_may_access in lookup so I don't think you'll be
able to open it from another namespace or/and if you have not
enough credentials to.

	Cyrill

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2
  2011-08-25 14:04         ` Andi Kleen
  2011-08-25 14:30           ` Cyrill Gorcunov
@ 2011-08-25 14:47           ` Pavel Emelyanov
  1 sibling, 0 replies; 48+ messages in thread
From: Pavel Emelyanov @ 2011-08-25 14:47 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Zan Lynx, Cyrill Gorcunov, Nathan Lynch, Oren Laadan,
	Daniel Lezcano, Tejun Heo, Andrew Morton, Glauber Costa,
	containers, linux-kernel, Serge Hallyn, LINUXFS-ML,
	James Bottomley

On 08/25/2011 06:04 PM, Andi Kleen wrote:
> On Thu, Aug 25, 2011 at 10:42:44AM +0400, Pavel Emelyanov wrote:
>> On 08/24/2011 09:36 PM, Andi Kleen wrote:
>>> Pavel Emelyanov <xemul@parallels.com> writes:
>>>>
>>>> No and this is the trick - when you readlink it - it give you trash, but
>>>> when you open one - you get exactly the same file as the map points to.
>>>
>>> Isn't that a minor security hole?
>>>
>>> For example if I pass a file descriptor into a chroot process for
>>> reading, and with this interface you can open it for writing too.
>>> I could see this causing problems.
>>
>> How does it differ from the /proc/pid/fd links?
> 
> Those cannot be opened I thought.

Neither can be these links then - I use the same access checks in my code.

> -Andi
> 


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2
  2011-08-25  8:29   ` Cyrill Gorcunov
@ 2011-08-25 17:01     ` Tejun Heo
  2011-08-25 17:05       ` Pavel Emelyanov
                         ` (2 more replies)
  2011-08-25 17:36     ` Vasiliy Kulikov
  1 sibling, 3 replies; 48+ messages in thread
From: Tejun Heo @ 2011-08-25 17:01 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Vasiliy Kulikov, Nathan Lynch, Oren Laadan, Daniel Lezcano,
	Andrew Morton, Pavel Emelyanov, linux-kernel, James Bottomley,
	LINUXFS-ML, containers, Zan Lynx, Andi Kleen

Hello,

On Thu, Aug 25, 2011 at 12:29:44PM +0400, Cyrill Gorcunov wrote:
>  | lr-x------ 1 cyrill cyrill 64 Aug  9 15:25 0x3d73a00000 -> /lib64/ld-2.5.so
>  | lr-x------ 1 cyrill cyrill 64 Aug  9 15:25 0x3d73c1b000 -> /lib64/ld-2.5.so
>  | lr-x------ 1 cyrill cyrill 64 Aug  9 15:25 0x3d73c1c000 -> /lib64/ld-2.5.so
>  | lr-x------ 1 cyrill cyrill 64 Aug  9 15:25 0x3d73e00000 -> /lib64/libc-2.5.so
>  | lr-x------ 1 cyrill cyrill 64 Aug  9 15:25 0x3d73f4e000 -> /lib64/libc-2.5.so

I would prefer if the filename included both start and end addresses
so that it matches the first column of /proc/PID/maps and ls'ing the
directory is more useful.  What do other people think?

> +static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd)
> +{
> +	struct task_struct *task;
> +	struct vm_area_struct *vma;
> +	struct mm_struct *mm;
> +	struct inode *inode;
> +
> +	if (nd && nd->flags & LOOKUP_RCU)
> +		return -ECHILD;
> +
> +	inode = dentry->d_inode;
> +	task = get_proc_task(inode);
> +	if (!task)
> +		goto out;
> +
> +	mm = get_task_mm(task);
> +	put_task_struct(task);
> +	if (!mm)
> +		goto out;
> +
> +	down_read(&mm->mmap_sem);
> +	vma = find_exact_vma(mm, PROC_I(inode)->vm_start);
> +	up_read(&mm->mmap_sem);
> +	mmput(mm);
> +
> +	if (vma)
> +		return 1;

Hmm... don't we need the same credential update as
tid_fd_revalidate()?  If the task seteuid's, we want the permissions
to change accordingly, right?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2
  2011-08-25 17:01     ` Tejun Heo
@ 2011-08-25 17:05       ` Pavel Emelyanov
  2011-08-25 17:21         ` Cyrill Gorcunov
  2011-08-25 17:07       ` Cyrill Gorcunov
  2011-08-25 17:11       ` Cyrill Gorcunov
  2 siblings, 1 reply; 48+ messages in thread
From: Pavel Emelyanov @ 2011-08-25 17:05 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Cyrill Gorcunov, Vasiliy Kulikov, Nathan Lynch, Oren Laadan,
	Daniel Lezcano, Andrew Morton, linux-kernel, James Bottomley,
	LINUXFS-ML, containers, Zan Lynx, Andi Kleen

On 08/25/2011 09:01 PM, Tejun Heo wrote:
> Hello,
> 
> On Thu, Aug 25, 2011 at 12:29:44PM +0400, Cyrill Gorcunov wrote:
>>  | lr-x------ 1 cyrill cyrill 64 Aug  9 15:25 0x3d73a00000 -> /lib64/ld-2.5.so
>>  | lr-x------ 1 cyrill cyrill 64 Aug  9 15:25 0x3d73c1b000 -> /lib64/ld-2.5.so
>>  | lr-x------ 1 cyrill cyrill 64 Aug  9 15:25 0x3d73c1c000 -> /lib64/ld-2.5.so
>>  | lr-x------ 1 cyrill cyrill 64 Aug  9 15:25 0x3d73e00000 -> /lib64/libc-2.5.so
>>  | lr-x------ 1 cyrill cyrill 64 Aug  9 15:25 0x3d73f4e000 -> /lib64/libc-2.5.so
> 
> I would prefer if the filename included both start and end addresses
> so that it matches the first column of /proc/PID/maps and ls'ing the
> directory is more useful.  What do other people think?

Without 0x at the beginning this is not very convenient, but once we add them it no
longer matches the contents of the /proc/pid/maps.

Just an opinion, do not mind adding <end> to the name.

>> +static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd)
>> +{
>> +	struct task_struct *task;
>> +	struct vm_area_struct *vma;
>> +	struct mm_struct *mm;
>> +	struct inode *inode;
>> +
>> +	if (nd && nd->flags & LOOKUP_RCU)
>> +		return -ECHILD;
>> +
>> +	inode = dentry->d_inode;
>> +	task = get_proc_task(inode);
>> +	if (!task)
>> +		goto out;
>> +
>> +	mm = get_task_mm(task);
>> +	put_task_struct(task);
>> +	if (!mm)
>> +		goto out;
>> +
>> +	down_read(&mm->mmap_sem);
>> +	vma = find_exact_vma(mm, PROC_I(inode)->vm_start);
>> +	up_read(&mm->mmap_sem);
>> +	mmput(mm);
>> +
>> +	if (vma)
>> +		return 1;
> 
> Hmm... don't we need the same credential update as
> tid_fd_revalidate()?  If the task seteuid's, we want the permissions
> to change accordingly, right?
> 
> Thanks.
> 


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2
  2011-08-25 17:01     ` Tejun Heo
  2011-08-25 17:05       ` Pavel Emelyanov
@ 2011-08-25 17:07       ` Cyrill Gorcunov
  2011-08-25 20:54         ` Tejun Heo
  2011-08-25 17:11       ` Cyrill Gorcunov
  2 siblings, 1 reply; 48+ messages in thread
From: Cyrill Gorcunov @ 2011-08-25 17:07 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Vasiliy Kulikov, Nathan Lynch, Oren Laadan, Daniel Lezcano,
	Andrew Morton, Pavel Emelyanov, linux-kernel, James Bottomley,
	LINUXFS-ML, containers, Zan Lynx, Andi Kleen

On Thu, Aug 25, 2011 at 07:01:47PM +0200, Tejun Heo wrote:
> Hello,
> 
> On Thu, Aug 25, 2011 at 12:29:44PM +0400, Cyrill Gorcunov wrote:
> >  | lr-x------ 1 cyrill cyrill 64 Aug  9 15:25 0x3d73a00000 -> /lib64/ld-2.5.so
> >  | lr-x------ 1 cyrill cyrill 64 Aug  9 15:25 0x3d73c1b000 -> /lib64/ld-2.5.so
> >  | lr-x------ 1 cyrill cyrill 64 Aug  9 15:25 0x3d73c1c000 -> /lib64/ld-2.5.so
> >  | lr-x------ 1 cyrill cyrill 64 Aug  9 15:25 0x3d73e00000 -> /lib64/libc-2.5.so
> >  | lr-x------ 1 cyrill cyrill 64 Aug  9 15:25 0x3d73f4e000 -> /lib64/libc-2.5.so
> 
> I would prefer if the filename included both start and end addresses
> so that it matches the first column of /proc/PID/maps and ls'ing the
> directory is more useful.  What do other people think?
> 

This will force us to bloat proc_inode a bit more, since at moment we
have a union such as

struct proc_inode {
	struct pid *pid;
	union {
		int fd;
		unsigned long vm_start;
	};
	...

also it will require to make a buffer for names twice bigger as well.
If all this not a problem -- then sure we can do that ;)

	Cyrill

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2
  2011-08-25 17:01     ` Tejun Heo
  2011-08-25 17:05       ` Pavel Emelyanov
  2011-08-25 17:07       ` Cyrill Gorcunov
@ 2011-08-25 17:11       ` Cyrill Gorcunov
  2 siblings, 0 replies; 48+ messages in thread
From: Cyrill Gorcunov @ 2011-08-25 17:11 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Vasiliy Kulikov, Nathan Lynch, Oren Laadan, Daniel Lezcano,
	Andrew Morton, Pavel Emelyanov, linux-kernel, James Bottomley,
	LINUXFS-ML, containers, Zan Lynx, Andi Kleen

On Thu, Aug 25, 2011 at 07:01:47PM +0200, Tejun Heo wrote:
...
> 
> Hmm... don't we need the same credential update as
> tid_fd_revalidate()?  If the task seteuid's, we want the permissions
> to change accordingly, right?
> 
> Thanks.
>

Exactly! Thanks Tejun. I'll be reworking it anyway (acoording to what
we decide to do with start-end range in names).
 
	Cyrill

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2
  2011-08-25 17:05       ` Pavel Emelyanov
@ 2011-08-25 17:21         ` Cyrill Gorcunov
  2011-08-25 17:25           ` Pavel Emelyanov
  0 siblings, 1 reply; 48+ messages in thread
From: Cyrill Gorcunov @ 2011-08-25 17:21 UTC (permalink / raw)
  To: Pavel Emelyanov
  Cc: Tejun Heo, Vasiliy Kulikov, Nathan Lynch, Oren Laadan,
	Daniel Lezcano, Andrew Morton, linux-kernel, James Bottomley,
	LINUXFS-ML, containers, Zan Lynx, Andi Kleen

On Thu, Aug 25, 2011 at 09:05:22PM +0400, Pavel Emelyanov wrote:
...
> > 
> > I would prefer if the filename included both start and end addresses
> > so that it matches the first column of /proc/PID/maps and ls'ing the
> > directory is more useful.  What do other people think?
> 
> Without 0x at the beginning this is not very convenient, but once we add them it no
> longer matches the contents of the /proc/pid/maps.
> 
> Just an opinion, do not mind adding <end> to the name.
> 

So Pavel, we can make the names to look as say "3d73a00000-3d73a01000"
leaving proc-inode structure unchanged, that's an idea, right?

	Cyrill

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2
  2011-08-25 17:21         ` Cyrill Gorcunov
@ 2011-08-25 17:25           ` Pavel Emelyanov
  2011-08-25 17:27             ` Tejun Heo
  0 siblings, 1 reply; 48+ messages in thread
From: Pavel Emelyanov @ 2011-08-25 17:25 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Tejun Heo, Vasiliy Kulikov, Nathan Lynch, Oren Laadan,
	Daniel Lezcano, Andrew Morton, linux-kernel, James Bottomley,
	LINUXFS-ML, containers, Zan Lynx, Andi Kleen

On 08/25/2011 09:21 PM, Cyrill Gorcunov wrote:
> On Thu, Aug 25, 2011 at 09:05:22PM +0400, Pavel Emelyanov wrote:
> ...
>>>
>>> I would prefer if the filename included both start and end addresses
>>> so that it matches the first column of /proc/PID/maps and ls'ing the
>>> directory is more useful.  What do other people think?
>>
>> Without 0x at the beginning this is not very convenient, but once we add them it no
>> longer matches the contents of the /proc/pid/maps.
>>
>> Just an opinion, do not mind adding <end> to the name.
>>
> 
> So Pavel, we can make the names to look as say "3d73a00000-3d73a01000"
> leaving proc-inode structure unchanged, that's an idea, right?

Not exactly - as I said if we leave the name without 0x (3d000000 instead of 0x3d000000)
this makes it hard to use strtol() on these strings to convert names to addresses.

Not big deal, just my 5 cents on the "leaving links' names as is" side.

> 	Cyrill
> .
> 


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2
  2011-08-25 17:25           ` Pavel Emelyanov
@ 2011-08-25 17:27             ` Tejun Heo
  2011-08-25 17:34               ` Cyrill Gorcunov
  0 siblings, 1 reply; 48+ messages in thread
From: Tejun Heo @ 2011-08-25 17:27 UTC (permalink / raw)
  To: Pavel Emelyanov
  Cc: Cyrill Gorcunov, Vasiliy Kulikov, Nathan Lynch, Oren Laadan,
	Daniel Lezcano, Andrew Morton, linux-kernel, James Bottomley,
	LINUXFS-ML, containers, Zan Lynx, Andi Kleen

Hello,

On Thu, Aug 25, 2011 at 7:25 PM, Pavel Emelyanov <xemul@parallels.com> wrote:
> Not exactly - as I said if we leave the name without 0x (3d000000 instead of 0x3d000000)
> this makes it hard to use strtol() on these strings to convert names to addresses.

But strtol() has @base.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2
  2011-08-25 17:27             ` Tejun Heo
@ 2011-08-25 17:34               ` Cyrill Gorcunov
  0 siblings, 0 replies; 48+ messages in thread
From: Cyrill Gorcunov @ 2011-08-25 17:34 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Pavel Emelyanov, Vasiliy Kulikov, Nathan Lynch, Oren Laadan,
	Daniel Lezcano, Andrew Morton, linux-kernel, James Bottomley,
	LINUXFS-ML, containers, Zan Lynx, Andi Kleen

On Thu, Aug 25, 2011 at 07:27:34PM +0200, Tejun Heo wrote:
> Hello,
> 
> On Thu, Aug 25, 2011 at 7:25 PM, Pavel Emelyanov <xemul@parallels.com> wrote:
> > Not exactly - as I said if we leave the name without 0x (3d000000 instead of 0x3d000000)
> > this makes it hard to use strtol() on these strings to convert names to addresses.
> 
> But strtol() has @base.
> 

That's ok. Tejun, i'll think a bit more (probably tomorrow already, since I'll
be reworking it and adding a check for permissions nit you've pointed) and will
check if there is a nice way to make the names longer ;)

	Cyrill

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2
  2011-08-25  8:29   ` Cyrill Gorcunov
  2011-08-25 17:01     ` Tejun Heo
@ 2011-08-25 17:36     ` Vasiliy Kulikov
  2011-08-25 17:39       ` Cyrill Gorcunov
  1 sibling, 1 reply; 48+ messages in thread
From: Vasiliy Kulikov @ 2011-08-25 17:36 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Nathan Lynch, Oren Laadan, Daniel Lezcano, Tejun Heo,
	Andrew Morton, Pavel Emelyanov, linux-kernel, James Bottomley,
	LINUXFS-ML, containers, Zan Lynx, Andi Kleen

On Thu, Aug 25, 2011 at 12:29 +0400, Cyrill Gorcunov wrote:
> +static struct dentry *proc_map_files_lookup(struct inode *dir,
> +		struct dentry *dentry, struct nameidata *nd)
> +{
> +	struct task_struct *task;
> +	unsigned long vm_start;
> +	struct vm_area_struct *vma;
> +	struct map_files_info info;
> +	struct mm_struct *mm;
> +	struct dentry *result;
> +	char *endp;
> +
> +	result = ERR_PTR(-ENOENT);
> +	task = get_proc_task(dir);
> +	if (!task)
> +		goto out_no_task;
> +
> +	result = ERR_PTR(-EPERM);
> +	if (!ptrace_may_access(task, PTRACE_MODE_READ));
> +		goto out_no_mm;

Probably

    result = ERR_PTR(-ENOENT);

belongs here?

> +	vm_start = simple_strtoul(dentry->d_name.name, &endp, 16);
> +	if (*endp != '\0')
> +		goto out_no_mm;
> +
> +	mm = get_task_mm(task);
> +	if (!mm)
> +		goto out_no_mm;
> +
> +	down_read(&mm->mmap_sem);
> +	vma = find_exact_vma(mm, vm_start);
> +	if (!vma)
> +		goto out_no_vma;
> +
> +	memset(&info, 0, sizeof(info));
> +	info.file	= vma->vm_file;
> +	info.vm_start	= vm_start;
> +
> +	result = proc_map_files_instantiate(dir, dentry, task, &info);
> +
> +out_no_vma:
> +	up_read(&mm->mmap_sem);
> +	mmput(mm);
> +out_no_mm:
> +	put_task_struct(task);
> +out_no_task:
> +	return result;
> +}

-- 
Vasiliy Kulikov
http://www.openwall.com - bringing security into open computing environments

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2
  2011-08-25 17:36     ` Vasiliy Kulikov
@ 2011-08-25 17:39       ` Cyrill Gorcunov
  2011-08-25 17:54         ` Vasiliy Kulikov
  0 siblings, 1 reply; 48+ messages in thread
From: Cyrill Gorcunov @ 2011-08-25 17:39 UTC (permalink / raw)
  To: Vasiliy Kulikov
  Cc: Nathan Lynch, Oren Laadan, Daniel Lezcano, Tejun Heo,
	Andrew Morton, Pavel Emelyanov, linux-kernel, James Bottomley,
	LINUXFS-ML, containers, Zan Lynx, Andi Kleen

On Thu, Aug 25, 2011 at 09:36:42PM +0400, Vasiliy Kulikov wrote:
...
> > +
> > +	result = ERR_PTR(-EPERM);
> > +	if (!ptrace_may_access(task, PTRACE_MODE_READ));
> > +		goto out_no_mm;
> 
> Probably
> 
>     result = ERR_PTR(-ENOENT);
> 
> belongs here?
> 

Don't think so, it should point that permission failed
rather no such entry. Or this might break some tools?

	Cyrill

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2
  2011-08-25 17:39       ` Cyrill Gorcunov
@ 2011-08-25 17:54         ` Vasiliy Kulikov
  2011-08-25 18:13           ` Cyrill Gorcunov
  0 siblings, 1 reply; 48+ messages in thread
From: Vasiliy Kulikov @ 2011-08-25 17:54 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Nathan Lynch, Oren Laadan, Daniel Lezcano, Tejun Heo,
	Andrew Morton, Pavel Emelyanov, linux-kernel, James Bottomley,
	LINUXFS-ML, containers, Zan Lynx, Andi Kleen

On Thu, Aug 25, 2011 at 21:39 +0400, Cyrill Gorcunov wrote:
> On Thu, Aug 25, 2011 at 09:36:42PM +0400, Vasiliy Kulikov wrote:
> ...
> > > +
> > > +	result = ERR_PTR(-EPERM);
> > > +	if (!ptrace_may_access(task, PTRACE_MODE_READ));
> > > +		goto out_no_mm;
> > 
> > Probably
> > 
> >     result = ERR_PTR(-ENOENT);
> > 
> > belongs here?
> > 
> 
> Don't think so, it should point that permission failed
> rather no such entry. Or this might break some tools?

I mean the place exactly where I've put it to indicate simple_strtoul(),
get_task_mm(), or find_exact_vma() error.  EPERM for ptrace_may_access()
is OK.

Thanks,

-- 
Vasiliy Kulikov
http://www.openwall.com - bringing security into open computing environments

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2
  2011-08-25 17:54         ` Vasiliy Kulikov
@ 2011-08-25 18:13           ` Cyrill Gorcunov
  0 siblings, 0 replies; 48+ messages in thread
From: Cyrill Gorcunov @ 2011-08-25 18:13 UTC (permalink / raw)
  To: Vasiliy Kulikov
  Cc: Nathan Lynch, Oren Laadan, Daniel Lezcano, Tejun Heo,
	Andrew Morton, Pavel Emelyanov, linux-kernel, James Bottomley,
	LINUXFS-ML, containers, Zan Lynx, Andi Kleen

On Thu, Aug 25, 2011 at 09:54:38PM +0400, Vasiliy Kulikov wrote:
...
> > > 
> > 
> > Don't think so, it should point that permission failed
> > rather no such entry. Or this might break some tools?
> 
> I mean the place exactly where I've put it to indicate simple_strtoul(),
> get_task_mm(), or find_exact_vma() error.  EPERM for ptrace_may_access()
> is OK.
> 

Ah, yeah! ENOENT is really would be correct option here, will fix in next
iteration, thanks a huge for spotting it!

	Cyrill

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2
  2011-08-25 17:07       ` Cyrill Gorcunov
@ 2011-08-25 20:54         ` Tejun Heo
  2011-08-25 21:12           ` Tejun Heo
  0 siblings, 1 reply; 48+ messages in thread
From: Tejun Heo @ 2011-08-25 20:54 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Vasiliy Kulikov, Nathan Lynch, Oren Laadan, Daniel Lezcano,
	Andrew Morton, Pavel Emelyanov, linux-kernel, James Bottomley,
	LINUXFS-ML, containers, Zan Lynx, Andi Kleen

Hello,

On Thu, Aug 25, 2011 at 09:07:05PM +0400, Cyrill Gorcunov wrote:
> This will force us to bloat proc_inode a bit more, since at moment we
> have a union such as
> 
> struct proc_inode {
> 	struct pid *pid;
> 	union {
> 		int fd;
> 		unsigned long vm_start;
> 	};
> 	...
> 
> also it will require to make a buffer for names twice bigger as well.
> If all this not a problem -- then sure we can do that ;)

I'm not too familiar with this area but I don't think bloating it a
bit would be such a big problem.  proc_inode is generated on the fly
as proc directories and files are accessed, so it's not like they'll
linearly hog down more memory with increasing number of entries (sysfs
had this sort of problem in the past).

Another thing is, I don't really see why we need vm_start, or fd for
that matter, in proc_inode at all.  proc_inode is created on the fly
only as dentry gets instantiated on demand, which means we always have
d_name on hand to tell what the file is supposed to point to.  In
fact, the code already uses name_to_int() to extract fd from d_name.
Hmmm... well yeah, it actually seems that proc_inode->fd is never used
and we can simply remove it.

So, we can do the same thing w/ these mapping files too.  We just need
name_to_map_range() or something.  Also, for most address ranges, the
name sould fit inside the inline name in dentry (32 bytes on 64bit),
so it really isn't gonna bloat anything.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2
  2011-08-25 20:54         ` Tejun Heo
@ 2011-08-25 21:12           ` Tejun Heo
  2011-08-25 21:34             ` Cyrill Gorcunov
  0 siblings, 1 reply; 48+ messages in thread
From: Tejun Heo @ 2011-08-25 21:12 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Vasiliy Kulikov, Nathan Lynch, Oren Laadan, Daniel Lezcano,
	Andrew Morton, Pavel Emelyanov, linux-kernel, James Bottomley,
	LINUXFS-ML, containers, Zan Lynx, Andi Kleen

Hello, again.

On Thu, Aug 25, 2011 at 10:54:26PM +0200, Tejun Heo wrote:
> Another thing is, I don't really see why we need vm_start, or fd for
> that matter, in proc_inode at all.  proc_inode is created on the fly
> only as dentry gets instantiated on demand, which means we always have
> d_name on hand to tell what the file is supposed to point to.  In
> fact, the code already uses name_to_int() to extract fd from d_name.
> Hmmm... well yeah, it actually seems that proc_inode->fd is never used
> and we can simply remove it.

Unfortunately, not quite as easy as I expected.  The information still
seems redundant but it seems we'll need to change
proc_inode->get_link() to take dentry instead of inode before doing
away with proc_inode->fd, but, at any rate, I don't think this is a
big deal one way or the other.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2
  2011-08-25 21:12           ` Tejun Heo
@ 2011-08-25 21:34             ` Cyrill Gorcunov
  2011-08-25 21:39               ` Tejun Heo
  0 siblings, 1 reply; 48+ messages in thread
From: Cyrill Gorcunov @ 2011-08-25 21:34 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Vasiliy Kulikov, Nathan Lynch, Oren Laadan, Daniel Lezcano,
	Andrew Morton, Pavel Emelyanov, linux-kernel, James Bottomley,
	LINUXFS-ML, containers, Zan Lynx, Andi Kleen

On Thu, Aug 25, 2011 at 11:12:13PM +0200, Tejun Heo wrote:
> Hello, again.
> 
> On Thu, Aug 25, 2011 at 10:54:26PM +0200, Tejun Heo wrote:
> > Another thing is, I don't really see why we need vm_start, or fd for
> > that matter, in proc_inode at all.  proc_inode is created on the fly
> > only as dentry gets instantiated on demand, which means we always have
> > d_name on hand to tell what the file is supposed to point to.  In
> > fact, the code already uses name_to_int() to extract fd from d_name.
> > Hmmm... well yeah, it actually seems that proc_inode->fd is never used
> > and we can simply remove it.
> 
> Unfortunately, not quite as easy as I expected.  The information still
> seems redundant but it seems we'll need to change
> proc_inode->get_link() to take dentry instead of inode before doing
> away with proc_inode->fd, but, at any rate, I don't think this is a
> big deal one way or the other.
> 

Hohum... picking up an additional reference to dentry might be dangerous
I think. How exactly you imagine we would do that? (without this problem
I guess we indeed may drop or rather not change proc-inode).

	Cyrill

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2
  2011-08-25 21:34             ` Cyrill Gorcunov
@ 2011-08-25 21:39               ` Tejun Heo
  2011-08-26  6:58                 ` Cyrill Gorcunov
  2011-08-26 11:29                 ` Cyrill Gorcunov
  0 siblings, 2 replies; 48+ messages in thread
From: Tejun Heo @ 2011-08-25 21:39 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Vasiliy Kulikov, Nathan Lynch, Oren Laadan, Daniel Lezcano,
	Andrew Morton, Pavel Emelyanov, linux-kernel, James Bottomley,
	LINUXFS-ML, containers, Zan Lynx, Andi Kleen

Hello,

On Fri, Aug 26, 2011 at 01:34:59AM +0400, Cyrill Gorcunov wrote:
> > Unfortunately, not quite as easy as I expected.  The information still
> > seems redundant but it seems we'll need to change
> > proc_inode->get_link() to take dentry instead of inode before doing
> > away with proc_inode->fd, but, at any rate, I don't think this is a
> > big deal one way or the other.
> 
> Hohum... picking up an additional reference to dentry might be dangerous
> I think. How exactly you imagine we would do that? (without this problem
> I guess we indeed may drop or rather not change proc-inode).

Why would you need an extra reference?  All these data structures are
created dynamically on access and dentry is always available while any
operation on the inode is in progress so it's guaranteed to be
available and there's no reason to diddle with reference count.
Anyways, we can deal with this optimization later, I think.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2
  2011-08-25 21:39               ` Tejun Heo
@ 2011-08-26  6:58                 ` Cyrill Gorcunov
  2011-08-26 11:29                 ` Cyrill Gorcunov
  1 sibling, 0 replies; 48+ messages in thread
From: Cyrill Gorcunov @ 2011-08-26  6:58 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Vasiliy Kulikov, Nathan Lynch, Oren Laadan, Daniel Lezcano,
	Andrew Morton, Pavel Emelyanov, linux-kernel, James Bottomley,
	LINUXFS-ML, containers, Zan Lynx, Andi Kleen

On Thu, Aug 25, 2011 at 11:39:31PM +0200, Tejun Heo wrote:
> Hello,
> 
> On Fri, Aug 26, 2011 at 01:34:59AM +0400, Cyrill Gorcunov wrote:
> > > Unfortunately, not quite as easy as I expected.  The information still
> > > seems redundant but it seems we'll need to change
> > > proc_inode->get_link() to take dentry instead of inode before doing
> > > away with proc_inode->fd, but, at any rate, I don't think this is a
> > > big deal one way or the other.
> > 
> > Hohum... picking up an additional reference to dentry might be dangerous
> > I think. How exactly you imagine we would do that? (without this problem
> > I guess we indeed may drop or rather not change proc-inode).
> 
> Why would you need an extra reference?  All these data structures are
> created dynamically on access and dentry is always available while any
> operation on the inode is in progress so it's guaranteed to be
> available and there's no reason to diddle with reference count.
> Anyways, we can deal with this optimization later, I think.
> 

Hi Tejun, yeah, I somehow missed that you propose to lift up
proc_get_link a bit. Letme try such approach indeed. Thanks!

	Cyrill

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2
  2011-08-25 21:39               ` Tejun Heo
  2011-08-26  6:58                 ` Cyrill Gorcunov
@ 2011-08-26 11:29                 ` Cyrill Gorcunov
  2011-08-26 12:28                     ` Kirill A. Shutemov
  1 sibling, 1 reply; 48+ messages in thread
From: Cyrill Gorcunov @ 2011-08-26 11:29 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Vasiliy Kulikov, Nathan Lynch, Oren Laadan, Daniel Lezcano,
	Andrew Morton, Pavel Emelyanov, linux-kernel, James Bottomley,
	LINUXFS-ML, containers, Zan Lynx, Andi Kleen

On Thu, Aug 25, 2011 at 11:39:31PM +0200, Tejun Heo wrote:
...
> 
> Why would you need an extra reference?  All these data structures are
> created dynamically on access and dentry is always available while any
> operation on the inode is in progress so it's guaranteed to be
> available and there's no reason to diddle with reference count.
> Anyways, we can deal with this optimization later, I think.
> 

Hi, thanks a huge for all feedback! Mind to give the below one a
review shot? Hope this time I've addressed all concerns. Thanks.
(please check map_files_d_revalidate() precisely). Complains are
welcome, as always.

	Cyrill
---
fs, proc: Introduce the /proc/<pid>/map_files/ directory v4

This one behaves similarly to the /proc/<pid>/fd/ one - it contains symlinks
one for each mapping with file, the name of a symlink is "vma->vm_start-vma->vm_end",
the target is the file. Opening a symlink results in a file that point exactly
to the same inode as them vma's one.

For example the ls -l of some arbitrary /proc/<pid>/map_files/

 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80403000-7f8f80404000 -> /lib64/libc-2.5.so
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f8061e000-7f8f80620000 -> /lib64/libselinux.so.1
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80826000-7f8f80827000 -> /lib64/libacl.so.1.1.0
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a2f000-7f8f80a30000 -> /lib64/librt-2.5.so
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a30000-7f8f80a4c000 -> /lib64/ld-2.5.so

This helps checkpointing process in three ways:

1. When dumping a task mappings we do know exact file that is mapped by particular
   region. We do this by opening /proc/pid/map_files/address symlink the way we do
   with file descriptors.

2. This also helps in determining which anonymous shared mappings are shared with
   each other by comparing the inodes of them.

3. When restoring a set of process in case two of them has a mapping shared, we map
   the memory by the 1st one and then open its /proc/pid/map_files/address file and
   map it by the 2nd task.

v2: (spotted by Tejun Heo)
 - /proc/<pid>/mfd changed to /proc/<pid>/map_files
 - find_vma helper is used instead of linear search
 - routines are re-grouped
 - d_revalidate is set now

v3:
 - d_revalidate reworked, now it should drops no longer valid dentries (Tejun Heo)
 - ptrace_may_access added into proc_map_files_lookup (Vasiliy Kulikov)
 - because of filldir (which eventually might need to lock mmap_sem)
   the proc_map_files_readdir() was reworked to call proc_fill_cache()
   with unlocked mmap_sem

v4: (feedback by Tejun Heo and Vasiliy Kulikov)
 - instead of saving data in proc_inode we rather make a dentry name
   to keep both vm_start and vm_end accordingly
 - d_revalidate now honor task credentials

Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
CC: Tejun Heo <tj@kernel.org>
CC: Vasiliy Kulikov <segoon@openwall.com>
---
 fs/proc/base.c          |  326 +++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/proc_fs.h |    2 
 2 files changed, 321 insertions(+), 7 deletions(-)

Index: linux-2.6.git/fs/proc/base.c
===================================================================
--- linux-2.6.git.orig/fs/proc/base.c
+++ linux-2.6.git/fs/proc/base.c
@@ -165,7 +165,7 @@ static int get_task_root(struct task_str
 	return result;
 }
 
-static int proc_cwd_link(struct inode *inode, struct path *path)
+static int proc_cwd_link(struct dentry *dentry, struct inode *inode, struct path *path)
 {
 	struct task_struct *task = get_proc_task(inode);
 	int result = -ENOENT;
@@ -182,7 +182,7 @@ static int proc_cwd_link(struct inode *i
 	return result;
 }
 
-static int proc_root_link(struct inode *inode, struct path *path)
+static int proc_root_link(struct dentry *dentry, struct inode *inode, struct path *path)
 {
 	struct task_struct *task = get_proc_task(inode);
 	int result = -ENOENT;
@@ -1580,7 +1580,7 @@ static const struct file_operations proc
 	.release	= single_release,
 };
 
-static int proc_exe_link(struct inode *inode, struct path *exe_path)
+static int proc_exe_link(struct dentry *dentry, struct inode *inode, struct path *exe_path)
 {
 	struct task_struct *task;
 	struct mm_struct *mm;
@@ -1616,7 +1616,7 @@ static void *proc_pid_follow_link(struct
 	if (!proc_fd_access_allowed(inode))
 		goto out;
 
-	error = PROC_I(inode)->op.proc_get_link(inode, &nd->path);
+	error = PROC_I(inode)->op.proc_get_link(dentry, inode, &nd->path);
 out:
 	return ERR_PTR(error);
 }
@@ -1655,7 +1655,7 @@ static int proc_pid_readlink(struct dent
 	if (!proc_fd_access_allowed(inode))
 		goto out;
 
-	error = PROC_I(inode)->op.proc_get_link(inode, &path);
+	error = PROC_I(inode)->op.proc_get_link(dentry, inode, &path);
 	if (error)
 		goto out;
 
@@ -1947,7 +1947,7 @@ static int proc_fd_info(struct inode *in
 	return -ENOENT;
 }
 
-static int proc_fd_link(struct inode *inode, struct path *path)
+static int proc_fd_link(struct dentry *dentry, struct inode *inode, struct path *path)
 {
 	return proc_fd_info(inode, path, NULL);
 }
@@ -2170,6 +2170,319 @@ static const struct file_operations proc
 	.llseek		= default_llseek,
 };
 
+static struct vm_area_struct *
+find_exact_vma(struct mm_struct *mm, unsigned long vm_start, unsigned long vm_end)
+{
+	struct vm_area_struct *vma = find_vma(mm, vm_start);
+	if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end))
+		vma = NULL;
+	return vma;
+}
+
+static int map_name_to_addr(const unsigned char *name, unsigned long *start, unsigned long *end)
+{
+	int ret = -1;
+	char *endp;
+
+	if (unlikely(!name))
+		goto err;
+
+	*start = simple_strtoul(name, &endp, 16);
+	if (*endp != '-')
+		goto err;
+	*end = simple_strtoul(endp + 1, &endp, 16);
+	if (*endp != 0)
+		goto err;
+
+	ret = 0;
+
+err:
+	return ret;
+}
+
+static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	struct vm_area_struct *vma = NULL;
+	unsigned long vm_start, vm_end;
+	struct task_struct *task;
+	const struct cred *cred;
+	struct mm_struct *mm;
+	struct inode *inode;
+
+	if (nd && nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	inode = dentry->d_inode;
+	task = get_proc_task(inode);
+	if (!task)
+		goto out;
+
+	mm = get_task_mm(task);
+	put_task_struct(task);
+	if (!mm)
+		goto out;
+
+	if (!map_name_to_addr(dentry->d_name.name, &vm_start, &vm_end)) {
+		down_read(&mm->mmap_sem);
+		vma = find_exact_vma(mm, vm_start, vm_end);
+		up_read(&mm->mmap_sem);
+	}
+
+	mmput(mm);
+
+	if (vma) {
+		if (task_dumpable(task)) {
+			rcu_read_lock();
+			cred = __task_cred(task);
+			inode->i_uid = cred->euid;
+			inode->i_gid = cred->egid;
+			rcu_read_unlock();
+		} else {
+			inode->i_uid = 0;
+			inode->i_gid = 0;
+		}
+		security_task_to_inode(task, inode);
+		return 1;
+	}
+out:
+	d_drop(dentry);
+	return 0;
+}
+
+static const struct dentry_operations tid_map_files_dentry_operations = {
+	.d_revalidate	= map_files_d_revalidate,
+	.d_delete	= pid_delete_dentry,
+};
+
+static int proc_map_files_get_link(struct dentry *dentry, struct inode *inode, struct path *path)
+{
+	unsigned long vm_start, vm_end;
+	struct vm_area_struct *vma;
+	struct task_struct *task;
+	struct mm_struct *mm;
+	int rc = -ENOENT;
+
+	task = get_proc_task(inode);
+	if (!task)
+		goto out;
+
+	mm = get_task_mm(task);
+	put_task_struct(task);
+	if (!mm)
+		goto out;
+
+	if (map_name_to_addr(dentry->d_name.name,
+			     &vm_start, &vm_end))
+		goto out_mmput;
+
+	down_read(&mm->mmap_sem);
+	vma = find_exact_vma(mm, vm_start, vm_end);
+	if (vma && vma->vm_file) {
+		*path = vma->vm_file->f_path;
+		path_get(path);
+		rc = 0;
+	}
+	up_read(&mm->mmap_sem);
+
+out_mmput:
+	mmput(mm);
+out:
+	return rc;
+}
+
+struct map_files_info {
+	struct file	*file;
+	unsigned char	name[16+16+2]; /* max: %016lx-%016lx\0 */
+	unsigned long	len;
+};
+
+static struct dentry *
+proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
+			   struct task_struct *task, const void *ptr)
+{
+	const struct file *file = ptr;
+	struct proc_inode *ei;
+	struct inode *inode;
+
+	if (!file)
+		return ERR_PTR(-ENOENT);
+
+	inode = proc_pid_make_inode(dir->i_sb, task);
+	if (!inode)
+		return ERR_PTR(-ENOENT);
+
+	ei			= PROC_I(inode);
+	ei->op.proc_get_link	= proc_map_files_get_link;
+
+	inode->i_op	= &proc_pid_link_inode_operations;
+	inode->i_size	= 64;
+	inode->i_mode	= S_IFLNK;
+
+	if (file->f_mode & FMODE_READ)
+		inode->i_mode |= S_IRUSR | S_IXUSR;
+	if (file->f_mode & FMODE_WRITE)
+		inode->i_mode |= S_IWUSR | S_IXUSR;
+
+	d_set_d_op(dentry, &tid_map_files_dentry_operations);
+	d_add(dentry, inode);
+
+	return NULL;
+}
+
+static struct dentry *proc_map_files_lookup(struct inode *dir,
+		struct dentry *dentry, struct nameidata *nd)
+{
+	unsigned long vm_start, vm_end;
+	struct task_struct *task;
+	struct vm_area_struct *vma;
+	struct mm_struct *mm;
+	struct dentry *result;
+
+	result = ERR_PTR(-ENOENT);
+	task = get_proc_task(dir);
+	if (!task)
+		goto out_no_task;
+
+	result = ERR_PTR(-EPERM);
+	if (!ptrace_may_access(task, PTRACE_MODE_READ));
+		goto out_no_mm;
+
+	result = ERR_PTR(-ENOENT);
+	if (map_name_to_addr(dentry->d_name.name,
+			     &vm_start, &vm_end))
+		goto out_no_mm;
+
+	mm = get_task_mm(task);
+	if (!mm)
+		goto out_no_mm;
+
+	down_read(&mm->mmap_sem);
+	vma = find_exact_vma(mm, vm_start, vm_end);
+	if (!vma)
+		goto out_no_vma;
+
+	result = proc_map_files_instantiate(dir, dentry, task, vma->vm_file);
+
+out_no_vma:
+	up_read(&mm->mmap_sem);
+	mmput(mm);
+out_no_mm:
+	put_task_struct(task);
+out_no_task:
+	return result;
+}
+
+static const struct inode_operations proc_map_files_inode_operations = {
+	.lookup		= proc_map_files_lookup,
+	.setattr	= proc_setattr,
+};
+
+static int proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct dentry *dentry = filp->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	struct vm_area_struct *vma;
+	struct task_struct *task;
+	struct mm_struct *mm;
+	unsigned int vmai;
+	ino_t ino;
+	int ret;
+
+	ret = -ENOENT;
+	task = get_proc_task(inode);
+	if (!task)
+		goto out_no_task;
+
+	ret = -EPERM;
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+		goto out;
+
+	ret = 0;
+	switch (filp->f_pos) {
+	case 0:
+		ino = inode->i_ino;
+		if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0)
+			goto out;
+		filp->f_pos++;
+	case 1:
+		ino = parent_ino(dentry);
+		if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
+			goto out;
+		filp->f_pos++;
+	default:
+	{
+		unsigned long nr_files, used, i;
+		struct map_files_info *info;
+
+		mm = get_task_mm(task);
+		if (!mm)
+			goto out;
+		down_read(&mm->mmap_sem);
+
+		nr_files = 0;
+		for (vma = mm->mmap; vma; vma = vma->vm_next) {
+			if (vma->vm_file)
+				nr_files++;
+		}
+		if (!nr_files)
+			goto out;
+
+		info = kmalloc(nr_files * sizeof(*info), GFP_KERNEL);
+		if (!info) {
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		used = 0;
+		for (vma = mm->mmap, vmai = 2; vma; vma = vma->vm_next) {
+			if (!vma->vm_file)
+				continue;
+			vmai++;
+			if (vmai <= filp->f_pos)
+				continue;
+
+			get_file(vma->vm_file);
+			info[used].file	= vma->vm_file;
+			info[used].len	= snprintf(info[used].name,
+						   sizeof(info[used].name),
+						   "%lx-%lx", vma->vm_start,
+						   vma->vm_end);
+			used++;
+		}
+
+		up_read(&mm->mmap_sem);
+
+		for (i = 0; i < used; i++) {
+			ret = proc_fill_cache(filp, dirent, filldir,
+					      info[i].name,
+					      info[i].len,
+					      proc_map_files_instantiate,
+					      task, info[i].file);
+			if (ret)
+				break;
+			filp->f_pos++;
+		}
+
+		for (i = 0; i < used; i++)
+			put_filp(info[i].file);
+
+		kfree(info);
+		mmput(mm);
+	}
+	}
+
+out:
+	put_task_struct(task);
+out_no_task:
+	return ret;
+}
+
+static const struct file_operations proc_map_files_operations = {
+	.read		= generic_read_dir,
+	.readdir	= proc_map_files_readdir,
+	.llseek		= default_llseek,
+};
+
 /*
  * /proc/pid/fd needs a special permission handler so that a process can still
  * access /proc/self/fd after it has executed a setuid().
@@ -2785,6 +3098,7 @@ static const struct inode_operations pro
 static const struct pid_entry tgid_base_stuff[] = {
 	DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
 	DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
+	DIR("map_files",  S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
 	DIR("fdinfo",     S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
 	DIR("ns",	  S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
 #ifdef CONFIG_NET
Index: linux-2.6.git/include/linux/proc_fs.h
===================================================================
--- linux-2.6.git.orig/include/linux/proc_fs.h
+++ linux-2.6.git/include/linux/proc_fs.h
@@ -253,7 +253,7 @@ extern const struct proc_ns_operations u
 extern const struct proc_ns_operations ipcns_operations;
 
 union proc_op {
-	int (*proc_get_link)(struct inode *, struct path *);
+	int (*proc_get_link)(struct dentry *, struct inode *, struct path *);
 	int (*proc_read)(struct task_struct *task, char *page);
 	int (*proc_show)(struct seq_file *m,
 		struct pid_namespace *ns, struct pid *pid,

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2
  2011-08-26 11:29                 ` Cyrill Gorcunov
@ 2011-08-26 12:28                     ` Kirill A. Shutemov
  0 siblings, 0 replies; 48+ messages in thread
From: Kirill A. Shutemov @ 2011-08-26 12:28 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Pavel Emelyanov, linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	James Bottomley, Vasiliy Kulikov,
	containers-qjLDD68F18O7TbgM5vRIOg, Andi Kleen, Nathan Lynch,
	Zan Lynx, Tejun Heo, LINUXFS-ML, Andrew Morton, Daniel Lezcano

On Fri, Aug 26, 2011 at 03:29:44PM +0400, Cyrill Gorcunov wrote:
> On Thu, Aug 25, 2011 at 11:39:31PM +0200, Tejun Heo wrote:
> ...
> > 
> > Why would you need an extra reference?  All these data structures are
> > created dynamically on access and dentry is always available while any
> > operation on the inode is in progress so it's guaranteed to be
> > available and there's no reason to diddle with reference count.
> > Anyways, we can deal with this optimization later, I think.
> > 
> 
> Hi, thanks a huge for all feedback! Mind to give the below one a
> review shot? Hope this time I've addressed all concerns. Thanks.
> (please check map_files_d_revalidate() precisely). Complains are
> welcome, as always.
> 
> 	Cyrill
> ---
> fs, proc: Introduce the /proc/<pid>/map_files/ directory v4
> 
> This one behaves similarly to the /proc/<pid>/fd/ one - it contains symlinks
> one for each mapping with file, the name of a symlink is "vma->vm_start-vma->vm_end",
> the target is the file. Opening a symlink results in a file that point exactly
> to the same inode as them vma's one.
> 
> For example the ls -l of some arbitrary /proc/<pid>/map_files/
> 
>  | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80403000-7f8f80404000 -> /lib64/libc-2.5.so
>  | lr-x------ 1 root root 64 Aug 26 06:40 7f8f8061e000-7f8f80620000 -> /lib64/libselinux.so.1
>  | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80826000-7f8f80827000 -> /lib64/libacl.so.1.1.0
>  | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a2f000-7f8f80a30000 -> /lib64/librt-2.5.so
>  | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a30000-7f8f80a4c000 -> /lib64/ld-2.5.so
> 
> This helps checkpointing process in three ways:
> 
> 1. When dumping a task mappings we do know exact file that is mapped by particular
>    region. We do this by opening /proc/pid/map_files/address symlink the way we do
>    with file descriptors.
> 
> 2. This also helps in determining which anonymous shared mappings are shared with
>    each other by comparing the inodes of them.
> 
> 3. When restoring a set of process in case two of them has a mapping shared, we map
>    the memory by the 1st one and then open its /proc/pid/map_files/address file and
>    map it by the 2nd task.
> 
> v2: (spotted by Tejun Heo)
>  - /proc/<pid>/mfd changed to /proc/<pid>/map_files
>  - find_vma helper is used instead of linear search
>  - routines are re-grouped
>  - d_revalidate is set now
> 
> v3:
>  - d_revalidate reworked, now it should drops no longer valid dentries (Tejun Heo)
>  - ptrace_may_access added into proc_map_files_lookup (Vasiliy Kulikov)
>  - because of filldir (which eventually might need to lock mmap_sem)
>    the proc_map_files_readdir() was reworked to call proc_fill_cache()
>    with unlocked mmap_sem
> 
> v4: (feedback by Tejun Heo and Vasiliy Kulikov)
>  - instead of saving data in proc_inode we rather make a dentry name
>    to keep both vm_start and vm_end accordingly
>  - d_revalidate now honor task credentials
> 
> Signed-off-by: Pavel Emelyanov <xemul-bzQdu9zFT3WakBO8gow8eQ@public.gmane.org>
> Signed-off-by: Cyrill Gorcunov <gorcunov-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org>
> CC: Tejun Heo <tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
> CC: Vasiliy Kulikov <segoon-cxoSlKxDwOJWk0Htik3J/w@public.gmane.org>
> ---
>  fs/proc/base.c          |  326 +++++++++++++++++++++++++++++++++++++++++++++++-
>  include/linux/proc_fs.h |    2 
>  2 files changed, 321 insertions(+), 7 deletions(-)
> 
> Index: linux-2.6.git/fs/proc/base.c
> ===================================================================
> --- linux-2.6.git.orig/fs/proc/base.c
> +++ linux-2.6.git/fs/proc/base.c
> @@ -165,7 +165,7 @@ static int get_task_root(struct task_str
>  	return result;
>  }
>  
> -static int proc_cwd_link(struct inode *inode, struct path *path)
> +static int proc_cwd_link(struct dentry *dentry, struct inode *inode, struct path *path)

Put proc_get_link() change in separate patch?

>  {
>  	struct task_struct *task = get_proc_task(inode);
>  	int result = -ENOENT;
> @@ -182,7 +182,7 @@ static int proc_cwd_link(struct inode *i
>  	return result;
>  }
>  
> -static int proc_root_link(struct inode *inode, struct path *path)
> +static int proc_root_link(struct dentry *dentry, struct inode *inode, struct path *path)
>  {
>  	struct task_struct *task = get_proc_task(inode);
>  	int result = -ENOENT;
> @@ -1580,7 +1580,7 @@ static const struct file_operations proc
>  	.release	= single_release,
>  };
>  
> -static int proc_exe_link(struct inode *inode, struct path *exe_path)
> +static int proc_exe_link(struct dentry *dentry, struct inode *inode, struct path *exe_path)
>  {
>  	struct task_struct *task;
>  	struct mm_struct *mm;
> @@ -1616,7 +1616,7 @@ static void *proc_pid_follow_link(struct
>  	if (!proc_fd_access_allowed(inode))
>  		goto out;
>  
> -	error = PROC_I(inode)->op.proc_get_link(inode, &nd->path);
> +	error = PROC_I(inode)->op.proc_get_link(dentry, inode, &nd->path);
>  out:
>  	return ERR_PTR(error);
>  }
> @@ -1655,7 +1655,7 @@ static int proc_pid_readlink(struct dent
>  	if (!proc_fd_access_allowed(inode))
>  		goto out;
>  
> -	error = PROC_I(inode)->op.proc_get_link(inode, &path);
> +	error = PROC_I(inode)->op.proc_get_link(dentry, inode, &path);
>  	if (error)
>  		goto out;
>  
> @@ -1947,7 +1947,7 @@ static int proc_fd_info(struct inode *in
>  	return -ENOENT;
>  }
>  
> -static int proc_fd_link(struct inode *inode, struct path *path)
> +static int proc_fd_link(struct dentry *dentry, struct inode *inode, struct path *path)
>  {
>  	return proc_fd_info(inode, path, NULL);
>  }
> @@ -2170,6 +2170,319 @@ static const struct file_operations proc
>  	.llseek		= default_llseek,
>  };
>  
> +static struct vm_area_struct *
> +find_exact_vma(struct mm_struct *mm, unsigned long vm_start, unsigned long vm_end)
> +{
> +	struct vm_area_struct *vma = find_vma(mm, vm_start);
> +	if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end))
> +		vma = NULL;
> +	return vma;
> +}
> +
> +static int map_name_to_addr(const unsigned char *name, unsigned long *start, unsigned long *end)
> +{
> +	int ret = -1;
> +	char *endp;
> +
> +	if (unlikely(!name))
> +		goto err;
> +
> +	*start = simple_strtoul(name, &endp, 16);
> +	if (*endp != '-')
> +		goto err;
> +	*end = simple_strtoul(endp + 1, &endp, 16);
> +	if (*endp != 0)
> +		goto err;
> +
> +	ret = 0;
> +
> +err:
> +	return ret;
> +}
> +
> +static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd)
> +{
> +	struct vm_area_struct *vma = NULL;
> +	unsigned long vm_start, vm_end;
> +	struct task_struct *task;
> +	const struct cred *cred;
> +	struct mm_struct *mm;
> +	struct inode *inode;
> +
> +	if (nd && nd->flags & LOOKUP_RCU)
> +		return -ECHILD;
> +
> +	inode = dentry->d_inode;
> +	task = get_proc_task(inode);
> +	if (!task)
> +		goto out;
> +
> +	mm = get_task_mm(task);
> +	put_task_struct(task);
> +	if (!mm)
> +		goto out;
> +
> +	if (!map_name_to_addr(dentry->d_name.name, &vm_start, &vm_end)) {
> +		down_read(&mm->mmap_sem);
> +		vma = find_exact_vma(mm, vm_start, vm_end);
> +		up_read(&mm->mmap_sem);
> +	}
> +
> +	mmput(mm);
> +
> +	if (vma) {
> +		if (task_dumpable(task)) {
> +			rcu_read_lock();
> +			cred = __task_cred(task);
> +			inode->i_uid = cred->euid;
> +			inode->i_gid = cred->egid;
> +			rcu_read_unlock();
> +		} else {
> +			inode->i_uid = 0;
> +			inode->i_gid = 0;
> +		}
> +		security_task_to_inode(task, inode);
> +		return 1;
> +	}
> +out:
> +	d_drop(dentry);
> +	return 0;
> +}
> +
> +static const struct dentry_operations tid_map_files_dentry_operations = {
> +	.d_revalidate	= map_files_d_revalidate,
> +	.d_delete	= pid_delete_dentry,
> +};
> +
> +static int proc_map_files_get_link(struct dentry *dentry, struct inode *inode, struct path *path)
> +{
> +	unsigned long vm_start, vm_end;
> +	struct vm_area_struct *vma;
> +	struct task_struct *task;
> +	struct mm_struct *mm;
> +	int rc = -ENOENT;
> +
> +	task = get_proc_task(inode);
> +	if (!task)
> +		goto out;
> +
> +	mm = get_task_mm(task);
> +	put_task_struct(task);
> +	if (!mm)
> +		goto out;
> +
> +	if (map_name_to_addr(dentry->d_name.name,
> +			     &vm_start, &vm_end))
> +		goto out_mmput;
> +
> +	down_read(&mm->mmap_sem);
> +	vma = find_exact_vma(mm, vm_start, vm_end);
> +	if (vma && vma->vm_file) {
> +		*path = vma->vm_file->f_path;
> +		path_get(path);
> +		rc = 0;
> +	}
> +	up_read(&mm->mmap_sem);
> +
> +out_mmput:
> +	mmput(mm);
> +out:
> +	return rc;
> +}
> +
> +struct map_files_info {
> +	struct file	*file;
> +	unsigned char	name[16+16+2]; /* max: %016lx-%016lx\0 */
> +	unsigned long	len;
> +};
> +
> +static struct dentry *
> +proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
> +			   struct task_struct *task, const void *ptr)
> +{
> +	const struct file *file = ptr;
> +	struct proc_inode *ei;
> +	struct inode *inode;
> +
> +	if (!file)
> +		return ERR_PTR(-ENOENT);
> +
> +	inode = proc_pid_make_inode(dir->i_sb, task);
> +	if (!inode)
> +		return ERR_PTR(-ENOENT);
> +
> +	ei			= PROC_I(inode);
> +	ei->op.proc_get_link	= proc_map_files_get_link;
> +
> +	inode->i_op	= &proc_pid_link_inode_operations;
> +	inode->i_size	= 64;
> +	inode->i_mode	= S_IFLNK;
> +
> +	if (file->f_mode & FMODE_READ)
> +		inode->i_mode |= S_IRUSR | S_IXUSR;
> +	if (file->f_mode & FMODE_WRITE)
> +		inode->i_mode |= S_IWUSR | S_IXUSR;
> +
> +	d_set_d_op(dentry, &tid_map_files_dentry_operations);
> +	d_add(dentry, inode);
> +
> +	return NULL;
> +}
> +
> +static struct dentry *proc_map_files_lookup(struct inode *dir,
> +		struct dentry *dentry, struct nameidata *nd)
> +{
> +	unsigned long vm_start, vm_end;
> +	struct task_struct *task;
> +	struct vm_area_struct *vma;
> +	struct mm_struct *mm;
> +	struct dentry *result;
> +
> +	result = ERR_PTR(-ENOENT);
> +	task = get_proc_task(dir);
> +	if (!task)
> +		goto out_no_task;
> +
> +	result = ERR_PTR(-EPERM);
> +	if (!ptrace_may_access(task, PTRACE_MODE_READ));
> +		goto out_no_mm;
> +
> +	result = ERR_PTR(-ENOENT);
> +	if (map_name_to_addr(dentry->d_name.name,
> +			     &vm_start, &vm_end))
> +		goto out_no_mm;
> +
> +	mm = get_task_mm(task);
> +	if (!mm)
> +		goto out_no_mm;
> +
> +	down_read(&mm->mmap_sem);
> +	vma = find_exact_vma(mm, vm_start, vm_end);
> +	if (!vma)
> +		goto out_no_vma;
> +
> +	result = proc_map_files_instantiate(dir, dentry, task, vma->vm_file);
> +
> +out_no_vma:
> +	up_read(&mm->mmap_sem);
> +	mmput(mm);
> +out_no_mm:
> +	put_task_struct(task);
> +out_no_task:
> +	return result;
> +}
> +
> +static const struct inode_operations proc_map_files_inode_operations = {
> +	.lookup		= proc_map_files_lookup,
> +	.setattr	= proc_setattr,
> +};
> +
> +static int proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
> +{
> +	struct dentry *dentry = filp->f_path.dentry;
> +	struct inode *inode = dentry->d_inode;
> +	struct vm_area_struct *vma;
> +	struct task_struct *task;
> +	struct mm_struct *mm;
> +	unsigned int vmai;
> +	ino_t ino;
> +	int ret;
> +
> +	ret = -ENOENT;
> +	task = get_proc_task(inode);
> +	if (!task)
> +		goto out_no_task;
> +
> +	ret = -EPERM;
> +	if (!ptrace_may_access(task, PTRACE_MODE_READ))
> +		goto out;
> +
> +	ret = 0;
> +	switch (filp->f_pos) {
> +	case 0:
> +		ino = inode->i_ino;
> +		if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0)
> +			goto out;
> +		filp->f_pos++;
> +	case 1:
> +		ino = parent_ino(dentry);
> +		if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
> +			goto out;
> +		filp->f_pos++;
> +	default:
> +	{
> +		unsigned long nr_files, used, i;
> +		struct map_files_info *info;
> +
> +		mm = get_task_mm(task);
> +		if (!mm)
> +			goto out;
> +		down_read(&mm->mmap_sem);
> +
> +		nr_files = 0;
> +		for (vma = mm->mmap; vma; vma = vma->vm_next) {
> +			if (vma->vm_file)
> +				nr_files++;
> +		}
> +		if (!nr_files)
> +			goto out;

up_read(&mm->mmap_sem); ?

> +
> +		info = kmalloc(nr_files * sizeof(*info), GFP_KERNEL);
> +		if (!info) {
> +			ret = -ENOMEM;
> +			goto out;

Ditto.

> +		}
> +
> +		used = 0;
> +		for (vma = mm->mmap, vmai = 2; vma; vma = vma->vm_next) {
> +			if (!vma->vm_file)
> +				continue;
> +			vmai++;
> +			if (vmai <= filp->f_pos)
> +				continue;
> +
> +			get_file(vma->vm_file);
> +			info[used].file	= vma->vm_file;
> +			info[used].len	= snprintf(info[used].name,
> +						   sizeof(info[used].name),
> +						   "%lx-%lx", vma->vm_start,
> +						   vma->vm_end);
> +			used++;
> +		}
> +
> +		up_read(&mm->mmap_sem);
> +
> +		for (i = 0; i < used; i++) {
> +			ret = proc_fill_cache(filp, dirent, filldir,
> +					      info[i].name,
> +					      info[i].len,
> +					      proc_map_files_instantiate,
> +					      task, info[i].file);
> +			if (ret)
> +				break;
> +			filp->f_pos++;
> +		}
> +
> +		for (i = 0; i < used; i++)
> +			put_filp(info[i].file);
> +
> +		kfree(info);
> +		mmput(mm);
> +	}
> +	}
> +
> +out:
> +	put_task_struct(task);
> +out_no_task:
> +	return ret;
> +}
> +
> +static const struct file_operations proc_map_files_operations = {
> +	.read		= generic_read_dir,
> +	.readdir	= proc_map_files_readdir,
> +	.llseek		= default_llseek,
> +};
> +
>  /*
>   * /proc/pid/fd needs a special permission handler so that a process can still
>   * access /proc/self/fd after it has executed a setuid().
> @@ -2785,6 +3098,7 @@ static const struct inode_operations pro
>  static const struct pid_entry tgid_base_stuff[] = {
>  	DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
>  	DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
> +	DIR("map_files",  S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
>  	DIR("fdinfo",     S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
>  	DIR("ns",	  S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
>  #ifdef CONFIG_NET
> Index: linux-2.6.git/include/linux/proc_fs.h
> ===================================================================
> --- linux-2.6.git.orig/include/linux/proc_fs.h
> +++ linux-2.6.git/include/linux/proc_fs.h
> @@ -253,7 +253,7 @@ extern const struct proc_ns_operations u
>  extern const struct proc_ns_operations ipcns_operations;
>  
>  union proc_op {
> -	int (*proc_get_link)(struct inode *, struct path *);
> +	int (*proc_get_link)(struct dentry *, struct inode *, struct path *);
>  	int (*proc_read)(struct task_struct *task, char *page);
>  	int (*proc_show)(struct seq_file *m,
>  		struct pid_namespace *ns, struct pid *pid,
> _______________________________________________
> Containers mailing list
> Containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
> https://lists.linux-foundation.org/mailman/listinfo/containers

-- 
 Kirill A. Shutemov

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2
@ 2011-08-26 12:28                     ` Kirill A. Shutemov
  0 siblings, 0 replies; 48+ messages in thread
From: Kirill A. Shutemov @ 2011-08-26 12:28 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Tejun Heo, Andrew Morton, Pavel Emelyanov, linux-kernel,
	James Bottomley, containers, Andi Kleen, Nathan Lynch,
	LINUXFS-ML, Zan Lynx, Daniel Lezcano, Vasiliy Kulikov

On Fri, Aug 26, 2011 at 03:29:44PM +0400, Cyrill Gorcunov wrote:
> On Thu, Aug 25, 2011 at 11:39:31PM +0200, Tejun Heo wrote:
> ...
> > 
> > Why would you need an extra reference?  All these data structures are
> > created dynamically on access and dentry is always available while any
> > operation on the inode is in progress so it's guaranteed to be
> > available and there's no reason to diddle with reference count.
> > Anyways, we can deal with this optimization later, I think.
> > 
> 
> Hi, thanks a huge for all feedback! Mind to give the below one a
> review shot? Hope this time I've addressed all concerns. Thanks.
> (please check map_files_d_revalidate() precisely). Complains are
> welcome, as always.
> 
> 	Cyrill
> ---
> fs, proc: Introduce the /proc/<pid>/map_files/ directory v4
> 
> This one behaves similarly to the /proc/<pid>/fd/ one - it contains symlinks
> one for each mapping with file, the name of a symlink is "vma->vm_start-vma->vm_end",
> the target is the file. Opening a symlink results in a file that point exactly
> to the same inode as them vma's one.
> 
> For example the ls -l of some arbitrary /proc/<pid>/map_files/
> 
>  | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80403000-7f8f80404000 -> /lib64/libc-2.5.so
>  | lr-x------ 1 root root 64 Aug 26 06:40 7f8f8061e000-7f8f80620000 -> /lib64/libselinux.so.1
>  | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80826000-7f8f80827000 -> /lib64/libacl.so.1.1.0
>  | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a2f000-7f8f80a30000 -> /lib64/librt-2.5.so
>  | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a30000-7f8f80a4c000 -> /lib64/ld-2.5.so
> 
> This helps checkpointing process in three ways:
> 
> 1. When dumping a task mappings we do know exact file that is mapped by particular
>    region. We do this by opening /proc/pid/map_files/address symlink the way we do
>    with file descriptors.
> 
> 2. This also helps in determining which anonymous shared mappings are shared with
>    each other by comparing the inodes of them.
> 
> 3. When restoring a set of process in case two of them has a mapping shared, we map
>    the memory by the 1st one and then open its /proc/pid/map_files/address file and
>    map it by the 2nd task.
> 
> v2: (spotted by Tejun Heo)
>  - /proc/<pid>/mfd changed to /proc/<pid>/map_files
>  - find_vma helper is used instead of linear search
>  - routines are re-grouped
>  - d_revalidate is set now
> 
> v3:
>  - d_revalidate reworked, now it should drops no longer valid dentries (Tejun Heo)
>  - ptrace_may_access added into proc_map_files_lookup (Vasiliy Kulikov)
>  - because of filldir (which eventually might need to lock mmap_sem)
>    the proc_map_files_readdir() was reworked to call proc_fill_cache()
>    with unlocked mmap_sem
> 
> v4: (feedback by Tejun Heo and Vasiliy Kulikov)
>  - instead of saving data in proc_inode we rather make a dentry name
>    to keep both vm_start and vm_end accordingly
>  - d_revalidate now honor task credentials
> 
> Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
> Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
> CC: Tejun Heo <tj@kernel.org>
> CC: Vasiliy Kulikov <segoon@openwall.com>
> ---
>  fs/proc/base.c          |  326 +++++++++++++++++++++++++++++++++++++++++++++++-
>  include/linux/proc_fs.h |    2 
>  2 files changed, 321 insertions(+), 7 deletions(-)
> 
> Index: linux-2.6.git/fs/proc/base.c
> ===================================================================
> --- linux-2.6.git.orig/fs/proc/base.c
> +++ linux-2.6.git/fs/proc/base.c
> @@ -165,7 +165,7 @@ static int get_task_root(struct task_str
>  	return result;
>  }
>  
> -static int proc_cwd_link(struct inode *inode, struct path *path)
> +static int proc_cwd_link(struct dentry *dentry, struct inode *inode, struct path *path)

Put proc_get_link() change in separate patch?

>  {
>  	struct task_struct *task = get_proc_task(inode);
>  	int result = -ENOENT;
> @@ -182,7 +182,7 @@ static int proc_cwd_link(struct inode *i
>  	return result;
>  }
>  
> -static int proc_root_link(struct inode *inode, struct path *path)
> +static int proc_root_link(struct dentry *dentry, struct inode *inode, struct path *path)
>  {
>  	struct task_struct *task = get_proc_task(inode);
>  	int result = -ENOENT;
> @@ -1580,7 +1580,7 @@ static const struct file_operations proc
>  	.release	= single_release,
>  };
>  
> -static int proc_exe_link(struct inode *inode, struct path *exe_path)
> +static int proc_exe_link(struct dentry *dentry, struct inode *inode, struct path *exe_path)
>  {
>  	struct task_struct *task;
>  	struct mm_struct *mm;
> @@ -1616,7 +1616,7 @@ static void *proc_pid_follow_link(struct
>  	if (!proc_fd_access_allowed(inode))
>  		goto out;
>  
> -	error = PROC_I(inode)->op.proc_get_link(inode, &nd->path);
> +	error = PROC_I(inode)->op.proc_get_link(dentry, inode, &nd->path);
>  out:
>  	return ERR_PTR(error);
>  }
> @@ -1655,7 +1655,7 @@ static int proc_pid_readlink(struct dent
>  	if (!proc_fd_access_allowed(inode))
>  		goto out;
>  
> -	error = PROC_I(inode)->op.proc_get_link(inode, &path);
> +	error = PROC_I(inode)->op.proc_get_link(dentry, inode, &path);
>  	if (error)
>  		goto out;
>  
> @@ -1947,7 +1947,7 @@ static int proc_fd_info(struct inode *in
>  	return -ENOENT;
>  }
>  
> -static int proc_fd_link(struct inode *inode, struct path *path)
> +static int proc_fd_link(struct dentry *dentry, struct inode *inode, struct path *path)
>  {
>  	return proc_fd_info(inode, path, NULL);
>  }
> @@ -2170,6 +2170,319 @@ static const struct file_operations proc
>  	.llseek		= default_llseek,
>  };
>  
> +static struct vm_area_struct *
> +find_exact_vma(struct mm_struct *mm, unsigned long vm_start, unsigned long vm_end)
> +{
> +	struct vm_area_struct *vma = find_vma(mm, vm_start);
> +	if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end))
> +		vma = NULL;
> +	return vma;
> +}
> +
> +static int map_name_to_addr(const unsigned char *name, unsigned long *start, unsigned long *end)
> +{
> +	int ret = -1;
> +	char *endp;
> +
> +	if (unlikely(!name))
> +		goto err;
> +
> +	*start = simple_strtoul(name, &endp, 16);
> +	if (*endp != '-')
> +		goto err;
> +	*end = simple_strtoul(endp + 1, &endp, 16);
> +	if (*endp != 0)
> +		goto err;
> +
> +	ret = 0;
> +
> +err:
> +	return ret;
> +}
> +
> +static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd)
> +{
> +	struct vm_area_struct *vma = NULL;
> +	unsigned long vm_start, vm_end;
> +	struct task_struct *task;
> +	const struct cred *cred;
> +	struct mm_struct *mm;
> +	struct inode *inode;
> +
> +	if (nd && nd->flags & LOOKUP_RCU)
> +		return -ECHILD;
> +
> +	inode = dentry->d_inode;
> +	task = get_proc_task(inode);
> +	if (!task)
> +		goto out;
> +
> +	mm = get_task_mm(task);
> +	put_task_struct(task);
> +	if (!mm)
> +		goto out;
> +
> +	if (!map_name_to_addr(dentry->d_name.name, &vm_start, &vm_end)) {
> +		down_read(&mm->mmap_sem);
> +		vma = find_exact_vma(mm, vm_start, vm_end);
> +		up_read(&mm->mmap_sem);
> +	}
> +
> +	mmput(mm);
> +
> +	if (vma) {
> +		if (task_dumpable(task)) {
> +			rcu_read_lock();
> +			cred = __task_cred(task);
> +			inode->i_uid = cred->euid;
> +			inode->i_gid = cred->egid;
> +			rcu_read_unlock();
> +		} else {
> +			inode->i_uid = 0;
> +			inode->i_gid = 0;
> +		}
> +		security_task_to_inode(task, inode);
> +		return 1;
> +	}
> +out:
> +	d_drop(dentry);
> +	return 0;
> +}
> +
> +static const struct dentry_operations tid_map_files_dentry_operations = {
> +	.d_revalidate	= map_files_d_revalidate,
> +	.d_delete	= pid_delete_dentry,
> +};
> +
> +static int proc_map_files_get_link(struct dentry *dentry, struct inode *inode, struct path *path)
> +{
> +	unsigned long vm_start, vm_end;
> +	struct vm_area_struct *vma;
> +	struct task_struct *task;
> +	struct mm_struct *mm;
> +	int rc = -ENOENT;
> +
> +	task = get_proc_task(inode);
> +	if (!task)
> +		goto out;
> +
> +	mm = get_task_mm(task);
> +	put_task_struct(task);
> +	if (!mm)
> +		goto out;
> +
> +	if (map_name_to_addr(dentry->d_name.name,
> +			     &vm_start, &vm_end))
> +		goto out_mmput;
> +
> +	down_read(&mm->mmap_sem);
> +	vma = find_exact_vma(mm, vm_start, vm_end);
> +	if (vma && vma->vm_file) {
> +		*path = vma->vm_file->f_path;
> +		path_get(path);
> +		rc = 0;
> +	}
> +	up_read(&mm->mmap_sem);
> +
> +out_mmput:
> +	mmput(mm);
> +out:
> +	return rc;
> +}
> +
> +struct map_files_info {
> +	struct file	*file;
> +	unsigned char	name[16+16+2]; /* max: %016lx-%016lx\0 */
> +	unsigned long	len;
> +};
> +
> +static struct dentry *
> +proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
> +			   struct task_struct *task, const void *ptr)
> +{
> +	const struct file *file = ptr;
> +	struct proc_inode *ei;
> +	struct inode *inode;
> +
> +	if (!file)
> +		return ERR_PTR(-ENOENT);
> +
> +	inode = proc_pid_make_inode(dir->i_sb, task);
> +	if (!inode)
> +		return ERR_PTR(-ENOENT);
> +
> +	ei			= PROC_I(inode);
> +	ei->op.proc_get_link	= proc_map_files_get_link;
> +
> +	inode->i_op	= &proc_pid_link_inode_operations;
> +	inode->i_size	= 64;
> +	inode->i_mode	= S_IFLNK;
> +
> +	if (file->f_mode & FMODE_READ)
> +		inode->i_mode |= S_IRUSR | S_IXUSR;
> +	if (file->f_mode & FMODE_WRITE)
> +		inode->i_mode |= S_IWUSR | S_IXUSR;
> +
> +	d_set_d_op(dentry, &tid_map_files_dentry_operations);
> +	d_add(dentry, inode);
> +
> +	return NULL;
> +}
> +
> +static struct dentry *proc_map_files_lookup(struct inode *dir,
> +		struct dentry *dentry, struct nameidata *nd)
> +{
> +	unsigned long vm_start, vm_end;
> +	struct task_struct *task;
> +	struct vm_area_struct *vma;
> +	struct mm_struct *mm;
> +	struct dentry *result;
> +
> +	result = ERR_PTR(-ENOENT);
> +	task = get_proc_task(dir);
> +	if (!task)
> +		goto out_no_task;
> +
> +	result = ERR_PTR(-EPERM);
> +	if (!ptrace_may_access(task, PTRACE_MODE_READ));
> +		goto out_no_mm;
> +
> +	result = ERR_PTR(-ENOENT);
> +	if (map_name_to_addr(dentry->d_name.name,
> +			     &vm_start, &vm_end))
> +		goto out_no_mm;
> +
> +	mm = get_task_mm(task);
> +	if (!mm)
> +		goto out_no_mm;
> +
> +	down_read(&mm->mmap_sem);
> +	vma = find_exact_vma(mm, vm_start, vm_end);
> +	if (!vma)
> +		goto out_no_vma;
> +
> +	result = proc_map_files_instantiate(dir, dentry, task, vma->vm_file);
> +
> +out_no_vma:
> +	up_read(&mm->mmap_sem);
> +	mmput(mm);
> +out_no_mm:
> +	put_task_struct(task);
> +out_no_task:
> +	return result;
> +}
> +
> +static const struct inode_operations proc_map_files_inode_operations = {
> +	.lookup		= proc_map_files_lookup,
> +	.setattr	= proc_setattr,
> +};
> +
> +static int proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
> +{
> +	struct dentry *dentry = filp->f_path.dentry;
> +	struct inode *inode = dentry->d_inode;
> +	struct vm_area_struct *vma;
> +	struct task_struct *task;
> +	struct mm_struct *mm;
> +	unsigned int vmai;
> +	ino_t ino;
> +	int ret;
> +
> +	ret = -ENOENT;
> +	task = get_proc_task(inode);
> +	if (!task)
> +		goto out_no_task;
> +
> +	ret = -EPERM;
> +	if (!ptrace_may_access(task, PTRACE_MODE_READ))
> +		goto out;
> +
> +	ret = 0;
> +	switch (filp->f_pos) {
> +	case 0:
> +		ino = inode->i_ino;
> +		if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0)
> +			goto out;
> +		filp->f_pos++;
> +	case 1:
> +		ino = parent_ino(dentry);
> +		if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
> +			goto out;
> +		filp->f_pos++;
> +	default:
> +	{
> +		unsigned long nr_files, used, i;
> +		struct map_files_info *info;
> +
> +		mm = get_task_mm(task);
> +		if (!mm)
> +			goto out;
> +		down_read(&mm->mmap_sem);
> +
> +		nr_files = 0;
> +		for (vma = mm->mmap; vma; vma = vma->vm_next) {
> +			if (vma->vm_file)
> +				nr_files++;
> +		}
> +		if (!nr_files)
> +			goto out;

up_read(&mm->mmap_sem); ?

> +
> +		info = kmalloc(nr_files * sizeof(*info), GFP_KERNEL);
> +		if (!info) {
> +			ret = -ENOMEM;
> +			goto out;

Ditto.

> +		}
> +
> +		used = 0;
> +		for (vma = mm->mmap, vmai = 2; vma; vma = vma->vm_next) {
> +			if (!vma->vm_file)
> +				continue;
> +			vmai++;
> +			if (vmai <= filp->f_pos)
> +				continue;
> +
> +			get_file(vma->vm_file);
> +			info[used].file	= vma->vm_file;
> +			info[used].len	= snprintf(info[used].name,
> +						   sizeof(info[used].name),
> +						   "%lx-%lx", vma->vm_start,
> +						   vma->vm_end);
> +			used++;
> +		}
> +
> +		up_read(&mm->mmap_sem);
> +
> +		for (i = 0; i < used; i++) {
> +			ret = proc_fill_cache(filp, dirent, filldir,
> +					      info[i].name,
> +					      info[i].len,
> +					      proc_map_files_instantiate,
> +					      task, info[i].file);
> +			if (ret)
> +				break;
> +			filp->f_pos++;
> +		}
> +
> +		for (i = 0; i < used; i++)
> +			put_filp(info[i].file);
> +
> +		kfree(info);
> +		mmput(mm);
> +	}
> +	}
> +
> +out:
> +	put_task_struct(task);
> +out_no_task:
> +	return ret;
> +}
> +
> +static const struct file_operations proc_map_files_operations = {
> +	.read		= generic_read_dir,
> +	.readdir	= proc_map_files_readdir,
> +	.llseek		= default_llseek,
> +};
> +
>  /*
>   * /proc/pid/fd needs a special permission handler so that a process can still
>   * access /proc/self/fd after it has executed a setuid().
> @@ -2785,6 +3098,7 @@ static const struct inode_operations pro
>  static const struct pid_entry tgid_base_stuff[] = {
>  	DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
>  	DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
> +	DIR("map_files",  S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
>  	DIR("fdinfo",     S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
>  	DIR("ns",	  S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
>  #ifdef CONFIG_NET
> Index: linux-2.6.git/include/linux/proc_fs.h
> ===================================================================
> --- linux-2.6.git.orig/include/linux/proc_fs.h
> +++ linux-2.6.git/include/linux/proc_fs.h
> @@ -253,7 +253,7 @@ extern const struct proc_ns_operations u
>  extern const struct proc_ns_operations ipcns_operations;
>  
>  union proc_op {
> -	int (*proc_get_link)(struct inode *, struct path *);
> +	int (*proc_get_link)(struct dentry *, struct inode *, struct path *);
>  	int (*proc_read)(struct task_struct *task, char *page);
>  	int (*proc_show)(struct seq_file *m,
>  		struct pid_namespace *ns, struct pid *pid,
> _______________________________________________
> Containers mailing list
> Containers@lists.linux-foundation.org
> https://lists.linux-foundation.org/mailman/listinfo/containers

-- 
 Kirill A. Shutemov

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2
  2011-08-26 12:28                     ` Kirill A. Shutemov
  (?)
@ 2011-08-26 12:39                     ` Cyrill Gorcunov
  -1 siblings, 0 replies; 48+ messages in thread
From: Cyrill Gorcunov @ 2011-08-26 12:39 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Tejun Heo, Andrew Morton, Pavel Emelyanov, linux-kernel,
	James Bottomley, containers, Andi Kleen, Nathan Lynch,
	LINUXFS-ML, Zan Lynx, Daniel Lezcano, Vasiliy Kulikov

On Fri, Aug 26, 2011 at 03:28:51PM +0300, Kirill A. Shutemov wrote:
...
> >  
> > -static int proc_cwd_link(struct inode *inode, struct path *path)
> > +static int proc_cwd_link(struct dentry *dentry, struct inode *inode, struct path *path)
> 
> Put proc_get_link() change in separate patch?
> 

Will do, but let leave it to a final pass, ie when all get agreed on idea and
details, ok? Since I'll be re-sending the final version seraparely anyway.
...
> > +		down_read(&mm->mmap_sem);
> > +
> > +		nr_files = 0;
> > +		for (vma = mm->mmap; vma; vma = vma->vm_next) {
> > +			if (vma->vm_file)
> > +				nr_files++;
> > +		}
> > +		if (!nr_files)
> > +			goto out;
> 
> up_read(&mm->mmap_sem); ?
> 
> > +
> > +		info = kmalloc(nr_files * sizeof(*info), GFP_KERNEL);
> > +		if (!info) {
> > +			ret = -ENOMEM;
> > +			goto out;
> 
> Ditto.
> 

Yeah, thanks Kirill!

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2
  2011-08-26 12:28                     ` Kirill A. Shutemov
  (?)
  (?)
@ 2011-08-26 13:16                     ` Cyrill Gorcunov
  2011-08-26 14:06                       ` Tejun Heo
  -1 siblings, 1 reply; 48+ messages in thread
From: Cyrill Gorcunov @ 2011-08-26 13:16 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Tejun Heo, Andrew Morton, Pavel Emelyanov, linux-kernel,
	James Bottomley, containers, Andi Kleen, Nathan Lynch,
	LINUXFS-ML, Zan Lynx, Daniel Lezcano, Vasiliy Kulikov

On Fri, Aug 26, 2011 at 03:28:51PM +0300, Kirill A. Shutemov wrote:
...
> > +
> > +		mm = get_task_mm(task);
> > +		if (!mm)
> > +			goto out;
> > +		down_read(&mm->mmap_sem);
> > +
> > +		nr_files = 0;
> > +		for (vma = mm->mmap; vma; vma = vma->vm_next) {
> > +			if (vma->vm_file)
> > +				nr_files++;
> > +		}
> > +		if (!nr_files)
> > +			goto out;
> 
> up_read(&mm->mmap_sem); ?
> 
> > +
> > +		info = kmalloc(nr_files * sizeof(*info), GFP_KERNEL);
> > +		if (!info) {
> > +			ret = -ENOMEM;
> > +			goto out;
> 
> Ditto.
> 

So the final (I hope) version is the below. Please check it. If all
get agreed on it I'll split it up and resend later. Sorry for such
massive change in one patch.

	Cyrill
---
fs, proc: Introduce the /proc/<pid>/map_files/ directory v5

This one behaves similarly to the /proc/<pid>/fd/ one - it contains symlinks
one for each mapping with file, the name of a symlink is "vma->vm_start-vma->vm_end",
the target is the file. Opening a symlink results in a file that point exactly
to the same inode as them vma's one.

For example the ls -l of some arbitrary /proc/<pid>/map_files/

 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80403000-7f8f80404000 -> /lib64/libc-2.5.so
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f8061e000-7f8f80620000 -> /lib64/libselinux.so.1
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80826000-7f8f80827000 -> /lib64/libacl.so.1.1.0
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a2f000-7f8f80a30000 -> /lib64/librt-2.5.so
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a30000-7f8f80a4c000 -> /lib64/ld-2.5.so

This helps checkpointing process in three ways:

1. When dumping a task mappings we do know exact file that is mapped by particular
   region. We do this by opening /proc/pid/map_files/address symlink the way we do
   with file descriptors.

2. This also helps in determining which anonymous shared mappings are shared with
   each other by comparing the inodes of them.

3. When restoring a set of process in case two of them has a mapping shared, we map
   the memory by the 1st one and then open its /proc/pid/map_files/address file and
   map it by the 2nd task.

v2: (spotted by Tejun Heo)
 - /proc/<pid>/mfd changed to /proc/<pid>/map_files
 - find_vma helper is used instead of linear search
 - routines are re-grouped
 - d_revalidate is set now

v3:
 - d_revalidate reworked, now it should drops no longer valid dentries (Tejun Heo)
 - ptrace_may_access added into proc_map_files_lookup (Vasiliy Kulikov)
 - because of filldir (which eventually might need to lock mmap_sem)
   the proc_map_files_readdir() was reworked to call proc_fill_cache()
   with unlocked mmap_sem

v4: (feedback by Tejun Heo and Vasiliy Kulikov)
 - instead of saving data in proc_inode we rather make a dentry name
   to keep both vm_start and vm_end accordingly
 - d_revalidate now honor task credentials

v5: (feedback by Kirill A. Shutemov)
 - don't forget to release mmap_sem on error path

Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
CC: Tejun Heo <tj@kernel.org>
CC: Vasiliy Kulikov <segoon@openwall.com>
CC: "Kirill A. Shutemov" <kirill@shutemov.name>
---
 fs/proc/base.c          |  323 +++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/proc_fs.h |    2 
 2 files changed, 318 insertions(+), 7 deletions(-)

Index: linux-2.6.git/fs/proc/base.c
===================================================================
--- linux-2.6.git.orig/fs/proc/base.c
+++ linux-2.6.git/fs/proc/base.c
@@ -165,7 +165,7 @@ static int get_task_root(struct task_str
 	return result;
 }
 
-static int proc_cwd_link(struct inode *inode, struct path *path)
+static int proc_cwd_link(struct dentry *dentry, struct inode *inode, struct path *path)
 {
 	struct task_struct *task = get_proc_task(inode);
 	int result = -ENOENT;
@@ -182,7 +182,7 @@ static int proc_cwd_link(struct inode *i
 	return result;
 }
 
-static int proc_root_link(struct inode *inode, struct path *path)
+static int proc_root_link(struct dentry *dentry, struct inode *inode, struct path *path)
 {
 	struct task_struct *task = get_proc_task(inode);
 	int result = -ENOENT;
@@ -1580,7 +1580,7 @@ static const struct file_operations proc
 	.release	= single_release,
 };
 
-static int proc_exe_link(struct inode *inode, struct path *exe_path)
+static int proc_exe_link(struct dentry *dentry, struct inode *inode, struct path *exe_path)
 {
 	struct task_struct *task;
 	struct mm_struct *mm;
@@ -1616,7 +1616,7 @@ static void *proc_pid_follow_link(struct
 	if (!proc_fd_access_allowed(inode))
 		goto out;
 
-	error = PROC_I(inode)->op.proc_get_link(inode, &nd->path);
+	error = PROC_I(inode)->op.proc_get_link(dentry, inode, &nd->path);
 out:
 	return ERR_PTR(error);
 }
@@ -1655,7 +1655,7 @@ static int proc_pid_readlink(struct dent
 	if (!proc_fd_access_allowed(inode))
 		goto out;
 
-	error = PROC_I(inode)->op.proc_get_link(inode, &path);
+	error = PROC_I(inode)->op.proc_get_link(dentry, inode, &path);
 	if (error)
 		goto out;
 
@@ -1947,7 +1947,7 @@ static int proc_fd_info(struct inode *in
 	return -ENOENT;
 }
 
-static int proc_fd_link(struct inode *inode, struct path *path)
+static int proc_fd_link(struct dentry *dentry, struct inode *inode, struct path *path)
 {
 	return proc_fd_info(inode, path, NULL);
 }
@@ -2170,6 +2170,316 @@ static const struct file_operations proc
 	.llseek		= default_llseek,
 };
 
+static struct vm_area_struct *
+find_exact_vma(struct mm_struct *mm, unsigned long vm_start, unsigned long vm_end)
+{
+	struct vm_area_struct *vma = find_vma(mm, vm_start);
+	if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end))
+		vma = NULL;
+	return vma;
+}
+
+static int map_name_to_addr(const unsigned char *name, unsigned long *start, unsigned long *end)
+{
+	int ret = -1;
+	char *endp;
+
+	if (unlikely(!name))
+		goto err;
+
+	*start = simple_strtoul(name, &endp, 16);
+	if (*endp != '-')
+		goto err;
+	*end = simple_strtoul(endp + 1, &endp, 16);
+	if (*endp != 0)
+		goto err;
+
+	ret = 0;
+
+err:
+	return ret;
+}
+
+static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	struct vm_area_struct *vma = NULL;
+	unsigned long vm_start, vm_end;
+	struct task_struct *task;
+	const struct cred *cred;
+	struct mm_struct *mm;
+	struct inode *inode;
+
+	if (nd && nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	inode = dentry->d_inode;
+	task = get_proc_task(inode);
+	if (!task)
+		goto out;
+
+	mm = get_task_mm(task);
+	put_task_struct(task);
+	if (!mm)
+		goto out;
+
+	if (!map_name_to_addr(dentry->d_name.name, &vm_start, &vm_end)) {
+		down_read(&mm->mmap_sem);
+		vma = find_exact_vma(mm, vm_start, vm_end);
+		up_read(&mm->mmap_sem);
+	}
+
+	mmput(mm);
+
+	if (vma) {
+		if (task_dumpable(task)) {
+			rcu_read_lock();
+			cred = __task_cred(task);
+			inode->i_uid = cred->euid;
+			inode->i_gid = cred->egid;
+			rcu_read_unlock();
+		} else {
+			inode->i_uid = 0;
+			inode->i_gid = 0;
+		}
+		security_task_to_inode(task, inode);
+		return 1;
+	}
+out:
+	d_drop(dentry);
+	return 0;
+}
+
+static const struct dentry_operations tid_map_files_dentry_operations = {
+	.d_revalidate	= map_files_d_revalidate,
+	.d_delete	= pid_delete_dentry,
+};
+
+static int proc_map_files_get_link(struct dentry *dentry, struct inode *inode, struct path *path)
+{
+	unsigned long vm_start, vm_end;
+	struct vm_area_struct *vma;
+	struct task_struct *task;
+	struct mm_struct *mm;
+	int rc = -ENOENT;
+
+	task = get_proc_task(inode);
+	if (!task)
+		goto out;
+
+	mm = get_task_mm(task);
+	put_task_struct(task);
+	if (!mm)
+		goto out;
+
+	if (map_name_to_addr(dentry->d_name.name,
+			     &vm_start, &vm_end))
+		goto out_mmput;
+
+	down_read(&mm->mmap_sem);
+	vma = find_exact_vma(mm, vm_start, vm_end);
+	if (vma && vma->vm_file) {
+		*path = vma->vm_file->f_path;
+		path_get(path);
+		rc = 0;
+	}
+	up_read(&mm->mmap_sem);
+
+out_mmput:
+	mmput(mm);
+out:
+	return rc;
+}
+
+struct map_files_info {
+	struct file	*file;
+	unsigned char	name[16+16+2]; /* max: %016lx-%016lx\0 */
+	unsigned long	len;
+};
+
+static struct dentry *
+proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
+			   struct task_struct *task, const void *ptr)
+{
+	const struct file *file = ptr;
+	struct proc_inode *ei;
+	struct inode *inode;
+
+	if (!file)
+		return ERR_PTR(-ENOENT);
+
+	inode = proc_pid_make_inode(dir->i_sb, task);
+	if (!inode)
+		return ERR_PTR(-ENOENT);
+
+	ei			= PROC_I(inode);
+	ei->op.proc_get_link	= proc_map_files_get_link;
+
+	inode->i_op	= &proc_pid_link_inode_operations;
+	inode->i_size	= 64;
+	inode->i_mode	= S_IFLNK;
+
+	if (file->f_mode & FMODE_READ)
+		inode->i_mode |= S_IRUSR | S_IXUSR;
+	if (file->f_mode & FMODE_WRITE)
+		inode->i_mode |= S_IWUSR | S_IXUSR;
+
+	d_set_d_op(dentry, &tid_map_files_dentry_operations);
+	d_add(dentry, inode);
+
+	return NULL;
+}
+
+static struct dentry *proc_map_files_lookup(struct inode *dir,
+		struct dentry *dentry, struct nameidata *nd)
+{
+	unsigned long vm_start, vm_end;
+	struct task_struct *task;
+	struct vm_area_struct *vma;
+	struct mm_struct *mm;
+	struct dentry *result;
+
+	result = ERR_PTR(-ENOENT);
+	task = get_proc_task(dir);
+	if (!task)
+		goto out_no_task;
+
+	result = ERR_PTR(-EPERM);
+	if (!ptrace_may_access(task, PTRACE_MODE_READ));
+		goto out_no_mm;
+
+	result = ERR_PTR(-ENOENT);
+	if (map_name_to_addr(dentry->d_name.name,
+			     &vm_start, &vm_end))
+		goto out_no_mm;
+
+	mm = get_task_mm(task);
+	if (!mm)
+		goto out_no_mm;
+
+	down_read(&mm->mmap_sem);
+	vma = find_exact_vma(mm, vm_start, vm_end);
+	if (!vma)
+		goto out_no_vma;
+
+	result = proc_map_files_instantiate(dir, dentry, task, vma->vm_file);
+
+out_no_vma:
+	up_read(&mm->mmap_sem);
+	mmput(mm);
+out_no_mm:
+	put_task_struct(task);
+out_no_task:
+	return result;
+}
+
+static const struct inode_operations proc_map_files_inode_operations = {
+	.lookup		= proc_map_files_lookup,
+	.setattr	= proc_setattr,
+};
+
+static int proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct dentry *dentry = filp->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	struct vm_area_struct *vma;
+	struct task_struct *task;
+	struct mm_struct *mm;
+	unsigned int vmai;
+	ino_t ino;
+	int ret;
+
+	ret = -ENOENT;
+	task = get_proc_task(inode);
+	if (!task)
+		goto out_no_task;
+
+	ret = -EPERM;
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+		goto out;
+
+	ret = 0;
+	switch (filp->f_pos) {
+	case 0:
+		ino = inode->i_ino;
+		if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0)
+			goto out;
+		filp->f_pos++;
+	case 1:
+		ino = parent_ino(dentry);
+		if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
+			goto out;
+		filp->f_pos++;
+	default:
+	{
+		struct map_files_info *info = NULL;
+		unsigned long nr_files, used, i;
+
+		mm = get_task_mm(task);
+		if (!mm)
+			goto out;
+		down_read(&mm->mmap_sem);
+
+		nr_files = 0;
+		used = 0;
+
+		for (vma = mm->mmap; vma; vma = vma->vm_next) {
+			if (vma->vm_file)
+				nr_files++;
+		}
+
+		if (nr_files) {
+			info = kmalloc(nr_files * sizeof(*info), GFP_KERNEL);
+			if (!info)
+				ret = -ENOMEM;
+			for (vma = mm->mmap, vmai = 2; vma && info; vma = vma->vm_next) {
+				if (!vma->vm_file)
+					continue;
+				vmai++;
+				if (vmai <= filp->f_pos)
+					continue;
+
+				get_file(vma->vm_file);
+				info[used].file	= vma->vm_file;
+				info[used].len	= snprintf(info[used].name,
+							   sizeof(info[used].name),
+							   "%lx-%lx", vma->vm_start,
+							   vma->vm_end);
+				used++;
+			}
+		}
+
+		up_read(&mm->mmap_sem);
+
+		for (i = 0; i < used; i++) {
+			ret = proc_fill_cache(filp, dirent, filldir,
+					      info[i].name, info[i].len,
+					      proc_map_files_instantiate,
+					      task, info[i].file);
+			if (ret)
+				break;
+			filp->f_pos++;
+		}
+
+		for (i = 0; i < used; i++)
+			put_filp(info[i].file);
+
+		kfree(info);
+		mmput(mm);
+	}
+	}
+
+out:
+	put_task_struct(task);
+out_no_task:
+	return ret;
+}
+
+static const struct file_operations proc_map_files_operations = {
+	.read		= generic_read_dir,
+	.readdir	= proc_map_files_readdir,
+	.llseek		= default_llseek,
+};
+
 /*
  * /proc/pid/fd needs a special permission handler so that a process can still
  * access /proc/self/fd after it has executed a setuid().
@@ -2785,6 +3095,7 @@ static const struct inode_operations pro
 static const struct pid_entry tgid_base_stuff[] = {
 	DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
 	DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
+	DIR("map_files",  S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
 	DIR("fdinfo",     S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
 	DIR("ns",	  S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
 #ifdef CONFIG_NET
Index: linux-2.6.git/include/linux/proc_fs.h
===================================================================
--- linux-2.6.git.orig/include/linux/proc_fs.h
+++ linux-2.6.git/include/linux/proc_fs.h
@@ -253,7 +253,7 @@ extern const struct proc_ns_operations u
 extern const struct proc_ns_operations ipcns_operations;
 
 union proc_op {
-	int (*proc_get_link)(struct inode *, struct path *);
+	int (*proc_get_link)(struct dentry *, struct inode *, struct path *);
 	int (*proc_read)(struct task_struct *task, char *page);
 	int (*proc_show)(struct seq_file *m,
 		struct pid_namespace *ns, struct pid *pid,

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2
  2011-08-26 13:16                     ` Cyrill Gorcunov
@ 2011-08-26 14:06                       ` Tejun Heo
  2011-08-26 14:23                         ` Kirill A. Shutemov
  0 siblings, 1 reply; 48+ messages in thread
From: Tejun Heo @ 2011-08-26 14:06 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Kirill A. Shutemov, Andrew Morton, Pavel Emelyanov, linux-kernel,
	James Bottomley, containers, Andi Kleen, Nathan Lynch,
	LINUXFS-ML, Zan Lynx, Daniel Lezcano, Vasiliy Kulikov

Hello, Cyrill.

On Fri, Aug 26, 2011 at 05:16:20PM +0400, Cyrill Gorcunov wrote:
> Index: linux-2.6.git/fs/proc/base.c
> ===================================================================
> --- linux-2.6.git.orig/fs/proc/base.c
> +++ linux-2.6.git/fs/proc/base.c
> @@ -165,7 +165,7 @@ static int get_task_root(struct task_str
>  	return result;
>  }
>  
> -static int proc_cwd_link(struct inode *inode, struct path *path)
> +static int proc_cwd_link(struct dentry *dentry, struct inode *inode, struct path *path)

I don't think it's necessary to pass around both @dentry and @inode.
@inode can always be dereferenced from @dentry.

> +static int map_name_to_addr(const unsigned char *name, unsigned long *start, unsigned long *end)
> +{
> +	int ret = -1;

Very minor but functions returning -1 for error bugs me a bit.  Maybe
-EINVAL is better?  Not a big deal either way.

> +static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd)
> +{
...
> +	if (vma) {
> +		if (task_dumpable(task)) {
> +			rcu_read_lock();
> +			cred = __task_cred(task);
> +			inode->i_uid = cred->euid;
> +			inode->i_gid = cred->egid;
> +			rcu_read_unlock();
> +		} else {
> +			inode->i_uid = 0;
> +			inode->i_gid = 0;
> +		}
> +		security_task_to_inode(task, inode);
> +		return 1;

Another minor point.  Maybe it would be nice to factor this into a
function and share it w/ the fd one (and maybe sprinkle some comments
while at it too :)

> +struct map_files_info {
> +	struct file	*file;
> +	unsigned char	name[16+16+2]; /* max: %016lx-%016lx\0 */
> +	unsigned long	len;
> +};

That's slightly above 50 bytes.

> +static struct dentry *proc_map_files_lookup(struct inode *dir,
> +		struct dentry *dentry, struct nameidata *nd)
> +{
> +	unsigned long vm_start, vm_end;
> +	struct task_struct *task;
> +	struct vm_area_struct *vma;
> +	struct mm_struct *mm;
> +	struct dentry *result;
> +
> +	result = ERR_PTR(-ENOENT);
> +	task = get_proc_task(dir);
> +	if (!task)
> +		goto out_no_task;
> +
> +	result = ERR_PTR(-EPERM);
> +	if (!ptrace_may_access(task, PTRACE_MODE_READ));
> +		goto out_no_mm;
> +
> +	result = ERR_PTR(-ENOENT);
> +	if (map_name_to_addr(dentry->d_name.name,
> +			     &vm_start, &vm_end))
> +		goto out_no_mm;

Another nitpick: Maybe factor out the above three steps?

> +static int proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
> +{
...
> +		if (nr_files) {
> +			info = kmalloc(nr_files * sizeof(*info), GFP_KERNEL);

I think we can just put this on stack.

All my comments are pretty cosmetic, so it generally looks good to me.
Andrew, what do you think?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2
  2011-08-26 14:06                       ` Tejun Heo
@ 2011-08-26 14:23                         ` Kirill A. Shutemov
  2011-08-26 14:27                           ` Tejun Heo
  0 siblings, 1 reply; 48+ messages in thread
From: Kirill A. Shutemov @ 2011-08-26 14:23 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Cyrill Gorcunov, Andrew Morton, Pavel Emelyanov, linux-kernel,
	James Bottomley, containers, Andi Kleen, Nathan Lynch,
	LINUXFS-ML, Zan Lynx, Daniel Lezcano, Vasiliy Kulikov

On Fri, Aug 26, 2011 at 04:06:20PM +0200, Tejun Heo wrote:
> > +struct map_files_info {
> > +	struct file	*file;
> > +	unsigned char	name[16+16+2]; /* max: %016lx-%016lx\0 */
> > +	unsigned long	len;
> > +};
> 
> That's slightly above 50 bytes.

We can use 4 * sizeof(unsigned long) + 2 for name to save a bit on 32 bit
architectures.

BTW, show_map_vma() uses "%08lx-%08lx" to print ->start, ->end. I think
it's good idea to use this here as well to be consistent with
/proc/$PID/{s,}maps

-- 
 Kirill A. Shutemov

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2
  2011-08-26 14:23                         ` Kirill A. Shutemov
@ 2011-08-26 14:27                           ` Tejun Heo
  0 siblings, 0 replies; 48+ messages in thread
From: Tejun Heo @ 2011-08-26 14:27 UTC (permalink / raw)
  To: Kirill A. Shutemov
  Cc: Cyrill Gorcunov, Andrew Morton, Pavel Emelyanov, linux-kernel,
	James Bottomley, containers, Andi Kleen, Nathan Lynch,
	LINUXFS-ML, Zan Lynx, Daniel Lezcano, Vasiliy Kulikov

Hello,

On Fri, Aug 26, 2011 at 05:23:34PM +0300, Kirill A. Shutemov wrote:
> On Fri, Aug 26, 2011 at 04:06:20PM +0200, Tejun Heo wrote:
> > > +struct map_files_info {
> > > +	struct file	*file;
> > > +	unsigned char	name[16+16+2]; /* max: %016lx-%016lx\0 */
> > > +	unsigned long	len;
> > > +};
> > 
> > That's slightly above 50 bytes.
> 
> We can use 4 * sizeof(unsigned long) + 2 for name to save a bit on 32 bit
> architectures.

Ooh, I wasn't trying to say it was too big.  I was trying to say it
would be okay to have that on stack.  :)

> BTW, show_map_vma() uses "%08lx-%08lx" to print ->start, ->end. I think
> it's good idea to use this here as well to be consistent with
> /proc/$PID/{s,}maps

I'm not too sure about this one.  First of all, I think %08lx is more
of an accident than intention (just forgot to update it while
transitioning to 64bit) and when ls'd the output can be a bit messy
without leading 0's, but again my opinion here isn't very strong.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2
  2011-08-24  8:53 [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2 Cyrill Gorcunov
                   ` (3 preceding siblings ...)
  2011-08-24 15:05   ` Zan Lynx
@ 2011-09-13 14:14 ` Pavel Machek
  2011-09-13 14:20   ` Pavel Emelyanov
  4 siblings, 1 reply; 48+ messages in thread
From: Pavel Machek @ 2011-09-13 14:14 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Nathan Lynch, Oren Laadan, Daniel Lezcano, Tejun Heo,
	Andrew Morton, Glauber Costa, containers, linux-kernel,
	Pavel Emelyanov, Serge Hallyn, LINUXFS-ML, James Bottomley

Hi!

> This one behaves similarly to the /proc/<pid>/fd/ one - it contains symlinks
> one for each mapping with file, the name of a symlink is vma->vm_start, the
> target is the file. Opening a symlink results in a file that point exactly
> to the same inode as them vma's one.

Is it good idea security-wise? It looks like symlink but does not
behave like one. (And yes, I know we already have similar problems in
/proc..)

ptrace-may-trace is not good enough protection; I can do  this on my
own thread to get around read-only protection on fd. (File can be
protected from me by directory permissions.)


-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2
  2011-09-13 14:14 ` Pavel Machek
@ 2011-09-13 14:20   ` Pavel Emelyanov
  0 siblings, 0 replies; 48+ messages in thread
From: Pavel Emelyanov @ 2011-09-13 14:20 UTC (permalink / raw)
  To: Pavel Machek
  Cc: Cyrill Gorcunov, Nathan Lynch, Oren Laadan, Daniel Lezcano,
	Tejun Heo, Andrew Morton, Glauber Costa, containers,
	linux-kernel, Serge Hallyn, LINUXFS-ML, James Bottomley

On 09/13/2011 06:14 PM, Pavel Machek wrote:
> Hi!
> 
>> This one behaves similarly to the /proc/<pid>/fd/ one - it contains symlinks
>> one for each mapping with file, the name of a symlink is vma->vm_start, the
>> target is the file. Opening a symlink results in a file that point exactly
>> to the same inode as them vma's one.
> 
> Is it good idea security-wise? It looks like symlink but does not
> behave like one. (And yes, I know we already have similar problems in
> /proc..)

What exactly doesn't behave like symlink, can you elaborate, please?

> ptrace-may-trace is not good enough protection; I can do  this on my
> own thread to get around read-only protection on fd. (File can be
> protected from me by directory permissions.)

I think this issue worth separate discussion and if it turns out there is
a problem with that we can fix it together with /proc/pid/fd and other stuff.

Thanks,
Pavel

^ permalink raw reply	[flat|nested] 48+ messages in thread

end of thread, other threads:[~2011-09-13 14:20 UTC | newest]

Thread overview: 48+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-08-24  8:53 [RFC] fs, proc: Introduce the /proc/<pid>/map_files/ directory v2 Cyrill Gorcunov
2011-08-24  9:21 ` Pekka Enberg
2011-08-24  9:33   ` Pavel Emelyanov
2011-08-24  9:34 ` Tejun Heo
2011-08-24  9:37   ` Cyrill Gorcunov
2011-08-24  9:41     ` Cyrill Gorcunov
2011-08-24  9:41     ` Cyrill Gorcunov
2011-08-24  9:41       ` Cyrill Gorcunov
2011-08-24 11:18 ` Vasiliy Kulikov
2011-08-24 11:31   ` Cyrill Gorcunov
2011-08-25  8:29   ` Cyrill Gorcunov
2011-08-25 17:01     ` Tejun Heo
2011-08-25 17:05       ` Pavel Emelyanov
2011-08-25 17:21         ` Cyrill Gorcunov
2011-08-25 17:25           ` Pavel Emelyanov
2011-08-25 17:27             ` Tejun Heo
2011-08-25 17:34               ` Cyrill Gorcunov
2011-08-25 17:07       ` Cyrill Gorcunov
2011-08-25 20:54         ` Tejun Heo
2011-08-25 21:12           ` Tejun Heo
2011-08-25 21:34             ` Cyrill Gorcunov
2011-08-25 21:39               ` Tejun Heo
2011-08-26  6:58                 ` Cyrill Gorcunov
2011-08-26 11:29                 ` Cyrill Gorcunov
2011-08-26 12:28                   ` Kirill A. Shutemov
2011-08-26 12:28                     ` Kirill A. Shutemov
2011-08-26 12:39                     ` Cyrill Gorcunov
2011-08-26 13:16                     ` Cyrill Gorcunov
2011-08-26 14:06                       ` Tejun Heo
2011-08-26 14:23                         ` Kirill A. Shutemov
2011-08-26 14:27                           ` Tejun Heo
2011-08-25 17:11       ` Cyrill Gorcunov
2011-08-25 17:36     ` Vasiliy Kulikov
2011-08-25 17:39       ` Cyrill Gorcunov
2011-08-25 17:54         ` Vasiliy Kulikov
2011-08-25 18:13           ` Cyrill Gorcunov
2011-08-24 15:05 ` Zan Lynx
2011-08-24 15:05   ` Zan Lynx
2011-08-24 15:19   ` Pavel Emelyanov
2011-08-24 17:36     ` Andi Kleen
2011-08-24 17:36       ` Andi Kleen
2011-08-25  6:42       ` Pavel Emelyanov
2011-08-25 14:04         ` Andi Kleen
2011-08-25 14:30           ` Cyrill Gorcunov
2011-08-25 14:47           ` Pavel Emelyanov
2011-08-24 15:22   ` Cyrill Gorcunov
2011-09-13 14:14 ` Pavel Machek
2011-09-13 14:20   ` Pavel Emelyanov

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.