All of lore.kernel.org
 help / color / mirror / Atom feed
* [PACTH v1] mm, proc: Implement /proc/<pid>/totmaps
@ 2016-08-09 16:05 robert.foss
  2016-08-09 16:29 ` Mateusz Guzik
                   ` (3 more replies)
  0 siblings, 4 replies; 24+ messages in thread
From: robert.foss @ 2016-08-09 16:05 UTC (permalink / raw)
  To: akpm, keescook, viro, gorcunov, john.stultz, plaguedbypenguins,
	sonnyrao, mguzik, adobriyan, jdanis, calvinowens, jann,
	robert.foss, mhocko, koct9i, vbabka, n-horiguchi,
	kirill.shutemov, ldufour, hannes, linux-kernel, Ben Zhang,
	Bryan Freed, Filipe Brandenburger

From: Sonny Rao <sonnyrao@chromium.org>

This is based on earlier work by Thiago Goncales. It implements a new
per process proc file which summarizes the contents of the smaps file
but doesn't display any addresses.  It gives more detailed information
than statm like the PSS (proprotional set size).  It differs from the
original implementation in that it doesn't use the full blown set of
seq operations, uses a different termination condition, and doesn't
displayed "Locked" as that was broken on the original implemenation.

This new proc file provides information faster than parsing the potentially
huge smaps file.

Signed-off-by: Sonny Rao <sonnyrao@chromium.org>

Tested-by: Robert Foss <robert.foss@collabora.com>
Signed-off-by: Robert Foss <robert.foss@collabora.com>

---
 fs/proc/base.c     |   1 +
 fs/proc/internal.h |   4 ++
 fs/proc/task_mmu.c | 126 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 131 insertions(+)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index a11eb71..de3acdf 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2855,6 +2855,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 	REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
 	REG("smaps",      S_IRUGO, proc_pid_smaps_operations),
 	REG("pagemap",    S_IRUSR, proc_pagemap_operations),
+	REG("totmaps",    S_IRUGO, proc_totmaps_operations),
 #endif
 #ifdef CONFIG_SECURITY
 	DIR("attr",       S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index aa27810..6f3540f 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -58,6 +58,9 @@ union proc_op {
 		struct task_struct *task);
 };
 
+
+extern const struct file_operations proc_totmaps_operations;
+
 struct proc_inode {
 	struct pid *pid;
 	int fd;
@@ -281,6 +284,7 @@ struct proc_maps_private {
 	struct mm_struct *mm;
 #ifdef CONFIG_MMU
 	struct vm_area_struct *tail_vma;
+	struct mem_size_stats *mss;
 #endif
 #ifdef CONFIG_NUMA
 	struct mempolicy *task_mempolicy;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 4648c7f..b61873e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -802,6 +802,81 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
 	return 0;
 }
 
+static void add_smaps_sum(struct mem_size_stats *mss,
+		struct mem_size_stats *mss_sum)
+{
+	mss_sum->resident += mss->resident;
+	mss_sum->pss += mss->pss;
+	mss_sum->shared_clean += mss->shared_clean;
+	mss_sum->shared_dirty += mss->shared_dirty;
+	mss_sum->private_clean += mss->private_clean;
+	mss_sum->private_dirty += mss->private_dirty;
+	mss_sum->referenced += mss->referenced;
+	mss_sum->anonymous += mss->anonymous;
+	mss_sum->anonymous_thp += mss->anonymous_thp;
+	mss_sum->swap += mss->swap;
+}
+
+static int totmaps_proc_show(struct seq_file *m, void *data)
+{
+	struct proc_maps_private *priv = m->private;
+	struct mm_struct *mm;
+	struct vm_area_struct *vma;
+	struct mem_size_stats *mss_sum = priv->mss;
+
+	/* reference to priv->task already taken */
+	/* but need to get the mm here because */
+	/* task could be in the process of exiting */
+	mm = get_task_mm(priv->task);
+	if (!mm || IS_ERR(mm))
+		return -EINVAL;
+
+	down_read(&mm->mmap_sem);
+	hold_task_mempolicy(priv);
+
+	for (vma = mm->mmap; vma != priv->tail_vma; vma = vma->vm_next) {
+		struct mem_size_stats mss;
+		struct mm_walk smaps_walk = {
+			.pmd_entry = smaps_pte_range,
+			.mm = vma->vm_mm,
+			.private = &mss,
+		};
+
+		if (vma->vm_mm && !is_vm_hugetlb_page(vma)) {
+			memset(&mss, 0, sizeof(mss));
+			walk_page_vma(vma, &smaps_walk);
+			add_smaps_sum(&mss, mss_sum);
+		}
+	}
+	seq_printf(m,
+		   "Rss:            %8lu kB\n"
+		   "Pss:            %8lu kB\n"
+		   "Shared_Clean:   %8lu kB\n"
+		   "Shared_Dirty:   %8lu kB\n"
+		   "Private_Clean:  %8lu kB\n"
+		   "Private_Dirty:  %8lu kB\n"
+		   "Referenced:     %8lu kB\n"
+		   "Anonymous:      %8lu kB\n"
+		   "AnonHugePages:  %8lu kB\n"
+		   "Swap:           %8lu kB\n",
+		   mss_sum->resident >> 10,
+		   (unsigned long)(mss_sum->pss >> (10 + PSS_SHIFT)),
+		   mss_sum->shared_clean  >> 10,
+		   mss_sum->shared_dirty  >> 10,
+		   mss_sum->private_clean >> 10,
+		   mss_sum->private_dirty >> 10,
+		   mss_sum->referenced >> 10,
+		   mss_sum->anonymous >> 10,
+		   mss_sum->anonymous_thp >> 10,
+		   mss_sum->swap >> 10);
+
+	release_task_mempolicy(priv);
+	up_read(&mm->mmap_sem);
+	mmput(mm);
+
+	return 0;
+}
+
 static int show_pid_smap(struct seq_file *m, void *v)
 {
 	return show_smap(m, v, 1);
@@ -836,6 +911,50 @@ static int tid_smaps_open(struct inode *inode, struct file *file)
 	return do_maps_open(inode, file, &proc_tid_smaps_op);
 }
 
+static int totmaps_open(struct inode *inode, struct file *file)
+{
+	struct proc_maps_private *priv;
+	int ret = -ENOMEM;
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (priv) {
+		priv->mss = kzalloc(sizeof(*priv->mss), GFP_KERNEL);
+		if (!priv->mss)
+			return -ENOMEM;
+
+		/* we need to grab references to the task_struct */
+		/* at open time, because there's a potential information */
+		/* leak where the totmaps file is opened and held open */
+		/* while the underlying pid to task mapping changes */
+		/* underneath it */
+		priv->task = get_pid_task(proc_pid(inode), PIDTYPE_PID);
+		if (!priv->task) {
+			kfree(priv->mss);
+			kfree(priv);
+			return -ESRCH;
+		}
+
+		ret = single_open(file, totmaps_proc_show, priv);
+		if (ret) {
+			put_task_struct(priv->task);
+			kfree(priv->mss);
+			kfree(priv);
+		}
+	}
+	return ret;
+}
+
+static int totmaps_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *m = file->private_data;
+	struct proc_maps_private *priv = m->private;
+
+	put_task_struct(priv->task);
+	kfree(priv->mss);
+	kfree(priv);
+	m->private = NULL;
+	return single_release(inode, file);
+}
+
 const struct file_operations proc_pid_smaps_operations = {
 	.open		= pid_smaps_open,
 	.read		= seq_read,
@@ -850,6 +969,13 @@ const struct file_operations proc_tid_smaps_operations = {
 	.release	= proc_map_release,
 };
 
+const struct file_operations proc_totmaps_operations = {
+	.open		= totmaps_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= totmaps_release,
+};
+
 enum clear_refs_types {
 	CLEAR_REFS_ALL = 1,
 	CLEAR_REFS_ANON,
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [PACTH v1] mm, proc: Implement /proc/<pid>/totmaps
  2016-08-09 16:05 [PACTH v1] mm, proc: Implement /proc/<pid>/totmaps robert.foss
@ 2016-08-09 16:29 ` Mateusz Guzik
  2016-08-09 16:56   ` Sonny Rao
  2016-08-09 20:17   ` Robert Foss
  2016-08-09 16:58 ` Alexey Dobriyan
                   ` (2 subsequent siblings)
  3 siblings, 2 replies; 24+ messages in thread
From: Mateusz Guzik @ 2016-08-09 16:29 UTC (permalink / raw)
  To: robert.foss
  Cc: akpm, keescook, viro, gorcunov, john.stultz, plaguedbypenguins,
	sonnyrao, adobriyan, jdanis, calvinowens, jann, mhocko, koct9i,
	vbabka, n-horiguchi, kirill.shutemov, ldufour, hannes,
	linux-kernel, Ben Zhang, Bryan Freed, Filipe Brandenburger

On Tue, Aug 09, 2016 at 12:05:43PM -0400, robert.foss@collabora.com wrote:
> From: Sonny Rao <sonnyrao@chromium.org>
> 
> This is based on earlier work by Thiago Goncales. It implements a new
> per process proc file which summarizes the contents of the smaps file
> but doesn't display any addresses.  It gives more detailed information
> than statm like the PSS (proprotional set size).  It differs from the
> original implementation in that it doesn't use the full blown set of
> seq operations, uses a different termination condition, and doesn't
> displayed "Locked" as that was broken on the original implemenation.
> 
> This new proc file provides information faster than parsing the potentially
> huge smaps file.

I have no idea about usefulness of this.

The patch is definitely buggy with respect to how it implements actual
access to mm.

> +static int totmaps_proc_show(struct seq_file *m, void *data)
> +{
> +	struct proc_maps_private *priv = m->private;
> +	struct mm_struct *mm;
> +	struct vm_area_struct *vma;
> +	struct mem_size_stats *mss_sum = priv->mss;
> +
> +	/* reference to priv->task already taken */
> +	/* but need to get the mm here because */
> +	/* task could be in the process of exiting */
> +	mm = get_task_mm(priv->task);
> +	if (!mm || IS_ERR(mm))
> +		return -EINVAL;
> +

That's not how it's done in smaps.

> +static int totmaps_open(struct inode *inode, struct file *file)
> +{
> +	struct proc_maps_private *priv;
> +	int ret = -ENOMEM;
> +	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
> +	if (priv) {
> +		priv->mss = kzalloc(sizeof(*priv->mss), GFP_KERNEL);
> +		if (!priv->mss)
> +			return -ENOMEM;

Cases below explicitly kfree(priv). I can't remember whether the close
routine gets called if this one fails. Either way, something is wrong
here.

> +
> +		/* we need to grab references to the task_struct */
> +		/* at open time, because there's a potential information */
> +		/* leak where the totmaps file is opened and held open */
> +		/* while the underlying pid to task mapping changes */
> +		/* underneath it */
> +		priv->task = get_pid_task(proc_pid(inode), PIDTYPE_PID);

This performs no permission checks that I would see. If you take a look
at smaps you will see the user ends up in proc_maps_open which performs
proc_mem_open(inode, PTRACE_MODE_READ) and gets a mm from there.


> +		if (!priv->task) {
> +			kfree(priv->mss);
> +			kfree(priv);
> +			return -ESRCH;
> +		}
> +
> +		ret = single_open(file, totmaps_proc_show, priv);
> +		if (ret) {
> +			put_task_struct(priv->task);
> +			kfree(priv->mss);
> +			kfree(priv);
> +		}
> +	}
> +	return ret;
> +}
> +

-- 
Mateusz Guzik

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PACTH v1] mm, proc: Implement /proc/<pid>/totmaps
  2016-08-09 16:29 ` Mateusz Guzik
@ 2016-08-09 16:56   ` Sonny Rao
  2016-08-09 20:17   ` Robert Foss
  1 sibling, 0 replies; 24+ messages in thread
From: Sonny Rao @ 2016-08-09 16:56 UTC (permalink / raw)
  To: Mateusz Guzik
  Cc: robert.foss, Andrew Morton, Kees Cook, viro, gorcunov,
	John Stultz, plaguedbypenguins, adobriyan, jdanis, calvinowens,
	jann, mhocko, koct9i, vbabka, n-horiguchi, kirill.shutemov,
	ldufour, Johannes Weiner, linux-kernel, Ben Zhang, Bryan Freed,
	Filipe Brandenburger

On Tue, Aug 9, 2016 at 9:29 AM, Mateusz Guzik <mguzik@redhat.com> wrote:
> On Tue, Aug 09, 2016 at 12:05:43PM -0400, robert.foss@collabora.com wrote:
>> From: Sonny Rao <sonnyrao@chromium.org>
>>
>> This is based on earlier work by Thiago Goncales. It implements a new
>> per process proc file which summarizes the contents of the smaps file
>> but doesn't display any addresses.  It gives more detailed information
>> than statm like the PSS (proprotional set size).  It differs from the
>> original implementation in that it doesn't use the full blown set of
>> seq operations, uses a different termination condition, and doesn't
>> displayed "Locked" as that was broken on the original implemenation.
>>
>> This new proc file provides information faster than parsing the potentially
>> huge smaps file.
>
> I have no idea about usefulness of this.

I can comment about this.  The use case is to speed up monitoring of
memory consumption in environments where RSS isn't precise.

For example Chrome tends to many processes which have hundreds of VMAs
with a substantial amount of shared memory, and the error of using
RSS rather than PSS tends to be very large when looking at overall
memory consumption.  PSS isn't kept as a single number that's exported
like RSS, so to calculate PSS means having to parse a very large smaps
file.

This process is slow and has to be repeated for many processes, and we
found that the just act of doing the parsing was taking up a
significant amount of CPU time, so this patch is an attempt to make
that process cheaper.

>
> The patch is definitely buggy with respect to how it implements actual
> access to mm.
>
>> +static int totmaps_proc_show(struct seq_file *m, void *data)
>> +{
>> +     struct proc_maps_private *priv = m->private;
>> +     struct mm_struct *mm;
>> +     struct vm_area_struct *vma;
>> +     struct mem_size_stats *mss_sum = priv->mss;
>> +
>> +     /* reference to priv->task already taken */
>> +     /* but need to get the mm here because */
>> +     /* task could be in the process of exiting */
>> +     mm = get_task_mm(priv->task);
>> +     if (!mm || IS_ERR(mm))
>> +             return -EINVAL;
>> +
>
> That's not how it's done in smaps.
>
>> +static int totmaps_open(struct inode *inode, struct file *file)
>> +{
>> +     struct proc_maps_private *priv;
>> +     int ret = -ENOMEM;
>> +     priv = kzalloc(sizeof(*priv), GFP_KERNEL);
>> +     if (priv) {
>> +             priv->mss = kzalloc(sizeof(*priv->mss), GFP_KERNEL);
>> +             if (!priv->mss)
>> +                     return -ENOMEM;
>
> Cases below explicitly kfree(priv). I can't remember whether the close
> routine gets called if this one fails. Either way, something is wrong
> here.
>
>> +
>> +             /* we need to grab references to the task_struct */
>> +             /* at open time, because there's a potential information */
>> +             /* leak where the totmaps file is opened and held open */
>> +             /* while the underlying pid to task mapping changes */
>> +             /* underneath it */
>> +             priv->task = get_pid_task(proc_pid(inode), PIDTYPE_PID);
>
> This performs no permission checks that I would see. If you take a look
> at smaps you will see the user ends up in proc_maps_open which performs
> proc_mem_open(inode, PTRACE_MODE_READ) and gets a mm from there.
>
>
>> +             if (!priv->task) {
>> +                     kfree(priv->mss);
>> +                     kfree(priv);
>> +                     return -ESRCH;
>> +             }
>> +
>> +             ret = single_open(file, totmaps_proc_show, priv);
>> +             if (ret) {
>> +                     put_task_struct(priv->task);
>> +                     kfree(priv->mss);
>> +                     kfree(priv);
>> +             }
>> +     }
>> +     return ret;
>> +}
>> +
>
> --
> Mateusz Guzik

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PACTH v1] mm, proc: Implement /proc/<pid>/totmaps
  2016-08-09 16:05 [PACTH v1] mm, proc: Implement /proc/<pid>/totmaps robert.foss
  2016-08-09 16:29 ` Mateusz Guzik
@ 2016-08-09 16:58 ` Alexey Dobriyan
  2016-08-09 18:28   ` Sonny Rao
  2016-08-09 19:16 ` Konstantin Khlebnikov
  2016-08-09 19:24 ` Jann Horn
  3 siblings, 1 reply; 24+ messages in thread
From: Alexey Dobriyan @ 2016-08-09 16:58 UTC (permalink / raw)
  To: robert.foss
  Cc: akpm, keescook, viro, gorcunov, john.stultz, plaguedbypenguins,
	sonnyrao, mguzik, jdanis, calvinowens, jann, mhocko, koct9i,
	vbabka, n-horiguchi, kirill.shutemov, ldufour, hannes,
	linux-kernel, Ben Zhang, Bryan Freed, Filipe Brandenburger

On Tue, Aug 09, 2016 at 12:05:43PM -0400, robert.foss@collabora.com wrote:
> From: Sonny Rao <sonnyrao@chromium.org>
> 
> This is based on earlier work by Thiago Goncales. It implements a new
> per process proc file which summarizes the contents of the smaps file
> but doesn't display any addresses.  It gives more detailed information
> than statm like the PSS (proprotional set size).  It differs from the
> original implementation in that it doesn't use the full blown set of
> seq operations, uses a different termination condition, and doesn't
> displayed "Locked" as that was broken on the original implemenation.
> 
> This new proc file provides information faster than parsing the potentially
> huge smaps file.

You can "parse" /proc/*/pagemap . RSS, swap are there.
So which ones do you really need?
Why the separate anon hugepages and anon regular pages?

> +	seq_printf(m,
> +		   "Rss:            %8lu kB\n"
> +		   "Pss:            %8lu kB\n"
> +		   "Shared_Clean:   %8lu kB\n"
> +		   "Shared_Dirty:   %8lu kB\n"
> +		   "Private_Clean:  %8lu kB\n"
> +		   "Private_Dirty:  %8lu kB\n"
> +		   "Referenced:     %8lu kB\n"
> +		   "Anonymous:      %8lu kB\n"
> +		   "AnonHugePages:  %8lu kB\n"
> +		   "Swap:           %8lu kB\n",
> +		   mss_sum->resident >> 10,
> +		   (unsigned long)(mss_sum->pss >> (10 + PSS_SHIFT)),
> +		   mss_sum->shared_clean  >> 10,
> +		   mss_sum->shared_dirty  >> 10,
> +		   mss_sum->private_clean >> 10,
> +		   mss_sum->private_dirty >> 10,
> +		   mss_sum->referenced >> 10,
> +		   mss_sum->anonymous >> 10,
> +		   mss_sum->anonymous_thp >> 10,
> +		   mss_sum->swap >> 10);

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PACTH v1] mm, proc: Implement /proc/<pid>/totmaps
  2016-08-09 16:58 ` Alexey Dobriyan
@ 2016-08-09 18:28   ` Sonny Rao
  0 siblings, 0 replies; 24+ messages in thread
From: Sonny Rao @ 2016-08-09 18:28 UTC (permalink / raw)
  To: Alexey Dobriyan
  Cc: Robert Foss, Andrew Morton, Kees Cook, viro, gorcunov,
	John Stultz, Robin Humble, Mateusz Guzik, Janis Danisevskis,
	calvinowens, jann, mhocko, Konstantin Khlebnikov, vbabka,
	n-horiguchi, kirill.shutemov, ldufour, Johannes Weiner,
	linux-kernel, Ben Zhang, Filipe Brandenburger

On Tue, Aug 9, 2016 at 9:58 AM, Alexey Dobriyan <adobriyan@gmail.com> wrote:
>
> On Tue, Aug 09, 2016 at 12:05:43PM -0400, robert.foss@collabora.com wrote:
> > From: Sonny Rao <sonnyrao@chromium.org>
> >
> > This is based on earlier work by Thiago Goncales. It implements a new
> > per process proc file which summarizes the contents of the smaps file
> > but doesn't display any addresses.  It gives more detailed information
> > than statm like the PSS (proprotional set size).  It differs from the
> > original implementation in that it doesn't use the full blown set of
> > seq operations, uses a different termination condition, and doesn't
> > displayed "Locked" as that was broken on the original implemenation.
> >
> > This new proc file provides information faster than parsing the potentially
> > huge smaps file.
>
> You can "parse" /proc/*/pagemap . RSS, swap are there.


/proc/*pagemap is generally restricted and I don't believe it would
quickly give PSS.

>
> So which ones do you really need?

PSS and Swap are the most important.  RSS isn't precise enough because
it counts shared pages fully, and there tends to be a lot of sharing.

> Why the separate anon hugepages and anon regular pages?

I'm not sure if it's necessary, but that's how it's broken out in smaps.

>
> > +     seq_printf(m,
> > +                "Rss:            %8lu kB\n"
> > +                "Pss:            %8lu kB\n"
> > +                "Shared_Clean:   %8lu kB\n"
> > +                "Shared_Dirty:   %8lu kB\n"
> > +                "Private_Clean:  %8lu kB\n"
> > +                "Private_Dirty:  %8lu kB\n"
> > +                "Referenced:     %8lu kB\n"
> > +                "Anonymous:      %8lu kB\n"
> > +                "AnonHugePages:  %8lu kB\n"
> > +                "Swap:           %8lu kB\n",
> > +                mss_sum->resident >> 10,
> > +                (unsigned long)(mss_sum->pss >> (10 + PSS_SHIFT)),
> > +                mss_sum->shared_clean  >> 10,
> > +                mss_sum->shared_dirty  >> 10,
> > +                mss_sum->private_clean >> 10,
> > +                mss_sum->private_dirty >> 10,
> > +                mss_sum->referenced >> 10,
> > +                mss_sum->anonymous >> 10,
> > +                mss_sum->anonymous_thp >> 10,
> > +                mss_sum->swap >> 10);

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PACTH v1] mm, proc: Implement /proc/<pid>/totmaps
  2016-08-09 16:05 [PACTH v1] mm, proc: Implement /proc/<pid>/totmaps robert.foss
  2016-08-09 16:29 ` Mateusz Guzik
  2016-08-09 16:58 ` Alexey Dobriyan
@ 2016-08-09 19:16 ` Konstantin Khlebnikov
  2016-08-10  0:30   ` Sonny Rao
  2016-08-09 19:24 ` Jann Horn
  3 siblings, 1 reply; 24+ messages in thread
From: Konstantin Khlebnikov @ 2016-08-09 19:16 UTC (permalink / raw)
  To: robert.foss
  Cc: Andrew Morton, Kees Cook, Al Viro, Cyrill Gorcunov, John Stultz,
	plaguedbypenguins, sonnyrao, mguzik, Alexey Dobriyan, jdanis,
	calvinowens, Jann Horn, Michal Hocko, Vlastimil Babka,
	Naoya Horiguchi, Kirill A. Shutemov, ldufour, Johannes Weiner,
	Linux Kernel Mailing List, Ben Zhang, Bryan Freed,
	Filipe Brandenburger

On Tue, Aug 9, 2016 at 7:05 PM,  <robert.foss@collabora.com> wrote:
> From: Sonny Rao <sonnyrao@chromium.org>
>
> This is based on earlier work by Thiago Goncales. It implements a new
> per process proc file which summarizes the contents of the smaps file
> but doesn't display any addresses.  It gives more detailed information
> than statm like the PSS (proprotional set size).  It differs from the
> original implementation in that it doesn't use the full blown set of
> seq operations, uses a different termination condition, and doesn't
> displayed "Locked" as that was broken on the original implemenation.
>
> This new proc file provides information faster than parsing the potentially
> huge smaps file.

What statistics do you really need?

I think, performance and flexibility issues could be really solved only by new
syscall for querying memory statistics for address range in any process:
process_vm_stat() or some kind of pumped fincore() for /proc/$pid/mem

>
> Signed-off-by: Sonny Rao <sonnyrao@chromium.org>
>
> Tested-by: Robert Foss <robert.foss@collabora.com>
> Signed-off-by: Robert Foss <robert.foss@collabora.com>
>
> ---
>  fs/proc/base.c     |   1 +
>  fs/proc/internal.h |   4 ++
>  fs/proc/task_mmu.c | 126 +++++++++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 131 insertions(+)
>
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index a11eb71..de3acdf 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -2855,6 +2855,7 @@ static const struct pid_entry tgid_base_stuff[] = {
>         REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
>         REG("smaps",      S_IRUGO, proc_pid_smaps_operations),
>         REG("pagemap",    S_IRUSR, proc_pagemap_operations),
> +       REG("totmaps",    S_IRUGO, proc_totmaps_operations),
>  #endif
>  #ifdef CONFIG_SECURITY
>         DIR("attr",       S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
> diff --git a/fs/proc/internal.h b/fs/proc/internal.h
> index aa27810..6f3540f 100644
> --- a/fs/proc/internal.h
> +++ b/fs/proc/internal.h
> @@ -58,6 +58,9 @@ union proc_op {
>                 struct task_struct *task);
>  };
>
> +
> +extern const struct file_operations proc_totmaps_operations;
> +
>  struct proc_inode {
>         struct pid *pid;
>         int fd;
> @@ -281,6 +284,7 @@ struct proc_maps_private {
>         struct mm_struct *mm;
>  #ifdef CONFIG_MMU
>         struct vm_area_struct *tail_vma;
> +       struct mem_size_stats *mss;
>  #endif
>  #ifdef CONFIG_NUMA
>         struct mempolicy *task_mempolicy;
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index 4648c7f..b61873e 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -802,6 +802,81 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
>         return 0;
>  }
>
> +static void add_smaps_sum(struct mem_size_stats *mss,
> +               struct mem_size_stats *mss_sum)
> +{
> +       mss_sum->resident += mss->resident;
> +       mss_sum->pss += mss->pss;
> +       mss_sum->shared_clean += mss->shared_clean;
> +       mss_sum->shared_dirty += mss->shared_dirty;
> +       mss_sum->private_clean += mss->private_clean;
> +       mss_sum->private_dirty += mss->private_dirty;
> +       mss_sum->referenced += mss->referenced;
> +       mss_sum->anonymous += mss->anonymous;
> +       mss_sum->anonymous_thp += mss->anonymous_thp;
> +       mss_sum->swap += mss->swap;
> +}
> +
> +static int totmaps_proc_show(struct seq_file *m, void *data)
> +{
> +       struct proc_maps_private *priv = m->private;
> +       struct mm_struct *mm;
> +       struct vm_area_struct *vma;
> +       struct mem_size_stats *mss_sum = priv->mss;
> +
> +       /* reference to priv->task already taken */
> +       /* but need to get the mm here because */
> +       /* task could be in the process of exiting */
> +       mm = get_task_mm(priv->task);
> +       if (!mm || IS_ERR(mm))
> +               return -EINVAL;
> +
> +       down_read(&mm->mmap_sem);
> +       hold_task_mempolicy(priv);
> +
> +       for (vma = mm->mmap; vma != priv->tail_vma; vma = vma->vm_next) {
> +               struct mem_size_stats mss;
> +               struct mm_walk smaps_walk = {
> +                       .pmd_entry = smaps_pte_range,
> +                       .mm = vma->vm_mm,
> +                       .private = &mss,
> +               };
> +
> +               if (vma->vm_mm && !is_vm_hugetlb_page(vma)) {
> +                       memset(&mss, 0, sizeof(mss));
> +                       walk_page_vma(vma, &smaps_walk);
> +                       add_smaps_sum(&mss, mss_sum);
> +               }
> +       }
> +       seq_printf(m,
> +                  "Rss:            %8lu kB\n"
> +                  "Pss:            %8lu kB\n"
> +                  "Shared_Clean:   %8lu kB\n"
> +                  "Shared_Dirty:   %8lu kB\n"
> +                  "Private_Clean:  %8lu kB\n"
> +                  "Private_Dirty:  %8lu kB\n"
> +                  "Referenced:     %8lu kB\n"
> +                  "Anonymous:      %8lu kB\n"
> +                  "AnonHugePages:  %8lu kB\n"
> +                  "Swap:           %8lu kB\n",
> +                  mss_sum->resident >> 10,
> +                  (unsigned long)(mss_sum->pss >> (10 + PSS_SHIFT)),
> +                  mss_sum->shared_clean  >> 10,
> +                  mss_sum->shared_dirty  >> 10,
> +                  mss_sum->private_clean >> 10,
> +                  mss_sum->private_dirty >> 10,
> +                  mss_sum->referenced >> 10,
> +                  mss_sum->anonymous >> 10,
> +                  mss_sum->anonymous_thp >> 10,
> +                  mss_sum->swap >> 10);
> +
> +       release_task_mempolicy(priv);
> +       up_read(&mm->mmap_sem);
> +       mmput(mm);
> +
> +       return 0;
> +}
> +
>  static int show_pid_smap(struct seq_file *m, void *v)
>  {
>         return show_smap(m, v, 1);
> @@ -836,6 +911,50 @@ static int tid_smaps_open(struct inode *inode, struct file *file)
>         return do_maps_open(inode, file, &proc_tid_smaps_op);
>  }
>
> +static int totmaps_open(struct inode *inode, struct file *file)
> +{
> +       struct proc_maps_private *priv;
> +       int ret = -ENOMEM;
> +       priv = kzalloc(sizeof(*priv), GFP_KERNEL);
> +       if (priv) {
> +               priv->mss = kzalloc(sizeof(*priv->mss), GFP_KERNEL);
> +               if (!priv->mss)
> +                       return -ENOMEM;
> +
> +               /* we need to grab references to the task_struct */
> +               /* at open time, because there's a potential information */
> +               /* leak where the totmaps file is opened and held open */
> +               /* while the underlying pid to task mapping changes */
> +               /* underneath it */
> +               priv->task = get_pid_task(proc_pid(inode), PIDTYPE_PID);
> +               if (!priv->task) {
> +                       kfree(priv->mss);
> +                       kfree(priv);
> +                       return -ESRCH;
> +               }
> +
> +               ret = single_open(file, totmaps_proc_show, priv);
> +               if (ret) {
> +                       put_task_struct(priv->task);
> +                       kfree(priv->mss);
> +                       kfree(priv);
> +               }
> +       }
> +       return ret;
> +}
> +
> +static int totmaps_release(struct inode *inode, struct file *file)
> +{
> +       struct seq_file *m = file->private_data;
> +       struct proc_maps_private *priv = m->private;
> +
> +       put_task_struct(priv->task);
> +       kfree(priv->mss);
> +       kfree(priv);
> +       m->private = NULL;
> +       return single_release(inode, file);
> +}
> +
>  const struct file_operations proc_pid_smaps_operations = {
>         .open           = pid_smaps_open,
>         .read           = seq_read,
> @@ -850,6 +969,13 @@ const struct file_operations proc_tid_smaps_operations = {
>         .release        = proc_map_release,
>  };
>
> +const struct file_operations proc_totmaps_operations = {
> +       .open           = totmaps_open,
> +       .read           = seq_read,
> +       .llseek         = seq_lseek,
> +       .release        = totmaps_release,
> +};
> +
>  enum clear_refs_types {
>         CLEAR_REFS_ALL = 1,
>         CLEAR_REFS_ANON,
> --
> 2.7.4
>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PACTH v1] mm, proc: Implement /proc/<pid>/totmaps
  2016-08-09 16:05 [PACTH v1] mm, proc: Implement /proc/<pid>/totmaps robert.foss
                   ` (2 preceding siblings ...)
  2016-08-09 19:16 ` Konstantin Khlebnikov
@ 2016-08-09 19:24 ` Jann Horn
  2016-08-09 21:01   ` Robert Foss
  3 siblings, 1 reply; 24+ messages in thread
From: Jann Horn @ 2016-08-09 19:24 UTC (permalink / raw)
  To: robert.foss, Sonny Rao
  Cc: akpm, keescook, viro, gorcunov, john.stultz, plaguedbypenguins,
	sonnyrao, mguzik, adobriyan, jdanis, calvinowens, mhocko, koct9i,
	vbabka, n-horiguchi, kirill.shutemov, ldufour, hannes,
	linux-kernel, Ben Zhang, Bryan Freed, Filipe Brandenburger

[-- Attachment #1: Type: text/plain, Size: 5264 bytes --]

On Tue, Aug 09, 2016 at 12:05:43PM -0400, robert.foss@collabora.com wrote:
> From: Sonny Rao <sonnyrao@chromium.org>
> 
> This is based on earlier work by Thiago Goncales. It implements a new
> per process proc file which summarizes the contents of the smaps file
> but doesn't display any addresses.  It gives more detailed information
> than statm like the PSS (proprotional set size).  It differs from the
> original implementation in that it doesn't use the full blown set of
> seq operations, uses a different termination condition, and doesn't
> displayed "Locked" as that was broken on the original implemenation.
> 
> This new proc file provides information faster than parsing the potentially
> huge smaps file.
> 
> Signed-off-by: Sonny Rao <sonnyrao@chromium.org>
> 
> Tested-by: Robert Foss <robert.foss@collabora.com>
> Signed-off-by: Robert Foss <robert.foss@collabora.com>


> +static int totmaps_proc_show(struct seq_file *m, void *data)
> +{
> +	struct proc_maps_private *priv = m->private;
> +	struct mm_struct *mm;
> +	struct vm_area_struct *vma;
> +	struct mem_size_stats *mss_sum = priv->mss;
> +
> +	/* reference to priv->task already taken */
> +	/* but need to get the mm here because */
> +	/* task could be in the process of exiting */

Can you please elaborate on this? My understanding here is that you
intend for the caller to be able to repeatedly read the same totmaps
file with pread() and still see updated information after the target
process has called execve() and be able to detect process death
(instead of simply seeing stale values). Is that accurate?

I would prefer it if you could grab a reference to the mm_struct
directly at open time.


> +	mm = get_task_mm(priv->task);
> +	if (!mm || IS_ERR(mm))
> +		return -EINVAL;

get_task_mm() doesn't return error codes, and all other callers just
check whether the return value is NULL.


> +	down_read(&mm->mmap_sem);
> +	hold_task_mempolicy(priv);
> +
> +	for (vma = mm->mmap; vma != priv->tail_vma; vma = vma->vm_next) {
> +		struct mem_size_stats mss;
> +		struct mm_walk smaps_walk = {
> +			.pmd_entry = smaps_pte_range,
> +			.mm = vma->vm_mm,
> +			.private = &mss,
> +		};
> +
> +		if (vma->vm_mm && !is_vm_hugetlb_page(vma)) {
> +			memset(&mss, 0, sizeof(mss));
> +			walk_page_vma(vma, &smaps_walk);
> +			add_smaps_sum(&mss, mss_sum);
> +		}
> +	}

Errrr... what? You accumulate values from mem_size_stats items into a
struct mss_sum that is associated with the struct file? So when you
read the file the second time, you get the old values plus the new ones?
And when you read the file in parallel, you get inconsistent values?

For most files in procfs, the behavior is that you can just call
pread(fd, buf, sizeof(buf), 0) on the same fd again and again, giving
you the current values every time, without mutating state. I strongly
recommend that you get rid of priv->mss and just accumulate the state
in a local variable (maybe one on the stack).


> @@ -836,6 +911,50 @@ static int tid_smaps_open(struct inode *inode, struct file *file)
>  	return do_maps_open(inode, file, &proc_tid_smaps_op);
>  }
>  
> +static int totmaps_open(struct inode *inode, struct file *file)
> +{
> +	struct proc_maps_private *priv;
> +	int ret = -ENOMEM;
> +	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
> +	if (priv) {
> +		priv->mss = kzalloc(sizeof(*priv->mss), GFP_KERNEL);
> +		if (!priv->mss)
> +			return -ENOMEM;

Memory leak: If the first allocation works and the second one doesn't, this
doesn't free the first allocation.

Please change this to use the typical goto pattern for error handling.

> +
> +		/* we need to grab references to the task_struct */
> +		/* at open time, because there's a potential information */
> +		/* leak where the totmaps file is opened and held open */
> +		/* while the underlying pid to task mapping changes */
> +		/* underneath it */

Nit: That's not how comments are done in the kernel. Maybe change this to
a normal block comment instead of one block comment per line?

> +		priv->task = get_pid_task(proc_pid(inode), PIDTYPE_PID);

`get_pid_task(proc_pid(inode), PIDTYPE_PID)` is exactly the definition
of get_proc_task(inode), maybe use that instead?

> +		if (!priv->task) {
> +			kfree(priv->mss);
> +			kfree(priv);
> +			return -ESRCH;
> +		}
> +
> +		ret = single_open(file, totmaps_proc_show, priv);
> +		if (ret) {
> +			put_task_struct(priv->task);
> +			kfree(priv->mss);
> +			kfree(priv);
> +		}
> +	}
> +	return ret;
> +}

Please change this method to use the typical goto pattern for error
handling. IMO repeating the undo steps in all error cases makes
mistakes (like the one above) more likely and increases the amount
of redundant code.

Also: The smaps file is only accessible to callers with
PTRACE_MODE_READ privileges on the target task. Your thing doesn't
do any access checks, neither in the open handler nor in the read
handler. Can you give an analysis of why it's okay to expose this
data? As far as I can tell, without spending a lot of time thinking
about it, this kind of data looks like it might potentially be
useful for side-channel information leaks or so.

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PACTH v1] mm, proc: Implement /proc/<pid>/totmaps
  2016-08-09 16:29 ` Mateusz Guzik
  2016-08-09 16:56   ` Sonny Rao
@ 2016-08-09 20:17   ` Robert Foss
  2016-08-10 15:39     ` Robert Foss
  1 sibling, 1 reply; 24+ messages in thread
From: Robert Foss @ 2016-08-09 20:17 UTC (permalink / raw)
  To: Mateusz Guzik
  Cc: akpm, keescook, viro, gorcunov, john.stultz, plaguedbypenguins,
	sonnyrao, adobriyan, jdanis, calvinowens, jann, mhocko, koct9i,
	vbabka, n-horiguchi, kirill.shutemov, ldufour, hannes,
	linux-kernel, Ben Zhang, Bryan Freed, Filipe Brandenburger



On 2016-08-09 12:29 PM, Mateusz Guzik wrote:
> On Tue, Aug 09, 2016 at 12:05:43PM -0400, robert.foss@collabora.com wrote:
>> From: Sonny Rao <sonnyrao@chromium.org>
>>
>> This is based on earlier work by Thiago Goncales. It implements a new
>> per process proc file which summarizes the contents of the smaps file
>> but doesn't display any addresses.  It gives more detailed information
>> than statm like the PSS (proprotional set size).  It differs from the
>> original implementation in that it doesn't use the full blown set of
>> seq operations, uses a different termination condition, and doesn't
>> displayed "Locked" as that was broken on the original implemenation.
>>
>> This new proc file provides information faster than parsing the potentially
>> huge smaps file.
>
> I have no idea about usefulness of this.
>
> The patch is definitely buggy with respect to how it implements actual
> access to mm.
>
>> +static int totmaps_proc_show(struct seq_file *m, void *data)
>> +{
>> +	struct proc_maps_private *priv = m->private;
>> +	struct mm_struct *mm;
>> +	struct vm_area_struct *vma;
>> +	struct mem_size_stats *mss_sum = priv->mss;
>> +
>> +	/* reference to priv->task already taken */
>> +	/* but need to get the mm here because */
>> +	/* task could be in the process of exiting */
>> +	mm = get_task_mm(priv->task);
>> +	if (!mm || IS_ERR(mm))
>> +		return -EINVAL;
>> +
>
> That's not how it's done in smaps.

Alright, I'll have to look into the difference between this approach and 
the smaps one.

>
>> +static int totmaps_open(struct inode *inode, struct file *file)
>> +{
>> +	struct proc_maps_private *priv;
>> +	int ret = -ENOMEM;
>> +	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
>> +	if (priv) {
>> +		priv->mss = kzalloc(sizeof(*priv->mss), GFP_KERNEL);
>> +		if (!priv->mss)
>> +			return -ENOMEM;
>
> Cases below explicitly kfree(priv). I can't remember whether the close
> routine gets called if this one fails. Either way, something is wrong
> here.

It looks fishy to me too, I'll have it reworked in v2.

>
>> +
>> +		/* we need to grab references to the task_struct */
>> +		/* at open time, because there's a potential information */
>> +		/* leak where the totmaps file is opened and held open */
>> +		/* while the underlying pid to task mapping changes */
>> +		/* underneath it */
>> +		priv->task = get_pid_task(proc_pid(inode), PIDTYPE_PID);
>
> This performs no permission checks that I would see. If you take a look
> at smaps you will see the user ends up in proc_maps_open which performs
> proc_mem_open(inode, PTRACE_MODE_READ) and gets a mm from there.

The proc_maps_open() function does seem to be doing everything I need 
it. I'll have a look at switching to using it.

Thanks for the heads up!


Rob.

>
>
>> +		if (!priv->task) {
>> +			kfree(priv->mss);
>> +			kfree(priv);
>> +			return -ESRCH;
>> +		}
>> +
>> +		ret = single_open(file, totmaps_proc_show, priv);
>> +		if (ret) {
>> +			put_task_struct(priv->task);
>> +			kfree(priv->mss);
>> +			kfree(priv);
>> +		}
>> +	}
>> +	return ret;
>> +}
>> +
>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PACTH v1] mm, proc: Implement /proc/<pid>/totmaps
  2016-08-09 19:24 ` Jann Horn
@ 2016-08-09 21:01   ` Robert Foss
  2016-08-09 22:30     ` Jann Horn
  2016-08-10 17:23     ` Sonny Rao
  0 siblings, 2 replies; 24+ messages in thread
From: Robert Foss @ 2016-08-09 21:01 UTC (permalink / raw)
  To: Jann Horn, Sonny Rao
  Cc: akpm, keescook, viro, gorcunov, john.stultz, plaguedbypenguins,
	mguzik, adobriyan, jdanis, calvinowens, mhocko, koct9i, vbabka,
	n-horiguchi, kirill.shutemov, ldufour, hannes, linux-kernel,
	Ben Zhang, Bryan Freed, Filipe Brandenburger



On 2016-08-09 03:24 PM, Jann Horn wrote:
> On Tue, Aug 09, 2016 at 12:05:43PM -0400, robert.foss@collabora.com wrote:
>> From: Sonny Rao <sonnyrao@chromium.org>
>>
>> This is based on earlier work by Thiago Goncales. It implements a new
>> per process proc file which summarizes the contents of the smaps file
>> but doesn't display any addresses.  It gives more detailed information
>> than statm like the PSS (proprotional set size).  It differs from the
>> original implementation in that it doesn't use the full blown set of
>> seq operations, uses a different termination condition, and doesn't
>> displayed "Locked" as that was broken on the original implemenation.
>>
>> This new proc file provides information faster than parsing the potentially
>> huge smaps file.
>>
>> Signed-off-by: Sonny Rao <sonnyrao@chromium.org>
>>
>> Tested-by: Robert Foss <robert.foss@collabora.com>
>> Signed-off-by: Robert Foss <robert.foss@collabora.com>
>
>
>> +static int totmaps_proc_show(struct seq_file *m, void *data)
>> +{
>> +	struct proc_maps_private *priv = m->private;
>> +	struct mm_struct *mm;
>> +	struct vm_area_struct *vma;
>> +	struct mem_size_stats *mss_sum = priv->mss;
>> +
>> +	/* reference to priv->task already taken */
>> +	/* but need to get the mm here because */
>> +	/* task could be in the process of exiting */
>
> Can you please elaborate on this? My understanding here is that you
> intend for the caller to be able to repeatedly read the same totmaps
> file with pread() and still see updated information after the target
> process has called execve() and be able to detect process death
> (instead of simply seeing stale values). Is that accurate?
>
> I would prefer it if you could grab a reference to the mm_struct
> directly at open time.

Sonny, do you know more about the above comment?

>
>
>> +	mm = get_task_mm(priv->task);
>> +	if (!mm || IS_ERR(mm))
>> +		return -EINVAL;
>
> get_task_mm() doesn't return error codes, and all other callers just
> check whether the return value is NULL.
>

I'll have that fixed in v2, thanks for spotting it!

>
>> +	down_read(&mm->mmap_sem);
>> +	hold_task_mempolicy(priv);
>> +
>> +	for (vma = mm->mmap; vma != priv->tail_vma; vma = vma->vm_next) {
>> +		struct mem_size_stats mss;
>> +		struct mm_walk smaps_walk = {
>> +			.pmd_entry = smaps_pte_range,
>> +			.mm = vma->vm_mm,
>> +			.private = &mss,
>> +		};
>> +
>> +		if (vma->vm_mm && !is_vm_hugetlb_page(vma)) {
>> +			memset(&mss, 0, sizeof(mss));
>> +			walk_page_vma(vma, &smaps_walk);
>> +			add_smaps_sum(&mss, mss_sum);
>> +		}
>> +	}
>
> Errrr... what? You accumulate values from mem_size_stats items into a
> struct mss_sum that is associated with the struct file? So when you
> read the file the second time, you get the old values plus the new ones?
> And when you read the file in parallel, you get inconsistent values?
>
> For most files in procfs, the behavior is that you can just call
> pread(fd, buf, sizeof(buf), 0) on the same fd again and again, giving
> you the current values every time, without mutating state. I strongly
> recommend that you get rid of priv->mss and just accumulate the state
> in a local variable (maybe one on the stack).

So a simple "static struct mem_size_stats" in totmaps_proc_show() would 
be a better solution?

>
>
>> @@ -836,6 +911,50 @@ static int tid_smaps_open(struct inode *inode, struct file *file)
>>  	return do_maps_open(inode, file, &proc_tid_smaps_op);
>>  }
>>
>> +static int totmaps_open(struct inode *inode, struct file *file)
>> +{
>> +	struct proc_maps_private *priv;
>> +	int ret = -ENOMEM;
>> +	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
>> +	if (priv) {
>> +		priv->mss = kzalloc(sizeof(*priv->mss), GFP_KERNEL);
>> +		if (!priv->mss)
>> +			return -ENOMEM;
>
> Memory leak: If the first allocation works and the second one doesn't, this
> doesn't free the first allocation.
>
> Please change this to use the typical goto pattern for error handling.

Fix will be implemented in v2.

>
>> +
>> +		/* we need to grab references to the task_struct */
>> +		/* at open time, because there's a potential information */
>> +		/* leak where the totmaps file is opened and held open */
>> +		/* while the underlying pid to task mapping changes */
>> +		/* underneath it */
>
> Nit: That's not how comments are done in the kernel. Maybe change this to
> a normal block comment instead of one block comment per line?

I'm not sure how that one slipped by, but I'll change it in v2.

>
>> +		priv->task = get_pid_task(proc_pid(inode), PIDTYPE_PID);
>
> `get_pid_task(proc_pid(inode), PIDTYPE_PID)` is exactly the definition
> of get_proc_task(inode), maybe use that instead?
>

Will do. v2 will fix this.

>> +		if (!priv->task) {
>> +			kfree(priv->mss);
>> +			kfree(priv);
>> +			return -ESRCH;
>> +		}
>> +
>> +		ret = single_open(file, totmaps_proc_show, priv);
>> +		if (ret) {
>> +			put_task_struct(priv->task);
>> +			kfree(priv->mss);
>> +			kfree(priv);
>> +		}
>> +	}
>> +	return ret;
>> +}
>
> Please change this method to use the typical goto pattern for error
> handling. IMO repeating the undo steps in all error cases makes
> mistakes (like the one above) more likely and increases the amount
> of redundant code.

Agreed. Change queued for v2.

>
> Also: The smaps file is only accessible to callers with
> PTRACE_MODE_READ privileges on the target task. Your thing doesn't
> do any access checks, neither in the open handler nor in the read
> handler. Can you give an analysis of why it's okay to expose this
> data? As far as I can tell, without spending a lot of time thinking
> about it, this kind of data looks like it might potentially be
> useful for side-channel information leaks or so.
>

I think it should require the same permissions as smaps, so changing the 
code to require PTRACE_MODE_READ privileges is most likely a good idea. 
I'll have a look at it for v2.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PACTH v1] mm, proc: Implement /proc/<pid>/totmaps
  2016-08-09 21:01   ` Robert Foss
@ 2016-08-09 22:30     ` Jann Horn
  2016-08-10 14:16       ` Robert Foss
  2016-08-10 17:23     ` Sonny Rao
  1 sibling, 1 reply; 24+ messages in thread
From: Jann Horn @ 2016-08-09 22:30 UTC (permalink / raw)
  To: Robert Foss
  Cc: Sonny Rao, akpm, keescook, viro, gorcunov, john.stultz,
	plaguedbypenguins, mguzik, adobriyan, jdanis, calvinowens,
	mhocko, koct9i, vbabka, n-horiguchi, kirill.shutemov, ldufour,
	hannes, linux-kernel, Ben Zhang, Bryan Freed,
	Filipe Brandenburger

[-- Attachment #1: Type: text/plain, Size: 1562 bytes --]

On Tue, Aug 09, 2016 at 05:01:44PM -0400, Robert Foss wrote:
> On 2016-08-09 03:24 PM, Jann Horn wrote:
> >On Tue, Aug 09, 2016 at 12:05:43PM -0400, robert.foss@collabora.com wrote:
> >>+	down_read(&mm->mmap_sem);
> >>+	hold_task_mempolicy(priv);
> >>+
> >>+	for (vma = mm->mmap; vma != priv->tail_vma; vma = vma->vm_next) {
> >>+		struct mem_size_stats mss;
> >>+		struct mm_walk smaps_walk = {
> >>+			.pmd_entry = smaps_pte_range,
> >>+			.mm = vma->vm_mm,
> >>+			.private = &mss,
> >>+		};
> >>+
> >>+		if (vma->vm_mm && !is_vm_hugetlb_page(vma)) {
> >>+			memset(&mss, 0, sizeof(mss));
> >>+			walk_page_vma(vma, &smaps_walk);
> >>+			add_smaps_sum(&mss, mss_sum);
> >>+		}
> >>+	}
> >
> >Errrr... what? You accumulate values from mem_size_stats items into a
> >struct mss_sum that is associated with the struct file? So when you
> >read the file the second time, you get the old values plus the new ones?
> >And when you read the file in parallel, you get inconsistent values?
> >
> >For most files in procfs, the behavior is that you can just call
> >pread(fd, buf, sizeof(buf), 0) on the same fd again and again, giving
> >you the current values every time, without mutating state. I strongly
> >recommend that you get rid of priv->mss and just accumulate the state
> >in a local variable (maybe one on the stack).
> 
> So a simple "static struct mem_size_stats" in totmaps_proc_show() would be a
> better solution?

Er, why "static"? Are you trying to create shared state between different
readers for some reason?

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PACTH v1] mm, proc: Implement /proc/<pid>/totmaps
  2016-08-09 19:16 ` Konstantin Khlebnikov
@ 2016-08-10  0:30   ` Sonny Rao
  0 siblings, 0 replies; 24+ messages in thread
From: Sonny Rao @ 2016-08-10  0:30 UTC (permalink / raw)
  To: Konstantin Khlebnikov
  Cc: Robert Foss, Andrew Morton, Kees Cook, Al Viro, Cyrill Gorcunov,
	John Stultz, Robin Humble, Mateusz Guzik, Alexey Dobriyan,
	Janis Danisevskis, calvinowens, Jann Horn, Michal Hocko,
	Vlastimil Babka, Naoya Horiguchi, Kirill A. Shutemov, ldufour,
	Johannes Weiner, Linux Kernel Mailing List, Ben Zhang,
	Bryan Freed, Filipe Brandenburger

On Tue, Aug 9, 2016 at 12:16 PM, Konstantin Khlebnikov <koct9i@gmail.com> wrote:
>
> On Tue, Aug 9, 2016 at 7:05 PM,  <robert.foss@collabora.com> wrote:
> > From: Sonny Rao <sonnyrao@chromium.org>
> >
> > This is based on earlier work by Thiago Goncales. It implements a new
> > per process proc file which summarizes the contents of the smaps file
> > but doesn't display any addresses.  It gives more detailed information
> > than statm like the PSS (proprotional set size).  It differs from the
> > original implementation in that it doesn't use the full blown set of
> > seq operations, uses a different termination condition, and doesn't
> > displayed "Locked" as that was broken on the original implemenation.
> >
> > This new proc file provides information faster than parsing the potentially
> > huge smaps file.
>
> What statistics do you really need?

PSS (Proportional Set Size) and related accounting of shared pages
(swap could be shared) is where the existing summaries of memory usage
are cumbersome.

>
>
> I think, performance and flexibility issues could be really solved only by new
> syscall for querying memory statistics for address range in any process:
> process_vm_stat() or some kind of pumped fincore() for /proc/$pid/mem


That would be a good long term solution if people want similarly
complicated statistics without having to iterate through current
interfaces.
I mentioned monitoring before but I'll add that Proportional Set size,
Unique Set Size, Swap are per process are also useful because they
help us make better decisions about what processes need to be
throttled or gracefully killed.

>
> >
> > Signed-off-by: Sonny Rao <sonnyrao@chromium.org>
> >
> > Tested-by: Robert Foss <robert.foss@collabora.com>
> > Signed-off-by: Robert Foss <robert.foss@collabora.com>
> >
> > ---
> >  fs/proc/base.c     |   1 +
> >  fs/proc/internal.h |   4 ++
> >  fs/proc/task_mmu.c | 126 +++++++++++++++++++++++++++++++++++++++++++++++++++++
> >  3 files changed, 131 insertions(+)
> >
> > diff --git a/fs/proc/base.c b/fs/proc/base.c
> > index a11eb71..de3acdf 100644
> > --- a/fs/proc/base.c
> > +++ b/fs/proc/base.c
> > @@ -2855,6 +2855,7 @@ static const struct pid_entry tgid_base_stuff[] = {
> >         REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
> >         REG("smaps",      S_IRUGO, proc_pid_smaps_operations),
> >         REG("pagemap",    S_IRUSR, proc_pagemap_operations),
> > +       REG("totmaps",    S_IRUGO, proc_totmaps_operations),
> >  #endif
> >  #ifdef CONFIG_SECURITY
> >         DIR("attr",       S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
> > diff --git a/fs/proc/internal.h b/fs/proc/internal.h
> > index aa27810..6f3540f 100644
> > --- a/fs/proc/internal.h
> > +++ b/fs/proc/internal.h
> > @@ -58,6 +58,9 @@ union proc_op {
> >                 struct task_struct *task);
> >  };
> >
> > +
> > +extern const struct file_operations proc_totmaps_operations;
> > +
> >  struct proc_inode {
> >         struct pid *pid;
> >         int fd;
> > @@ -281,6 +284,7 @@ struct proc_maps_private {
> >         struct mm_struct *mm;
> >  #ifdef CONFIG_MMU
> >         struct vm_area_struct *tail_vma;
> > +       struct mem_size_stats *mss;
> >  #endif
> >  #ifdef CONFIG_NUMA
> >         struct mempolicy *task_mempolicy;
> > diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> > index 4648c7f..b61873e 100644
> > --- a/fs/proc/task_mmu.c
> > +++ b/fs/proc/task_mmu.c
> > @@ -802,6 +802,81 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
> >         return 0;
> >  }
> >
> > +static void add_smaps_sum(struct mem_size_stats *mss,
> > +               struct mem_size_stats *mss_sum)
> > +{
> > +       mss_sum->resident += mss->resident;
> > +       mss_sum->pss += mss->pss;
> > +       mss_sum->shared_clean += mss->shared_clean;
> > +       mss_sum->shared_dirty += mss->shared_dirty;
> > +       mss_sum->private_clean += mss->private_clean;
> > +       mss_sum->private_dirty += mss->private_dirty;
> > +       mss_sum->referenced += mss->referenced;
> > +       mss_sum->anonymous += mss->anonymous;
> > +       mss_sum->anonymous_thp += mss->anonymous_thp;
> > +       mss_sum->swap += mss->swap;
> > +}
> > +
> > +static int totmaps_proc_show(struct seq_file *m, void *data)
> > +{
> > +       struct proc_maps_private *priv = m->private;
> > +       struct mm_struct *mm;
> > +       struct vm_area_struct *vma;
> > +       struct mem_size_stats *mss_sum = priv->mss;
> > +
> > +       /* reference to priv->task already taken */
> > +       /* but need to get the mm here because */
> > +       /* task could be in the process of exiting */
> > +       mm = get_task_mm(priv->task);
> > +       if (!mm || IS_ERR(mm))
> > +               return -EINVAL;
> > +
> > +       down_read(&mm->mmap_sem);
> > +       hold_task_mempolicy(priv);
> > +
> > +       for (vma = mm->mmap; vma != priv->tail_vma; vma = vma->vm_next) {
> > +               struct mem_size_stats mss;
> > +               struct mm_walk smaps_walk = {
> > +                       .pmd_entry = smaps_pte_range,
> > +                       .mm = vma->vm_mm,
> > +                       .private = &mss,
> > +               };
> > +
> > +               if (vma->vm_mm && !is_vm_hugetlb_page(vma)) {
> > +                       memset(&mss, 0, sizeof(mss));
> > +                       walk_page_vma(vma, &smaps_walk);
> > +                       add_smaps_sum(&mss, mss_sum);
> > +               }
> > +       }
> > +       seq_printf(m,
> > +                  "Rss:            %8lu kB\n"
> > +                  "Pss:            %8lu kB\n"
> > +                  "Shared_Clean:   %8lu kB\n"
> > +                  "Shared_Dirty:   %8lu kB\n"
> > +                  "Private_Clean:  %8lu kB\n"
> > +                  "Private_Dirty:  %8lu kB\n"
> > +                  "Referenced:     %8lu kB\n"
> > +                  "Anonymous:      %8lu kB\n"
> > +                  "AnonHugePages:  %8lu kB\n"
> > +                  "Swap:           %8lu kB\n",
> > +                  mss_sum->resident >> 10,
> > +                  (unsigned long)(mss_sum->pss >> (10 + PSS_SHIFT)),
> > +                  mss_sum->shared_clean  >> 10,
> > +                  mss_sum->shared_dirty  >> 10,
> > +                  mss_sum->private_clean >> 10,
> > +                  mss_sum->private_dirty >> 10,
> > +                  mss_sum->referenced >> 10,
> > +                  mss_sum->anonymous >> 10,
> > +                  mss_sum->anonymous_thp >> 10,
> > +                  mss_sum->swap >> 10);
> > +
> > +       release_task_mempolicy(priv);
> > +       up_read(&mm->mmap_sem);
> > +       mmput(mm);
> > +
> > +       return 0;
> > +}
> > +
> >  static int show_pid_smap(struct seq_file *m, void *v)
> >  {
> >         return show_smap(m, v, 1);
> > @@ -836,6 +911,50 @@ static int tid_smaps_open(struct inode *inode, struct file *file)
> >         return do_maps_open(inode, file, &proc_tid_smaps_op);
> >  }
> >
> > +static int totmaps_open(struct inode *inode, struct file *file)
> > +{
> > +       struct proc_maps_private *priv;
> > +       int ret = -ENOMEM;
> > +       priv = kzalloc(sizeof(*priv), GFP_KERNEL);
> > +       if (priv) {
> > +               priv->mss = kzalloc(sizeof(*priv->mss), GFP_KERNEL);
> > +               if (!priv->mss)
> > +                       return -ENOMEM;
> > +
> > +               /* we need to grab references to the task_struct */
> > +               /* at open time, because there's a potential information */
> > +               /* leak where the totmaps file is opened and held open */
> > +               /* while the underlying pid to task mapping changes */
> > +               /* underneath it */
> > +               priv->task = get_pid_task(proc_pid(inode), PIDTYPE_PID);
> > +               if (!priv->task) {
> > +                       kfree(priv->mss);
> > +                       kfree(priv);
> > +                       return -ESRCH;
> > +               }
> > +
> > +               ret = single_open(file, totmaps_proc_show, priv);
> > +               if (ret) {
> > +                       put_task_struct(priv->task);
> > +                       kfree(priv->mss);
> > +                       kfree(priv);
> > +               }
> > +       }
> > +       return ret;
> > +}
> > +
> > +static int totmaps_release(struct inode *inode, struct file *file)
> > +{
> > +       struct seq_file *m = file->private_data;
> > +       struct proc_maps_private *priv = m->private;
> > +
> > +       put_task_struct(priv->task);
> > +       kfree(priv->mss);
> > +       kfree(priv);
> > +       m->private = NULL;
> > +       return single_release(inode, file);
> > +}
> > +
> >  const struct file_operations proc_pid_smaps_operations = {
> >         .open           = pid_smaps_open,
> >         .read           = seq_read,
> > @@ -850,6 +969,13 @@ const struct file_operations proc_tid_smaps_operations = {
> >         .release        = proc_map_release,
> >  };
> >
> > +const struct file_operations proc_totmaps_operations = {
> > +       .open           = totmaps_open,
> > +       .read           = seq_read,
> > +       .llseek         = seq_lseek,
> > +       .release        = totmaps_release,
> > +};
> > +
> >  enum clear_refs_types {
> >         CLEAR_REFS_ALL = 1,
> >         CLEAR_REFS_ANON,
> > --
> > 2.7.4
> >

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PACTH v1] mm, proc: Implement /proc/<pid>/totmaps
  2016-08-09 22:30     ` Jann Horn
@ 2016-08-10 14:16       ` Robert Foss
  2016-08-10 15:02         ` Jann Horn
  0 siblings, 1 reply; 24+ messages in thread
From: Robert Foss @ 2016-08-10 14:16 UTC (permalink / raw)
  To: Jann Horn
  Cc: Sonny Rao, akpm, keescook, viro, gorcunov, john.stultz,
	plaguedbypenguins, mguzik, adobriyan, jdanis, calvinowens,
	mhocko, koct9i, vbabka, n-horiguchi, kirill.shutemov, ldufour,
	hannes, linux-kernel, Ben Zhang, Bryan Freed,
	Filipe Brandenburger



On 2016-08-09 06:30 PM, Jann Horn wrote:
> On Tue, Aug 09, 2016 at 05:01:44PM -0400, Robert Foss wrote:
>> On 2016-08-09 03:24 PM, Jann Horn wrote:
>>> On Tue, Aug 09, 2016 at 12:05:43PM -0400, robert.foss@collabora.com wrote:
>>>> +	down_read(&mm->mmap_sem);
>>>> +	hold_task_mempolicy(priv);
>>>> +
>>>> +	for (vma = mm->mmap; vma != priv->tail_vma; vma = vma->vm_next) {
>>>> +		struct mem_size_stats mss;
>>>> +		struct mm_walk smaps_walk = {
>>>> +			.pmd_entry = smaps_pte_range,
>>>> +			.mm = vma->vm_mm,
>>>> +			.private = &mss,
>>>> +		};
>>>> +
>>>> +		if (vma->vm_mm && !is_vm_hugetlb_page(vma)) {
>>>> +			memset(&mss, 0, sizeof(mss));
>>>> +			walk_page_vma(vma, &smaps_walk);
>>>> +			add_smaps_sum(&mss, mss_sum);
>>>> +		}
>>>> +	}
>>>
>>> Errrr... what? You accumulate values from mem_size_stats items into a
>>> struct mss_sum that is associated with the struct file? So when you
>>> read the file the second time, you get the old values plus the new ones?
>>> And when you read the file in parallel, you get inconsistent values?
>>>
>>> For most files in procfs, the behavior is that you can just call
>>> pread(fd, buf, sizeof(buf), 0) on the same fd again and again, giving
>>> you the current values every time, without mutating state. I strongly
>>> recommend that you get rid of priv->mss and just accumulate the state
>>> in a local variable (maybe one on the stack).
>>
>> So a simple "static struct mem_size_stats" in totmaps_proc_show() would be a
>> better solution?
>
> Er, why "static"? Are you trying to create shared state between different
> readers for some reason?
>

I think I'm a bit confused now, how are you suggesting that I replace 
priv->mss?

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PACTH v1] mm, proc: Implement /proc/<pid>/totmaps
  2016-08-10 14:16       ` Robert Foss
@ 2016-08-10 15:02         ` Jann Horn
  2016-08-10 16:24           ` Robert Foss
  0 siblings, 1 reply; 24+ messages in thread
From: Jann Horn @ 2016-08-10 15:02 UTC (permalink / raw)
  To: Robert Foss
  Cc: Sonny Rao, akpm, keescook, viro, gorcunov, john.stultz,
	plaguedbypenguins, mguzik, adobriyan, jdanis, calvinowens,
	mhocko, koct9i, vbabka, n-horiguchi, kirill.shutemov, ldufour,
	hannes, linux-kernel, Ben Zhang, Bryan Freed,
	Filipe Brandenburger

[-- Attachment #1: Type: text/plain, Size: 3154 bytes --]

On Wed, Aug 10, 2016 at 10:16:45AM -0400, Robert Foss wrote:
> 
> 
> On 2016-08-09 06:30 PM, Jann Horn wrote:
> >On Tue, Aug 09, 2016 at 05:01:44PM -0400, Robert Foss wrote:
> >>On 2016-08-09 03:24 PM, Jann Horn wrote:
> >>>On Tue, Aug 09, 2016 at 12:05:43PM -0400, robert.foss@collabora.com wrote:
> >>>>+	down_read(&mm->mmap_sem);
> >>>>+	hold_task_mempolicy(priv);
> >>>>+
> >>>>+	for (vma = mm->mmap; vma != priv->tail_vma; vma = vma->vm_next) {
> >>>>+		struct mem_size_stats mss;
> >>>>+		struct mm_walk smaps_walk = {
> >>>>+			.pmd_entry = smaps_pte_range,
> >>>>+			.mm = vma->vm_mm,
> >>>>+			.private = &mss,
> >>>>+		};
> >>>>+
> >>>>+		if (vma->vm_mm && !is_vm_hugetlb_page(vma)) {
> >>>>+			memset(&mss, 0, sizeof(mss));
> >>>>+			walk_page_vma(vma, &smaps_walk);
> >>>>+			add_smaps_sum(&mss, mss_sum);
> >>>>+		}
> >>>>+	}
> >>>
> >>>Errrr... what? You accumulate values from mem_size_stats items into a
> >>>struct mss_sum that is associated with the struct file? So when you
> >>>read the file the second time, you get the old values plus the new ones?
> >>>And when you read the file in parallel, you get inconsistent values?
> >>>
> >>>For most files in procfs, the behavior is that you can just call
> >>>pread(fd, buf, sizeof(buf), 0) on the same fd again and again, giving
> >>>you the current values every time, without mutating state. I strongly
> >>>recommend that you get rid of priv->mss and just accumulate the state
> >>>in a local variable (maybe one on the stack).
> >>
> >>So a simple "static struct mem_size_stats" in totmaps_proc_show() would be a
> >>better solution?
> >
> >Er, why "static"? Are you trying to create shared state between different
> >readers for some reason?
> >
> 
> I think I'm a bit confused now, how are you suggesting that I replace
> priv->mss?

Like this:

static int totmaps_proc_show(struct seq_file *m, void *data)
{
        struct proc_maps_private *priv = m->private;
        struct mm_struct *mm;
        struct vm_area_struct *vma;
        struct mem_size_stats mss_sum;

        memset(&mss_sum, 0, sizeof(mss_sum));

        [...]

        for (vma = mm->mmap; vma != priv->tail_vma; vma = vma->vm_next) {
                struct mem_size_stats mss;
                struct mm_walk smaps_walk = {
                        .pmd_entry = smaps_pte_range,
                        .mm = vma->vm_mm,
                        .private = &mss,
                };

                if (vma->vm_mm && !is_vm_hugetlb_page(vma)) {
                        memset(&mss, 0, sizeof(mss));
                        walk_page_vma(vma, &smaps_walk);
                        add_smaps_sum(&mss, &mss_sum);
                }
        }
        seq_printf(m,
                   "Rss:            %8lu kB\n"
                   "Pss:            %8lu kB\n"
                   "Shared_Clean:   %8lu kB\n"
                   [...],
                   mss_sum.resident >> 10,
                   (unsigned long)(mss_sum.pss >> (10 + PSS_SHIFT)),
                   mss_sum.shared_clean  >> 10,
                   [...]);
        [...]
}

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PACTH v1] mm, proc: Implement /proc/<pid>/totmaps
  2016-08-09 20:17   ` Robert Foss
@ 2016-08-10 15:39     ` Robert Foss
  2016-08-10 15:42       ` Mateusz Guzik
  0 siblings, 1 reply; 24+ messages in thread
From: Robert Foss @ 2016-08-10 15:39 UTC (permalink / raw)
  To: Mateusz Guzik
  Cc: akpm, keescook, viro, gorcunov, john.stultz, plaguedbypenguins,
	sonnyrao, adobriyan, jdanis, calvinowens, jann, mhocko, koct9i,
	vbabka, n-horiguchi, kirill.shutemov, ldufour, hannes,
	linux-kernel, Ben Zhang, Bryan Freed, Filipe Brandenburger



On 2016-08-09 04:17 PM, Robert Foss wrote:
>>> +static int totmaps_proc_show(struct seq_file *m, void *data)
>>> +{
>>> +    struct proc_maps_private *priv = m->private;
>>> +    struct mm_struct *mm;
>>> +    struct vm_area_struct *vma;
>>> +    struct mem_size_stats *mss_sum = priv->mss;
>>> +
>>> +    /* reference to priv->task already taken */
>>> +    /* but need to get the mm here because */
>>> +    /* task could be in the process of exiting */
>>> +    mm = get_task_mm(priv->task);
>>> +    if (!mm || IS_ERR(mm))
>>> +        return -EINVAL;
>>> +
>>
>> That's not how it's done in smaps.
>
> Alright, I'll have to look into the difference between this approach and
> the smaps one.


I had a look at show_smaps(), and it's not entirely clear to me what the 
advantage of doing it show_smaps() way.

mm = get_task_mm(priv->task) is needed to iterate through all of the 
mappings. Is there a preferable way of doing that?

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PACTH v1] mm, proc: Implement /proc/<pid>/totmaps
  2016-08-10 15:39     ` Robert Foss
@ 2016-08-10 15:42       ` Mateusz Guzik
  2016-08-10 15:50         ` Robert Foss
  0 siblings, 1 reply; 24+ messages in thread
From: Mateusz Guzik @ 2016-08-10 15:42 UTC (permalink / raw)
  To: Robert Foss
  Cc: akpm, keescook, viro, gorcunov, john.stultz, plaguedbypenguins,
	sonnyrao, adobriyan, jdanis, calvinowens, jann, mhocko, koct9i,
	vbabka, n-horiguchi, kirill.shutemov, ldufour, hannes,
	linux-kernel, Ben Zhang, Bryan Freed, Filipe Brandenburger

On Wed, Aug 10, 2016 at 11:39:12AM -0400, Robert Foss wrote:
> 
> 
> On 2016-08-09 04:17 PM, Robert Foss wrote:
> > > > +static int totmaps_proc_show(struct seq_file *m, void *data)
> > > > +{
> > > > +    struct proc_maps_private *priv = m->private;
> > > > +    struct mm_struct *mm;
> > > > +    struct vm_area_struct *vma;
> > > > +    struct mem_size_stats *mss_sum = priv->mss;
> > > > +
> > > > +    /* reference to priv->task already taken */
> > > > +    /* but need to get the mm here because */
> > > > +    /* task could be in the process of exiting */
> > > > +    mm = get_task_mm(priv->task);
> > > > +    if (!mm || IS_ERR(mm))
> > > > +        return -EINVAL;
> > > > +
> > > 
> > > That's not how it's done in smaps.
> > 
> > Alright, I'll have to look into the difference between this approach and
> > the smaps one.
> 
> 
> I had a look at show_smaps(), and it's not entirely clear to me what the
> advantage of doing it show_smaps() way.
> 
> mm = get_task_mm(priv->task) is needed to iterate through all of the
> mappings. Is there a preferable way of doing that?

In the other part of the mail I stated smaps goes to proc_maps_open
which has:
priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);

This gives you stable access to mm and all needed permission checks.

Then, in the read routine you can just:
if (!atomic_inc_not_zero(&mm->mm_users))
	goto thats_it;

See smaps routines or e.g. environ_read.
-- 
Mateusz Guzik

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PACTH v1] mm, proc: Implement /proc/<pid>/totmaps
  2016-08-10 15:42       ` Mateusz Guzik
@ 2016-08-10 15:50         ` Robert Foss
  0 siblings, 0 replies; 24+ messages in thread
From: Robert Foss @ 2016-08-10 15:50 UTC (permalink / raw)
  To: Mateusz Guzik
  Cc: akpm, keescook, viro, gorcunov, john.stultz, plaguedbypenguins,
	sonnyrao, adobriyan, jdanis, calvinowens, jann, mhocko, koct9i,
	vbabka, n-horiguchi, kirill.shutemov, ldufour, hannes,
	linux-kernel, Ben Zhang, Bryan Freed, Filipe Brandenburger



On 2016-08-10 11:42 AM, Mateusz Guzik wrote:
> On Wed, Aug 10, 2016 at 11:39:12AM -0400, Robert Foss wrote:
>>
>>
>> On 2016-08-09 04:17 PM, Robert Foss wrote:
>>>>> +static int totmaps_proc_show(struct seq_file *m, void *data)
>>>>> +{
>>>>> +    struct proc_maps_private *priv = m->private;
>>>>> +    struct mm_struct *mm;
>>>>> +    struct vm_area_struct *vma;
>>>>> +    struct mem_size_stats *mss_sum = priv->mss;
>>>>> +
>>>>> +    /* reference to priv->task already taken */
>>>>> +    /* but need to get the mm here because */
>>>>> +    /* task could be in the process of exiting */
>>>>> +    mm = get_task_mm(priv->task);
>>>>> +    if (!mm || IS_ERR(mm))
>>>>> +        return -EINVAL;
>>>>> +
>>>>
>>>> That's not how it's done in smaps.
>>>
>>> Alright, I'll have to look into the difference between this approach and
>>> the smaps one.
>>
>>
>> I had a look at show_smaps(), and it's not entirely clear to me what the
>> advantage of doing it show_smaps() way.
>>
>> mm = get_task_mm(priv->task) is needed to iterate through all of the
>> mappings. Is there a preferable way of doing that?
>
> In the other part of the mail I stated smaps goes to proc_maps_open
> which has:
> priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
>
> This gives you stable access to mm and all needed permission checks.
>
> Then, in the read routine you can just:
> if (!atomic_inc_not_zero(&mm->mm_users))
> 	goto thats_it;
>
> See smaps routines or e.g. environ_read.
>

Ah! I see what you mean now. Thanks for the clarification!


Rob.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PACTH v1] mm, proc: Implement /proc/<pid>/totmaps
  2016-08-10 15:02         ` Jann Horn
@ 2016-08-10 16:24           ` Robert Foss
  0 siblings, 0 replies; 24+ messages in thread
From: Robert Foss @ 2016-08-10 16:24 UTC (permalink / raw)
  To: Jann Horn
  Cc: Sonny Rao, akpm, keescook, viro, gorcunov, john.stultz,
	plaguedbypenguins, mguzik, adobriyan, jdanis, calvinowens,
	mhocko, koct9i, vbabka, n-horiguchi, kirill.shutemov, ldufour,
	hannes, linux-kernel, Ben Zhang, Bryan Freed,
	Filipe Brandenburger



On 2016-08-10 11:02 AM, Jann Horn wrote:
> On Wed, Aug 10, 2016 at 10:16:45AM -0400, Robert Foss wrote:
>>
>>
>> On 2016-08-09 06:30 PM, Jann Horn wrote:
>>> On Tue, Aug 09, 2016 at 05:01:44PM -0400, Robert Foss wrote:
>>>> On 2016-08-09 03:24 PM, Jann Horn wrote:
>>>>> On Tue, Aug 09, 2016 at 12:05:43PM -0400, robert.foss@collabora.com wrote:
>>>>>> +	down_read(&mm->mmap_sem);
>>>>>> +	hold_task_mempolicy(priv);
>>>>>> +
>>>>>> +	for (vma = mm->mmap; vma != priv->tail_vma; vma = vma->vm_next) {
>>>>>> +		struct mem_size_stats mss;
>>>>>> +		struct mm_walk smaps_walk = {
>>>>>> +			.pmd_entry = smaps_pte_range,
>>>>>> +			.mm = vma->vm_mm,
>>>>>> +			.private = &mss,
>>>>>> +		};
>>>>>> +
>>>>>> +		if (vma->vm_mm && !is_vm_hugetlb_page(vma)) {
>>>>>> +			memset(&mss, 0, sizeof(mss));
>>>>>> +			walk_page_vma(vma, &smaps_walk);
>>>>>> +			add_smaps_sum(&mss, mss_sum);
>>>>>> +		}
>>>>>> +	}
>>>>>
>>>>> Errrr... what? You accumulate values from mem_size_stats items into a
>>>>> struct mss_sum that is associated with the struct file? So when you
>>>>> read the file the second time, you get the old values plus the new ones?
>>>>> And when you read the file in parallel, you get inconsistent values?
>>>>>
>>>>> For most files in procfs, the behavior is that you can just call
>>>>> pread(fd, buf, sizeof(buf), 0) on the same fd again and again, giving
>>>>> you the current values every time, without mutating state. I strongly
>>>>> recommend that you get rid of priv->mss and just accumulate the state
>>>>> in a local variable (maybe one on the stack).
>>>>
>>>> So a simple "static struct mem_size_stats" in totmaps_proc_show() would be a
>>>> better solution?
>>>
>>> Er, why "static"? Are you trying to create shared state between different
>>> readers for some reason?
>>>
>>
>> I think I'm a bit confused now, how are you suggesting that I replace
>> priv->mss?
>
> Like this:
>
> static int totmaps_proc_show(struct seq_file *m, void *data)
> {
>         struct proc_maps_private *priv = m->private;
>         struct mm_struct *mm;
>         struct vm_area_struct *vma;
>         struct mem_size_stats mss_sum;
>
>         memset(&mss_sum, 0, sizeof(mss_sum));
>
>         [...]
>
>         for (vma = mm->mmap; vma != priv->tail_vma; vma = vma->vm_next) {
>                 struct mem_size_stats mss;
>                 struct mm_walk smaps_walk = {
>                         .pmd_entry = smaps_pte_range,
>                         .mm = vma->vm_mm,
>                         .private = &mss,
>                 };
>
>                 if (vma->vm_mm && !is_vm_hugetlb_page(vma)) {
>                         memset(&mss, 0, sizeof(mss));
>                         walk_page_vma(vma, &smaps_walk);
>                         add_smaps_sum(&mss, &mss_sum);
>                 }
>         }
>         seq_printf(m,
>                    "Rss:            %8lu kB\n"
>                    "Pss:            %8lu kB\n"
>                    "Shared_Clean:   %8lu kB\n"
>                    [...],
>                    mss_sum.resident >> 10,
>                    (unsigned long)(mss_sum.pss >> (10 + PSS_SHIFT)),
>                    mss_sum.shared_clean  >> 10,
>                    [...]);
>         [...]
> }
>


Thanks Jann for being really clear about this stuff. It is much appreciated!

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PACTH v1] mm, proc: Implement /proc/<pid>/totmaps
  2016-08-09 21:01   ` Robert Foss
  2016-08-09 22:30     ` Jann Horn
@ 2016-08-10 17:23     ` Sonny Rao
  2016-08-10 17:37       ` Jann Horn
  1 sibling, 1 reply; 24+ messages in thread
From: Sonny Rao @ 2016-08-10 17:23 UTC (permalink / raw)
  To: Robert Foss
  Cc: Jann Horn, Andrew Morton, Kees Cook, viro, gorcunov, John Stultz,
	plaguedbypenguins, Mateusz Guzik, adobriyan, jdanis, calvinowens,
	mhocko, koct9i, vbabka, n-horiguchi, kirill.shutemov, ldufour,
	Johannes Weiner, linux-kernel, Ben Zhang, Bryan Freed,
	Filipe Brandenburger

On Tue, Aug 9, 2016 at 2:01 PM, Robert Foss <robert.foss@collabora.com> wrote:
>
>
> On 2016-08-09 03:24 PM, Jann Horn wrote:
>>
>> On Tue, Aug 09, 2016 at 12:05:43PM -0400, robert.foss@collabora.com wrote:
>>>
>>> From: Sonny Rao <sonnyrao@chromium.org>
>>>
>>> This is based on earlier work by Thiago Goncales. It implements a new
>>> per process proc file which summarizes the contents of the smaps file
>>> but doesn't display any addresses.  It gives more detailed information
>>> than statm like the PSS (proprotional set size).  It differs from the
>>> original implementation in that it doesn't use the full blown set of
>>> seq operations, uses a different termination condition, and doesn't
>>> displayed "Locked" as that was broken on the original implemenation.
>>>
>>> This new proc file provides information faster than parsing the
>>> potentially
>>> huge smaps file.
>>>
>>> Signed-off-by: Sonny Rao <sonnyrao@chromium.org>
>>>
>>> Tested-by: Robert Foss <robert.foss@collabora.com>
>>> Signed-off-by: Robert Foss <robert.foss@collabora.com>
>>
>>
>>
>>> +static int totmaps_proc_show(struct seq_file *m, void *data)
>>> +{
>>> +       struct proc_maps_private *priv = m->private;
>>> +       struct mm_struct *mm;
>>> +       struct vm_area_struct *vma;
>>> +       struct mem_size_stats *mss_sum = priv->mss;
>>> +
>>> +       /* reference to priv->task already taken */
>>> +       /* but need to get the mm here because */
>>> +       /* task could be in the process of exiting */
>>
>>
>> Can you please elaborate on this? My understanding here is that you
>> intend for the caller to be able to repeatedly read the same totmaps
>> file with pread() and still see updated information after the target
>> process has called execve() and be able to detect process death
>> (instead of simply seeing stale values). Is that accurate?
>>
>> I would prefer it if you could grab a reference to the mm_struct
>> directly at open time.
>
>
> Sonny, do you know more about the above comment?

I think right now the file gets re-opened every time, but the mode
where the file is opened once and repeatedly read is interesting
because it avoids having to open the file again and again.

I guess you could end up with a wierd situation where you don't read
the entire contents of the file in open call to read() and you might
get inconsistent data across the different statistics?

>
>>
>>
>>> +       mm = get_task_mm(priv->task);
>>> +       if (!mm || IS_ERR(mm))
>>> +               return -EINVAL;
>>
>>
>> get_task_mm() doesn't return error codes, and all other callers just
>> check whether the return value is NULL.
>>
>
> I'll have that fixed in v2, thanks for spotting it!
>
>
>>
>>> +       down_read(&mm->mmap_sem);
>>> +       hold_task_mempolicy(priv);
>>> +
>>> +       for (vma = mm->mmap; vma != priv->tail_vma; vma = vma->vm_next) {
>>> +               struct mem_size_stats mss;
>>> +               struct mm_walk smaps_walk = {
>>> +                       .pmd_entry = smaps_pte_range,
>>> +                       .mm = vma->vm_mm,
>>> +                       .private = &mss,
>>> +               };
>>> +
>>> +               if (vma->vm_mm && !is_vm_hugetlb_page(vma)) {
>>> +                       memset(&mss, 0, sizeof(mss));
>>> +                       walk_page_vma(vma, &smaps_walk);
>>> +                       add_smaps_sum(&mss, mss_sum);
>>> +               }
>>> +       }
>>
>>
>> Errrr... what? You accumulate values from mem_size_stats items into a
>> struct mss_sum that is associated with the struct file? So when you
>> read the file the second time, you get the old values plus the new ones?
>> And when you read the file in parallel, you get inconsistent values?
>>
>> For most files in procfs, the behavior is that you can just call
>> pread(fd, buf, sizeof(buf), 0) on the same fd again and again, giving
>> you the current values every time, without mutating state. I strongly
>> recommend that you get rid of priv->mss and just accumulate the state
>> in a local variable (maybe one on the stack).
>
>
> So a simple "static struct mem_size_stats" in totmaps_proc_show() would be a
> better solution?
>
>>
>>
>>> @@ -836,6 +911,50 @@ static int tid_smaps_open(struct inode *inode,
>>> struct file *file)
>>>         return do_maps_open(inode, file, &proc_tid_smaps_op);
>>>  }
>>>
>>> +static int totmaps_open(struct inode *inode, struct file *file)
>>> +{
>>> +       struct proc_maps_private *priv;
>>> +       int ret = -ENOMEM;
>>> +       priv = kzalloc(sizeof(*priv), GFP_KERNEL);
>>> +       if (priv) {
>>> +               priv->mss = kzalloc(sizeof(*priv->mss), GFP_KERNEL);
>>> +               if (!priv->mss)
>>> +                       return -ENOMEM;
>>
>>
>> Memory leak: If the first allocation works and the second one doesn't,
>> this
>> doesn't free the first allocation.
>>
>> Please change this to use the typical goto pattern for error handling.
>
>
> Fix will be implemented in v2.
>
>>
>>> +
>>> +               /* we need to grab references to the task_struct */
>>> +               /* at open time, because there's a potential information
>>> */
>>> +               /* leak where the totmaps file is opened and held open */
>>> +               /* while the underlying pid to task mapping changes */
>>> +               /* underneath it */
>>
>>
>> Nit: That's not how comments are done in the kernel. Maybe change this to
>> a normal block comment instead of one block comment per line?
>
>
> I'm not sure how that one slipped by, but I'll change it in v2.
>
>>
>>> +               priv->task = get_pid_task(proc_pid(inode), PIDTYPE_PID);
>>
>>
>> `get_pid_task(proc_pid(inode), PIDTYPE_PID)` is exactly the definition
>> of get_proc_task(inode), maybe use that instead?
>>
>
> Will do. v2 will fix this.
>
>>> +               if (!priv->task) {
>>> +                       kfree(priv->mss);
>>> +                       kfree(priv);
>>> +                       return -ESRCH;
>>> +               }
>>> +
>>> +               ret = single_open(file, totmaps_proc_show, priv);
>>> +               if (ret) {
>>> +                       put_task_struct(priv->task);
>>> +                       kfree(priv->mss);
>>> +                       kfree(priv);
>>> +               }
>>> +       }
>>> +       return ret;
>>> +}
>>
>>
>> Please change this method to use the typical goto pattern for error
>> handling. IMO repeating the undo steps in all error cases makes
>> mistakes (like the one above) more likely and increases the amount
>> of redundant code.
>
>
> Agreed. Change queued for v2.
>
>>
>> Also: The smaps file is only accessible to callers with
>> PTRACE_MODE_READ privileges on the target task. Your thing doesn't
>> do any access checks, neither in the open handler nor in the read
>> handler. Can you give an analysis of why it's okay to expose this
>> data? As far as I can tell, without spending a lot of time thinking
>> about it, this kind of data looks like it might potentially be
>> useful for side-channel information leaks or so.
>>
>
> I think it should require the same permissions as smaps, so changing the
> code to require PTRACE_MODE_READ privileges is most likely a good idea. I'll
> have a look at it for v2.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PACTH v1] mm, proc: Implement /proc/<pid>/totmaps
  2016-08-10 17:23     ` Sonny Rao
@ 2016-08-10 17:37       ` Jann Horn
  2016-08-10 17:45         ` Sonny Rao
  0 siblings, 1 reply; 24+ messages in thread
From: Jann Horn @ 2016-08-10 17:37 UTC (permalink / raw)
  To: Sonny Rao
  Cc: Robert Foss, Andrew Morton, Kees Cook, viro, gorcunov,
	John Stultz, plaguedbypenguins, Mateusz Guzik, adobriyan, jdanis,
	calvinowens, mhocko, koct9i, vbabka, n-horiguchi,
	kirill.shutemov, ldufour, Johannes Weiner, linux-kernel,
	Ben Zhang, Bryan Freed, Filipe Brandenburger

[-- Attachment #1: Type: text/plain, Size: 3080 bytes --]

On Wed, Aug 10, 2016 at 10:23:53AM -0700, Sonny Rao wrote:
> On Tue, Aug 9, 2016 at 2:01 PM, Robert Foss <robert.foss@collabora.com> wrote:
> >
> >
> > On 2016-08-09 03:24 PM, Jann Horn wrote:
> >>
> >> On Tue, Aug 09, 2016 at 12:05:43PM -0400, robert.foss@collabora.com wrote:
> >>>
> >>> From: Sonny Rao <sonnyrao@chromium.org>
> >>>
> >>> This is based on earlier work by Thiago Goncales. It implements a new
> >>> per process proc file which summarizes the contents of the smaps file
> >>> but doesn't display any addresses.  It gives more detailed information
> >>> than statm like the PSS (proprotional set size).  It differs from the
> >>> original implementation in that it doesn't use the full blown set of
> >>> seq operations, uses a different termination condition, and doesn't
> >>> displayed "Locked" as that was broken on the original implemenation.
> >>>
> >>> This new proc file provides information faster than parsing the
> >>> potentially
> >>> huge smaps file.
> >>>
> >>> Signed-off-by: Sonny Rao <sonnyrao@chromium.org>
> >>>
> >>> Tested-by: Robert Foss <robert.foss@collabora.com>
> >>> Signed-off-by: Robert Foss <robert.foss@collabora.com>
> >>
> >>
> >>
> >>> +static int totmaps_proc_show(struct seq_file *m, void *data)
> >>> +{
> >>> +       struct proc_maps_private *priv = m->private;
> >>> +       struct mm_struct *mm;
> >>> +       struct vm_area_struct *vma;
> >>> +       struct mem_size_stats *mss_sum = priv->mss;
> >>> +
> >>> +       /* reference to priv->task already taken */
> >>> +       /* but need to get the mm here because */
> >>> +       /* task could be in the process of exiting */
> >>
> >>
> >> Can you please elaborate on this? My understanding here is that you
> >> intend for the caller to be able to repeatedly read the same totmaps
> >> file with pread() and still see updated information after the target
> >> process has called execve() and be able to detect process death
> >> (instead of simply seeing stale values). Is that accurate?
> >>
> >> I would prefer it if you could grab a reference to the mm_struct
> >> directly at open time.
> >
> >
> > Sonny, do you know more about the above comment?
> 
> I think right now the file gets re-opened every time, but the mode
> where the file is opened once and repeatedly read is interesting
> because it avoids having to open the file again and again.
> 
> I guess you could end up with a wierd situation where you don't read
> the entire contents of the file in open call to read() and you might
> get inconsistent data across the different statistics?

If the file is read in two chunks, totmaps_proc_show is only called
once. The patch specifies seq_read as read handler. Have a look at its
definition. As long as you don't read from the same seq file in
parallel or seek around in it, simple sequential reads will not
re-invoke the show() method for data that has already been formatted.
For partially consumed data, the kernel buffers the rest until someone
reads it or seeks to another offset.

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PACTH v1] mm, proc: Implement /proc/<pid>/totmaps
  2016-08-10 17:37       ` Jann Horn
@ 2016-08-10 17:45         ` Sonny Rao
  2016-08-10 18:05           ` Jann Horn
  0 siblings, 1 reply; 24+ messages in thread
From: Sonny Rao @ 2016-08-10 17:45 UTC (permalink / raw)
  To: Jann Horn
  Cc: Robert Foss, Andrew Morton, Kees Cook, Al Viro, Cyrill Gorcunov,
	John Stultz, Robin Humble, Mateusz Guzik, Alexey Dobriyan,
	Janis Danisevskis, calvinowens, Michal Hocko,
	Konstantin Khlebnikov, Vlastimil Babka, Naoya Horiguchi,
	Kirill A. Shutemov, ldufour, Johannes Weiner, linux-kernel,
	Ben Zhang, Bryan Freed, Filipe Brandenburger

On Wed, Aug 10, 2016 at 10:37 AM, Jann Horn <jann@thejh.net> wrote:
> On Wed, Aug 10, 2016 at 10:23:53AM -0700, Sonny Rao wrote:
>> On Tue, Aug 9, 2016 at 2:01 PM, Robert Foss <robert.foss@collabora.com> wrote:
>> >
>> >
>> > On 2016-08-09 03:24 PM, Jann Horn wrote:
>> >>
>> >> On Tue, Aug 09, 2016 at 12:05:43PM -0400, robert.foss@collabora.com wrote:
>> >>>
>> >>> From: Sonny Rao <sonnyrao@chromium.org>
>> >>>
>> >>> This is based on earlier work by Thiago Goncales. It implements a new
>> >>> per process proc file which summarizes the contents of the smaps file
>> >>> but doesn't display any addresses.  It gives more detailed information
>> >>> than statm like the PSS (proprotional set size).  It differs from the
>> >>> original implementation in that it doesn't use the full blown set of
>> >>> seq operations, uses a different termination condition, and doesn't
>> >>> displayed "Locked" as that was broken on the original implemenation.
>> >>>
>> >>> This new proc file provides information faster than parsing the
>> >>> potentially
>> >>> huge smaps file.
>> >>>
>> >>> Signed-off-by: Sonny Rao <sonnyrao@chromium.org>
>> >>>
>> >>> Tested-by: Robert Foss <robert.foss@collabora.com>
>> >>> Signed-off-by: Robert Foss <robert.foss@collabora.com>
>> >>
>> >>
>> >>
>> >>> +static int totmaps_proc_show(struct seq_file *m, void *data)
>> >>> +{
>> >>> +       struct proc_maps_private *priv = m->private;
>> >>> +       struct mm_struct *mm;
>> >>> +       struct vm_area_struct *vma;
>> >>> +       struct mem_size_stats *mss_sum = priv->mss;
>> >>> +
>> >>> +       /* reference to priv->task already taken */
>> >>> +       /* but need to get the mm here because */
>> >>> +       /* task could be in the process of exiting */
>> >>
>> >>
>> >> Can you please elaborate on this? My understanding here is that you
>> >> intend for the caller to be able to repeatedly read the same totmaps
>> >> file with pread() and still see updated information after the target
>> >> process has called execve() and be able to detect process death
>> >> (instead of simply seeing stale values). Is that accurate?
>> >>
>> >> I would prefer it if you could grab a reference to the mm_struct
>> >> directly at open time.
>> >
>> >
>> > Sonny, do you know more about the above comment?
>>
>> I think right now the file gets re-opened every time, but the mode
>> where the file is opened once and repeatedly read is interesting
>> because it avoids having to open the file again and again.
>>
>> I guess you could end up with a wierd situation where you don't read
>> the entire contents of the file in open call to read() and you might
>> get inconsistent data across the different statistics?
>
> If the file is read in two chunks, totmaps_proc_show is only called
> once. The patch specifies seq_read as read handler. Have a look at its
> definition. As long as you don't read from the same seq file in
> parallel or seek around in it, simple sequential reads will not
> re-invoke the show() method for data that has already been formatted.
> For partially consumed data, the kernel buffers the rest until someone
> reads it or seeks to another offset.

Ok that's good.  If the consumer were using pread() though, would that
look like a seek?

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PACTH v1] mm, proc: Implement /proc/<pid>/totmaps
  2016-08-10 17:45         ` Sonny Rao
@ 2016-08-10 18:05           ` Jann Horn
  2016-08-12 16:28             ` Robert Foss
  0 siblings, 1 reply; 24+ messages in thread
From: Jann Horn @ 2016-08-10 18:05 UTC (permalink / raw)
  To: Sonny Rao
  Cc: Robert Foss, Andrew Morton, Kees Cook, Al Viro, Cyrill Gorcunov,
	John Stultz, Robin Humble, Mateusz Guzik, Alexey Dobriyan,
	Janis Danisevskis, calvinowens, Michal Hocko,
	Konstantin Khlebnikov, Vlastimil Babka, Naoya Horiguchi,
	Kirill A. Shutemov, ldufour, Johannes Weiner, linux-kernel,
	Ben Zhang, Bryan Freed, Filipe Brandenburger

[-- Attachment #1: Type: text/plain, Size: 3889 bytes --]

On Wed, Aug 10, 2016 at 10:45:51AM -0700, Sonny Rao wrote:
> On Wed, Aug 10, 2016 at 10:37 AM, Jann Horn <jann@thejh.net> wrote:
> > On Wed, Aug 10, 2016 at 10:23:53AM -0700, Sonny Rao wrote:
> >> On Tue, Aug 9, 2016 at 2:01 PM, Robert Foss <robert.foss@collabora.com> wrote:
> >> >
> >> >
> >> > On 2016-08-09 03:24 PM, Jann Horn wrote:
> >> >>
> >> >> On Tue, Aug 09, 2016 at 12:05:43PM -0400, robert.foss@collabora.com wrote:
> >> >>>
> >> >>> From: Sonny Rao <sonnyrao@chromium.org>
> >> >>>
> >> >>> This is based on earlier work by Thiago Goncales. It implements a new
> >> >>> per process proc file which summarizes the contents of the smaps file
> >> >>> but doesn't display any addresses.  It gives more detailed information
> >> >>> than statm like the PSS (proprotional set size).  It differs from the
> >> >>> original implementation in that it doesn't use the full blown set of
> >> >>> seq operations, uses a different termination condition, and doesn't
> >> >>> displayed "Locked" as that was broken on the original implemenation.
> >> >>>
> >> >>> This new proc file provides information faster than parsing the
> >> >>> potentially
> >> >>> huge smaps file.
> >> >>>
> >> >>> Signed-off-by: Sonny Rao <sonnyrao@chromium.org>
> >> >>>
> >> >>> Tested-by: Robert Foss <robert.foss@collabora.com>
> >> >>> Signed-off-by: Robert Foss <robert.foss@collabora.com>
> >> >>
> >> >>
> >> >>
> >> >>> +static int totmaps_proc_show(struct seq_file *m, void *data)
> >> >>> +{
> >> >>> +       struct proc_maps_private *priv = m->private;
> >> >>> +       struct mm_struct *mm;
> >> >>> +       struct vm_area_struct *vma;
> >> >>> +       struct mem_size_stats *mss_sum = priv->mss;
> >> >>> +
> >> >>> +       /* reference to priv->task already taken */
> >> >>> +       /* but need to get the mm here because */
> >> >>> +       /* task could be in the process of exiting */
> >> >>
> >> >>
> >> >> Can you please elaborate on this? My understanding here is that you
> >> >> intend for the caller to be able to repeatedly read the same totmaps
> >> >> file with pread() and still see updated information after the target
> >> >> process has called execve() and be able to detect process death
> >> >> (instead of simply seeing stale values). Is that accurate?
> >> >>
> >> >> I would prefer it if you could grab a reference to the mm_struct
> >> >> directly at open time.
> >> >
> >> >
> >> > Sonny, do you know more about the above comment?
> >>
> >> I think right now the file gets re-opened every time, but the mode
> >> where the file is opened once and repeatedly read is interesting
> >> because it avoids having to open the file again and again.
> >>
> >> I guess you could end up with a wierd situation where you don't read
> >> the entire contents of the file in open call to read() and you might
> >> get inconsistent data across the different statistics?
> >
> > If the file is read in two chunks, totmaps_proc_show is only called
> > once. The patch specifies seq_read as read handler. Have a look at its
> > definition. As long as you don't read from the same seq file in
> > parallel or seek around in it, simple sequential reads will not
> > re-invoke the show() method for data that has already been formatted.
> > For partially consumed data, the kernel buffers the rest until someone
> > reads it or seeks to another offset.
> 
> Ok that's good.  If the consumer were using pread() though, would that
> look like a seek?

Only if the consumer uses pread() with an offset that is not the same as
the end offset of the previous read.

So if you tried to use the same file from multiple threads in parallel,
you might still have issues, but as long as you don't do that, it should
be fine.

I guess it might make sense to document this behavior somewhere - maybe
the proc.5 manpage?

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PACTH v1] mm, proc: Implement /proc/<pid>/totmaps
  2016-08-10 18:05           ` Jann Horn
@ 2016-08-12 16:28             ` Robert Foss
  2016-08-13 12:39                 ` Jann Horn
  0 siblings, 1 reply; 24+ messages in thread
From: Robert Foss @ 2016-08-12 16:28 UTC (permalink / raw)
  To: Jann Horn, Sonny Rao
  Cc: Andrew Morton, Kees Cook, Al Viro, Cyrill Gorcunov, John Stultz,
	Robin Humble, Mateusz Guzik, Alexey Dobriyan, Janis Danisevskis,
	calvinowens, Michal Hocko, Konstantin Khlebnikov,
	Vlastimil Babka, Naoya Horiguchi, Kirill A. Shutemov, ldufour,
	Johannes Weiner, linux-kernel, Ben Zhang, Bryan Freed,
	Filipe Brandenburger



On 2016-08-10 02:05 PM, Jann Horn wrote:
> On Wed, Aug 10, 2016 at 10:45:51AM -0700, Sonny Rao wrote:
>> On Wed, Aug 10, 2016 at 10:37 AM, Jann Horn <jann@thejh.net> wrote:
>>> On Wed, Aug 10, 2016 at 10:23:53AM -0700, Sonny Rao wrote:
>>>> On Tue, Aug 9, 2016 at 2:01 PM, Robert Foss <robert.foss@collabora.com> wrote:
>>>>>
>>>>>
>>>>> On 2016-08-09 03:24 PM, Jann Horn wrote:
>>>>>>
>>>>>> On Tue, Aug 09, 2016 at 12:05:43PM -0400, robert.foss@collabora.com wrote:
>>>>>>>
>>>>>>> From: Sonny Rao <sonnyrao@chromium.org>
>>>>>>>
>>>>>>> This is based on earlier work by Thiago Goncales. It implements a new
>>>>>>> per process proc file which summarizes the contents of the smaps file
>>>>>>> but doesn't display any addresses.  It gives more detailed information
>>>>>>> than statm like the PSS (proprotional set size).  It differs from the
>>>>>>> original implementation in that it doesn't use the full blown set of
>>>>>>> seq operations, uses a different termination condition, and doesn't
>>>>>>> displayed "Locked" as that was broken on the original implemenation.
>>>>>>>
>>>>>>> This new proc file provides information faster than parsing the
>>>>>>> potentially
>>>>>>> huge smaps file.
>>>>>>>
>>>>>>> Signed-off-by: Sonny Rao <sonnyrao@chromium.org>
>>>>>>>
>>>>>>> Tested-by: Robert Foss <robert.foss@collabora.com>
>>>>>>> Signed-off-by: Robert Foss <robert.foss@collabora.com>
>>>>>>
>>>>>>
>>>>>>
>>>>>>> +static int totmaps_proc_show(struct seq_file *m, void *data)
>>>>>>> +{
>>>>>>> +       struct proc_maps_private *priv = m->private;
>>>>>>> +       struct mm_struct *mm;
>>>>>>> +       struct vm_area_struct *vma;
>>>>>>> +       struct mem_size_stats *mss_sum = priv->mss;
>>>>>>> +
>>>>>>> +       /* reference to priv->task already taken */
>>>>>>> +       /* but need to get the mm here because */
>>>>>>> +       /* task could be in the process of exiting */
>>>>>>
>>>>>>
>>>>>> Can you please elaborate on this? My understanding here is that you
>>>>>> intend for the caller to be able to repeatedly read the same totmaps
>>>>>> file with pread() and still see updated information after the target
>>>>>> process has called execve() and be able to detect process death
>>>>>> (instead of simply seeing stale values). Is that accurate?
>>>>>>
>>>>>> I would prefer it if you could grab a reference to the mm_struct
>>>>>> directly at open time.
>>>>>
>>>>>
>>>>> Sonny, do you know more about the above comment?
>>>>
>>>> I think right now the file gets re-opened every time, but the mode
>>>> where the file is opened once and repeatedly read is interesting
>>>> because it avoids having to open the file again and again.
>>>>
>>>> I guess you could end up with a wierd situation where you don't read
>>>> the entire contents of the file in open call to read() and you might
>>>> get inconsistent data across the different statistics?
>>>
>>> If the file is read in two chunks, totmaps_proc_show is only called
>>> once. The patch specifies seq_read as read handler. Have a look at its
>>> definition. As long as you don't read from the same seq file in
>>> parallel or seek around in it, simple sequential reads will not
>>> re-invoke the show() method for data that has already been formatted.
>>> For partially consumed data, the kernel buffers the rest until someone
>>> reads it or seeks to another offset.
>>
>> Ok that's good.  If the consumer were using pread() though, would that
>> look like a seek?
>
> Only if the consumer uses pread() with an offset that is not the same as
> the end offset of the previous read.
>
> So if you tried to use the same file from multiple threads in parallel,
> you might still have issues, but as long as you don't do that, it should
> be fine.
>
> I guess it might make sense to document this behavior somewhere - maybe
> the proc.5 manpage?
>

I'll add a note about limitations for parallel read. The overall 
documentation for this feature should live in the proc.5 manpage as well?

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PACTH v1] mm, proc: Implement /proc/<pid>/totmaps
@ 2016-08-13 12:39                 ` Jann Horn
  0 siblings, 0 replies; 24+ messages in thread
From: Jann Horn @ 2016-08-13 12:39 UTC (permalink / raw)
  To: Robert Foss
  Cc: Sonny Rao, Andrew Morton, Kees Cook, Al Viro, Cyrill Gorcunov,
	John Stultz, Robin Humble, Mateusz Guzik, Alexey Dobriyan,
	Janis Danisevskis, calvinowens, Michal Hocko,
	Konstantin Khlebnikov, Vlastimil Babka, Naoya Horiguchi,
	Kirill A. Shutemov, ldufour, Johannes Weiner, linux-kernel,
	Ben Zhang, Bryan Freed, Filipe Brandenburger, linux-api

[-- Attachment #1: Type: text/plain, Size: 4288 bytes --]

On Fri, Aug 12, 2016 at 12:28:11PM -0400, Robert Foss wrote:
> 
> 
> On 2016-08-10 02:05 PM, Jann Horn wrote:
> >On Wed, Aug 10, 2016 at 10:45:51AM -0700, Sonny Rao wrote:
> >>On Wed, Aug 10, 2016 at 10:37 AM, Jann Horn <jann@thejh.net> wrote:
> >>>On Wed, Aug 10, 2016 at 10:23:53AM -0700, Sonny Rao wrote:
> >>>>On Tue, Aug 9, 2016 at 2:01 PM, Robert Foss <robert.foss@collabora.com> wrote:
> >>>>>
> >>>>>
> >>>>>On 2016-08-09 03:24 PM, Jann Horn wrote:
> >>>>>>
> >>>>>>On Tue, Aug 09, 2016 at 12:05:43PM -0400, robert.foss@collabora.com wrote:
> >>>>>>>
> >>>>>>>From: Sonny Rao <sonnyrao@chromium.org>
> >>>>>>>
> >>>>>>>This is based on earlier work by Thiago Goncales. It implements a new
> >>>>>>>per process proc file which summarizes the contents of the smaps file
> >>>>>>>but doesn't display any addresses.  It gives more detailed information
> >>>>>>>than statm like the PSS (proprotional set size).  It differs from the
> >>>>>>>original implementation in that it doesn't use the full blown set of
> >>>>>>>seq operations, uses a different termination condition, and doesn't
> >>>>>>>displayed "Locked" as that was broken on the original implemenation.
> >>>>>>>
> >>>>>>>This new proc file provides information faster than parsing the
> >>>>>>>potentially
> >>>>>>>huge smaps file.
> >>>>>>>
> >>>>>>>Signed-off-by: Sonny Rao <sonnyrao@chromium.org>
> >>>>>>>
> >>>>>>>Tested-by: Robert Foss <robert.foss@collabora.com>
> >>>>>>>Signed-off-by: Robert Foss <robert.foss@collabora.com>
> >>>>>>
> >>>>>>
> >>>>>>
> >>>>>>>+static int totmaps_proc_show(struct seq_file *m, void *data)
> >>>>>>>+{
> >>>>>>>+       struct proc_maps_private *priv = m->private;
> >>>>>>>+       struct mm_struct *mm;
> >>>>>>>+       struct vm_area_struct *vma;
> >>>>>>>+       struct mem_size_stats *mss_sum = priv->mss;
> >>>>>>>+
> >>>>>>>+       /* reference to priv->task already taken */
> >>>>>>>+       /* but need to get the mm here because */
> >>>>>>>+       /* task could be in the process of exiting */
> >>>>>>
> >>>>>>
> >>>>>>Can you please elaborate on this? My understanding here is that you
> >>>>>>intend for the caller to be able to repeatedly read the same totmaps
> >>>>>>file with pread() and still see updated information after the target
> >>>>>>process has called execve() and be able to detect process death
> >>>>>>(instead of simply seeing stale values). Is that accurate?
> >>>>>>
> >>>>>>I would prefer it if you could grab a reference to the mm_struct
> >>>>>>directly at open time.
> >>>>>
> >>>>>
> >>>>>Sonny, do you know more about the above comment?
> >>>>
> >>>>I think right now the file gets re-opened every time, but the mode
> >>>>where the file is opened once and repeatedly read is interesting
> >>>>because it avoids having to open the file again and again.
> >>>>
> >>>>I guess you could end up with a wierd situation where you don't read
> >>>>the entire contents of the file in open call to read() and you might
> >>>>get inconsistent data across the different statistics?
> >>>
> >>>If the file is read in two chunks, totmaps_proc_show is only called
> >>>once. The patch specifies seq_read as read handler. Have a look at its
> >>>definition. As long as you don't read from the same seq file in
> >>>parallel or seek around in it, simple sequential reads will not
> >>>re-invoke the show() method for data that has already been formatted.
> >>>For partially consumed data, the kernel buffers the rest until someone
> >>>reads it or seeks to another offset.
> >>
> >>Ok that's good.  If the consumer were using pread() though, would that
> >>look like a seek?
> >
> >Only if the consumer uses pread() with an offset that is not the same as
> >the end offset of the previous read.
> >
> >So if you tried to use the same file from multiple threads in parallel,
> >you might still have issues, but as long as you don't do that, it should
> >be fine.
> >
> >I guess it might make sense to document this behavior somewhere - maybe
> >the proc.5 manpage?
> >
> 
> I'll add a note about limitations for parallel read. The overall
> documentation for this feature should live in the proc.5 manpage as well?

Yes, I think so.

+Cc linux-api@vger.kernel.org

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PACTH v1] mm, proc: Implement /proc/<pid>/totmaps
@ 2016-08-13 12:39                 ` Jann Horn
  0 siblings, 0 replies; 24+ messages in thread
From: Jann Horn @ 2016-08-13 12:39 UTC (permalink / raw)
  To: Robert Foss
  Cc: Sonny Rao, Andrew Morton, Kees Cook, Al Viro, Cyrill Gorcunov,
	John Stultz, Robin Humble, Mateusz Guzik, Alexey Dobriyan,
	Janis Danisevskis, calvinowens-b10kYP2dOMg, Michal Hocko,
	Konstantin Khlebnikov, Vlastimil Babka, Naoya Horiguchi,
	Kirill A. Shutemov, ldufour-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8,
	Johannes Weiner, linux-kernel-u79uwXL29TY76Z2rM5mHXA, Ben Zhang,
	Bryan Freed, Filipe Brandenburger,
	linux-api-u79uwXL29TZUIDd8j+nm9g

[-- Attachment #1: Type: text/plain, Size: 4501 bytes --]

On Fri, Aug 12, 2016 at 12:28:11PM -0400, Robert Foss wrote:
> 
> 
> On 2016-08-10 02:05 PM, Jann Horn wrote:
> >On Wed, Aug 10, 2016 at 10:45:51AM -0700, Sonny Rao wrote:
> >>On Wed, Aug 10, 2016 at 10:37 AM, Jann Horn <jann-XZ1E9jl8jIdeoWH0uzbU5w@public.gmane.org> wrote:
> >>>On Wed, Aug 10, 2016 at 10:23:53AM -0700, Sonny Rao wrote:
> >>>>On Tue, Aug 9, 2016 at 2:01 PM, Robert Foss <robert.foss-ZGY8ohtN/8rSCDK34cm6iQ@public.gmane.orgm> wrote:
> >>>>>
> >>>>>
> >>>>>On 2016-08-09 03:24 PM, Jann Horn wrote:
> >>>>>>
> >>>>>>On Tue, Aug 09, 2016 at 12:05:43PM -0400, robert.foss-ZGY8ohtN/8qB+jHODAdFcQ@public.gmane.org wrote:
> >>>>>>>
> >>>>>>>From: Sonny Rao <sonnyrao-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org>
> >>>>>>>
> >>>>>>>This is based on earlier work by Thiago Goncales. It implements a new
> >>>>>>>per process proc file which summarizes the contents of the smaps file
> >>>>>>>but doesn't display any addresses.  It gives more detailed information
> >>>>>>>than statm like the PSS (proprotional set size).  It differs from the
> >>>>>>>original implementation in that it doesn't use the full blown set of
> >>>>>>>seq operations, uses a different termination condition, and doesn't
> >>>>>>>displayed "Locked" as that was broken on the original implemenation.
> >>>>>>>
> >>>>>>>This new proc file provides information faster than parsing the
> >>>>>>>potentially
> >>>>>>>huge smaps file.
> >>>>>>>
> >>>>>>>Signed-off-by: Sonny Rao <sonnyrao-F7+t8E8rja9g9hUCZPvPmw@public.gmane.org>
> >>>>>>>
> >>>>>>>Tested-by: Robert Foss <robert.foss-ZGY8ohtN/8qB+jHODAdFcQ@public.gmane.org>
> >>>>>>>Signed-off-by: Robert Foss <robert.foss-ZGY8ohtN/8qB+jHODAdFcQ@public.gmane.org>
> >>>>>>
> >>>>>>
> >>>>>>
> >>>>>>>+static int totmaps_proc_show(struct seq_file *m, void *data)
> >>>>>>>+{
> >>>>>>>+       struct proc_maps_private *priv = m->private;
> >>>>>>>+       struct mm_struct *mm;
> >>>>>>>+       struct vm_area_struct *vma;
> >>>>>>>+       struct mem_size_stats *mss_sum = priv->mss;
> >>>>>>>+
> >>>>>>>+       /* reference to priv->task already taken */
> >>>>>>>+       /* but need to get the mm here because */
> >>>>>>>+       /* task could be in the process of exiting */
> >>>>>>
> >>>>>>
> >>>>>>Can you please elaborate on this? My understanding here is that you
> >>>>>>intend for the caller to be able to repeatedly read the same totmaps
> >>>>>>file with pread() and still see updated information after the target
> >>>>>>process has called execve() and be able to detect process death
> >>>>>>(instead of simply seeing stale values). Is that accurate?
> >>>>>>
> >>>>>>I would prefer it if you could grab a reference to the mm_struct
> >>>>>>directly at open time.
> >>>>>
> >>>>>
> >>>>>Sonny, do you know more about the above comment?
> >>>>
> >>>>I think right now the file gets re-opened every time, but the mode
> >>>>where the file is opened once and repeatedly read is interesting
> >>>>because it avoids having to open the file again and again.
> >>>>
> >>>>I guess you could end up with a wierd situation where you don't read
> >>>>the entire contents of the file in open call to read() and you might
> >>>>get inconsistent data across the different statistics?
> >>>
> >>>If the file is read in two chunks, totmaps_proc_show is only called
> >>>once. The patch specifies seq_read as read handler. Have a look at its
> >>>definition. As long as you don't read from the same seq file in
> >>>parallel or seek around in it, simple sequential reads will not
> >>>re-invoke the show() method for data that has already been formatted.
> >>>For partially consumed data, the kernel buffers the rest until someone
> >>>reads it or seeks to another offset.
> >>
> >>Ok that's good.  If the consumer were using pread() though, would that
> >>look like a seek?
> >
> >Only if the consumer uses pread() with an offset that is not the same as
> >the end offset of the previous read.
> >
> >So if you tried to use the same file from multiple threads in parallel,
> >you might still have issues, but as long as you don't do that, it should
> >be fine.
> >
> >I guess it might make sense to document this behavior somewhere - maybe
> >the proc.5 manpage?
> >
> 
> I'll add a note about limitations for parallel read. The overall
> documentation for this feature should live in the proc.5 manpage as well?

Yes, I think so.

+Cc linux-api-u79uwXL29TY76Z2rM5mHXA@public.gmane.org

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2016-08-13 12:40 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-08-09 16:05 [PACTH v1] mm, proc: Implement /proc/<pid>/totmaps robert.foss
2016-08-09 16:29 ` Mateusz Guzik
2016-08-09 16:56   ` Sonny Rao
2016-08-09 20:17   ` Robert Foss
2016-08-10 15:39     ` Robert Foss
2016-08-10 15:42       ` Mateusz Guzik
2016-08-10 15:50         ` Robert Foss
2016-08-09 16:58 ` Alexey Dobriyan
2016-08-09 18:28   ` Sonny Rao
2016-08-09 19:16 ` Konstantin Khlebnikov
2016-08-10  0:30   ` Sonny Rao
2016-08-09 19:24 ` Jann Horn
2016-08-09 21:01   ` Robert Foss
2016-08-09 22:30     ` Jann Horn
2016-08-10 14:16       ` Robert Foss
2016-08-10 15:02         ` Jann Horn
2016-08-10 16:24           ` Robert Foss
2016-08-10 17:23     ` Sonny Rao
2016-08-10 17:37       ` Jann Horn
2016-08-10 17:45         ` Sonny Rao
2016-08-10 18:05           ` Jann Horn
2016-08-12 16:28             ` Robert Foss
2016-08-13 12:39               ` Jann Horn
2016-08-13 12:39                 ` Jann Horn

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.