All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] ipc/shm.c: add RSS and swap size information to /proc/sysvipc/shm
@ 2010-08-11 20:13 ` Helge Deller
  0 siblings, 0 replies; 18+ messages in thread
From: Helge Deller @ 2010-08-11 20:13 UTC (permalink / raw)
  To: linux-mm, linux-kernel

The kernel currently provides no functionality to analyze the RSS
and swap space usage of each individual sysvipc shared memory segment.

This patch add this info for each existing shm segment by extending
the output of /proc/sysvipc/shm by two columns for RSS and swap.

Since shmctl(SHM_INFO) already provides a similiar calculation (it
currently sums up all RSS/swap info for all segments), I did split
out a static function which is now used by the /proc/sysvipc/shm 
output and shmctl(SHM_INFO).

Tested on x86-64 and parisc (32- and 64bit). 

Signed-off-by: Helge Deller <deller@gmx.de>

---

 shm.c |   63 ++++++++++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 42 insertions(+), 21 deletions(-)


diff --git a/ipc/shm.c b/ipc/shm.c
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -108,7 +108,11 @@ void __init shm_init (void)
 {
 	shm_init_ns(&init_ipc_ns);
 	ipc_init_proc_interface("sysvipc/shm",
-				"       key      shmid perms       size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime\n",
+#if BITS_PER_LONG <= 32
+				"       key      shmid perms       size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime        RSS       swap\n",
+#else
+				"       key      shmid perms                  size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime                   RSS                  swap\n",
+#endif
 				IPC_SHM_IDS, sysvipc_shm_proc_show);
 }
 
@@ -542,6 +546,34 @@ static inline unsigned long copy_shminfo_to_user(void __user *buf, struct shminf
 }
 
 /*
+ * Calculate and add used RSS and swap pages of a shm.
+ * Called with shm_ids.rw_mutex held as a reader
+ */
+static void shm_add_rss_swap(struct shmid_kernel *shp,
+	unsigned long *rss_add, unsigned long *swp_add)
+{
+	struct inode *inode;
+
+	inode = shp->shm_file->f_path.dentry->d_inode;
+
+	if (is_file_hugepages(shp->shm_file)) {
+		struct address_space *mapping = inode->i_mapping;
+		struct hstate *h = hstate_file(shp->shm_file);
+		*rss_add += pages_per_huge_page(h) * mapping->nrpages;
+	} else {
+#ifdef CONFIG_SHMEM
+		struct shmem_inode_info *info = SHMEM_I(inode);
+		spin_lock(&info->lock);
+		*rss_add += inode->i_mapping->nrpages;
+		*swp_add += info->swapped;
+		spin_unlock(&info->lock);
+#else
+		*rss_add += inode->i_mapping->nrpages;
+#endif
+	}
+}
+
+/*
  * Called with shm_ids.rw_mutex held as a reader
  */
 static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss,
@@ -558,30 +590,13 @@ static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss,
 	for (total = 0, next_id = 0; total < in_use; next_id++) {
 		struct kern_ipc_perm *ipc;
 		struct shmid_kernel *shp;
-		struct inode *inode;
 
 		ipc = idr_find(&shm_ids(ns).ipcs_idr, next_id);
 		if (ipc == NULL)
 			continue;
 		shp = container_of(ipc, struct shmid_kernel, shm_perm);
 
-		inode = shp->shm_file->f_path.dentry->d_inode;
-
-		if (is_file_hugepages(shp->shm_file)) {
-			struct address_space *mapping = inode->i_mapping;
-			struct hstate *h = hstate_file(shp->shm_file);
-			*rss += pages_per_huge_page(h) * mapping->nrpages;
-		} else {
-#ifdef CONFIG_SHMEM
-			struct shmem_inode_info *info = SHMEM_I(inode);
-			spin_lock(&info->lock);
-			*rss += inode->i_mapping->nrpages;
-			*swp += info->swapped;
-			spin_unlock(&info->lock);
-#else
-			*rss += inode->i_mapping->nrpages;
-#endif
-		}
+		shm_add_rss_swap(shp, rss, swp);
 
 		total++;
 	}
@@ -1070,6 +1085,9 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr)
 static int sysvipc_shm_proc_show(struct seq_file *s, void *it)
 {
 	struct shmid_kernel *shp = it;
+	unsigned long rss = 0, swp = 0;
+
+	shm_add_rss_swap(shp, &rss, &swp);
 
 #if BITS_PER_LONG <= 32
 #define SIZE_SPEC "%10lu"
@@ -1079,7 +1097,8 @@ static int sysvipc_shm_proc_show(struct seq_file *s, void *it)
 
 	return seq_printf(s,
 			  "%10d %10d  %4o " SIZE_SPEC " %5u %5u  "
-			  "%5lu %5u %5u %5u %5u %10lu %10lu %10lu\n",
+			  "%5lu %5u %5u %5u %5u %10lu %10lu %10lu "
+			  SIZE_SPEC " " SIZE_SPEC "\n",
 			  shp->shm_perm.key,
 			  shp->shm_perm.id,
 			  shp->shm_perm.mode,
@@ -1093,6 +1112,8 @@ static int sysvipc_shm_proc_show(struct seq_file *s, void *it)
 			  shp->shm_perm.cgid,
 			  shp->shm_atim,
 			  shp->shm_dtim,
-			  shp->shm_ctim);
+			  shp->shm_ctim,
+			  rss * PAGE_SIZE,
+			  swp * PAGE_SIZE);
 }
 #endif

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH] ipc/shm.c: add RSS and swap size information to /proc/sysvipc/shm
@ 2010-08-11 20:13 ` Helge Deller
  0 siblings, 0 replies; 18+ messages in thread
From: Helge Deller @ 2010-08-11 20:13 UTC (permalink / raw)
  To: linux-mm, linux-kernel

The kernel currently provides no functionality to analyze the RSS
and swap space usage of each individual sysvipc shared memory segment.

This patch add this info for each existing shm segment by extending
the output of /proc/sysvipc/shm by two columns for RSS and swap.

Since shmctl(SHM_INFO) already provides a similiar calculation (it
currently sums up all RSS/swap info for all segments), I did split
out a static function which is now used by the /proc/sysvipc/shm 
output and shmctl(SHM_INFO).

Tested on x86-64 and parisc (32- and 64bit). 

Signed-off-by: Helge Deller <deller@gmx.de>

---

 shm.c |   63 ++++++++++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 42 insertions(+), 21 deletions(-)


diff --git a/ipc/shm.c b/ipc/shm.c
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -108,7 +108,11 @@ void __init shm_init (void)
 {
 	shm_init_ns(&init_ipc_ns);
 	ipc_init_proc_interface("sysvipc/shm",
-				"       key      shmid perms       size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime\n",
+#if BITS_PER_LONG <= 32
+				"       key      shmid perms       size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime        RSS       swap\n",
+#else
+				"       key      shmid perms                  size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime                   RSS                  swap\n",
+#endif
 				IPC_SHM_IDS, sysvipc_shm_proc_show);
 }
 
@@ -542,6 +546,34 @@ static inline unsigned long copy_shminfo_to_user(void __user *buf, struct shminf
 }
 
 /*
+ * Calculate and add used RSS and swap pages of a shm.
+ * Called with shm_ids.rw_mutex held as a reader
+ */
+static void shm_add_rss_swap(struct shmid_kernel *shp,
+	unsigned long *rss_add, unsigned long *swp_add)
+{
+	struct inode *inode;
+
+	inode = shp->shm_file->f_path.dentry->d_inode;
+
+	if (is_file_hugepages(shp->shm_file)) {
+		struct address_space *mapping = inode->i_mapping;
+		struct hstate *h = hstate_file(shp->shm_file);
+		*rss_add += pages_per_huge_page(h) * mapping->nrpages;
+	} else {
+#ifdef CONFIG_SHMEM
+		struct shmem_inode_info *info = SHMEM_I(inode);
+		spin_lock(&info->lock);
+		*rss_add += inode->i_mapping->nrpages;
+		*swp_add += info->swapped;
+		spin_unlock(&info->lock);
+#else
+		*rss_add += inode->i_mapping->nrpages;
+#endif
+	}
+}
+
+/*
  * Called with shm_ids.rw_mutex held as a reader
  */
 static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss,
@@ -558,30 +590,13 @@ static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss,
 	for (total = 0, next_id = 0; total < in_use; next_id++) {
 		struct kern_ipc_perm *ipc;
 		struct shmid_kernel *shp;
-		struct inode *inode;
 
 		ipc = idr_find(&shm_ids(ns).ipcs_idr, next_id);
 		if (ipc == NULL)
 			continue;
 		shp = container_of(ipc, struct shmid_kernel, shm_perm);
 
-		inode = shp->shm_file->f_path.dentry->d_inode;
-
-		if (is_file_hugepages(shp->shm_file)) {
-			struct address_space *mapping = inode->i_mapping;
-			struct hstate *h = hstate_file(shp->shm_file);
-			*rss += pages_per_huge_page(h) * mapping->nrpages;
-		} else {
-#ifdef CONFIG_SHMEM
-			struct shmem_inode_info *info = SHMEM_I(inode);
-			spin_lock(&info->lock);
-			*rss += inode->i_mapping->nrpages;
-			*swp += info->swapped;
-			spin_unlock(&info->lock);
-#else
-			*rss += inode->i_mapping->nrpages;
-#endif
-		}
+		shm_add_rss_swap(shp, rss, swp);
 
 		total++;
 	}
@@ -1070,6 +1085,9 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr)
 static int sysvipc_shm_proc_show(struct seq_file *s, void *it)
 {
 	struct shmid_kernel *shp = it;
+	unsigned long rss = 0, swp = 0;
+
+	shm_add_rss_swap(shp, &rss, &swp);
 
 #if BITS_PER_LONG <= 32
 #define SIZE_SPEC "%10lu"
@@ -1079,7 +1097,8 @@ static int sysvipc_shm_proc_show(struct seq_file *s, void *it)
 
 	return seq_printf(s,
 			  "%10d %10d  %4o " SIZE_SPEC " %5u %5u  "
-			  "%5lu %5u %5u %5u %5u %10lu %10lu %10lu\n",
+			  "%5lu %5u %5u %5u %5u %10lu %10lu %10lu "
+			  SIZE_SPEC " " SIZE_SPEC "\n",
 			  shp->shm_perm.key,
 			  shp->shm_perm.id,
 			  shp->shm_perm.mode,
@@ -1093,6 +1112,8 @@ static int sysvipc_shm_proc_show(struct seq_file *s, void *it)
 			  shp->shm_perm.cgid,
 			  shp->shm_atim,
 			  shp->shm_dtim,
-			  shp->shm_ctim);
+			  shp->shm_ctim,
+			  rss * PAGE_SIZE,
+			  swp * PAGE_SIZE);
 }
 #endif

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] ipc/shm.c: add RSS and swap size information to /proc/sysvipc/shm
  2010-08-11 20:13 ` Helge Deller
@ 2010-08-12 20:10   ` Andrew Morton
  -1 siblings, 0 replies; 18+ messages in thread
From: Andrew Morton @ 2010-08-12 20:10 UTC (permalink / raw)
  To: Helge Deller; +Cc: linux-mm, linux-kernel, Hugh Dickins, Manfred Spraul

On Wed, 11 Aug 2010 22:13:45 +0200
Helge Deller <deller@gmx.de> wrote:

> The kernel currently provides no functionality to analyze the RSS
> and swap space usage of each individual sysvipc shared memory segment.
> 
> This patch add this info for each existing shm segment by extending
> the output of /proc/sysvipc/shm by two columns for RSS and swap.
> 
> Since shmctl(SHM_INFO) already provides a similiar calculation (it
> currently sums up all RSS/swap info for all segments), I did split
> out a static function which is now used by the /proc/sysvipc/shm 
> output and shmctl(SHM_INFO).
> 

I suppose that could be useful, although it would be most interesting
to hear why _you_ consider it useful?

But is it useful enough to risk breaking existing code which parses
that file?  The risk is not great, but it's there.

> 
> ---
> 
>  shm.c |   63 ++++++++++++++++++++++++++++++++++++++++++---------------------
>  1 file changed, 42 insertions(+), 21 deletions(-)
> 
> 
> diff --git a/ipc/shm.c b/ipc/shm.c
> --- a/ipc/shm.c
> +++ b/ipc/shm.c
> @@ -108,7 +108,11 @@ void __init shm_init (void)
>  {
>  	shm_init_ns(&init_ipc_ns);
>  	ipc_init_proc_interface("sysvipc/shm",
> -				"       key      shmid perms       size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime\n",
> +#if BITS_PER_LONG <= 32
> +				"       key      shmid perms       size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime        RSS       swap\n",
> +#else
> +				"       key      shmid perms                  size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime                   RSS                  swap\n",

This adds 11 new spaces between "perms" and "size", only on 64-bit
machines.  That was unchangelogged and adds another (smaller) risk of
breaking things.  Please explain.

This interface is really old and crufty and horrid, but I guess that
there's not a lot we can do about that :(


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] ipc/shm.c: add RSS and swap size information to /proc/sysvipc/shm
@ 2010-08-12 20:10   ` Andrew Morton
  0 siblings, 0 replies; 18+ messages in thread
From: Andrew Morton @ 2010-08-12 20:10 UTC (permalink / raw)
  To: Helge Deller; +Cc: linux-mm, linux-kernel, Hugh Dickins, Manfred Spraul

On Wed, 11 Aug 2010 22:13:45 +0200
Helge Deller <deller@gmx.de> wrote:

> The kernel currently provides no functionality to analyze the RSS
> and swap space usage of each individual sysvipc shared memory segment.
> 
> This patch add this info for each existing shm segment by extending
> the output of /proc/sysvipc/shm by two columns for RSS and swap.
> 
> Since shmctl(SHM_INFO) already provides a similiar calculation (it
> currently sums up all RSS/swap info for all segments), I did split
> out a static function which is now used by the /proc/sysvipc/shm 
> output and shmctl(SHM_INFO).
> 

I suppose that could be useful, although it would be most interesting
to hear why _you_ consider it useful?

But is it useful enough to risk breaking existing code which parses
that file?  The risk is not great, but it's there.

> 
> ---
> 
>  shm.c |   63 ++++++++++++++++++++++++++++++++++++++++++---------------------
>  1 file changed, 42 insertions(+), 21 deletions(-)
> 
> 
> diff --git a/ipc/shm.c b/ipc/shm.c
> --- a/ipc/shm.c
> +++ b/ipc/shm.c
> @@ -108,7 +108,11 @@ void __init shm_init (void)
>  {
>  	shm_init_ns(&init_ipc_ns);
>  	ipc_init_proc_interface("sysvipc/shm",
> -				"       key      shmid perms       size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime\n",
> +#if BITS_PER_LONG <= 32
> +				"       key      shmid perms       size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime        RSS       swap\n",
> +#else
> +				"       key      shmid perms                  size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime                   RSS                  swap\n",

This adds 11 new spaces between "perms" and "size", only on 64-bit
machines.  That was unchangelogged and adds another (smaller) risk of
breaking things.  Please explain.

This interface is really old and crufty and horrid, but I guess that
there's not a lot we can do about that :(

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] ipc/shm.c: add RSS and swap size information to /proc/sysvipc/shm
  2010-08-12 20:10   ` Andrew Morton
@ 2010-08-12 21:33     ` Helge Deller
  -1 siblings, 0 replies; 18+ messages in thread
From: Helge Deller @ 2010-08-12 21:33 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-mm, linux-kernel, Hugh Dickins, Manfred Spraul

On 08/12/2010 10:10 PM, Andrew Morton wrote:
> On Wed, 11 Aug 2010 22:13:45 +0200
> Helge Deller<deller@gmx.de>  wrote:
>
>> The kernel currently provides no functionality to analyze the RSS
>> and swap space usage of each individual sysvipc shared memory segment.
>>
>> This patch add this info for each existing shm segment by extending
>> the output of /proc/sysvipc/shm by two columns for RSS and swap.
>>
>> Since shmctl(SHM_INFO) already provides a similiar calculation (it
>> currently sums up all RSS/swap info for all segments), I did split
>> out a static function which is now used by the /proc/sysvipc/shm
>> output and shmctl(SHM_INFO).
>>
>
> I suppose that could be useful, although it would be most interesting
> to hear why _you_ consider it useful?

A reasonable question, and I really should have explained when I did 
send this patch.

In my job I do work for SAP in the SAP LinuxLab 
(http://www.sap.com/linux) and take care of the SAP ERP enterprise 
software on Linux.
SAP products (esp. the SAP Netweaver ABAP Kernel) uses lots of big 
shared memory segments (we often have Linux systems with >= 16GB shm 
usage). Sometimes we get customer reports about "slow" system responses 
and while looking into their configurations we often find massive 
swapping activity on the system. With this patch it's now easy to see 
from the command line if and which shm segments gets swapped out (and 
how much) and can more easily give recommendations for system tuning.
Without the patch it's currently not possible to do such shm analysis at 
all.

So, my patch actually does fix a real-world problem.

By the way - I found another bug/issue in /proc/<pid>/smaps as well. The 
kernel currently does not adds swapped-out shm pages to the swap size 
value correctly. The swap size value always stays zero for shm pages. 
I'm currently preparing a small patch to fix that, which I will send to 
linux-mm for review soon.

> But is it useful enough to risk breaking existing code which parses
> that file?  The risk is not great, but it's there.

Sure. The only positive argument is maybe, that I added the new info to 
the end of the lines. IMHO existing applications which parse /proc files 
should always take into account, that more text could follow with newer 
Linux kernels...?

>> ---
>>
>>   shm.c |   63 ++++++++++++++++++++++++++++++++++++++++++---------------------
>>   1 file changed, 42 insertions(+), 21 deletions(-)
>>
>>
>> diff --git a/ipc/shm.c b/ipc/shm.c
>> --- a/ipc/shm.c
>> +++ b/ipc/shm.c
>> @@ -108,7 +108,11 @@ void __init shm_init (void)
>>   {
>>   	shm_init_ns(&init_ipc_ns);
>>   	ipc_init_proc_interface("sysvipc/shm",
>> -				"       key      shmid perms       size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime\n",
>> +#if BITS_PER_LONG<= 32
>> +				"       key      shmid perms       size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime        RSS       swap\n",
>> +#else
>> +				"       key      shmid perms                  size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime                   RSS                  swap\n",
>
> This adds 11 new spaces between "perms" and "size", only on 64-bit
> machines.  That was unchangelogged and adds another (smaller) risk of
> breaking things.  Please explain.

Yes, I did added some spaces in front of the "size" field for 64bit 
kernels to get the columns correct if you cat the contents of the file.
In sysvipc_shm_proc_show() the kernel prints the size value in 
"SPEC_SIZE" format, which is defined like this:

#if BITS_PER_LONG <= 32
#define SIZE_SPEC "%10lu"
#else
#define SIZE_SPEC "%21lu"
#endif

So, if the header is not adjusted, the columns are not correctly 
aligned. I actually tested this on 32- and 64-bit and it seems correct now.

> This interface is really old and crufty and horrid, but I guess that
> there's not a lot we can do about that :(

Helge

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] ipc/shm.c: add RSS and swap size information to /proc/sysvipc/shm
@ 2010-08-12 21:33     ` Helge Deller
  0 siblings, 0 replies; 18+ messages in thread
From: Helge Deller @ 2010-08-12 21:33 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-mm, linux-kernel, Hugh Dickins, Manfred Spraul

On 08/12/2010 10:10 PM, Andrew Morton wrote:
> On Wed, 11 Aug 2010 22:13:45 +0200
> Helge Deller<deller@gmx.de>  wrote:
>
>> The kernel currently provides no functionality to analyze the RSS
>> and swap space usage of each individual sysvipc shared memory segment.
>>
>> This patch add this info for each existing shm segment by extending
>> the output of /proc/sysvipc/shm by two columns for RSS and swap.
>>
>> Since shmctl(SHM_INFO) already provides a similiar calculation (it
>> currently sums up all RSS/swap info for all segments), I did split
>> out a static function which is now used by the /proc/sysvipc/shm
>> output and shmctl(SHM_INFO).
>>
>
> I suppose that could be useful, although it would be most interesting
> to hear why _you_ consider it useful?

A reasonable question, and I really should have explained when I did 
send this patch.

In my job I do work for SAP in the SAP LinuxLab 
(http://www.sap.com/linux) and take care of the SAP ERP enterprise 
software on Linux.
SAP products (esp. the SAP Netweaver ABAP Kernel) uses lots of big 
shared memory segments (we often have Linux systems with >= 16GB shm 
usage). Sometimes we get customer reports about "slow" system responses 
and while looking into their configurations we often find massive 
swapping activity on the system. With this patch it's now easy to see 
from the command line if and which shm segments gets swapped out (and 
how much) and can more easily give recommendations for system tuning.
Without the patch it's currently not possible to do such shm analysis at 
all.

So, my patch actually does fix a real-world problem.

By the way - I found another bug/issue in /proc/<pid>/smaps as well. The 
kernel currently does not adds swapped-out shm pages to the swap size 
value correctly. The swap size value always stays zero for shm pages. 
I'm currently preparing a small patch to fix that, which I will send to 
linux-mm for review soon.

> But is it useful enough to risk breaking existing code which parses
> that file?  The risk is not great, but it's there.

Sure. The only positive argument is maybe, that I added the new info to 
the end of the lines. IMHO existing applications which parse /proc files 
should always take into account, that more text could follow with newer 
Linux kernels...?

>> ---
>>
>>   shm.c |   63 ++++++++++++++++++++++++++++++++++++++++++---------------------
>>   1 file changed, 42 insertions(+), 21 deletions(-)
>>
>>
>> diff --git a/ipc/shm.c b/ipc/shm.c
>> --- a/ipc/shm.c
>> +++ b/ipc/shm.c
>> @@ -108,7 +108,11 @@ void __init shm_init (void)
>>   {
>>   	shm_init_ns(&init_ipc_ns);
>>   	ipc_init_proc_interface("sysvipc/shm",
>> -				"       key      shmid perms       size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime\n",
>> +#if BITS_PER_LONG<= 32
>> +				"       key      shmid perms       size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime        RSS       swap\n",
>> +#else
>> +				"       key      shmid perms                  size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime                   RSS                  swap\n",
>
> This adds 11 new spaces between "perms" and "size", only on 64-bit
> machines.  That was unchangelogged and adds another (smaller) risk of
> breaking things.  Please explain.

Yes, I did added some spaces in front of the "size" field for 64bit 
kernels to get the columns correct if you cat the contents of the file.
In sysvipc_shm_proc_show() the kernel prints the size value in 
"SPEC_SIZE" format, which is defined like this:

#if BITS_PER_LONG <= 32
#define SIZE_SPEC "%10lu"
#else
#define SIZE_SPEC "%21lu"
#endif

So, if the header is not adjusted, the columns are not correctly 
aligned. I actually tested this on 32- and 64-bit and it seems correct now.

> This interface is really old and crufty and horrid, but I guess that
> there's not a lot we can do about that :(

Helge

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] ipc/shm.c: add RSS and swap size information to /proc/sysvipc/shm
  2010-08-12 21:33     ` Helge Deller
@ 2010-08-12 21:44       ` Andrew Morton
  -1 siblings, 0 replies; 18+ messages in thread
From: Andrew Morton @ 2010-08-12 21:44 UTC (permalink / raw)
  To: Helge Deller; +Cc: linux-mm, linux-kernel, Hugh Dickins, Manfred Spraul

On Thu, 12 Aug 2010 23:33:29 +0200
Helge Deller <deller@gmx.de> wrote:

> On 08/12/2010 10:10 PM, Andrew Morton wrote:
> > On Wed, 11 Aug 2010 22:13:45 +0200
> > Helge Deller<deller@gmx.de>  wrote:
> >
> >> The kernel currently provides no functionality to analyze the RSS
> >> and swap space usage of each individual sysvipc shared memory segment.
> >>
> >> This patch add this info for each existing shm segment by extending
> >> the output of /proc/sysvipc/shm by two columns for RSS and swap.
> >>
> >> Since shmctl(SHM_INFO) already provides a similiar calculation (it
> >> currently sums up all RSS/swap info for all segments), I did split
> >> out a static function which is now used by the /proc/sysvipc/shm
> >> output and shmctl(SHM_INFO).
> >>
> >
> > I suppose that could be useful, although it would be most interesting
> > to hear why _you_ consider it useful?
> 
> A reasonable question, and I really should have explained when I did 
> send this patch.
> 
> In my job I do work for SAP in the SAP LinuxLab 
> (http://www.sap.com/linux) and take care of the SAP ERP enterprise 
> software on Linux.
> SAP products (esp. the SAP Netweaver ABAP Kernel) uses lots of big 
> shared memory segments (we often have Linux systems with >= 16GB shm 
> usage). Sometimes we get customer reports about "slow" system responses 
> and while looking into their configurations we often find massive 
> swapping activity on the system. With this patch it's now easy to see 
> from the command line if and which shm segments gets swapped out (and 
> how much) and can more easily give recommendations for system tuning.
> Without the patch it's currently not possible to do such shm analysis at 
> all.

OK, thanks.  copied-n-pasted into changelog ;)

> So, my patch actually does fix a real-world problem.
> 
> By the way - I found another bug/issue in /proc/<pid>/smaps as well. The 
> kernel currently does not adds swapped-out shm pages to the swap size 
> value correctly. The swap size value always stays zero for shm pages. 
> I'm currently preparing a small patch to fix that, which I will send to 
> linux-mm for review soon.
> 
> > But is it useful enough to risk breaking existing code which parses
> > that file?  The risk is not great, but it's there.
> 
> Sure. The only positive argument is maybe, that I added the new info to 
> the end of the lines. IMHO existing applications which parse /proc files 
> should always take into account, that more text could follow with newer 
> Linux kernels...?

Yeah, they'd be pretty dumb if they failed because new columns appear
in later kernels.

But there's some pretty dumb code out there.

> > This adds 11 new spaces between "perms" and "size", only on 64-bit
> > machines.  That was unchangelogged and adds another (smaller) risk of
> > breaking things.  Please explain.
> 
> Yes, I did added some spaces in front of the "size" field for 64bit 
> kernels to get the columns correct if you cat the contents of the file.
> In sysvipc_shm_proc_show() the kernel prints the size value in 
> "SPEC_SIZE" format, which is defined like this:
> 
> #if BITS_PER_LONG <= 32
> #define SIZE_SPEC "%10lu"
> #else
> #define SIZE_SPEC "%21lu"
> #endif
> 
> So, if the header is not adjusted, the columns are not correctly 
> aligned. I actually tested this on 32- and 64-bit and it seems correct now.

<copy, paste>



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] ipc/shm.c: add RSS and swap size information to /proc/sysvipc/shm
@ 2010-08-12 21:44       ` Andrew Morton
  0 siblings, 0 replies; 18+ messages in thread
From: Andrew Morton @ 2010-08-12 21:44 UTC (permalink / raw)
  To: Helge Deller; +Cc: linux-mm, linux-kernel, Hugh Dickins, Manfred Spraul

On Thu, 12 Aug 2010 23:33:29 +0200
Helge Deller <deller@gmx.de> wrote:

> On 08/12/2010 10:10 PM, Andrew Morton wrote:
> > On Wed, 11 Aug 2010 22:13:45 +0200
> > Helge Deller<deller@gmx.de>  wrote:
> >
> >> The kernel currently provides no functionality to analyze the RSS
> >> and swap space usage of each individual sysvipc shared memory segment.
> >>
> >> This patch add this info for each existing shm segment by extending
> >> the output of /proc/sysvipc/shm by two columns for RSS and swap.
> >>
> >> Since shmctl(SHM_INFO) already provides a similiar calculation (it
> >> currently sums up all RSS/swap info for all segments), I did split
> >> out a static function which is now used by the /proc/sysvipc/shm
> >> output and shmctl(SHM_INFO).
> >>
> >
> > I suppose that could be useful, although it would be most interesting
> > to hear why _you_ consider it useful?
> 
> A reasonable question, and I really should have explained when I did 
> send this patch.
> 
> In my job I do work for SAP in the SAP LinuxLab 
> (http://www.sap.com/linux) and take care of the SAP ERP enterprise 
> software on Linux.
> SAP products (esp. the SAP Netweaver ABAP Kernel) uses lots of big 
> shared memory segments (we often have Linux systems with >= 16GB shm 
> usage). Sometimes we get customer reports about "slow" system responses 
> and while looking into their configurations we often find massive 
> swapping activity on the system. With this patch it's now easy to see 
> from the command line if and which shm segments gets swapped out (and 
> how much) and can more easily give recommendations for system tuning.
> Without the patch it's currently not possible to do such shm analysis at 
> all.

OK, thanks.  copied-n-pasted into changelog ;)

> So, my patch actually does fix a real-world problem.
> 
> By the way - I found another bug/issue in /proc/<pid>/smaps as well. The 
> kernel currently does not adds swapped-out shm pages to the swap size 
> value correctly. The swap size value always stays zero for shm pages. 
> I'm currently preparing a small patch to fix that, which I will send to 
> linux-mm for review soon.
> 
> > But is it useful enough to risk breaking existing code which parses
> > that file?  The risk is not great, but it's there.
> 
> Sure. The only positive argument is maybe, that I added the new info to 
> the end of the lines. IMHO existing applications which parse /proc files 
> should always take into account, that more text could follow with newer 
> Linux kernels...?

Yeah, they'd be pretty dumb if they failed because new columns appear
in later kernels.

But there's some pretty dumb code out there.

> > This adds 11 new spaces between "perms" and "size", only on 64-bit
> > machines.  That was unchangelogged and adds another (smaller) risk of
> > breaking things.  Please explain.
> 
> Yes, I did added some spaces in front of the "size" field for 64bit 
> kernels to get the columns correct if you cat the contents of the file.
> In sysvipc_shm_proc_show() the kernel prints the size value in 
> "SPEC_SIZE" format, which is defined like this:
> 
> #if BITS_PER_LONG <= 32
> #define SIZE_SPEC "%10lu"
> #else
> #define SIZE_SPEC "%21lu"
> #endif
> 
> So, if the header is not adjusted, the columns are not correctly 
> aligned. I actually tested this on 32- and 64-bit and it seems correct now.

<copy, paste>


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] ipc/shm.c: add RSS and swap size information to /proc/sysvipc/shm
  2010-08-12 21:33     ` Helge Deller
@ 2010-08-12 22:40       ` Hugh Dickins
  -1 siblings, 0 replies; 18+ messages in thread
From: Hugh Dickins @ 2010-08-12 22:40 UTC (permalink / raw)
  To: Helge Deller; +Cc: Andrew Morton, linux-mm, linux-kernel, Manfred Spraul

On Thu, 12 Aug 2010, Helge Deller wrote:
> On 08/12/2010 10:10 PM, Andrew Morton wrote:
> > On Wed, 11 Aug 2010 22:13:45 +0200
> > Helge Deller<deller@gmx.de>  wrote:
> > 
> > > The kernel currently provides no functionality to analyze the RSS
> > > and swap space usage of each individual sysvipc shared memory segment.
> > > 
> > > This patch add this info for each existing shm segment by extending
> > > the output of /proc/sysvipc/shm by two columns for RSS and swap.
> > > 
> > > Since shmctl(SHM_INFO) already provides a similiar calculation (it
> > > currently sums up all RSS/swap info for all segments), I did split
> > > out a static function which is now used by the /proc/sysvipc/shm
> > > output and shmctl(SHM_INFO).
> > > 
> > 
> > I suppose that could be useful, although it would be most interesting
> > to hear why _you_ consider it useful?
> 
> A reasonable question, and I really should have explained when I did send this
> patch.
> 
> In my job I do work for SAP in the SAP LinuxLab (http://www.sap.com/linux) and
> take care of the SAP ERP enterprise software on Linux.
> SAP products (esp. the SAP Netweaver ABAP Kernel) uses lots of big shared
> memory segments (we often have Linux systems with >= 16GB shm usage).
> Sometimes we get customer reports about "slow" system responses and while
> looking into their configurations we often find massive swapping activity on
> the system. With this patch it's now easy to see from the command line if and
> which shm segments gets swapped out (and how much) and can more easily give
> recommendations for system tuning.
> Without the patch it's currently not possible to do such shm analysis at all.
> 
> So, my patch actually does fix a real-world problem.

That's good justification, thanks.

> 
> By the way - I found another bug/issue in /proc/<pid>/smaps as well. The
> kernel currently does not adds swapped-out shm pages to the swap size value
> correctly. The swap size value always stays zero for shm pages. I'm currently
> preparing a small patch to fix that, which I will send to linux-mm for review
> soon.

I certainly wouldn't call smaps's present behaviour on it a bug: but given
your justification above, I can see that it would be more useful to you,
and probably to others, for it to be changed in the way that you suggest,
to reveal the underlying swap.

Hmm, I wonder what that patch is going to look like...

> 
> > But is it useful enough to risk breaking existing code which parses
> > that file?  The risk is not great, but it's there.
> 
> Sure. The only positive argument is maybe, that I added the new info to the
> end of the lines. IMHO existing applications which parse /proc files should
> always take into account, that more text could follow with newer Linux
> kernels...?

I hope so too.  And agree you're right to correct the 64-bit header
alignment, and to show the new fields in bytes rather than pages.
But one little thing in your patch upsets me greatly...

> 
> > > ---
> > > 
> > >   shm.c |   63
> > > ++++++++++++++++++++++++++++++++++++++++++---------------------
> > >   1 file changed, 42 insertions(+), 21 deletions(-)
> > > 
> > > 
> > > diff --git a/ipc/shm.c b/ipc/shm.c
> > > --- a/ipc/shm.c
> > > +++ b/ipc/shm.c
> > > @@ -108,7 +108,11 @@ void __init shm_init (void)
> > >   {
> > >   	shm_init_ns(&init_ipc_ns);
> > >   	ipc_init_proc_interface("sysvipc/shm",
> > > -				"       key      shmid perms       size  cpid
> > > lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime\n",
> > > +#if BITS_PER_LONG<= 32
> > > +				"       key      shmid perms       size  cpid
> > > lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime
> > > RSS       swap\n",
> > > +#else
> > > +				"       key      shmid perms
> > > size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime
> > > ctime                   RSS                  swap\n",

... why oh why do you write "RSS" in uppercase, when every other field
is named in lowercase?  Please change that to "rss" and then

Acked-by: Hugh Dickins <hughd@google.com>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] ipc/shm.c: add RSS and swap size information to /proc/sysvipc/shm
@ 2010-08-12 22:40       ` Hugh Dickins
  0 siblings, 0 replies; 18+ messages in thread
From: Hugh Dickins @ 2010-08-12 22:40 UTC (permalink / raw)
  To: Helge Deller; +Cc: Andrew Morton, linux-mm, linux-kernel, Manfred Spraul

On Thu, 12 Aug 2010, Helge Deller wrote:
> On 08/12/2010 10:10 PM, Andrew Morton wrote:
> > On Wed, 11 Aug 2010 22:13:45 +0200
> > Helge Deller<deller@gmx.de>  wrote:
> > 
> > > The kernel currently provides no functionality to analyze the RSS
> > > and swap space usage of each individual sysvipc shared memory segment.
> > > 
> > > This patch add this info for each existing shm segment by extending
> > > the output of /proc/sysvipc/shm by two columns for RSS and swap.
> > > 
> > > Since shmctl(SHM_INFO) already provides a similiar calculation (it
> > > currently sums up all RSS/swap info for all segments), I did split
> > > out a static function which is now used by the /proc/sysvipc/shm
> > > output and shmctl(SHM_INFO).
> > > 
> > 
> > I suppose that could be useful, although it would be most interesting
> > to hear why _you_ consider it useful?
> 
> A reasonable question, and I really should have explained when I did send this
> patch.
> 
> In my job I do work for SAP in the SAP LinuxLab (http://www.sap.com/linux) and
> take care of the SAP ERP enterprise software on Linux.
> SAP products (esp. the SAP Netweaver ABAP Kernel) uses lots of big shared
> memory segments (we often have Linux systems with >= 16GB shm usage).
> Sometimes we get customer reports about "slow" system responses and while
> looking into their configurations we often find massive swapping activity on
> the system. With this patch it's now easy to see from the command line if and
> which shm segments gets swapped out (and how much) and can more easily give
> recommendations for system tuning.
> Without the patch it's currently not possible to do such shm analysis at all.
> 
> So, my patch actually does fix a real-world problem.

That's good justification, thanks.

> 
> By the way - I found another bug/issue in /proc/<pid>/smaps as well. The
> kernel currently does not adds swapped-out shm pages to the swap size value
> correctly. The swap size value always stays zero for shm pages. I'm currently
> preparing a small patch to fix that, which I will send to linux-mm for review
> soon.

I certainly wouldn't call smaps's present behaviour on it a bug: but given
your justification above, I can see that it would be more useful to you,
and probably to others, for it to be changed in the way that you suggest,
to reveal the underlying swap.

Hmm, I wonder what that patch is going to look like...

> 
> > But is it useful enough to risk breaking existing code which parses
> > that file?  The risk is not great, but it's there.
> 
> Sure. The only positive argument is maybe, that I added the new info to the
> end of the lines. IMHO existing applications which parse /proc files should
> always take into account, that more text could follow with newer Linux
> kernels...?

I hope so too.  And agree you're right to correct the 64-bit header
alignment, and to show the new fields in bytes rather than pages.
But one little thing in your patch upsets me greatly...

> 
> > > ---
> > > 
> > >   shm.c |   63
> > > ++++++++++++++++++++++++++++++++++++++++++---------------------
> > >   1 file changed, 42 insertions(+), 21 deletions(-)
> > > 
> > > 
> > > diff --git a/ipc/shm.c b/ipc/shm.c
> > > --- a/ipc/shm.c
> > > +++ b/ipc/shm.c
> > > @@ -108,7 +108,11 @@ void __init shm_init (void)
> > >   {
> > >   	shm_init_ns(&init_ipc_ns);
> > >   	ipc_init_proc_interface("sysvipc/shm",
> > > -				"       key      shmid perms       size  cpid
> > > lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime\n",
> > > +#if BITS_PER_LONG<= 32
> > > +				"       key      shmid perms       size  cpid
> > > lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime
> > > RSS       swap\n",
> > > +#else
> > > +				"       key      shmid perms
> > > size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime
> > > ctime                   RSS                  swap\n",

... why oh why do you write "RSS" in uppercase, when every other field
is named in lowercase?  Please change that to "rss" and then

Acked-by: Hugh Dickins <hughd@google.com>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH] ipc/shm.c: add RSS and swap size information to /proc/sysvipc/shm (v2)
  2010-08-12 22:40       ` Hugh Dickins
@ 2010-08-13 14:59         ` Helge Deller
  -1 siblings, 0 replies; 18+ messages in thread
From: Helge Deller @ 2010-08-13 14:59 UTC (permalink / raw)
  To: Hugh Dickins, linux-mm, linux-kernel, manfred, akpm; +Cc: Alexander.Hass

The kernel currently provides no functionality to analyze the RSS
and swap space usage of each individual sysvipc shared memory segment.

This patch adds this info for each existing shm segment by extending
the output of /proc/sysvipc/shm by two columns for RSS and swap.

This additional info about RSS/swap usage of shm segments is very
useful for runtime analysis of the kernel. In my case, at work I'm taking
care of a large ERP enterprise software, which uses lots of big
shared memory segments (we often have Linux systems with >= 16GB shm
usage). Sometimes we get customer reports about "slow" system responses
and while looking into their configurations we often find massive
swapping activity on the system. With this patch it's now easy to see
from the command line if and which shm segments gets swapped out (and
how much) and we can more easily give recommendations for system tuning.
Without the patch it's currently not possible to do such shm analysis at
all.

Since shmctl(SHM_INFO) already provides a similiar calculation (it
currently sums up all RSS/swap info for all segments), I did split
out a static function which is now used by the /proc/sysvipc/shm 
output and shmctl(SHM_INFO).
Additionally, the header line of /proc/sysvipc/shm was changed so
that all columns are adjusted correctly (the width of size field 
is 10 chars for 32bit and 21 chars for 64bit kernels).

Tested on x86-64 and parisc (32- and 64bit). 

Signed-off-by: Helge Deller <deller@gmx.de>
Acked-by: Hugh Dickins <hughd@google.com>

---

 shm.c |   63
++++++++++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 42 insertions(+), 21 deletions(-)


diff --git a/ipc/shm.c b/ipc/shm.c
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -108,7 +108,11 @@ void __init shm_init (void)
 {
 	shm_init_ns(&init_ipc_ns);
 	ipc_init_proc_interface("sysvipc/shm",
-				"       key      shmid perms       size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime\n",
+#if BITS_PER_LONG <= 32
+				"       key      shmid perms       size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime        rss       swap\n",
+#else
+				"       key      shmid perms                  size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime                   rss                  swap\n",
+#endif
 				IPC_SHM_IDS, sysvipc_shm_proc_show);
 }
 
@@ -542,6 +546,34 @@ static inline unsigned long copy_shminfo_to_user(void __user *buf, struct shminf
 }
 
 /*
+ * Calculate and add used RSS and swap pages of a shm.
+ * Called with shm_ids.rw_mutex held as a reader
+ */
+static void shm_add_rss_swap(struct shmid_kernel *shp,
+	unsigned long *rss_add, unsigned long *swp_add)
+{
+	struct inode *inode;
+
+	inode = shp->shm_file->f_path.dentry->d_inode;
+
+	if (is_file_hugepages(shp->shm_file)) {
+		struct address_space *mapping = inode->i_mapping;
+		struct hstate *h = hstate_file(shp->shm_file);
+		*rss_add += pages_per_huge_page(h) * mapping->nrpages;
+	} else {
+#ifdef CONFIG_SHMEM
+		struct shmem_inode_info *info = SHMEM_I(inode);
+		spin_lock(&info->lock);
+		*rss_add += inode->i_mapping->nrpages;
+		*swp_add += info->swapped;
+		spin_unlock(&info->lock);
+#else
+		*rss_add += inode->i_mapping->nrpages;
+#endif
+	}
+}
+
+/*
  * Called with shm_ids.rw_mutex held as a reader
  */
 static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss,
@@ -558,30 +590,13 @@ static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss,
 	for (total = 0, next_id = 0; total < in_use; next_id++) {
 		struct kern_ipc_perm *ipc;
 		struct shmid_kernel *shp;
-		struct inode *inode;
 
 		ipc = idr_find(&shm_ids(ns).ipcs_idr, next_id);
 		if (ipc == NULL)
 			continue;
 		shp = container_of(ipc, struct shmid_kernel, shm_perm);
 
-		inode = shp->shm_file->f_path.dentry->d_inode;
-
-		if (is_file_hugepages(shp->shm_file)) {
-			struct address_space *mapping = inode->i_mapping;
-			struct hstate *h = hstate_file(shp->shm_file);
-			*rss += pages_per_huge_page(h) * mapping->nrpages;
-		} else {
-#ifdef CONFIG_SHMEM
-			struct shmem_inode_info *info = SHMEM_I(inode);
-			spin_lock(&info->lock);
-			*rss += inode->i_mapping->nrpages;
-			*swp += info->swapped;
-			spin_unlock(&info->lock);
-#else
-			*rss += inode->i_mapping->nrpages;
-#endif
-		}
+		shm_add_rss_swap(shp, rss, swp);
 
 		total++;
 	}
@@ -1070,6 +1085,9 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr)
 static int sysvipc_shm_proc_show(struct seq_file *s, void *it)
 {
 	struct shmid_kernel *shp = it;
+	unsigned long rss = 0, swp = 0;
+
+	shm_add_rss_swap(shp, &rss, &swp);
 
 #if BITS_PER_LONG <= 32
 #define SIZE_SPEC "%10lu"
@@ -1079,7 +1097,8 @@ static int sysvipc_shm_proc_show(struct seq_file *s, void *it)
 
 	return seq_printf(s,
 			  "%10d %10d  %4o " SIZE_SPEC " %5u %5u  "
-			  "%5lu %5u %5u %5u %5u %10lu %10lu %10lu\n",
+			  "%5lu %5u %5u %5u %5u %10lu %10lu %10lu "
+			  SIZE_SPEC " " SIZE_SPEC "\n",
 			  shp->shm_perm.key,
 			  shp->shm_perm.id,
 			  shp->shm_perm.mode,
@@ -1093,6 +1112,8 @@ static int sysvipc_shm_proc_show(struct seq_file *s, void *it)
 			  shp->shm_perm.cgid,
 			  shp->shm_atim,
 			  shp->shm_dtim,
-			  shp->shm_ctim);
+			  shp->shm_ctim,
+			  rss * PAGE_SIZE,
+			  swp * PAGE_SIZE);
 }
 #endif

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH] ipc/shm.c: add RSS and swap size information to /proc/sysvipc/shm (v2)
@ 2010-08-13 14:59         ` Helge Deller
  0 siblings, 0 replies; 18+ messages in thread
From: Helge Deller @ 2010-08-13 14:59 UTC (permalink / raw)
  To: Hugh Dickins, linux-mm, linux-kernel, manfred, akpm; +Cc: Alexander.Hass

The kernel currently provides no functionality to analyze the RSS
and swap space usage of each individual sysvipc shared memory segment.

This patch adds this info for each existing shm segment by extending
the output of /proc/sysvipc/shm by two columns for RSS and swap.

This additional info about RSS/swap usage of shm segments is very
useful for runtime analysis of the kernel. In my case, at work I'm taking
care of a large ERP enterprise software, which uses lots of big
shared memory segments (we often have Linux systems with >= 16GB shm
usage). Sometimes we get customer reports about "slow" system responses
and while looking into their configurations we often find massive
swapping activity on the system. With this patch it's now easy to see
from the command line if and which shm segments gets swapped out (and
how much) and we can more easily give recommendations for system tuning.
Without the patch it's currently not possible to do such shm analysis at
all.

Since shmctl(SHM_INFO) already provides a similiar calculation (it
currently sums up all RSS/swap info for all segments), I did split
out a static function which is now used by the /proc/sysvipc/shm 
output and shmctl(SHM_INFO).
Additionally, the header line of /proc/sysvipc/shm was changed so
that all columns are adjusted correctly (the width of size field 
is 10 chars for 32bit and 21 chars for 64bit kernels).

Tested on x86-64 and parisc (32- and 64bit). 

Signed-off-by: Helge Deller <deller@gmx.de>
Acked-by: Hugh Dickins <hughd@google.com>

---

 shm.c |   63
++++++++++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 42 insertions(+), 21 deletions(-)


diff --git a/ipc/shm.c b/ipc/shm.c
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -108,7 +108,11 @@ void __init shm_init (void)
 {
 	shm_init_ns(&init_ipc_ns);
 	ipc_init_proc_interface("sysvipc/shm",
-				"       key      shmid perms       size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime\n",
+#if BITS_PER_LONG <= 32
+				"       key      shmid perms       size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime        rss       swap\n",
+#else
+				"       key      shmid perms                  size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime                   rss                  swap\n",
+#endif
 				IPC_SHM_IDS, sysvipc_shm_proc_show);
 }
 
@@ -542,6 +546,34 @@ static inline unsigned long copy_shminfo_to_user(void __user *buf, struct shminf
 }
 
 /*
+ * Calculate and add used RSS and swap pages of a shm.
+ * Called with shm_ids.rw_mutex held as a reader
+ */
+static void shm_add_rss_swap(struct shmid_kernel *shp,
+	unsigned long *rss_add, unsigned long *swp_add)
+{
+	struct inode *inode;
+
+	inode = shp->shm_file->f_path.dentry->d_inode;
+
+	if (is_file_hugepages(shp->shm_file)) {
+		struct address_space *mapping = inode->i_mapping;
+		struct hstate *h = hstate_file(shp->shm_file);
+		*rss_add += pages_per_huge_page(h) * mapping->nrpages;
+	} else {
+#ifdef CONFIG_SHMEM
+		struct shmem_inode_info *info = SHMEM_I(inode);
+		spin_lock(&info->lock);
+		*rss_add += inode->i_mapping->nrpages;
+		*swp_add += info->swapped;
+		spin_unlock(&info->lock);
+#else
+		*rss_add += inode->i_mapping->nrpages;
+#endif
+	}
+}
+
+/*
  * Called with shm_ids.rw_mutex held as a reader
  */
 static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss,
@@ -558,30 +590,13 @@ static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss,
 	for (total = 0, next_id = 0; total < in_use; next_id++) {
 		struct kern_ipc_perm *ipc;
 		struct shmid_kernel *shp;
-		struct inode *inode;
 
 		ipc = idr_find(&shm_ids(ns).ipcs_idr, next_id);
 		if (ipc == NULL)
 			continue;
 		shp = container_of(ipc, struct shmid_kernel, shm_perm);
 
-		inode = shp->shm_file->f_path.dentry->d_inode;
-
-		if (is_file_hugepages(shp->shm_file)) {
-			struct address_space *mapping = inode->i_mapping;
-			struct hstate *h = hstate_file(shp->shm_file);
-			*rss += pages_per_huge_page(h) * mapping->nrpages;
-		} else {
-#ifdef CONFIG_SHMEM
-			struct shmem_inode_info *info = SHMEM_I(inode);
-			spin_lock(&info->lock);
-			*rss += inode->i_mapping->nrpages;
-			*swp += info->swapped;
-			spin_unlock(&info->lock);
-#else
-			*rss += inode->i_mapping->nrpages;
-#endif
-		}
+		shm_add_rss_swap(shp, rss, swp);
 
 		total++;
 	}
@@ -1070,6 +1085,9 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr)
 static int sysvipc_shm_proc_show(struct seq_file *s, void *it)
 {
 	struct shmid_kernel *shp = it;
+	unsigned long rss = 0, swp = 0;
+
+	shm_add_rss_swap(shp, &rss, &swp);
 
 #if BITS_PER_LONG <= 32
 #define SIZE_SPEC "%10lu"
@@ -1079,7 +1097,8 @@ static int sysvipc_shm_proc_show(struct seq_file *s, void *it)
 
 	return seq_printf(s,
 			  "%10d %10d  %4o " SIZE_SPEC " %5u %5u  "
-			  "%5lu %5u %5u %5u %5u %10lu %10lu %10lu\n",
+			  "%5lu %5u %5u %5u %5u %10lu %10lu %10lu "
+			  SIZE_SPEC " " SIZE_SPEC "\n",
 			  shp->shm_perm.key,
 			  shp->shm_perm.id,
 			  shp->shm_perm.mode,
@@ -1093,6 +1112,8 @@ static int sysvipc_shm_proc_show(struct seq_file *s, void *it)
 			  shp->shm_perm.cgid,
 			  shp->shm_atim,
 			  shp->shm_dtim,
-			  shp->shm_ctim);
+			  shp->shm_ctim,
+			  rss * PAGE_SIZE,
+			  swp * PAGE_SIZE);
 }
 #endif

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH][RFC] Fix up rss/swap usage of shm segments in /proc/pid/smaps
  2010-08-12 22:40       ` Hugh Dickins
@ 2010-08-13 19:52         ` Helge Deller
  -1 siblings, 0 replies; 18+ messages in thread
From: Helge Deller @ 2010-08-13 19:52 UTC (permalink / raw)
  To: Hugh Dickins, linux-mm
  Cc: Helge Deller, Andrew Morton, linux-kernel, Manfred Spraul

> > By the way - I found another bug/issue in /proc/<pid>/smaps as well. The
> > kernel currently does not adds swapped-out shm pages to the swap size value
> > correctly. The swap size value always stays zero for shm pages. I'm currently
> > preparing a small patch to fix that, which I will send to linux-mm for review
> > soon.
> 
> I certainly wouldn't call smaps's present behaviour on it a bug: but given
> your justification above, I can see that it would be more useful to you,
> and probably to others, for it to be changed in the way that you suggest,
> to reveal the underlying swap.
> 
> Hmm, I wonder what that patch is going to look like...

:-)

I tried quite hard to implement rss/swap accounting for shm segments inside
smaps_pte_range() which is a callback function of walk_page_range() in
show_smap().

Given the fact that I'm no linux-mm expert, I might have overseen other
possibilities, but my experiments inside smaps_pte_range() were not
very successful:
>From my tests, a swapped-out shm segment 
	- fails on the "is_swap_pte()" test, and
	- succeeds on the "!pte_present()" test (since it's swapped
	  out).
So, here would it be possible to add such accounting for swap, but how
can I then see that this pte is 
	a) belonging to a shm segment?, and
	b) see if this page/pte was really swapped out and not just not
yet written to at all?
As answers I found:
	a) (vma->vm_flags & VM_MAYSHARE) is true for shm segments (is
		this check sufficient?)
	b) no idea.

But if I add this page to the mss.swap entry, all pages including such 
which haven't been touched yet at all are suddenly counted as
swapped-out...?

Any hints here would be great...


As an alternative solution, I created the following patch.
This one works nicely, but it's just a fix-up of the mss.resident and
mss.swap values after walk_page_range() was called.
It's mostly a copy of the shm_add_rss_swap() function from 
my previous patch (http://marc.info/?l=linux-mm&m=128171161101817&w=2).
Do you think such a fix-up-afterwards-approach is acceptable at all?
If yes, a new patch on top of my ipc/shm.c patch would be easy (and
small).

Comments?

Helge



diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index aea1d3f..a1ef7c9 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -9,6 +9,7 @@
 #include <linux/mempolicy.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
+#include <linux/shmem_fs.h>
 
 #include <asm/elf.h>
 #include <asm/uaccess.h>
@@ -392,6 +393,31 @@ static int show_smap(struct seq_file *m, void *v)
 	if (vma->vm_mm && !is_vm_hugetlb_page(vma))
 		walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk);
 
+#ifdef CONFIG_SYSVIPC
+	/* sysvipc shm segments and hugepages are counted wrong in
+	 * walk_page_range(). Fix it up.
+	 */
+	if (vma->vm_file && (vma->vm_flags & VM_MAYSHARE)) {
+		struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+		if (is_file_hugepages(vma->vm_file)) {
+			struct address_space *mapping = inode->i_mapping;
+			struct hstate *h = hstate_file(vma->vm_file);
+			mss.resident = pages_per_huge_page(h) * 
+				mapping->nrpages * PAGE_SIZE;
+		} else {
+#ifdef CONFIG_SHMEM
+			struct shmem_inode_info *info = SHMEM_I(inode);
+			spin_lock(&info->lock);
+			mss.resident = inode->i_mapping->nrpages * PAGE_SIZE;
+			mss.swap = info->swapped * PAGE_SIZE;
+			spin_unlock(&info->lock);
+#else
+			mss.resident = inode->i_mapping->nrpages * PAGE_SIZE;
+#endif
+		}
+	}
+#endif
+	
 	show_map_vma(m, vma);
 
 	seq_printf(m,

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH][RFC] Fix up rss/swap usage of shm segments in /proc/pid/smaps
@ 2010-08-13 19:52         ` Helge Deller
  0 siblings, 0 replies; 18+ messages in thread
From: Helge Deller @ 2010-08-13 19:52 UTC (permalink / raw)
  To: Hugh Dickins, linux-mm
  Cc: Helge Deller, Andrew Morton, linux-kernel, Manfred Spraul

> > By the way - I found another bug/issue in /proc/<pid>/smaps as well. The
> > kernel currently does not adds swapped-out shm pages to the swap size value
> > correctly. The swap size value always stays zero for shm pages. I'm currently
> > preparing a small patch to fix that, which I will send to linux-mm for review
> > soon.
> 
> I certainly wouldn't call smaps's present behaviour on it a bug: but given
> your justification above, I can see that it would be more useful to you,
> and probably to others, for it to be changed in the way that you suggest,
> to reveal the underlying swap.
> 
> Hmm, I wonder what that patch is going to look like...

:-)

I tried quite hard to implement rss/swap accounting for shm segments inside
smaps_pte_range() which is a callback function of walk_page_range() in
show_smap().

Given the fact that I'm no linux-mm expert, I might have overseen other
possibilities, but my experiments inside smaps_pte_range() were not
very successful:
>From my tests, a swapped-out shm segment 
	- fails on the "is_swap_pte()" test, and
	- succeeds on the "!pte_present()" test (since it's swapped
	  out).
So, here would it be possible to add such accounting for swap, but how
can I then see that this pte is 
	a) belonging to a shm segment?, and
	b) see if this page/pte was really swapped out and not just not
yet written to at all?
As answers I found:
	a) (vma->vm_flags & VM_MAYSHARE) is true for shm segments (is
		this check sufficient?)
	b) no idea.

But if I add this page to the mss.swap entry, all pages including such 
which haven't been touched yet at all are suddenly counted as
swapped-out...?

Any hints here would be great...


As an alternative solution, I created the following patch.
This one works nicely, but it's just a fix-up of the mss.resident and
mss.swap values after walk_page_range() was called.
It's mostly a copy of the shm_add_rss_swap() function from 
my previous patch (http://marc.info/?l=linux-mm&m=128171161101817&w=2).
Do you think such a fix-up-afterwards-approach is acceptable at all?
If yes, a new patch on top of my ipc/shm.c patch would be easy (and
small).

Comments?

Helge



diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index aea1d3f..a1ef7c9 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -9,6 +9,7 @@
 #include <linux/mempolicy.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
+#include <linux/shmem_fs.h>
 
 #include <asm/elf.h>
 #include <asm/uaccess.h>
@@ -392,6 +393,31 @@ static int show_smap(struct seq_file *m, void *v)
 	if (vma->vm_mm && !is_vm_hugetlb_page(vma))
 		walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk);
 
+#ifdef CONFIG_SYSVIPC
+	/* sysvipc shm segments and hugepages are counted wrong in
+	 * walk_page_range(). Fix it up.
+	 */
+	if (vma->vm_file && (vma->vm_flags & VM_MAYSHARE)) {
+		struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+		if (is_file_hugepages(vma->vm_file)) {
+			struct address_space *mapping = inode->i_mapping;
+			struct hstate *h = hstate_file(vma->vm_file);
+			mss.resident = pages_per_huge_page(h) * 
+				mapping->nrpages * PAGE_SIZE;
+		} else {
+#ifdef CONFIG_SHMEM
+			struct shmem_inode_info *info = SHMEM_I(inode);
+			spin_lock(&info->lock);
+			mss.resident = inode->i_mapping->nrpages * PAGE_SIZE;
+			mss.swap = info->swapped * PAGE_SIZE;
+			spin_unlock(&info->lock);
+#else
+			mss.resident = inode->i_mapping->nrpages * PAGE_SIZE;
+#endif
+		}
+	}
+#endif
+	
 	show_map_vma(m, vma);
 
 	seq_printf(m,

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH][RFC] Fix up rss/swap usage of shm segments in /proc/pid/smaps
  2010-08-13 19:52         ` Helge Deller
@ 2010-08-13 22:45           ` Hugh Dickins
  -1 siblings, 0 replies; 18+ messages in thread
From: Hugh Dickins @ 2010-08-13 22:45 UTC (permalink / raw)
  To: Helge Deller; +Cc: linux-mm, Andrew Morton, linux-kernel, Manfred Spraul

On Fri, 13 Aug 2010, Helge Deller wrote:
> 
> I tried quite hard to implement rss/swap accounting for shm segments inside
> smaps_pte_range() which is a callback function of walk_page_range() in
> show_smap().

Sorry, I think the short answer will be that you should give up on this:
reasons below.

> 
> Given the fact that I'm no linux-mm expert, I might have overseen other
> possibilities, but my experiments inside smaps_pte_range() were not
> very successful:
> From my tests, a swapped-out shm segment 
> 	- fails on the "is_swap_pte()" test, and
> 	- succeeds on the "!pte_present()" test (since it's swapped
> 	  out).

Yes.

> So, here would it be possible to add such accounting for swap, but how
> can I then see that this pte is 
> 	a) belonging to a shm segment?, and
> 	b) see if this page/pte was really swapped out and not just not
> yet written to at all?

You would have to add a function in mm/shmem.c to do this: it would
need to check vma->vm_file to work out if this vma belongs to it,
and use shmem_swp_alloc() to check if the page there is on swap.  OTOH
I'm not sure if you could call it while holding page table lock or not.

> As answers I found:
> 	a) (vma->vm_flags & VM_MAYSHARE) is true for shm segments (is
> 		this check sufficient?)

No, VM_MAYSHARE is set on many other kinds of mapping too; and is not
set on all mappings of shmem objects - there is no good reason to
include SysV shm segments here, yet omit other kinds of shmem object
(/dev/shm POSIX shared memory, shared-anonymous mappings, mappings of
tmpfs files).

> 	b) no idea.
> 
> But if I add this page to the mss.swap entry, all pages including such 
> which haven't been touched yet at all are suddenly counted as
> swapped-out...?
> 
> Any hints here would be great...
> 
> 
> As an alternative solution, I created the following patch.
> This one works nicely, but it's just a fix-up of the mss.resident and
> mss.swap values after walk_page_range() was called.
> It's mostly a copy of the shm_add_rss_swap() function from 
> my previous patch (http://marc.info/?l=linux-mm&m=128171161101817&w=2).
> Do you think such a fix-up-afterwards-approach is acceptable at all?
> If yes, a new patch on top of my ipc/shm.c patch would be easy (and
> small).

Not acceptable, I'm afraid.  Nothing wrong with a fix-up-afterwards
approach as such, but it's assuming that the vma covers the full extent
of the shmem object.  That is very often the case, but by no means
necessarily so (whereas it is always the case that one vma cannot cover
more than one object).  So you do have to count pageslot by pageslot.

There are two reasons why I think you have to abandon this.  One is
that /proc/<pid>/smaps is reporting on the userspace mappings, saying
where swap is instanced in them.  Some of those mappings may be of
shmem objects, and some of those shmem objects may use swap backing
themselves, but that's different from the mapping using swap directly.

One can argue about that distinction, but it is how all this is
designed, and blurring that distinction tends to get into trouble.
(It's reasonable to think of anonymous mappings as mappings of anon
objects, which just happen to find room for the swp_entry in the page
table: but then it's a happy accident that smaps can see them.)

The second reason is that since 2.6.34, /proc/<pid>/status shows
VmSwap: we would not want a huge discrepancy between what it shows
in swap and what /proc/<pid>/smaps shows in swap, but nor would we
want to make /proc/<pid>/status scan through page tables enquiring
of shmem.

All this stands in contrast to your /proc/sysvipc/shm patch, which
is rightly dealing with one class of shmem object, not via mappings
of those objects.

There is a case for a "where has my swap gone" tool, which examines
the different kinds of object involved (anonymous mappings as well
as shmem objects), and shows them all somehow.  But that's a lot
more work than just extending an existing stats display.

Hugh

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH][RFC] Fix up rss/swap usage of shm segments in /proc/pid/smaps
@ 2010-08-13 22:45           ` Hugh Dickins
  0 siblings, 0 replies; 18+ messages in thread
From: Hugh Dickins @ 2010-08-13 22:45 UTC (permalink / raw)
  To: Helge Deller; +Cc: linux-mm, Andrew Morton, linux-kernel, Manfred Spraul

On Fri, 13 Aug 2010, Helge Deller wrote:
> 
> I tried quite hard to implement rss/swap accounting for shm segments inside
> smaps_pte_range() which is a callback function of walk_page_range() in
> show_smap().

Sorry, I think the short answer will be that you should give up on this:
reasons below.

> 
> Given the fact that I'm no linux-mm expert, I might have overseen other
> possibilities, but my experiments inside smaps_pte_range() were not
> very successful:
> From my tests, a swapped-out shm segment 
> 	- fails on the "is_swap_pte()" test, and
> 	- succeeds on the "!pte_present()" test (since it's swapped
> 	  out).

Yes.

> So, here would it be possible to add such accounting for swap, but how
> can I then see that this pte is 
> 	a) belonging to a shm segment?, and
> 	b) see if this page/pte was really swapped out and not just not
> yet written to at all?

You would have to add a function in mm/shmem.c to do this: it would
need to check vma->vm_file to work out if this vma belongs to it,
and use shmem_swp_alloc() to check if the page there is on swap.  OTOH
I'm not sure if you could call it while holding page table lock or not.

> As answers I found:
> 	a) (vma->vm_flags & VM_MAYSHARE) is true for shm segments (is
> 		this check sufficient?)

No, VM_MAYSHARE is set on many other kinds of mapping too; and is not
set on all mappings of shmem objects - there is no good reason to
include SysV shm segments here, yet omit other kinds of shmem object
(/dev/shm POSIX shared memory, shared-anonymous mappings, mappings of
tmpfs files).

> 	b) no idea.
> 
> But if I add this page to the mss.swap entry, all pages including such 
> which haven't been touched yet at all are suddenly counted as
> swapped-out...?
> 
> Any hints here would be great...
> 
> 
> As an alternative solution, I created the following patch.
> This one works nicely, but it's just a fix-up of the mss.resident and
> mss.swap values after walk_page_range() was called.
> It's mostly a copy of the shm_add_rss_swap() function from 
> my previous patch (http://marc.info/?l=linux-mm&m=128171161101817&w=2).
> Do you think such a fix-up-afterwards-approach is acceptable at all?
> If yes, a new patch on top of my ipc/shm.c patch would be easy (and
> small).

Not acceptable, I'm afraid.  Nothing wrong with a fix-up-afterwards
approach as such, but it's assuming that the vma covers the full extent
of the shmem object.  That is very often the case, but by no means
necessarily so (whereas it is always the case that one vma cannot cover
more than one object).  So you do have to count pageslot by pageslot.

There are two reasons why I think you have to abandon this.  One is
that /proc/<pid>/smaps is reporting on the userspace mappings, saying
where swap is instanced in them.  Some of those mappings may be of
shmem objects, and some of those shmem objects may use swap backing
themselves, but that's different from the mapping using swap directly.

One can argue about that distinction, but it is how all this is
designed, and blurring that distinction tends to get into trouble.
(It's reasonable to think of anonymous mappings as mappings of anon
objects, which just happen to find room for the swp_entry in the page
table: but then it's a happy accident that smaps can see them.)

The second reason is that since 2.6.34, /proc/<pid>/status shows
VmSwap: we would not want a huge discrepancy between what it shows
in swap and what /proc/<pid>/smaps shows in swap, but nor would we
want to make /proc/<pid>/status scan through page tables enquiring
of shmem.

All this stands in contrast to your /proc/sysvipc/shm patch, which
is rightly dealing with one class of shmem object, not via mappings
of those objects.

There is a case for a "where has my swap gone" tool, which examines
the different kinds of object involved (anonymous mappings as well
as shmem objects), and shows them all somehow.  But that's a lot
more work than just extending an existing stats display.

Hugh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH][RFC] Fix up rss/swap usage of shm segments in /proc/pid/smaps
  2010-08-13 22:45           ` Hugh Dickins
@ 2010-08-16 20:21             ` Helge Deller
  -1 siblings, 0 replies; 18+ messages in thread
From: Helge Deller @ 2010-08-16 20:21 UTC (permalink / raw)
  To: Hugh Dickins; +Cc: linux-mm, Andrew Morton, linux-kernel, Manfred Spraul

On 08/14/2010 12:45 AM, Hugh Dickins wrote:
> On Fri, 13 Aug 2010, Helge Deller wrote:
>>
>> I tried quite hard to implement rss/swap accounting for shm segments inside
>> smaps_pte_range() which is a callback function of walk_page_range() in
>> show_smap().
>
> Sorry, I think the short answer will be that you should give up on this:
> reasons below.
>
>>
>> Given the fact that I'm no linux-mm expert, I might have overseen other
>> possibilities, but my experiments inside smaps_pte_range() were not
>> very successful:
>>  From my tests, a swapped-out shm segment
>> 	- fails on the "is_swap_pte()" test, and
>> 	- succeeds on the "!pte_present()" test (since it's swapped
>> 	  out).
>
> Yes.
>
>> So, here would it be possible to add such accounting for swap, but how
>> can I then see that this pte is
>> 	a) belonging to a shm segment?, and
>> 	b) see if this page/pte was really swapped out and not just not
>> yet written to at all?
>
> You would have to add a function in mm/shmem.c to do this: it would
> need to check vma->vm_file to work out if this vma belongs to it,
> and use shmem_swp_alloc() to check if the page there is on swap.  OTOH
> I'm not sure if you could call it while holding page table lock or not.
>
>> As answers I found:
>> 	a) (vma->vm_flags&  VM_MAYSHARE) is true for shm segments (is
>> 		this check sufficient?)
>
> No, VM_MAYSHARE is set on many other kinds of mapping too; and is not
> set on all mappings of shmem objects - there is no good reason to
> include SysV shm segments here, yet omit other kinds of shmem object
> (/dev/shm POSIX shared memory, shared-anonymous mappings, mappings of
> tmpfs files).
>
>> 	b) no idea.
>>
>> But if I add this page to the mss.swap entry, all pages including such
>> which haven't been touched yet at all are suddenly counted as
>> swapped-out...?
>>
>> Any hints here would be great...
>>
>>
>> As an alternative solution, I created the following patch.
>> This one works nicely, but it's just a fix-up of the mss.resident and
>> mss.swap values after walk_page_range() was called.
>> It's mostly a copy of the shm_add_rss_swap() function from
>> my previous patch (http://marc.info/?l=linux-mm&m=128171161101817&w=2).
>> Do you think such a fix-up-afterwards-approach is acceptable at all?
>> If yes, a new patch on top of my ipc/shm.c patch would be easy (and
>> small).
>
> Not acceptable, I'm afraid.  Nothing wrong with a fix-up-afterwards
> approach as such, but it's assuming that the vma covers the full extent
> of the shmem object.  That is very often the case, but by no means
> necessarily so (whereas it is always the case that one vma cannot cover
> more than one object).  So you do have to count pageslot by pageslot.
>
> There are two reasons why I think you have to abandon this.  One is
> that /proc/<pid>/smaps is reporting on the userspace mappings, saying
> where swap is instanced in them.  Some of those mappings may be of
> shmem objects, and some of those shmem objects may use swap backing
> themselves, but that's different from the mapping using swap directly.
>
> One can argue about that distinction, but it is how all this is
> designed, and blurring that distinction tends to get into trouble.
> (It's reasonable to think of anonymous mappings as mappings of anon
> objects, which just happen to find room for the swp_entry in the page
> table: but then it's a happy accident that smaps can see them.)
>
> The second reason is that since 2.6.34, /proc/<pid>/status shows
> VmSwap: we would not want a huge discrepancy between what it shows
> in swap and what /proc/<pid>/smaps shows in swap, but nor would we
> want to make /proc/<pid>/status scan through page tables enquiring
> of shmem.
>
> All this stands in contrast to your /proc/sysvipc/shm patch, which
> is rightly dealing with one class of shmem object, not via mappings
> of those objects.
>
> There is a case for a "where has my swap gone" tool, which examines
> the different kinds of object involved (anonymous mappings as well
> as shmem objects), and shows them all somehow.  But that's a lot
> more work than just extending an existing stats display.

Hugh, thanks for the good and comprehensive summary!
Seems that I have to live with the /proc/sysvipc/shm overview then :-(

Thanks,
Helge

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH][RFC] Fix up rss/swap usage of shm segments in /proc/pid/smaps
@ 2010-08-16 20:21             ` Helge Deller
  0 siblings, 0 replies; 18+ messages in thread
From: Helge Deller @ 2010-08-16 20:21 UTC (permalink / raw)
  To: Hugh Dickins; +Cc: linux-mm, Andrew Morton, linux-kernel, Manfred Spraul

On 08/14/2010 12:45 AM, Hugh Dickins wrote:
> On Fri, 13 Aug 2010, Helge Deller wrote:
>>
>> I tried quite hard to implement rss/swap accounting for shm segments inside
>> smaps_pte_range() which is a callback function of walk_page_range() in
>> show_smap().
>
> Sorry, I think the short answer will be that you should give up on this:
> reasons below.
>
>>
>> Given the fact that I'm no linux-mm expert, I might have overseen other
>> possibilities, but my experiments inside smaps_pte_range() were not
>> very successful:
>>  From my tests, a swapped-out shm segment
>> 	- fails on the "is_swap_pte()" test, and
>> 	- succeeds on the "!pte_present()" test (since it's swapped
>> 	  out).
>
> Yes.
>
>> So, here would it be possible to add such accounting for swap, but how
>> can I then see that this pte is
>> 	a) belonging to a shm segment?, and
>> 	b) see if this page/pte was really swapped out and not just not
>> yet written to at all?
>
> You would have to add a function in mm/shmem.c to do this: it would
> need to check vma->vm_file to work out if this vma belongs to it,
> and use shmem_swp_alloc() to check if the page there is on swap.  OTOH
> I'm not sure if you could call it while holding page table lock or not.
>
>> As answers I found:
>> 	a) (vma->vm_flags&  VM_MAYSHARE) is true for shm segments (is
>> 		this check sufficient?)
>
> No, VM_MAYSHARE is set on many other kinds of mapping too; and is not
> set on all mappings of shmem objects - there is no good reason to
> include SysV shm segments here, yet omit other kinds of shmem object
> (/dev/shm POSIX shared memory, shared-anonymous mappings, mappings of
> tmpfs files).
>
>> 	b) no idea.
>>
>> But if I add this page to the mss.swap entry, all pages including such
>> which haven't been touched yet at all are suddenly counted as
>> swapped-out...?
>>
>> Any hints here would be great...
>>
>>
>> As an alternative solution, I created the following patch.
>> This one works nicely, but it's just a fix-up of the mss.resident and
>> mss.swap values after walk_page_range() was called.
>> It's mostly a copy of the shm_add_rss_swap() function from
>> my previous patch (http://marc.info/?l=linux-mm&m=128171161101817&w=2).
>> Do you think such a fix-up-afterwards-approach is acceptable at all?
>> If yes, a new patch on top of my ipc/shm.c patch would be easy (and
>> small).
>
> Not acceptable, I'm afraid.  Nothing wrong with a fix-up-afterwards
> approach as such, but it's assuming that the vma covers the full extent
> of the shmem object.  That is very often the case, but by no means
> necessarily so (whereas it is always the case that one vma cannot cover
> more than one object).  So you do have to count pageslot by pageslot.
>
> There are two reasons why I think you have to abandon this.  One is
> that /proc/<pid>/smaps is reporting on the userspace mappings, saying
> where swap is instanced in them.  Some of those mappings may be of
> shmem objects, and some of those shmem objects may use swap backing
> themselves, but that's different from the mapping using swap directly.
>
> One can argue about that distinction, but it is how all this is
> designed, and blurring that distinction tends to get into trouble.
> (It's reasonable to think of anonymous mappings as mappings of anon
> objects, which just happen to find room for the swp_entry in the page
> table: but then it's a happy accident that smaps can see them.)
>
> The second reason is that since 2.6.34, /proc/<pid>/status shows
> VmSwap: we would not want a huge discrepancy between what it shows
> in swap and what /proc/<pid>/smaps shows in swap, but nor would we
> want to make /proc/<pid>/status scan through page tables enquiring
> of shmem.
>
> All this stands in contrast to your /proc/sysvipc/shm patch, which
> is rightly dealing with one class of shmem object, not via mappings
> of those objects.
>
> There is a case for a "where has my swap gone" tool, which examines
> the different kinds of object involved (anonymous mappings as well
> as shmem objects), and shows them all somehow.  But that's a lot
> more work than just extending an existing stats display.

Hugh, thanks for the good and comprehensive summary!
Seems that I have to live with the /proc/sysvipc/shm overview then :-(

Thanks,
Helge

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2010-08-16 20:21 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-08-11 20:13 [PATCH] ipc/shm.c: add RSS and swap size information to /proc/sysvipc/shm Helge Deller
2010-08-11 20:13 ` Helge Deller
2010-08-12 20:10 ` Andrew Morton
2010-08-12 20:10   ` Andrew Morton
2010-08-12 21:33   ` Helge Deller
2010-08-12 21:33     ` Helge Deller
2010-08-12 21:44     ` Andrew Morton
2010-08-12 21:44       ` Andrew Morton
2010-08-12 22:40     ` Hugh Dickins
2010-08-12 22:40       ` Hugh Dickins
2010-08-13 14:59       ` [PATCH] ipc/shm.c: add RSS and swap size information to /proc/sysvipc/shm (v2) Helge Deller
2010-08-13 14:59         ` Helge Deller
2010-08-13 19:52       ` [PATCH][RFC] Fix up rss/swap usage of shm segments in /proc/pid/smaps Helge Deller
2010-08-13 19:52         ` Helge Deller
2010-08-13 22:45         ` Hugh Dickins
2010-08-13 22:45           ` Hugh Dickins
2010-08-16 20:21           ` Helge Deller
2010-08-16 20:21             ` Helge Deller

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.