All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] show per-process swap usage via procfs
@ 2009-11-04  6:24 ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 42+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-11-04  6:24 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux-mm, hugh.dickins, cl, akpm, kosaki.motohiro


Passed several tests and one bug was fixed since RFC version.
This patch is against mmotm.
=
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

Now, anon_rss and file_rss is counted as RSS and exported via /proc.
RSS usage is important information but one more information which
is often asked by users is "usage of swap".(user support team said.)

This patch counts swap entry usage per process and show it via
/proc/<pid>/status. I think status file is robust against new entry.
Then, it is the first candidate..

 After this, /proc/<pid>/status includes following line
 <snip>
 VmPeak:   315360 kB
 VmSize:   315360 kB
 VmLck:         0 kB
 VmHWM:    180452 kB
 VmRSS:    180452 kB
 VmData:   311624 kB
 VmStk:        84 kB
 VmExe:         4 kB
 VmLib:      1568 kB
 VmPTE:       640 kB
 VmSwap:   131240 kB <=== new information

Note:
  Because this patch catches swap_pte on page table, this will
  not catch shmem's swapout. It's already accounted in per-shmem
  inode and we don't need to do more.

Changelog: 2009/11/03
 - clean up.
 - fixed initialization bug at fork (init_mm())

Acked-by: Acked-by; David Rientjes <rientjes@google.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 fs/proc/task_mmu.c       |    9 ++++++---
 include/linux/mm_types.h |    1 +
 kernel/fork.c            |    1 +
 mm/memory.c              |   30 +++++++++++++++++++++---------
 mm/rmap.c                |    1 +
 mm/swapfile.c            |    1 +
 6 files changed, 31 insertions(+), 12 deletions(-)

Index: mmotm-2.6.32-Nov2/include/linux/mm_types.h
===================================================================
--- mmotm-2.6.32-Nov2.orig/include/linux/mm_types.h
+++ mmotm-2.6.32-Nov2/include/linux/mm_types.h
@@ -228,6 +228,7 @@ struct mm_struct {
 	 */
 	mm_counter_t _file_rss;
 	mm_counter_t _anon_rss;
+	mm_counter_t _swap_usage;
 
 	unsigned long hiwater_rss;	/* High-watermark of RSS usage */
 	unsigned long hiwater_vm;	/* High-water virtual memory usage */
Index: mmotm-2.6.32-Nov2/mm/memory.c
===================================================================
--- mmotm-2.6.32-Nov2.orig/mm/memory.c
+++ mmotm-2.6.32-Nov2/mm/memory.c
@@ -376,12 +376,15 @@ int __pte_alloc_kernel(pmd_t *pmd, unsig
 	return 0;
 }
 
-static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
+static inline void
+add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss, int swap_usage)
 {
 	if (file_rss)
 		add_mm_counter(mm, file_rss, file_rss);
 	if (anon_rss)
 		add_mm_counter(mm, anon_rss, anon_rss);
+	if (swap_usage)
+		add_mm_counter(mm, swap_usage, swap_usage);
 }
 
 /*
@@ -597,7 +600,9 @@ copy_one_pte(struct mm_struct *dst_mm, s
 						 &src_mm->mmlist);
 				spin_unlock(&mmlist_lock);
 			}
-			if (is_write_migration_entry(entry) &&
+			if (!is_migration_entry(entry))
+				rss[2]++;
+			else if (is_write_migration_entry(entry) &&
 					is_cow_mapping(vm_flags)) {
 				/*
 				 * COW mappings require pages in both parent
@@ -648,11 +653,11 @@ static int copy_pte_range(struct mm_stru
 	pte_t *src_pte, *dst_pte;
 	spinlock_t *src_ptl, *dst_ptl;
 	int progress = 0;
-	int rss[2];
+	int rss[3];
 	swp_entry_t entry = (swp_entry_t){0};
 
 again:
-	rss[1] = rss[0] = 0;
+	rss[2] = rss[1] = rss[0] = 0;
 	dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
 	if (!dst_pte)
 		return -ENOMEM;
@@ -688,7 +693,7 @@ again:
 	arch_leave_lazy_mmu_mode();
 	spin_unlock(src_ptl);
 	pte_unmap_nested(orig_src_pte);
-	add_mm_rss(dst_mm, rss[0], rss[1]);
+	add_mm_rss(dst_mm, rss[0], rss[1], rss[2]);
 	pte_unmap_unlock(orig_dst_pte, dst_ptl);
 	cond_resched();
 
@@ -818,6 +823,7 @@ static unsigned long zap_pte_range(struc
 	spinlock_t *ptl;
 	int file_rss = 0;
 	int anon_rss = 0;
+	int swap_usage = 0;
 
 	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 	arch_enter_lazy_mmu_mode();
@@ -887,13 +893,18 @@ static unsigned long zap_pte_range(struc
 		if (pte_file(ptent)) {
 			if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
 				print_bad_pte(vma, addr, ptent, NULL);
-		} else if
-		  (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent))))
-			print_bad_pte(vma, addr, ptent, NULL);
+		} else {
+			swp_entry_t ent = pte_to_swp_entry(ptent);
+
+			if (!is_migration_entry(ent))
+				swap_usage--;
+			if (unlikely(!free_swap_and_cache(ent)))
+				print_bad_pte(vma, addr, ptent, NULL);
+		}
 		pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
 	} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
 
-	add_mm_rss(mm, file_rss, anon_rss);
+	add_mm_rss(mm, file_rss, anon_rss, swap_usage);
 	arch_leave_lazy_mmu_mode();
 	pte_unmap_unlock(pte - 1, ptl);
 
@@ -2595,6 +2606,7 @@ static int do_swap_page(struct mm_struct
 	 */
 
 	inc_mm_counter(mm, anon_rss);
+	dec_mm_counter(mm, swap_usage);
 	pte = mk_pte(page, vma->vm_page_prot);
 	if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
 		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
Index: mmotm-2.6.32-Nov2/mm/swapfile.c
===================================================================
--- mmotm-2.6.32-Nov2.orig/mm/swapfile.c
+++ mmotm-2.6.32-Nov2/mm/swapfile.c
@@ -837,6 +837,7 @@ static int unuse_pte(struct vm_area_stru
 	}
 
 	inc_mm_counter(vma->vm_mm, anon_rss);
+	dec_mm_counter(vma->vm_mm, swap_usage);
 	get_page(page);
 	set_pte_at(vma->vm_mm, addr, pte,
 		   pte_mkold(mk_pte(page, vma->vm_page_prot)));
Index: mmotm-2.6.32-Nov2/fs/proc/task_mmu.c
===================================================================
--- mmotm-2.6.32-Nov2.orig/fs/proc/task_mmu.c
+++ mmotm-2.6.32-Nov2/fs/proc/task_mmu.c
@@ -17,7 +17,7 @@
 void task_mem(struct seq_file *m, struct mm_struct *mm)
 {
 	unsigned long data, text, lib;
-	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
+	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss, swap;
 
 	/*
 	 * Note: to minimize their overhead, mm maintains hiwater_vm and
@@ -36,6 +36,7 @@ void task_mem(struct seq_file *m, struct
 	data = mm->total_vm - mm->shared_vm - mm->stack_vm;
 	text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
 	lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
+	swap = get_mm_counter(mm, swap_usage);
 	seq_printf(m,
 		"VmPeak:\t%8lu kB\n"
 		"VmSize:\t%8lu kB\n"
@@ -46,7 +47,8 @@ void task_mem(struct seq_file *m, struct
 		"VmStk:\t%8lu kB\n"
 		"VmExe:\t%8lu kB\n"
 		"VmLib:\t%8lu kB\n"
-		"VmPTE:\t%8lu kB\n",
+		"VmPTE:\t%8lu kB\n"
+		"VmSwap:\t%8lu kB\n",
 		hiwater_vm << (PAGE_SHIFT-10),
 		(total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
 		mm->locked_vm << (PAGE_SHIFT-10),
@@ -54,7 +56,8 @@ void task_mem(struct seq_file *m, struct
 		total_rss << (PAGE_SHIFT-10),
 		data << (PAGE_SHIFT-10),
 		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
-		(PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10);
+		(PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10,
+		swap << (PAGE_SHIFT - 10));
 }
 
 unsigned long task_vsize(struct mm_struct *mm)
Index: mmotm-2.6.32-Nov2/mm/rmap.c
===================================================================
--- mmotm-2.6.32-Nov2.orig/mm/rmap.c
+++ mmotm-2.6.32-Nov2/mm/rmap.c
@@ -834,6 +834,7 @@ static int try_to_unmap_one(struct page 
 				spin_unlock(&mmlist_lock);
 			}
 			dec_mm_counter(mm, anon_rss);
+			inc_mm_counter(mm, swap_usage);
 		} else if (PAGE_MIGRATION) {
 			/*
 			 * Store the pfn of the page in a special migration
Index: mmotm-2.6.32-Nov2/kernel/fork.c
===================================================================
--- mmotm-2.6.32-Nov2.orig/kernel/fork.c
+++ mmotm-2.6.32-Nov2/kernel/fork.c
@@ -454,6 +454,7 @@ static struct mm_struct * mm_init(struct
 	mm->nr_ptes = 0;
 	set_mm_counter(mm, file_rss, 0);
 	set_mm_counter(mm, anon_rss, 0);
+	set_mm_counter(mm, swap_usage, 0);
 	spin_lock_init(&mm->page_table_lock);
 	mm->free_area_cache = TASK_UNMAPPED_BASE;
 	mm->cached_hole_size = ~0UL;


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH] show per-process swap usage via procfs
@ 2009-11-04  6:24 ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 42+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-11-04  6:24 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux-mm, hugh.dickins, cl, akpm, kosaki.motohiro


Passed several tests and one bug was fixed since RFC version.
This patch is against mmotm.
=
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

Now, anon_rss and file_rss is counted as RSS and exported via /proc.
RSS usage is important information but one more information which
is often asked by users is "usage of swap".(user support team said.)

This patch counts swap entry usage per process and show it via
/proc/<pid>/status. I think status file is robust against new entry.
Then, it is the first candidate..

 After this, /proc/<pid>/status includes following line
 <snip>
 VmPeak:   315360 kB
 VmSize:   315360 kB
 VmLck:         0 kB
 VmHWM:    180452 kB
 VmRSS:    180452 kB
 VmData:   311624 kB
 VmStk:        84 kB
 VmExe:         4 kB
 VmLib:      1568 kB
 VmPTE:       640 kB
 VmSwap:   131240 kB <=== new information

Note:
  Because this patch catches swap_pte on page table, this will
  not catch shmem's swapout. It's already accounted in per-shmem
  inode and we don't need to do more.

Changelog: 2009/11/03
 - clean up.
 - fixed initialization bug at fork (init_mm())

Acked-by: Acked-by; David Rientjes <rientjes@google.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 fs/proc/task_mmu.c       |    9 ++++++---
 include/linux/mm_types.h |    1 +
 kernel/fork.c            |    1 +
 mm/memory.c              |   30 +++++++++++++++++++++---------
 mm/rmap.c                |    1 +
 mm/swapfile.c            |    1 +
 6 files changed, 31 insertions(+), 12 deletions(-)

Index: mmotm-2.6.32-Nov2/include/linux/mm_types.h
===================================================================
--- mmotm-2.6.32-Nov2.orig/include/linux/mm_types.h
+++ mmotm-2.6.32-Nov2/include/linux/mm_types.h
@@ -228,6 +228,7 @@ struct mm_struct {
 	 */
 	mm_counter_t _file_rss;
 	mm_counter_t _anon_rss;
+	mm_counter_t _swap_usage;
 
 	unsigned long hiwater_rss;	/* High-watermark of RSS usage */
 	unsigned long hiwater_vm;	/* High-water virtual memory usage */
Index: mmotm-2.6.32-Nov2/mm/memory.c
===================================================================
--- mmotm-2.6.32-Nov2.orig/mm/memory.c
+++ mmotm-2.6.32-Nov2/mm/memory.c
@@ -376,12 +376,15 @@ int __pte_alloc_kernel(pmd_t *pmd, unsig
 	return 0;
 }
 
-static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
+static inline void
+add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss, int swap_usage)
 {
 	if (file_rss)
 		add_mm_counter(mm, file_rss, file_rss);
 	if (anon_rss)
 		add_mm_counter(mm, anon_rss, anon_rss);
+	if (swap_usage)
+		add_mm_counter(mm, swap_usage, swap_usage);
 }
 
 /*
@@ -597,7 +600,9 @@ copy_one_pte(struct mm_struct *dst_mm, s
 						 &src_mm->mmlist);
 				spin_unlock(&mmlist_lock);
 			}
-			if (is_write_migration_entry(entry) &&
+			if (!is_migration_entry(entry))
+				rss[2]++;
+			else if (is_write_migration_entry(entry) &&
 					is_cow_mapping(vm_flags)) {
 				/*
 				 * COW mappings require pages in both parent
@@ -648,11 +653,11 @@ static int copy_pte_range(struct mm_stru
 	pte_t *src_pte, *dst_pte;
 	spinlock_t *src_ptl, *dst_ptl;
 	int progress = 0;
-	int rss[2];
+	int rss[3];
 	swp_entry_t entry = (swp_entry_t){0};
 
 again:
-	rss[1] = rss[0] = 0;
+	rss[2] = rss[1] = rss[0] = 0;
 	dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
 	if (!dst_pte)
 		return -ENOMEM;
@@ -688,7 +693,7 @@ again:
 	arch_leave_lazy_mmu_mode();
 	spin_unlock(src_ptl);
 	pte_unmap_nested(orig_src_pte);
-	add_mm_rss(dst_mm, rss[0], rss[1]);
+	add_mm_rss(dst_mm, rss[0], rss[1], rss[2]);
 	pte_unmap_unlock(orig_dst_pte, dst_ptl);
 	cond_resched();
 
@@ -818,6 +823,7 @@ static unsigned long zap_pte_range(struc
 	spinlock_t *ptl;
 	int file_rss = 0;
 	int anon_rss = 0;
+	int swap_usage = 0;
 
 	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 	arch_enter_lazy_mmu_mode();
@@ -887,13 +893,18 @@ static unsigned long zap_pte_range(struc
 		if (pte_file(ptent)) {
 			if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
 				print_bad_pte(vma, addr, ptent, NULL);
-		} else if
-		  (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent))))
-			print_bad_pte(vma, addr, ptent, NULL);
+		} else {
+			swp_entry_t ent = pte_to_swp_entry(ptent);
+
+			if (!is_migration_entry(ent))
+				swap_usage--;
+			if (unlikely(!free_swap_and_cache(ent)))
+				print_bad_pte(vma, addr, ptent, NULL);
+		}
 		pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
 	} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
 
-	add_mm_rss(mm, file_rss, anon_rss);
+	add_mm_rss(mm, file_rss, anon_rss, swap_usage);
 	arch_leave_lazy_mmu_mode();
 	pte_unmap_unlock(pte - 1, ptl);
 
@@ -2595,6 +2606,7 @@ static int do_swap_page(struct mm_struct
 	 */
 
 	inc_mm_counter(mm, anon_rss);
+	dec_mm_counter(mm, swap_usage);
 	pte = mk_pte(page, vma->vm_page_prot);
 	if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
 		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
Index: mmotm-2.6.32-Nov2/mm/swapfile.c
===================================================================
--- mmotm-2.6.32-Nov2.orig/mm/swapfile.c
+++ mmotm-2.6.32-Nov2/mm/swapfile.c
@@ -837,6 +837,7 @@ static int unuse_pte(struct vm_area_stru
 	}
 
 	inc_mm_counter(vma->vm_mm, anon_rss);
+	dec_mm_counter(vma->vm_mm, swap_usage);
 	get_page(page);
 	set_pte_at(vma->vm_mm, addr, pte,
 		   pte_mkold(mk_pte(page, vma->vm_page_prot)));
Index: mmotm-2.6.32-Nov2/fs/proc/task_mmu.c
===================================================================
--- mmotm-2.6.32-Nov2.orig/fs/proc/task_mmu.c
+++ mmotm-2.6.32-Nov2/fs/proc/task_mmu.c
@@ -17,7 +17,7 @@
 void task_mem(struct seq_file *m, struct mm_struct *mm)
 {
 	unsigned long data, text, lib;
-	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
+	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss, swap;
 
 	/*
 	 * Note: to minimize their overhead, mm maintains hiwater_vm and
@@ -36,6 +36,7 @@ void task_mem(struct seq_file *m, struct
 	data = mm->total_vm - mm->shared_vm - mm->stack_vm;
 	text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
 	lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
+	swap = get_mm_counter(mm, swap_usage);
 	seq_printf(m,
 		"VmPeak:\t%8lu kB\n"
 		"VmSize:\t%8lu kB\n"
@@ -46,7 +47,8 @@ void task_mem(struct seq_file *m, struct
 		"VmStk:\t%8lu kB\n"
 		"VmExe:\t%8lu kB\n"
 		"VmLib:\t%8lu kB\n"
-		"VmPTE:\t%8lu kB\n",
+		"VmPTE:\t%8lu kB\n"
+		"VmSwap:\t%8lu kB\n",
 		hiwater_vm << (PAGE_SHIFT-10),
 		(total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
 		mm->locked_vm << (PAGE_SHIFT-10),
@@ -54,7 +56,8 @@ void task_mem(struct seq_file *m, struct
 		total_rss << (PAGE_SHIFT-10),
 		data << (PAGE_SHIFT-10),
 		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
-		(PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10);
+		(PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10,
+		swap << (PAGE_SHIFT - 10));
 }
 
 unsigned long task_vsize(struct mm_struct *mm)
Index: mmotm-2.6.32-Nov2/mm/rmap.c
===================================================================
--- mmotm-2.6.32-Nov2.orig/mm/rmap.c
+++ mmotm-2.6.32-Nov2/mm/rmap.c
@@ -834,6 +834,7 @@ static int try_to_unmap_one(struct page 
 				spin_unlock(&mmlist_lock);
 			}
 			dec_mm_counter(mm, anon_rss);
+			inc_mm_counter(mm, swap_usage);
 		} else if (PAGE_MIGRATION) {
 			/*
 			 * Store the pfn of the page in a special migration
Index: mmotm-2.6.32-Nov2/kernel/fork.c
===================================================================
--- mmotm-2.6.32-Nov2.orig/kernel/fork.c
+++ mmotm-2.6.32-Nov2/kernel/fork.c
@@ -454,6 +454,7 @@ static struct mm_struct * mm_init(struct
 	mm->nr_ptes = 0;
 	set_mm_counter(mm, file_rss, 0);
 	set_mm_counter(mm, anon_rss, 0);
+	set_mm_counter(mm, swap_usage, 0);
 	spin_lock_init(&mm->page_table_lock);
 	mm->free_area_cache = TASK_UNMAPPED_BASE;
 	mm->cached_hole_size = ~0UL;

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] show per-process swap usage via procfs
  2009-11-04  6:24 ` KAMEZAWA Hiroyuki
@ 2009-11-04 19:15   ` Christoph Lameter
  -1 siblings, 0 replies; 42+ messages in thread
From: Christoph Lameter @ 2009-11-04 19:15 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: linux-kernel, linux-mm, hugh.dickins, akpm, kosaki.motohiro

On Wed, 4 Nov 2009, KAMEZAWA Hiroyuki wrote:

> Now, anon_rss and file_rss is counted as RSS and exported via /proc.
> RSS usage is important information but one more information which
> is often asked by users is "usage of swap".(user support team said.)

Hmmm... Could we do some rework of the counters first so that they are per
cpu?


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] show per-process swap usage via procfs
@ 2009-11-04 19:15   ` Christoph Lameter
  0 siblings, 0 replies; 42+ messages in thread
From: Christoph Lameter @ 2009-11-04 19:15 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: linux-kernel, linux-mm, hugh.dickins, akpm, kosaki.motohiro

On Wed, 4 Nov 2009, KAMEZAWA Hiroyuki wrote:

> Now, anon_rss and file_rss is counted as RSS and exported via /proc.
> RSS usage is important information but one more information which
> is often asked by users is "usage of swap".(user support team said.)

Hmmm... Could we do some rework of the counters first so that they are per
cpu?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] show per-process swap usage via procfs
  2009-11-04 19:15   ` Christoph Lameter
@ 2009-11-04 23:25     ` KOSAKI Motohiro
  -1 siblings, 0 replies; 42+ messages in thread
From: KOSAKI Motohiro @ 2009-11-04 23:25 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: kosaki.motohiro, KAMEZAWA Hiroyuki, linux-kernel, linux-mm,
	hugh.dickins, akpm

> On Wed, 4 Nov 2009, KAMEZAWA Hiroyuki wrote:
> 
> > Now, anon_rss and file_rss is counted as RSS and exported via /proc.
> > RSS usage is important information but one more information which
> > is often asked by users is "usage of swap".(user support team said.)
> 
> Hmmm... Could we do some rework of the counters first so that they are per
> cpu?

per-cpu swap counter?
It seems overkill effort....




^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] show per-process swap usage via procfs
@ 2009-11-04 23:25     ` KOSAKI Motohiro
  0 siblings, 0 replies; 42+ messages in thread
From: KOSAKI Motohiro @ 2009-11-04 23:25 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: kosaki.motohiro, KAMEZAWA Hiroyuki, linux-kernel, linux-mm,
	hugh.dickins, akpm

> On Wed, 4 Nov 2009, KAMEZAWA Hiroyuki wrote:
> 
> > Now, anon_rss and file_rss is counted as RSS and exported via /proc.
> > RSS usage is important information but one more information which
> > is often asked by users is "usage of swap".(user support team said.)
> 
> Hmmm... Could we do some rework of the counters first so that they are per
> cpu?

per-cpu swap counter?
It seems overkill effort....



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] show per-process swap usage via procfs
  2009-11-04 19:15   ` Christoph Lameter
@ 2009-11-05  0:06     ` KAMEZAWA Hiroyuki
  -1 siblings, 0 replies; 42+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-11-05  0:06 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: linux-kernel, linux-mm, hugh.dickins, akpm, kosaki.motohiro

On Wed, 4 Nov 2009 14:15:40 -0500 (EST)
Christoph Lameter <cl@linux-foundation.org> wrote:

> On Wed, 4 Nov 2009, KAMEZAWA Hiroyuki wrote:
> 
> > Now, anon_rss and file_rss is counted as RSS and exported via /proc.
> > RSS usage is important information but one more information which
> > is often asked by users is "usage of swap".(user support team said.)
> 
> Hmmm... Could we do some rework of the counters first so that they are per
> cpu?
> 
I don't think swap_usage counter has much costs because it's call path
is always slow path. But, I'm not in hurry. So rework is ok.

I'll post my percpu array counter with some rework, CCing you.
Maybe it can be used in this case.

Thanks,
-Kame


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] show per-process swap usage via procfs
@ 2009-11-05  0:06     ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 42+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-11-05  0:06 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: linux-kernel, linux-mm, hugh.dickins, akpm, kosaki.motohiro

On Wed, 4 Nov 2009 14:15:40 -0500 (EST)
Christoph Lameter <cl@linux-foundation.org> wrote:

> On Wed, 4 Nov 2009, KAMEZAWA Hiroyuki wrote:
> 
> > Now, anon_rss and file_rss is counted as RSS and exported via /proc.
> > RSS usage is important information but one more information which
> > is often asked by users is "usage of swap".(user support team said.)
> 
> Hmmm... Could we do some rework of the counters first so that they are per
> cpu?
> 
I don't think swap_usage counter has much costs because it's call path
is always slow path. But, I'm not in hurry. So rework is ok.

I'll post my percpu array counter with some rework, CCing you.
Maybe it can be used in this case.

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] show per-process swap usage via procfs
  2009-11-04 23:25     ` KOSAKI Motohiro
@ 2009-11-05  2:28       ` KAMEZAWA Hiroyuki
  -1 siblings, 0 replies; 42+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-11-05  2:28 UTC (permalink / raw)
  To: KOSAKI Motohiro
  Cc: Christoph Lameter, linux-kernel, linux-mm, hugh.dickins, akpm

On Thu,  5 Nov 2009 08:25:28 +0900 (JST)
KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> wrote:

> > On Wed, 4 Nov 2009, KAMEZAWA Hiroyuki wrote:
> > 
> > > Now, anon_rss and file_rss is counted as RSS and exported via /proc.
> > > RSS usage is important information but one more information which
> > > is often asked by users is "usage of swap".(user support team said.)
> > 
> > Hmmm... Could we do some rework of the counters first so that they are per
> > cpu?
> 
> per-cpu swap counter?
> It seems overkill effort....
> 
I nearly agree with you.

Thanks,
-Kame


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] show per-process swap usage via procfs
@ 2009-11-05  2:28       ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 42+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-11-05  2:28 UTC (permalink / raw)
  To: KOSAKI Motohiro
  Cc: Christoph Lameter, linux-kernel, linux-mm, hugh.dickins, akpm

On Thu,  5 Nov 2009 08:25:28 +0900 (JST)
KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> wrote:

> > On Wed, 4 Nov 2009, KAMEZAWA Hiroyuki wrote:
> > 
> > > Now, anon_rss and file_rss is counted as RSS and exported via /proc.
> > > RSS usage is important information but one more information which
> > > is often asked by users is "usage of swap".(user support team said.)
> > 
> > Hmmm... Could we do some rework of the counters first so that they are per
> > cpu?
> 
> per-cpu swap counter?
> It seems overkill effort....
> 
I nearly agree with you.

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [RFC][PATCH] lib: generic percpu counter array
  2009-11-05  0:06     ` KAMEZAWA Hiroyuki
@ 2009-11-05  5:16       ` KAMEZAWA Hiroyuki
  -1 siblings, 0 replies; 42+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-11-05  5:16 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Christoph Lameter, linux-kernel, linux-mm, hugh.dickins, akpm,
	kosaki.motohiro

On Thu, 5 Nov 2009 09:06:59 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:

> I'll post my percpu array counter with some rework, CCing you.
> Maybe it can be used in this case.
> 

This pach has been on my queue for a month. 
I'm glad if I can get advise from you. This patch is for memcg, now.

==
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

Now, percpu code is rewritten and it's easy to use in dynamic.
We have lib/percpu_counter.c but it uses
 - unsigned long long
 - spinlock
so, it tend to be big size and not very optimized.

Anothter major percpu coutner is vm_stat[]. This patch implements
vm_stat[] style counter array in lib/percpu_counter.c
This is designed for introducing vm_stat[] style counter to memcg,
but maybe useful for other people. By using this, counter array
using percpu can be implemented easily in compact structure.

usage in my assumption is like this.

	enum {
		ELEM_A, ELEM_B, NR_ELEMENTS};
	struct hoge {
		....
		...
		DEFINE_COUNTER_ARRAY(name, NR_ELEMENT);
		.....
	} xxxx;

	counter_array_add(_CA(xxxx->name), ELEM_A, val), 

Changelog 2009/11/05
 - renamed name of structures.
 - rewrote all comments
 - support "nosync" mode
 - fixed !SMP case
 - changed percpu value from "char" to "long"

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 include/linux/percpu_counter.h |  107 +++++++++++++++++++++++++++++
 lib/percpu_counter.c           |  148 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 255 insertions(+)

Index: mmotm-2.6.32-Nov2/include/linux/percpu_counter.h
===================================================================
--- mmotm-2.6.32-Nov2.orig/include/linux/percpu_counter.h
+++ mmotm-2.6.32-Nov2/include/linux/percpu_counter.h
@@ -77,6 +77,59 @@ static inline s64 percpu_counter_read_po
 	return 1;
 }
 
+/*
+ * Counter Array is array of counter like percpu_counter but it's idea is
+ * mainly from vm_stat[]. Unlike vm_stat[], this counter use "int" for batch
+ * value, If user wants, this can provides "nosync" percpu counter.
+ * But in that case, read will be slow.
+ *
+ * One more point is size of this array. This uses cacheline-size+elements
+ * size object and also use element size of percpu area. So, this will use
+ * bigger amount of memory than simple atomic_t.
+ */
+
+struct _pad_counter_array {
+	char elements;
+	char nosync;
+	int batch;
+	long *array;
+#ifdef CONFIG_HOTPLUG_CPU
+	struct list_head list;
+#endif
+} ____cacheline_aligned_in_smp;
+
+struct counter_array {
+	struct _pad_counter_array v;
+	atomic_long_t counters[0];
+};
+
+#define DEFINE_COUNTER_ARRAY(name, elements) \
+	struct {\
+		struct counter_array ca;\
+		long __counters[(elements)]; } name;
+
+#define DEFINE_COUNTER_ARRAY_NOSYNC(name, elements) \
+	struct {\
+		struct counter_array ca; } name;
+/*
+ * For access counters, using this macro is an easy way as
+ * array_counter_add( _CA(object->name), elem, val);
+ */
+#define _CA(x)	(&(x)->ca)
+/*  about "nosync" see lib/percpu_counrer.c for its meaning. */
+int counter_array_init(struct counter_array *ca, int size, int nosync);
+void counter_array_destroy(struct counter_array *ca);
+void counter_array_add(struct counter_array *ca, int idx, int val);
+void __counter_array_add(struct counter_array *ca, int idx, int val, int batch);
+
+static inline long counter_array_read(struct counter_array *ca, int idx)
+{
+	return atomic_long_read(&ca->counters[idx]);
+}
+
+/* take all percpu value into account */
+long counter_array_sum(struct counter_array *ca, int idx);
+
 #else
 
 struct percpu_counter {
@@ -129,6 +182,45 @@ static inline s64 percpu_counter_sum(str
 	return percpu_counter_read(fbc);
 }
 
+struct counter_array {
+	atomic_long_t counters[0];
+};
+#define DEFINE_COUNTER_ARRAY(name) \
+	struct {\
+		struct counter_array ac;\
+		unsigned long counters[(elements)]; } name;\
+
+static inline int counter_array_init(struct counter_array *ca,
+		int size, int nosync)
+{
+	return 0;
+}
+
+static inline void counter_array_destroy(struct counter_array *ca)
+{
+}
+
+static inline void
+counter_array_add(struct counter_array *ca, int idx, int val)
+{
+	ca->counters[idx] += val;
+}
+
+static inline void
+__counter_array_add(struct counter_array *ca, int idx, int val, int batch)
+{
+	ca->counters[idx] += val;
+}
+
+static inline counter_array_read(struct counter_array *ca, int idx)
+{
+	return ca->counters[idx];
+}
+
+static inline counter_array_sum(struct counter_array *ca, int idx)
+{
+	return ca->counters[idx];
+}
 #endif	/* CONFIG_SMP */
 
 static inline void percpu_counter_inc(struct percpu_counter *fbc)
@@ -146,4 +238,19 @@ static inline void percpu_counter_sub(st
 	percpu_counter_add(fbc, -amount);
 }
 
+static inline void counter_array_inc(struct counter_array *ca, int idx)
+{
+	counter_array_add(ca, idx, 1);
+}
+
+static inline void counter_array_dec(struct counter_array *ca, int idx)
+{
+	counter_array_add(ca, idx, -1);
+}
+
+static inline void
+counter_array_sub(struct counter_array *ca, int idx, int val)
+{
+	counter_array_add(ca, idx, -val);
+}
 #endif /* _LINUX_PERCPU_COUNTER_H */
Index: mmotm-2.6.32-Nov2/lib/percpu_counter.c
===================================================================
--- mmotm-2.6.32-Nov2.orig/lib/percpu_counter.c
+++ mmotm-2.6.32-Nov2/lib/percpu_counter.c
@@ -144,3 +144,151 @@ static int __init percpu_counter_startup
 	return 0;
 }
 module_init(percpu_counter_startup);
+
+/* COUNTER_ARRAY */
+DEFINE_MUTEX(counter_array_mutex);
+LIST_HEAD(counter_arrays);
+#ifdef CONFIG_HOTPLUG_CPU
+#define MAINTAIN_LIST(ca)	(!(ca)->v.nosync)
+#else
+#define MAINTAIN_LIST 0
+#endif
+
+/**
+ * counter_array_init - initialize counter array with percpu.
+ * @ca: counter array to be initialized
+ * @size: the number of elements in this array
+ * @nosync: need to sync in batch or not
+ *
+ * Initialize counter array which contains elements of @size. Modification
+ * of each value will be cached in percpu area and merged into global atomic
+ * counter in batched manner. If nosync==1, global atomic counter will not be
+ * used, but readers has to use countar_array_sum() always.
+ *
+ * If nosync is specified, this skips entry for a list of CPU HOTPLUG
+ * notification. If you ofren alloc/free coutners, nosync is appreciated.
+ * But you have to use counter_array_sum() to read values. It's trade-off.
+ */
+int counter_array_init(struct counter_array *ca, int size, int nosync)
+{
+	ca->v.array = __alloc_percpu(size * sizeof(long), __alignof__(long));
+	if (!ca->v.array)
+		return -ENOMEM;
+	ca->v.nosync = nosync;
+	ca->v.elements = size;
+
+	if (MAINTAIN_LIST(ca)) {
+		mutex_lock(&counter_array_mutex);
+		list_add(&ca->v.list, &counter_arrays);
+		mutex_unlock(&counter_array_mutex);
+	}
+	return 0;
+}
+
+void counter_array_destroy(struct counter_array *ca)
+{
+	if (MAINTAIN_LIST(ca)) {
+		mutex_lock(&counter_array_mutex);
+		list_add(&ca->v.list, &counter_arrays);
+		mutex_unlock(&counter_array_mutex);
+	}
+	free_percpu(ca->v.array);
+	ca->v.array = NULL;
+}
+#undef MAINTAIN_LIST
+
+/**
+ * __counter_array_add - add specified value to counter[idx]
+ * @ca: counter array to be modified
+ * @idx: index in counter array
+ * @val: value to be added
+ * @batch: threshould to coalesce percpu value to global counter.
+ *
+ * Add specified value to counter[idx]. Users can control how frequently
+ * synchronization will happen by "batch" value. If counter is initialized
+ * as "nosync" counter, no synchronization will happen.
+ */
+void __counter_array_add(struct counter_array *ca, int idx, int val, int batch)
+{
+	long count, *pcount;
+
+	preempt_disable();
+
+	pcount = this_cpu_ptr(ca->v.array);
+	count = pcount[idx] + val;
+	if (!ca->v.nosync && ((count > batch) || (count < -batch))) {
+		atomic_long_add(count, &ca->counters[idx]);
+		pcount[idx] = 0;
+	} else
+		pcount[idx] = count;
+	preempt_enable();
+}
+
+void counter_array_add(struct counter_array *ca, int idx, int val)
+{
+	__counter_array_add(ca, idx, val, percpu_counter_batch);
+}
+
+long counter_array_sum(struct counter_array *ca, int idx)
+{
+	long val, *pcount;
+	int cpu;
+
+	if (ca->v.nosync) {
+		val = 0;
+		/* We don't have CPU HOTPLUG callback */
+		for_each_possible_cpu(cpu) {
+			pcount = per_cpu_ptr(ca->v.array, cpu);
+			val += pcount[idx];
+		}
+	} else {
+		/*
+		 * We don't have CPU HOTPLUG callback. There maybe race
+		 * but amount of error is below batch value.
+		 */
+		val = atomic_long_read(&ca->counters[idx]);
+		for_each_online_cpu(cpu) {
+			pcount = per_cpu_ptr(ca->v.array, cpu);
+			val += pcount[idx];
+		}
+	}
+	return val;
+}
+
+static int __cpuinit counter_array_hotcpu_callback(struct notifier_block *nb,
+		unsigned long action, void *hcpu)
+{
+	struct _pad_counter_array *pca;
+	unsigned int cpu;
+
+	if (action != CPU_DEAD)
+		return NOTIFY_OK;
+
+	cpu = (unsigned long)hcpu;
+	/*
+	 * nosync counter is not on this list.
+	 */
+	mutex_lock(&counter_array_mutex);
+	list_for_each_entry(pca, &counter_arrays, list) {
+		struct counter_array *ca;
+		long *pcount;
+		int idx;
+
+		pcount = per_cpu_ptr(pca->array, cpu);
+		ca = container_of(pca, struct counter_array, v);
+		for (idx = 0; idx < ca->v.elements; idx++) {
+			atomic_long_add(pcount[idx], &ca->counters[idx]);
+			pcount[idx] = 0;
+		}
+	}
+	mutex_unlock(&counter_array_mutex);
+
+	return NOTIFY_OK;
+}
+
+static int __init counter_array_startup(void)
+{
+	hotcpu_notifier(counter_array_hotcpu_callback, 0);
+	return 0;
+}
+module_init(counter_array_startup);


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [RFC][PATCH] lib: generic percpu counter array
@ 2009-11-05  5:16       ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 42+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-11-05  5:16 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Christoph Lameter, linux-kernel, linux-mm, hugh.dickins, akpm,
	kosaki.motohiro

On Thu, 5 Nov 2009 09:06:59 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:

> I'll post my percpu array counter with some rework, CCing you.
> Maybe it can be used in this case.
> 

This pach has been on my queue for a month. 
I'm glad if I can get advise from you. This patch is for memcg, now.

==
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

Now, percpu code is rewritten and it's easy to use in dynamic.
We have lib/percpu_counter.c but it uses
 - unsigned long long
 - spinlock
so, it tend to be big size and not very optimized.

Anothter major percpu coutner is vm_stat[]. This patch implements
vm_stat[] style counter array in lib/percpu_counter.c
This is designed for introducing vm_stat[] style counter to memcg,
but maybe useful for other people. By using this, counter array
using percpu can be implemented easily in compact structure.

usage in my assumption is like this.

	enum {
		ELEM_A, ELEM_B, NR_ELEMENTS};
	struct hoge {
		....
		...
		DEFINE_COUNTER_ARRAY(name, NR_ELEMENT);
		.....
	} xxxx;

	counter_array_add(_CA(xxxx->name), ELEM_A, val), 

Changelog 2009/11/05
 - renamed name of structures.
 - rewrote all comments
 - support "nosync" mode
 - fixed !SMP case
 - changed percpu value from "char" to "long"

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 include/linux/percpu_counter.h |  107 +++++++++++++++++++++++++++++
 lib/percpu_counter.c           |  148 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 255 insertions(+)

Index: mmotm-2.6.32-Nov2/include/linux/percpu_counter.h
===================================================================
--- mmotm-2.6.32-Nov2.orig/include/linux/percpu_counter.h
+++ mmotm-2.6.32-Nov2/include/linux/percpu_counter.h
@@ -77,6 +77,59 @@ static inline s64 percpu_counter_read_po
 	return 1;
 }
 
+/*
+ * Counter Array is array of counter like percpu_counter but it's idea is
+ * mainly from vm_stat[]. Unlike vm_stat[], this counter use "int" for batch
+ * value, If user wants, this can provides "nosync" percpu counter.
+ * But in that case, read will be slow.
+ *
+ * One more point is size of this array. This uses cacheline-size+elements
+ * size object and also use element size of percpu area. So, this will use
+ * bigger amount of memory than simple atomic_t.
+ */
+
+struct _pad_counter_array {
+	char elements;
+	char nosync;
+	int batch;
+	long *array;
+#ifdef CONFIG_HOTPLUG_CPU
+	struct list_head list;
+#endif
+} ____cacheline_aligned_in_smp;
+
+struct counter_array {
+	struct _pad_counter_array v;
+	atomic_long_t counters[0];
+};
+
+#define DEFINE_COUNTER_ARRAY(name, elements) \
+	struct {\
+		struct counter_array ca;\
+		long __counters[(elements)]; } name;
+
+#define DEFINE_COUNTER_ARRAY_NOSYNC(name, elements) \
+	struct {\
+		struct counter_array ca; } name;
+/*
+ * For access counters, using this macro is an easy way as
+ * array_counter_add( _CA(object->name), elem, val);
+ */
+#define _CA(x)	(&(x)->ca)
+/*  about "nosync" see lib/percpu_counrer.c for its meaning. */
+int counter_array_init(struct counter_array *ca, int size, int nosync);
+void counter_array_destroy(struct counter_array *ca);
+void counter_array_add(struct counter_array *ca, int idx, int val);
+void __counter_array_add(struct counter_array *ca, int idx, int val, int batch);
+
+static inline long counter_array_read(struct counter_array *ca, int idx)
+{
+	return atomic_long_read(&ca->counters[idx]);
+}
+
+/* take all percpu value into account */
+long counter_array_sum(struct counter_array *ca, int idx);
+
 #else
 
 struct percpu_counter {
@@ -129,6 +182,45 @@ static inline s64 percpu_counter_sum(str
 	return percpu_counter_read(fbc);
 }
 
+struct counter_array {
+	atomic_long_t counters[0];
+};
+#define DEFINE_COUNTER_ARRAY(name) \
+	struct {\
+		struct counter_array ac;\
+		unsigned long counters[(elements)]; } name;\
+
+static inline int counter_array_init(struct counter_array *ca,
+		int size, int nosync)
+{
+	return 0;
+}
+
+static inline void counter_array_destroy(struct counter_array *ca)
+{
+}
+
+static inline void
+counter_array_add(struct counter_array *ca, int idx, int val)
+{
+	ca->counters[idx] += val;
+}
+
+static inline void
+__counter_array_add(struct counter_array *ca, int idx, int val, int batch)
+{
+	ca->counters[idx] += val;
+}
+
+static inline counter_array_read(struct counter_array *ca, int idx)
+{
+	return ca->counters[idx];
+}
+
+static inline counter_array_sum(struct counter_array *ca, int idx)
+{
+	return ca->counters[idx];
+}
 #endif	/* CONFIG_SMP */
 
 static inline void percpu_counter_inc(struct percpu_counter *fbc)
@@ -146,4 +238,19 @@ static inline void percpu_counter_sub(st
 	percpu_counter_add(fbc, -amount);
 }
 
+static inline void counter_array_inc(struct counter_array *ca, int idx)
+{
+	counter_array_add(ca, idx, 1);
+}
+
+static inline void counter_array_dec(struct counter_array *ca, int idx)
+{
+	counter_array_add(ca, idx, -1);
+}
+
+static inline void
+counter_array_sub(struct counter_array *ca, int idx, int val)
+{
+	counter_array_add(ca, idx, -val);
+}
 #endif /* _LINUX_PERCPU_COUNTER_H */
Index: mmotm-2.6.32-Nov2/lib/percpu_counter.c
===================================================================
--- mmotm-2.6.32-Nov2.orig/lib/percpu_counter.c
+++ mmotm-2.6.32-Nov2/lib/percpu_counter.c
@@ -144,3 +144,151 @@ static int __init percpu_counter_startup
 	return 0;
 }
 module_init(percpu_counter_startup);
+
+/* COUNTER_ARRAY */
+DEFINE_MUTEX(counter_array_mutex);
+LIST_HEAD(counter_arrays);
+#ifdef CONFIG_HOTPLUG_CPU
+#define MAINTAIN_LIST(ca)	(!(ca)->v.nosync)
+#else
+#define MAINTAIN_LIST 0
+#endif
+
+/**
+ * counter_array_init - initialize counter array with percpu.
+ * @ca: counter array to be initialized
+ * @size: the number of elements in this array
+ * @nosync: need to sync in batch or not
+ *
+ * Initialize counter array which contains elements of @size. Modification
+ * of each value will be cached in percpu area and merged into global atomic
+ * counter in batched manner. If nosync==1, global atomic counter will not be
+ * used, but readers has to use countar_array_sum() always.
+ *
+ * If nosync is specified, this skips entry for a list of CPU HOTPLUG
+ * notification. If you ofren alloc/free coutners, nosync is appreciated.
+ * But you have to use counter_array_sum() to read values. It's trade-off.
+ */
+int counter_array_init(struct counter_array *ca, int size, int nosync)
+{
+	ca->v.array = __alloc_percpu(size * sizeof(long), __alignof__(long));
+	if (!ca->v.array)
+		return -ENOMEM;
+	ca->v.nosync = nosync;
+	ca->v.elements = size;
+
+	if (MAINTAIN_LIST(ca)) {
+		mutex_lock(&counter_array_mutex);
+		list_add(&ca->v.list, &counter_arrays);
+		mutex_unlock(&counter_array_mutex);
+	}
+	return 0;
+}
+
+void counter_array_destroy(struct counter_array *ca)
+{
+	if (MAINTAIN_LIST(ca)) {
+		mutex_lock(&counter_array_mutex);
+		list_add(&ca->v.list, &counter_arrays);
+		mutex_unlock(&counter_array_mutex);
+	}
+	free_percpu(ca->v.array);
+	ca->v.array = NULL;
+}
+#undef MAINTAIN_LIST
+
+/**
+ * __counter_array_add - add specified value to counter[idx]
+ * @ca: counter array to be modified
+ * @idx: index in counter array
+ * @val: value to be added
+ * @batch: threshould to coalesce percpu value to global counter.
+ *
+ * Add specified value to counter[idx]. Users can control how frequently
+ * synchronization will happen by "batch" value. If counter is initialized
+ * as "nosync" counter, no synchronization will happen.
+ */
+void __counter_array_add(struct counter_array *ca, int idx, int val, int batch)
+{
+	long count, *pcount;
+
+	preempt_disable();
+
+	pcount = this_cpu_ptr(ca->v.array);
+	count = pcount[idx] + val;
+	if (!ca->v.nosync && ((count > batch) || (count < -batch))) {
+		atomic_long_add(count, &ca->counters[idx]);
+		pcount[idx] = 0;
+	} else
+		pcount[idx] = count;
+	preempt_enable();
+}
+
+void counter_array_add(struct counter_array *ca, int idx, int val)
+{
+	__counter_array_add(ca, idx, val, percpu_counter_batch);
+}
+
+long counter_array_sum(struct counter_array *ca, int idx)
+{
+	long val, *pcount;
+	int cpu;
+
+	if (ca->v.nosync) {
+		val = 0;
+		/* We don't have CPU HOTPLUG callback */
+		for_each_possible_cpu(cpu) {
+			pcount = per_cpu_ptr(ca->v.array, cpu);
+			val += pcount[idx];
+		}
+	} else {
+		/*
+		 * We don't have CPU HOTPLUG callback. There maybe race
+		 * but amount of error is below batch value.
+		 */
+		val = atomic_long_read(&ca->counters[idx]);
+		for_each_online_cpu(cpu) {
+			pcount = per_cpu_ptr(ca->v.array, cpu);
+			val += pcount[idx];
+		}
+	}
+	return val;
+}
+
+static int __cpuinit counter_array_hotcpu_callback(struct notifier_block *nb,
+		unsigned long action, void *hcpu)
+{
+	struct _pad_counter_array *pca;
+	unsigned int cpu;
+
+	if (action != CPU_DEAD)
+		return NOTIFY_OK;
+
+	cpu = (unsigned long)hcpu;
+	/*
+	 * nosync counter is not on this list.
+	 */
+	mutex_lock(&counter_array_mutex);
+	list_for_each_entry(pca, &counter_arrays, list) {
+		struct counter_array *ca;
+		long *pcount;
+		int idx;
+
+		pcount = per_cpu_ptr(pca->array, cpu);
+		ca = container_of(pca, struct counter_array, v);
+		for (idx = 0; idx < ca->v.elements; idx++) {
+			atomic_long_add(pcount[idx], &ca->counters[idx]);
+			pcount[idx] = 0;
+		}
+	}
+	mutex_unlock(&counter_array_mutex);
+
+	return NOTIFY_OK;
+}
+
+static int __init counter_array_startup(void)
+{
+	hotcpu_notifier(counter_array_hotcpu_callback, 0);
+	return 0;
+}
+module_init(counter_array_startup);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] show per-process swap usage via procfs
  2009-11-04  6:24 ` KAMEZAWA Hiroyuki
@ 2009-11-05 14:41   ` KOSAKI Motohiro
  -1 siblings, 0 replies; 42+ messages in thread
From: KOSAKI Motohiro @ 2009-11-05 14:41 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-kernel, linux-mm, hugh.dickins, cl, akpm

2009/11/4 KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>:
>
> Passed several tests and one bug was fixed since RFC version.
> This patch is against mmotm.
> =
> From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
>
> Now, anon_rss and file_rss is counted as RSS and exported via /proc.
> RSS usage is important information but one more information which
> is often asked by users is "usage of swap".(user support team said.)
>
> This patch counts swap entry usage per process and show it via
> /proc/<pid>/status. I think status file is robust against new entry.
> Then, it is the first candidate..
>
>  After this, /proc/<pid>/status includes following line
>  <snip>
>  VmPeak:   315360 kB
>  VmSize:   315360 kB
>  VmLck:         0 kB
>  VmHWM:    180452 kB
>  VmRSS:    180452 kB
>  VmData:   311624 kB
>  VmStk:        84 kB
>  VmExe:         4 kB
>  VmLib:      1568 kB
>  VmPTE:       640 kB
>  VmSwap:   131240 kB <=== new information
>
> Note:
>  Because this patch catches swap_pte on page table, this will
>  not catch shmem's swapout. It's already accounted in per-shmem
>  inode and we don't need to do more.

Sidenote: top(1) can show SWAP usage. but it is crazy buggy
implementation. it define
VIRT = SWAP + RES (see man top or actual source code). this patch help
to fix its insane
calculation.

    Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] show per-process swap usage via procfs
@ 2009-11-05 14:41   ` KOSAKI Motohiro
  0 siblings, 0 replies; 42+ messages in thread
From: KOSAKI Motohiro @ 2009-11-05 14:41 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: linux-kernel, linux-mm, hugh.dickins, cl, akpm

2009/11/4 KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>:
>
> Passed several tests and one bug was fixed since RFC version.
> This patch is against mmotm.
> =
> From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
>
> Now, anon_rss and file_rss is counted as RSS and exported via /proc.
> RSS usage is important information but one more information which
> is often asked by users is "usage of swap".(user support team said.)
>
> This patch counts swap entry usage per process and show it via
> /proc/<pid>/status. I think status file is robust against new entry.
> Then, it is the first candidate..
>
>  After this, /proc/<pid>/status includes following line
>  <snip>
>  VmPeak:   315360 kB
>  VmSize:   315360 kB
>  VmLck:         0 kB
>  VmHWM:    180452 kB
>  VmRSS:    180452 kB
>  VmData:   311624 kB
>  VmStk:        84 kB
>  VmExe:         4 kB
>  VmLib:      1568 kB
>  VmPTE:       640 kB
>  VmSwap:   131240 kB <=== new information
>
> Note:
>  Because this patch catches swap_pte on page table, this will
>  not catch shmem's swapout. It's already accounted in per-shmem
>  inode and we don't need to do more.

Sidenote: top(1) can show SWAP usage. but it is crazy buggy
implementation. it define
VIRT = SWAP + RES (see man top or actual source code). this patch help
to fix its insane
calculation.

    Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] show per-process swap usage via procfs
  2009-11-04 23:25     ` KOSAKI Motohiro
@ 2009-11-05 15:04       ` Christoph Lameter
  -1 siblings, 0 replies; 42+ messages in thread
From: Christoph Lameter @ 2009-11-05 15:04 UTC (permalink / raw)
  To: KOSAKI Motohiro
  Cc: KAMEZAWA Hiroyuki, linux-kernel, linux-mm, hugh.dickins, akpm

On Thu, 5 Nov 2009, KOSAKI Motohiro wrote:

> > On Wed, 4 Nov 2009, KAMEZAWA Hiroyuki wrote:
> >
> > > Now, anon_rss and file_rss is counted as RSS and exported via /proc.
> > > RSS usage is important information but one more information which
> > > is often asked by users is "usage of swap".(user support team said.)
> >
> > Hmmm... Could we do some rework of the counters first so that they are per
> > cpu?
>
> per-cpu swap counter?
> It seems overkill effort....

The other alternative is to use atomic ops which are significantly slower
and have an impact on critical sections.


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] show per-process swap usage via procfs
@ 2009-11-05 15:04       ` Christoph Lameter
  0 siblings, 0 replies; 42+ messages in thread
From: Christoph Lameter @ 2009-11-05 15:04 UTC (permalink / raw)
  To: KOSAKI Motohiro
  Cc: KAMEZAWA Hiroyuki, linux-kernel, linux-mm, hugh.dickins, akpm

On Thu, 5 Nov 2009, KOSAKI Motohiro wrote:

> > On Wed, 4 Nov 2009, KAMEZAWA Hiroyuki wrote:
> >
> > > Now, anon_rss and file_rss is counted as RSS and exported via /proc.
> > > RSS usage is important information but one more information which
> > > is often asked by users is "usage of swap".(user support team said.)
> >
> > Hmmm... Could we do some rework of the counters first so that they are per
> > cpu?
>
> per-cpu swap counter?
> It seems overkill effort....

The other alternative is to use atomic ops which are significantly slower
and have an impact on critical sections.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] show per-process swap usage via procfs
  2009-11-04  6:24 ` KAMEZAWA Hiroyuki
@ 2009-11-05 15:11   ` Minchan Kim
  -1 siblings, 0 replies; 42+ messages in thread
From: Minchan Kim @ 2009-11-05 15:11 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: linux-kernel, linux-mm, hugh.dickins, cl, akpm, kosaki.motohiro

Hi, Kame.

On Wed, Nov 4, 2009 at 3:24 PM, KAMEZAWA Hiroyuki
<kamezawa.hiroyu@jp.fujitsu.com> wrote:
>
> Passed several tests and one bug was fixed since RFC version.
> This patch is against mmotm.
> =
> From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
>
> Now, anon_rss and file_rss is counted as RSS and exported via /proc.
> RSS usage is important information but one more information which
> is often asked by users is "usage of swap".(user support team said.)
>
> This patch counts swap entry usage per process and show it via
> /proc/<pid>/status. I think status file is robust against new entry.
> Then, it is the first candidate..
>
>  After this, /proc/<pid>/status includes following line
>  <snip>
>  VmPeak:   315360 kB
>  VmSize:   315360 kB
>  VmLck:         0 kB
>  VmHWM:    180452 kB
>  VmRSS:    180452 kB
>  VmData:   311624 kB
>  VmStk:        84 kB
>  VmExe:         4 kB
>  VmLib:      1568 kB
>  VmPTE:       640 kB
>  VmSwap:   131240 kB <=== new information
>
> Note:
>  Because this patch catches swap_pte on page table, this will
>  not catch shmem's swapout. It's already accounted in per-shmem
>  inode and we don't need to do more.
>
> Changelog: 2009/11/03
>  - clean up.
>  - fixed initialization bug at fork (init_mm())
>
> Acked-by: Acked-by; David Rientjes <rientjes@google.com>
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> ---
>  fs/proc/task_mmu.c       |    9 ++++++---
>  include/linux/mm_types.h |    1 +
>  kernel/fork.c            |    1 +
>  mm/memory.c              |   30 +++++++++++++++++++++---------
>  mm/rmap.c                |    1 +
>  mm/swapfile.c            |    1 +
>  6 files changed, 31 insertions(+), 12 deletions(-)
>
> Index: mmotm-2.6.32-Nov2/include/linux/mm_types.h
> ===================================================================
> --- mmotm-2.6.32-Nov2.orig/include/linux/mm_types.h
> +++ mmotm-2.6.32-Nov2/include/linux/mm_types.h
> @@ -228,6 +228,7 @@ struct mm_struct {
>         */
>        mm_counter_t _file_rss;
>        mm_counter_t _anon_rss;
> +       mm_counter_t _swap_usage;
>
>        unsigned long hiwater_rss;      /* High-watermark of RSS usage */
>        unsigned long hiwater_vm;       /* High-water virtual memory usage */
> Index: mmotm-2.6.32-Nov2/mm/memory.c
> ===================================================================
> --- mmotm-2.6.32-Nov2.orig/mm/memory.c
> +++ mmotm-2.6.32-Nov2/mm/memory.c
> @@ -376,12 +376,15 @@ int __pte_alloc_kernel(pmd_t *pmd, unsig
>        return 0;
>  }
>
> -static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
> +static inline void
> +add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss, int swap_usage)
>  {
>        if (file_rss)
>                add_mm_counter(mm, file_rss, file_rss);
>        if (anon_rss)
>                add_mm_counter(mm, anon_rss, anon_rss);
> +       if (swap_usage)
> +               add_mm_counter(mm, swap_usage, swap_usage);
>  }
>
>  /*
> @@ -597,7 +600,9 @@ copy_one_pte(struct mm_struct *dst_mm, s
>                                                 &src_mm->mmlist);
>                                spin_unlock(&mmlist_lock);
>                        }
> -                       if (is_write_migration_entry(entry) &&
> +                       if (!is_migration_entry(entry))
> +                               rss[2]++;

First thought I come to is that we believe !is_migration_entry(entry) equal
swap entry?
We began supporting HWPOISON.
HWPOISON would be rare event so some less exact swap accouting may
be allowed, I think.  Is this enough to jusitfy that?

> +                       else if (is_write_migration_entry(entry) &&
>                                        is_cow_mapping(vm_flags)) {
>                                /*
>                                 * COW mappings require pages in both parent
> @@ -648,11 +653,11 @@ static int copy_pte_range(struct mm_stru
>        pte_t *src_pte, *dst_pte;
>        spinlock_t *src_ptl, *dst_ptl;
>        int progress = 0;
> -       int rss[2];
> +       int rss[3];
>        swp_entry_t entry = (swp_entry_t){0};
>
>  again:
> -       rss[1] = rss[0] = 0;
> +       rss[2] = rss[1] = rss[0] = 0;
>        dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
>        if (!dst_pte)
>                return -ENOMEM;
> @@ -688,7 +693,7 @@ again:
>        arch_leave_lazy_mmu_mode();
>        spin_unlock(src_ptl);
>        pte_unmap_nested(orig_src_pte);
> -       add_mm_rss(dst_mm, rss[0], rss[1]);
> +       add_mm_rss(dst_mm, rss[0], rss[1], rss[2]);
>        pte_unmap_unlock(orig_dst_pte, dst_ptl);
>        cond_resched();
>
> @@ -818,6 +823,7 @@ static unsigned long zap_pte_range(struc
>        spinlock_t *ptl;
>        int file_rss = 0;
>        int anon_rss = 0;
> +       int swap_usage = 0;
>
>        pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
>        arch_enter_lazy_mmu_mode();
> @@ -887,13 +893,18 @@ static unsigned long zap_pte_range(struc
>                if (pte_file(ptent)) {
>                        if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
>                                print_bad_pte(vma, addr, ptent, NULL);
> -               } else if
> -                 (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent))))
> -                       print_bad_pte(vma, addr, ptent, NULL);
> +               } else {
> +                       swp_entry_t ent = pte_to_swp_entry(ptent);
> +
> +                       if (!is_migration_entry(ent))
> +                               swap_usage--;

ditto

> +                       if (unlikely(!free_swap_and_cache(ent)))
> +                               print_bad_pte(vma, addr, ptent, NULL);
> +               }
>                pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
>        } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
>
> -       add_mm_rss(mm, file_rss, anon_rss);
> +       add_mm_rss(mm, file_rss, anon_rss, swap_usage);
>        arch_leave_lazy_mmu_mode();
>        pte_unmap_unlock(pte - 1, ptl);
>
> @@ -2595,6 +2606,7 @@ static int do_swap_page(struct mm_struct
>         */
>
>        inc_mm_counter(mm, anon_rss);
> +       dec_mm_counter(mm, swap_usage);
>        pte = mk_pte(page, vma->vm_page_prot);
>        if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
>                pte = maybe_mkwrite(pte_mkdirty(pte), vma);
> Index: mmotm-2.6.32-Nov2/mm/swapfile.c
> ===================================================================
> --- mmotm-2.6.32-Nov2.orig/mm/swapfile.c
> +++ mmotm-2.6.32-Nov2/mm/swapfile.c
> @@ -837,6 +837,7 @@ static int unuse_pte(struct vm_area_stru
>        }
>
>        inc_mm_counter(vma->vm_mm, anon_rss);
> +       dec_mm_counter(vma->vm_mm, swap_usage);
>        get_page(page);
>        set_pte_at(vma->vm_mm, addr, pte,
>                   pte_mkold(mk_pte(page, vma->vm_page_prot)));
> Index: mmotm-2.6.32-Nov2/fs/proc/task_mmu.c
> ===================================================================
> --- mmotm-2.6.32-Nov2.orig/fs/proc/task_mmu.c
> +++ mmotm-2.6.32-Nov2/fs/proc/task_mmu.c
> @@ -17,7 +17,7 @@
>  void task_mem(struct seq_file *m, struct mm_struct *mm)
>  {
>        unsigned long data, text, lib;
> -       unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
> +       unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss, swap;
>
>        /*
>         * Note: to minimize their overhead, mm maintains hiwater_vm and
> @@ -36,6 +36,7 @@ void task_mem(struct seq_file *m, struct
>        data = mm->total_vm - mm->shared_vm - mm->stack_vm;
>        text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
>        lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
> +       swap = get_mm_counter(mm, swap_usage);
>        seq_printf(m,
>                "VmPeak:\t%8lu kB\n"
>                "VmSize:\t%8lu kB\n"
> @@ -46,7 +47,8 @@ void task_mem(struct seq_file *m, struct
>                "VmStk:\t%8lu kB\n"
>                "VmExe:\t%8lu kB\n"
>                "VmLib:\t%8lu kB\n"
> -               "VmPTE:\t%8lu kB\n",
> +               "VmPTE:\t%8lu kB\n"
> +               "VmSwap:\t%8lu kB\n",
>                hiwater_vm << (PAGE_SHIFT-10),
>                (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
>                mm->locked_vm << (PAGE_SHIFT-10),
> @@ -54,7 +56,8 @@ void task_mem(struct seq_file *m, struct
>                total_rss << (PAGE_SHIFT-10),
>                data << (PAGE_SHIFT-10),
>                mm->stack_vm << (PAGE_SHIFT-10), text, lib,
> -               (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10);
> +               (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10,
> +               swap << (PAGE_SHIFT - 10));
>  }
>
>  unsigned long task_vsize(struct mm_struct *mm)
> Index: mmotm-2.6.32-Nov2/mm/rmap.c
> ===================================================================
> --- mmotm-2.6.32-Nov2.orig/mm/rmap.c
> +++ mmotm-2.6.32-Nov2/mm/rmap.c
> @@ -834,6 +834,7 @@ static int try_to_unmap_one(struct page
>                                spin_unlock(&mmlist_lock);
>                        }
>                        dec_mm_counter(mm, anon_rss);
> +                       inc_mm_counter(mm, swap_usage);
>                } else if (PAGE_MIGRATION) {
>                        /*
>                         * Store the pfn of the page in a special migration
> Index: mmotm-2.6.32-Nov2/kernel/fork.c
> ===================================================================
> --- mmotm-2.6.32-Nov2.orig/kernel/fork.c
> +++ mmotm-2.6.32-Nov2/kernel/fork.c
> @@ -454,6 +454,7 @@ static struct mm_struct * mm_init(struct
>        mm->nr_ptes = 0;
>        set_mm_counter(mm, file_rss, 0);
>        set_mm_counter(mm, anon_rss, 0);
> +       set_mm_counter(mm, swap_usage, 0);
>        spin_lock_init(&mm->page_table_lock);
>        mm->free_area_cache = TASK_UNMAPPED_BASE;
>        mm->cached_hole_size = ~0UL;
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
>

That's good.
>From now on, we can chagne scanning of pte to find swap pte
in smaps_pte_rangem, too. :)

-- 
Kind regards,
Minchan Kim

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] show per-process swap usage via procfs
@ 2009-11-05 15:11   ` Minchan Kim
  0 siblings, 0 replies; 42+ messages in thread
From: Minchan Kim @ 2009-11-05 15:11 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: linux-kernel, linux-mm, hugh.dickins, cl, akpm, kosaki.motohiro

Hi, Kame.

On Wed, Nov 4, 2009 at 3:24 PM, KAMEZAWA Hiroyuki
<kamezawa.hiroyu@jp.fujitsu.com> wrote:
>
> Passed several tests and one bug was fixed since RFC version.
> This patch is against mmotm.
> =
> From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
>
> Now, anon_rss and file_rss is counted as RSS and exported via /proc.
> RSS usage is important information but one more information which
> is often asked by users is "usage of swap".(user support team said.)
>
> This patch counts swap entry usage per process and show it via
> /proc/<pid>/status. I think status file is robust against new entry.
> Then, it is the first candidate..
>
>  After this, /proc/<pid>/status includes following line
>  <snip>
>  VmPeak:   315360 kB
>  VmSize:   315360 kB
>  VmLck:         0 kB
>  VmHWM:    180452 kB
>  VmRSS:    180452 kB
>  VmData:   311624 kB
>  VmStk:        84 kB
>  VmExe:         4 kB
>  VmLib:      1568 kB
>  VmPTE:       640 kB
>  VmSwap:   131240 kB <=== new information
>
> Note:
>  Because this patch catches swap_pte on page table, this will
>  not catch shmem's swapout. It's already accounted in per-shmem
>  inode and we don't need to do more.
>
> Changelog: 2009/11/03
>  - clean up.
>  - fixed initialization bug at fork (init_mm())
>
> Acked-by: Acked-by; David Rientjes <rientjes@google.com>
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> ---
>  fs/proc/task_mmu.c       |    9 ++++++---
>  include/linux/mm_types.h |    1 +
>  kernel/fork.c            |    1 +
>  mm/memory.c              |   30 +++++++++++++++++++++---------
>  mm/rmap.c                |    1 +
>  mm/swapfile.c            |    1 +
>  6 files changed, 31 insertions(+), 12 deletions(-)
>
> Index: mmotm-2.6.32-Nov2/include/linux/mm_types.h
> ===================================================================
> --- mmotm-2.6.32-Nov2.orig/include/linux/mm_types.h
> +++ mmotm-2.6.32-Nov2/include/linux/mm_types.h
> @@ -228,6 +228,7 @@ struct mm_struct {
>         */
>        mm_counter_t _file_rss;
>        mm_counter_t _anon_rss;
> +       mm_counter_t _swap_usage;
>
>        unsigned long hiwater_rss;      /* High-watermark of RSS usage */
>        unsigned long hiwater_vm;       /* High-water virtual memory usage */
> Index: mmotm-2.6.32-Nov2/mm/memory.c
> ===================================================================
> --- mmotm-2.6.32-Nov2.orig/mm/memory.c
> +++ mmotm-2.6.32-Nov2/mm/memory.c
> @@ -376,12 +376,15 @@ int __pte_alloc_kernel(pmd_t *pmd, unsig
>        return 0;
>  }
>
> -static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
> +static inline void
> +add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss, int swap_usage)
>  {
>        if (file_rss)
>                add_mm_counter(mm, file_rss, file_rss);
>        if (anon_rss)
>                add_mm_counter(mm, anon_rss, anon_rss);
> +       if (swap_usage)
> +               add_mm_counter(mm, swap_usage, swap_usage);
>  }
>
>  /*
> @@ -597,7 +600,9 @@ copy_one_pte(struct mm_struct *dst_mm, s
>                                                 &src_mm->mmlist);
>                                spin_unlock(&mmlist_lock);
>                        }
> -                       if (is_write_migration_entry(entry) &&
> +                       if (!is_migration_entry(entry))
> +                               rss[2]++;

First thought I come to is that we believe !is_migration_entry(entry) equal
swap entry?
We began supporting HWPOISON.
HWPOISON would be rare event so some less exact swap accouting may
be allowed, I think.  Is this enough to jusitfy that?

> +                       else if (is_write_migration_entry(entry) &&
>                                        is_cow_mapping(vm_flags)) {
>                                /*
>                                 * COW mappings require pages in both parent
> @@ -648,11 +653,11 @@ static int copy_pte_range(struct mm_stru
>        pte_t *src_pte, *dst_pte;
>        spinlock_t *src_ptl, *dst_ptl;
>        int progress = 0;
> -       int rss[2];
> +       int rss[3];
>        swp_entry_t entry = (swp_entry_t){0};
>
>  again:
> -       rss[1] = rss[0] = 0;
> +       rss[2] = rss[1] = rss[0] = 0;
>        dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
>        if (!dst_pte)
>                return -ENOMEM;
> @@ -688,7 +693,7 @@ again:
>        arch_leave_lazy_mmu_mode();
>        spin_unlock(src_ptl);
>        pte_unmap_nested(orig_src_pte);
> -       add_mm_rss(dst_mm, rss[0], rss[1]);
> +       add_mm_rss(dst_mm, rss[0], rss[1], rss[2]);
>        pte_unmap_unlock(orig_dst_pte, dst_ptl);
>        cond_resched();
>
> @@ -818,6 +823,7 @@ static unsigned long zap_pte_range(struc
>        spinlock_t *ptl;
>        int file_rss = 0;
>        int anon_rss = 0;
> +       int swap_usage = 0;
>
>        pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
>        arch_enter_lazy_mmu_mode();
> @@ -887,13 +893,18 @@ static unsigned long zap_pte_range(struc
>                if (pte_file(ptent)) {
>                        if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
>                                print_bad_pte(vma, addr, ptent, NULL);
> -               } else if
> -                 (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent))))
> -                       print_bad_pte(vma, addr, ptent, NULL);
> +               } else {
> +                       swp_entry_t ent = pte_to_swp_entry(ptent);
> +
> +                       if (!is_migration_entry(ent))
> +                               swap_usage--;

ditto

> +                       if (unlikely(!free_swap_and_cache(ent)))
> +                               print_bad_pte(vma, addr, ptent, NULL);
> +               }
>                pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
>        } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
>
> -       add_mm_rss(mm, file_rss, anon_rss);
> +       add_mm_rss(mm, file_rss, anon_rss, swap_usage);
>        arch_leave_lazy_mmu_mode();
>        pte_unmap_unlock(pte - 1, ptl);
>
> @@ -2595,6 +2606,7 @@ static int do_swap_page(struct mm_struct
>         */
>
>        inc_mm_counter(mm, anon_rss);
> +       dec_mm_counter(mm, swap_usage);
>        pte = mk_pte(page, vma->vm_page_prot);
>        if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
>                pte = maybe_mkwrite(pte_mkdirty(pte), vma);
> Index: mmotm-2.6.32-Nov2/mm/swapfile.c
> ===================================================================
> --- mmotm-2.6.32-Nov2.orig/mm/swapfile.c
> +++ mmotm-2.6.32-Nov2/mm/swapfile.c
> @@ -837,6 +837,7 @@ static int unuse_pte(struct vm_area_stru
>        }
>
>        inc_mm_counter(vma->vm_mm, anon_rss);
> +       dec_mm_counter(vma->vm_mm, swap_usage);
>        get_page(page);
>        set_pte_at(vma->vm_mm, addr, pte,
>                   pte_mkold(mk_pte(page, vma->vm_page_prot)));
> Index: mmotm-2.6.32-Nov2/fs/proc/task_mmu.c
> ===================================================================
> --- mmotm-2.6.32-Nov2.orig/fs/proc/task_mmu.c
> +++ mmotm-2.6.32-Nov2/fs/proc/task_mmu.c
> @@ -17,7 +17,7 @@
>  void task_mem(struct seq_file *m, struct mm_struct *mm)
>  {
>        unsigned long data, text, lib;
> -       unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
> +       unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss, swap;
>
>        /*
>         * Note: to minimize their overhead, mm maintains hiwater_vm and
> @@ -36,6 +36,7 @@ void task_mem(struct seq_file *m, struct
>        data = mm->total_vm - mm->shared_vm - mm->stack_vm;
>        text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
>        lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
> +       swap = get_mm_counter(mm, swap_usage);
>        seq_printf(m,
>                "VmPeak:\t%8lu kB\n"
>                "VmSize:\t%8lu kB\n"
> @@ -46,7 +47,8 @@ void task_mem(struct seq_file *m, struct
>                "VmStk:\t%8lu kB\n"
>                "VmExe:\t%8lu kB\n"
>                "VmLib:\t%8lu kB\n"
> -               "VmPTE:\t%8lu kB\n",
> +               "VmPTE:\t%8lu kB\n"
> +               "VmSwap:\t%8lu kB\n",
>                hiwater_vm << (PAGE_SHIFT-10),
>                (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
>                mm->locked_vm << (PAGE_SHIFT-10),
> @@ -54,7 +56,8 @@ void task_mem(struct seq_file *m, struct
>                total_rss << (PAGE_SHIFT-10),
>                data << (PAGE_SHIFT-10),
>                mm->stack_vm << (PAGE_SHIFT-10), text, lib,
> -               (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10);
> +               (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10,
> +               swap << (PAGE_SHIFT - 10));
>  }
>
>  unsigned long task_vsize(struct mm_struct *mm)
> Index: mmotm-2.6.32-Nov2/mm/rmap.c
> ===================================================================
> --- mmotm-2.6.32-Nov2.orig/mm/rmap.c
> +++ mmotm-2.6.32-Nov2/mm/rmap.c
> @@ -834,6 +834,7 @@ static int try_to_unmap_one(struct page
>                                spin_unlock(&mmlist_lock);
>                        }
>                        dec_mm_counter(mm, anon_rss);
> +                       inc_mm_counter(mm, swap_usage);
>                } else if (PAGE_MIGRATION) {
>                        /*
>                         * Store the pfn of the page in a special migration
> Index: mmotm-2.6.32-Nov2/kernel/fork.c
> ===================================================================
> --- mmotm-2.6.32-Nov2.orig/kernel/fork.c
> +++ mmotm-2.6.32-Nov2/kernel/fork.c
> @@ -454,6 +454,7 @@ static struct mm_struct * mm_init(struct
>        mm->nr_ptes = 0;
>        set_mm_counter(mm, file_rss, 0);
>        set_mm_counter(mm, anon_rss, 0);
> +       set_mm_counter(mm, swap_usage, 0);
>        spin_lock_init(&mm->page_table_lock);
>        mm->free_area_cache = TASK_UNMAPPED_BASE;
>        mm->cached_hole_size = ~0UL;
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
>

That's good.
>From now on, we can chagne scanning of pte to find swap pte
in smaps_pte_rangem, too. :)

-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC][PATCH] lib: generic percpu counter array
  2009-11-05  5:16       ` KAMEZAWA Hiroyuki
@ 2009-11-05 15:15         ` Christoph Lameter
  -1 siblings, 0 replies; 42+ messages in thread
From: Christoph Lameter @ 2009-11-05 15:15 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: linux-kernel, linux-mm, hugh.dickins, akpm, kosaki.motohiro

On Thu, 5 Nov 2009, KAMEZAWA Hiroyuki wrote:

> +static inline void
> +counter_array_add(struct counter_array *ca, int idx, int val)
> +{
> +	ca->counters[idx] += val;
> +}

This is not a per cpu operation and therefore expensive. The new percpu
this_cpu_inc f.e. generates a single x86 instruction for an increment.

> +void __counter_array_add(struct counter_array *ca, int idx, int val, int batch)
> +{
> +	long count, *pcount;
> +
> +	preempt_disable();
> +
> +	pcount = this_cpu_ptr(ca->v.array);
> +	count = pcount[idx] + val;
> +	if (!ca->v.nosync && ((count > batch) || (count < -batch))) {
> +		atomic_long_add(count, &ca->counters[idx]);
> +		pcount[idx] = 0;
> +	} else
> +		pcount[idx] = count;
> +	preempt_enable();
> +}

Too expensive to use in critical VM paths. The percpu operations generate
a single instruction instead of the code above. No need for preempt etc.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC][PATCH] lib: generic percpu counter array
@ 2009-11-05 15:15         ` Christoph Lameter
  0 siblings, 0 replies; 42+ messages in thread
From: Christoph Lameter @ 2009-11-05 15:15 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: linux-kernel, linux-mm, hugh.dickins, akpm, kosaki.motohiro

On Thu, 5 Nov 2009, KAMEZAWA Hiroyuki wrote:

> +static inline void
> +counter_array_add(struct counter_array *ca, int idx, int val)
> +{
> +	ca->counters[idx] += val;
> +}

This is not a per cpu operation and therefore expensive. The new percpu
this_cpu_inc f.e. generates a single x86 instruction for an increment.

> +void __counter_array_add(struct counter_array *ca, int idx, int val, int batch)
> +{
> +	long count, *pcount;
> +
> +	preempt_disable();
> +
> +	pcount = this_cpu_ptr(ca->v.array);
> +	count = pcount[idx] + val;
> +	if (!ca->v.nosync && ((count > batch) || (count < -batch))) {
> +		atomic_long_add(count, &ca->counters[idx]);
> +		pcount[idx] = 0;
> +	} else
> +		pcount[idx] = count;
> +	preempt_enable();
> +}

Too expensive to use in critical VM paths. The percpu operations generate
a single instruction instead of the code above. No need for preempt etc.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC][PATCH] lib: generic percpu counter array
  2009-11-05  5:16       ` KAMEZAWA Hiroyuki
@ 2009-11-05 15:20         ` Christoph Lameter
  -1 siblings, 0 replies; 42+ messages in thread
From: Christoph Lameter @ 2009-11-05 15:20 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: linux-kernel, linux-mm, hugh.dickins, akpm, kosaki.motohiro

On Thu, 5 Nov 2009, KAMEZAWA Hiroyuki wrote:

> Anothter major percpu coutner is vm_stat[]. This patch implements
> vm_stat[] style counter array in lib/percpu_counter.c
> This is designed for introducing vm_stat[] style counter to memcg,
> but maybe useful for other people. By using this, counter array
> using percpu can be implemented easily in compact structure.


Note that vm_stat support was written that way because we have extreme
space constraints due to the need to keep statistics per zone and per cpu
and avoid cache line pressure that would result through the use of big
integer arrays per zone and per cpu. For a large number of zones and cpus
this is desastrous.

If you only need to keep statistics per cpu for an entity then the vmstat
approach is overkill. A per cpu allocation of a counter is enough.


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC][PATCH] lib: generic percpu counter array
@ 2009-11-05 15:20         ` Christoph Lameter
  0 siblings, 0 replies; 42+ messages in thread
From: Christoph Lameter @ 2009-11-05 15:20 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: linux-kernel, linux-mm, hugh.dickins, akpm, kosaki.motohiro

On Thu, 5 Nov 2009, KAMEZAWA Hiroyuki wrote:

> Anothter major percpu coutner is vm_stat[]. This patch implements
> vm_stat[] style counter array in lib/percpu_counter.c
> This is designed for introducing vm_stat[] style counter to memcg,
> but maybe useful for other people. By using this, counter array
> using percpu can be implemented easily in compact structure.


Note that vm_stat support was written that way because we have extreme
space constraints due to the need to keep statistics per zone and per cpu
and avoid cache line pressure that would result through the use of big
integer arrays per zone and per cpu. For a large number of zones and cpus
this is desastrous.

If you only need to keep statistics per cpu for an entity then the vmstat
approach is overkill. A per cpu allocation of a counter is enough.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] show per-process swap usage via procfs
  2009-11-05 15:11   ` Minchan Kim
@ 2009-11-05 23:48     ` KAMEZAWA Hiroyuki
  -1 siblings, 0 replies; 42+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-11-05 23:48 UTC (permalink / raw)
  To: Minchan Kim
  Cc: linux-kernel, linux-mm, hugh.dickins, cl, akpm, kosaki.motohiro

On Fri, 6 Nov 2009 00:11:32 +0900
Minchan Kim <minchan.kim@gmail.com> wrote:

> Hi, Kame.
> 
Hi,

<snip>
> >  /*
> > @@ -597,7 +600,9 @@ copy_one_pte(struct mm_struct *dst_mm, s
> >                                                 &src_mm->mmlist);
> >                                spin_unlock(&mmlist_lock);
> >                        }
> > -                       if (is_write_migration_entry(entry) &&
> > +                       if (!is_migration_entry(entry))
> > +                               rss[2]++;
> 
> First thought I come to is that we believe !is_migration_entry(entry) equal
> swap entry?
> We began supporting HWPOISON.
> HWPOISON would be rare event so some less exact swap accouting may
> be allowed, I think.  Is this enough to jusitfy that?
> 
Ah, ok, I'll fix here.


> > +                       else if (is_write_migration_entry(entry) &&
> >                                        is_cow_mapping(vm_flags)) {
> >                                /*
> >                                 * COW mappings require pages in both parent
> > @@ -648,11 +653,11 @@ static int copy_pte_range(struct mm_stru
> >        pte_t *src_pte, *dst_pte;
> >        spinlock_t *src_ptl, *dst_ptl;
> >        int progress = 0;
> > -       int rss[2];
> > +       int rss[3];
> >        swp_entry_t entry = (swp_entry_t){0};
> >
> >  again:
> > -       rss[1] = rss[0] = 0;
> > +       rss[2] = rss[1] = rss[0] = 0;
> >        dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
> >        if (!dst_pte)
> >                return -ENOMEM;
> > @@ -688,7 +693,7 @@ again:
> >        arch_leave_lazy_mmu_mode();
> >        spin_unlock(src_ptl);
> >        pte_unmap_nested(orig_src_pte);
> > -       add_mm_rss(dst_mm, rss[0], rss[1]);
> > +       add_mm_rss(dst_mm, rss[0], rss[1], rss[2]);
> >        pte_unmap_unlock(orig_dst_pte, dst_ptl);
> >        cond_resched();
> >
> > @@ -818,6 +823,7 @@ static unsigned long zap_pte_range(struc
> >        spinlock_t *ptl;
> >        int file_rss = 0;
> >        int anon_rss = 0;
> > +       int swap_usage = 0;
> >
> >        pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
> >        arch_enter_lazy_mmu_mode();
> > @@ -887,13 +893,18 @@ static unsigned long zap_pte_range(struc
> >                if (pte_file(ptent)) {
> >                        if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
> >                                print_bad_pte(vma, addr, ptent, NULL);
> > -               } else if
> > -                 (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent))))
> > -                       print_bad_pte(vma, addr, ptent, NULL);
> > +               } else {
> > +                       swp_entry_t ent = pte_to_swp_entry(ptent);
> > +
> > +                       if (!is_migration_entry(ent))
> > +                               swap_usage--;
> 
> ditto
> 
ok, will do.


> > +                       if (unlikely(!free_swap_and_cache(ent)))
> > +                               print_bad_pte(vma, addr, ptent, NULL);
> > +               }
> >                pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
> >        } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
> >
> > -       add_mm_rss(mm, file_rss, anon_rss);
> > +       add_mm_rss(mm, file_rss, anon_rss, swap_usage);
> >        arch_leave_lazy_mmu_mode();
> >        pte_unmap_unlock(pte - 1, ptl);
> >
> > @@ -2595,6 +2606,7 @@ static int do_swap_page(struct mm_struct
> >         */
> >
> >        inc_mm_counter(mm, anon_rss);
> > +       dec_mm_counter(mm, swap_usage);
> >        pte = mk_pte(page, vma->vm_page_prot);
> >        if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
> >                pte = maybe_mkwrite(pte_mkdirty(pte), vma);
> > Index: mmotm-2.6.32-Nov2/mm/swapfile.c
> > ===================================================================
> > --- mmotm-2.6.32-Nov2.orig/mm/swapfile.c
> > +++ mmotm-2.6.32-Nov2/mm/swapfile.c
> > @@ -837,6 +837,7 @@ static int unuse_pte(struct vm_area_stru
> >        }
> >
> >        inc_mm_counter(vma->vm_mm, anon_rss);
> > +       dec_mm_counter(vma->vm_mm, swap_usage);
> >        get_page(page);
> >        set_pte_at(vma->vm_mm, addr, pte,
> >                   pte_mkold(mk_pte(page, vma->vm_page_prot)));
> > Index: mmotm-2.6.32-Nov2/fs/proc/task_mmu.c
> > ===================================================================
> > --- mmotm-2.6.32-Nov2.orig/fs/proc/task_mmu.c
> > +++ mmotm-2.6.32-Nov2/fs/proc/task_mmu.c
> > @@ -17,7 +17,7 @@
> >  void task_mem(struct seq_file *m, struct mm_struct *mm)
> >  {
> >        unsigned long data, text, lib;
> > -       unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
> > +       unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss, swap;
> >
> >        /*
> >         * Note: to minimize their overhead, mm maintains hiwater_vm and
> > @@ -36,6 +36,7 @@ void task_mem(struct seq_file *m, struct
> >        data = mm->total_vm - mm->shared_vm - mm->stack_vm;
> >        text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
> >        lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
> > +       swap = get_mm_counter(mm, swap_usage);
> >        seq_printf(m,
> >                "VmPeak:\t%8lu kB\n"
> >                "VmSize:\t%8lu kB\n"
> > @@ -46,7 +47,8 @@ void task_mem(struct seq_file *m, struct
> >                "VmStk:\t%8lu kB\n"
> >                "VmExe:\t%8lu kB\n"
> >                "VmLib:\t%8lu kB\n"
> > -               "VmPTE:\t%8lu kB\n",
> > +               "VmPTE:\t%8lu kB\n"
> > +               "VmSwap:\t%8lu kB\n",
> >                hiwater_vm << (PAGE_SHIFT-10),
> >                (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
> >                mm->locked_vm << (PAGE_SHIFT-10),
> > @@ -54,7 +56,8 @@ void task_mem(struct seq_file *m, struct
> >                total_rss << (PAGE_SHIFT-10),
> >                data << (PAGE_SHIFT-10),
> >                mm->stack_vm << (PAGE_SHIFT-10), text, lib,
> > -               (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10);
> > +               (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10,
> > +               swap << (PAGE_SHIFT - 10));
> >  }
> >
> >  unsigned long task_vsize(struct mm_struct *mm)
> > Index: mmotm-2.6.32-Nov2/mm/rmap.c
> > ===================================================================
> > --- mmotm-2.6.32-Nov2.orig/mm/rmap.c
> > +++ mmotm-2.6.32-Nov2/mm/rmap.c
> > @@ -834,6 +834,7 @@ static int try_to_unmap_one(struct page
> >                                spin_unlock(&mmlist_lock);
> >                        }
> >                        dec_mm_counter(mm, anon_rss);
> > +                       inc_mm_counter(mm, swap_usage);
> >                } else if (PAGE_MIGRATION) {
> >                        /*
> >                         * Store the pfn of the page in a special migration
> > Index: mmotm-2.6.32-Nov2/kernel/fork.c
> > ===================================================================
> > --- mmotm-2.6.32-Nov2.orig/kernel/fork.c
> > +++ mmotm-2.6.32-Nov2/kernel/fork.c
> > @@ -454,6 +454,7 @@ static struct mm_struct * mm_init(struct
> >        mm->nr_ptes = 0;
> >        set_mm_counter(mm, file_rss, 0);
> >        set_mm_counter(mm, anon_rss, 0);
> > +       set_mm_counter(mm, swap_usage, 0);
> >        spin_lock_init(&mm->page_table_lock);
> >        mm->free_area_cache = TASK_UNMAPPED_BASE;
> >        mm->cached_hole_size = ~0UL;
> >
> > --
> > To unsubscribe, send a message with 'unsubscribe linux-mm' in
> > the body to majordomo@kvack.org.  For more info on Linux MM,
> > see: http://www.linux-mm.org/ .
> > Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
> >
> 
> That's good.
> From now on, we can chagne scanning of pte to find swap pte
> in smaps_pte_rangem, too. :)
> 

Thanks, I'll update this.
-Kame



^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] show per-process swap usage via procfs
@ 2009-11-05 23:48     ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 42+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-11-05 23:48 UTC (permalink / raw)
  To: Minchan Kim
  Cc: linux-kernel, linux-mm, hugh.dickins, cl, akpm, kosaki.motohiro

On Fri, 6 Nov 2009 00:11:32 +0900
Minchan Kim <minchan.kim@gmail.com> wrote:

> Hi, Kame.
> 
Hi,

<snip>
> > A /*
> > @@ -597,7 +600,9 @@ copy_one_pte(struct mm_struct *dst_mm, s
> > A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  &src_mm->mmlist);
> > A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A spin_unlock(&mmlist_lock);
> > A  A  A  A  A  A  A  A  A  A  A  A }
> > - A  A  A  A  A  A  A  A  A  A  A  if (is_write_migration_entry(entry) &&
> > + A  A  A  A  A  A  A  A  A  A  A  if (!is_migration_entry(entry))
> > + A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  rss[2]++;
> 
> First thought I come to is that we believe !is_migration_entry(entry) equal
> swap entry?
> We began supporting HWPOISON.
> HWPOISON would be rare event so some less exact swap accouting may
> be allowed, I think.  Is this enough to jusitfy that?
> 
Ah, ok, I'll fix here.


> > + A  A  A  A  A  A  A  A  A  A  A  else if (is_write_migration_entry(entry) &&
> > A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A is_cow_mapping(vm_flags)) {
> > A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A /*
> > A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  * COW mappings require pages in both parent
> > @@ -648,11 +653,11 @@ static int copy_pte_range(struct mm_stru
> > A  A  A  A pte_t *src_pte, *dst_pte;
> > A  A  A  A spinlock_t *src_ptl, *dst_ptl;
> > A  A  A  A int progress = 0;
> > - A  A  A  int rss[2];
> > + A  A  A  int rss[3];
> > A  A  A  A swp_entry_t entry = (swp_entry_t){0};
> >
> > A again:
> > - A  A  A  rss[1] = rss[0] = 0;
> > + A  A  A  rss[2] = rss[1] = rss[0] = 0;
> > A  A  A  A dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
> > A  A  A  A if (!dst_pte)
> > A  A  A  A  A  A  A  A return -ENOMEM;
> > @@ -688,7 +693,7 @@ again:
> > A  A  A  A arch_leave_lazy_mmu_mode();
> > A  A  A  A spin_unlock(src_ptl);
> > A  A  A  A pte_unmap_nested(orig_src_pte);
> > - A  A  A  add_mm_rss(dst_mm, rss[0], rss[1]);
> > + A  A  A  add_mm_rss(dst_mm, rss[0], rss[1], rss[2]);
> > A  A  A  A pte_unmap_unlock(orig_dst_pte, dst_ptl);
> > A  A  A  A cond_resched();
> >
> > @@ -818,6 +823,7 @@ static unsigned long zap_pte_range(struc
> > A  A  A  A spinlock_t *ptl;
> > A  A  A  A int file_rss = 0;
> > A  A  A  A int anon_rss = 0;
> > + A  A  A  int swap_usage = 0;
> >
> > A  A  A  A pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
> > A  A  A  A arch_enter_lazy_mmu_mode();
> > @@ -887,13 +893,18 @@ static unsigned long zap_pte_range(struc
> > A  A  A  A  A  A  A  A if (pte_file(ptent)) {
> > A  A  A  A  A  A  A  A  A  A  A  A if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
> > A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A print_bad_pte(vma, addr, ptent, NULL);
> > - A  A  A  A  A  A  A  } else if
> > - A  A  A  A  A  A  A  A  (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent))))
> > - A  A  A  A  A  A  A  A  A  A  A  print_bad_pte(vma, addr, ptent, NULL);
> > + A  A  A  A  A  A  A  } else {
> > + A  A  A  A  A  A  A  A  A  A  A  swp_entry_t ent = pte_to_swp_entry(ptent);
> > +
> > + A  A  A  A  A  A  A  A  A  A  A  if (!is_migration_entry(ent))
> > + A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  swap_usage--;
> 
> ditto
> 
ok, will do.


> > + A  A  A  A  A  A  A  A  A  A  A  if (unlikely(!free_swap_and_cache(ent)))
> > + A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  print_bad_pte(vma, addr, ptent, NULL);
> > + A  A  A  A  A  A  A  }
> > A  A  A  A  A  A  A  A pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
> > A  A  A  A } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
> >
> > - A  A  A  add_mm_rss(mm, file_rss, anon_rss);
> > + A  A  A  add_mm_rss(mm, file_rss, anon_rss, swap_usage);
> > A  A  A  A arch_leave_lazy_mmu_mode();
> > A  A  A  A pte_unmap_unlock(pte - 1, ptl);
> >
> > @@ -2595,6 +2606,7 @@ static int do_swap_page(struct mm_struct
> > A  A  A  A  */
> >
> > A  A  A  A inc_mm_counter(mm, anon_rss);
> > + A  A  A  dec_mm_counter(mm, swap_usage);
> > A  A  A  A pte = mk_pte(page, vma->vm_page_prot);
> > A  A  A  A if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
> > A  A  A  A  A  A  A  A pte = maybe_mkwrite(pte_mkdirty(pte), vma);
> > Index: mmotm-2.6.32-Nov2/mm/swapfile.c
> > ===================================================================
> > --- mmotm-2.6.32-Nov2.orig/mm/swapfile.c
> > +++ mmotm-2.6.32-Nov2/mm/swapfile.c
> > @@ -837,6 +837,7 @@ static int unuse_pte(struct vm_area_stru
> > A  A  A  A }
> >
> > A  A  A  A inc_mm_counter(vma->vm_mm, anon_rss);
> > + A  A  A  dec_mm_counter(vma->vm_mm, swap_usage);
> > A  A  A  A get_page(page);
> > A  A  A  A set_pte_at(vma->vm_mm, addr, pte,
> > A  A  A  A  A  A  A  A  A  pte_mkold(mk_pte(page, vma->vm_page_prot)));
> > Index: mmotm-2.6.32-Nov2/fs/proc/task_mmu.c
> > ===================================================================
> > --- mmotm-2.6.32-Nov2.orig/fs/proc/task_mmu.c
> > +++ mmotm-2.6.32-Nov2/fs/proc/task_mmu.c
> > @@ -17,7 +17,7 @@
> > A void task_mem(struct seq_file *m, struct mm_struct *mm)
> > A {
> > A  A  A  A unsigned long data, text, lib;
> > - A  A  A  unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
> > + A  A  A  unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss, swap;
> >
> > A  A  A  A /*
> > A  A  A  A  * Note: to minimize their overhead, mm maintains hiwater_vm and
> > @@ -36,6 +36,7 @@ void task_mem(struct seq_file *m, struct
> > A  A  A  A data = mm->total_vm - mm->shared_vm - mm->stack_vm;
> > A  A  A  A text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
> > A  A  A  A lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
> > + A  A  A  swap = get_mm_counter(mm, swap_usage);
> > A  A  A  A seq_printf(m,
> > A  A  A  A  A  A  A  A "VmPeak:\t%8lu kB\n"
> > A  A  A  A  A  A  A  A "VmSize:\t%8lu kB\n"
> > @@ -46,7 +47,8 @@ void task_mem(struct seq_file *m, struct
> > A  A  A  A  A  A  A  A "VmStk:\t%8lu kB\n"
> > A  A  A  A  A  A  A  A "VmExe:\t%8lu kB\n"
> > A  A  A  A  A  A  A  A "VmLib:\t%8lu kB\n"
> > - A  A  A  A  A  A  A  "VmPTE:\t%8lu kB\n",
> > + A  A  A  A  A  A  A  "VmPTE:\t%8lu kB\n"
> > + A  A  A  A  A  A  A  "VmSwap:\t%8lu kB\n",
> > A  A  A  A  A  A  A  A hiwater_vm << (PAGE_SHIFT-10),
> > A  A  A  A  A  A  A  A (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
> > A  A  A  A  A  A  A  A mm->locked_vm << (PAGE_SHIFT-10),
> > @@ -54,7 +56,8 @@ void task_mem(struct seq_file *m, struct
> > A  A  A  A  A  A  A  A total_rss << (PAGE_SHIFT-10),
> > A  A  A  A  A  A  A  A data << (PAGE_SHIFT-10),
> > A  A  A  A  A  A  A  A mm->stack_vm << (PAGE_SHIFT-10), text, lib,
> > - A  A  A  A  A  A  A  (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10);
> > + A  A  A  A  A  A  A  (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10,
> > + A  A  A  A  A  A  A  swap << (PAGE_SHIFT - 10));
> > A }
> >
> > A unsigned long task_vsize(struct mm_struct *mm)
> > Index: mmotm-2.6.32-Nov2/mm/rmap.c
> > ===================================================================
> > --- mmotm-2.6.32-Nov2.orig/mm/rmap.c
> > +++ mmotm-2.6.32-Nov2/mm/rmap.c
> > @@ -834,6 +834,7 @@ static int try_to_unmap_one(struct page
> > A  A  A  A  A  A  A  A  A  A  A  A  A  A  A  A spin_unlock(&mmlist_lock);
> > A  A  A  A  A  A  A  A  A  A  A  A }
> > A  A  A  A  A  A  A  A  A  A  A  A dec_mm_counter(mm, anon_rss);
> > + A  A  A  A  A  A  A  A  A  A  A  inc_mm_counter(mm, swap_usage);
> > A  A  A  A  A  A  A  A } else if (PAGE_MIGRATION) {
> > A  A  A  A  A  A  A  A  A  A  A  A /*
> > A  A  A  A  A  A  A  A  A  A  A  A  * Store the pfn of the page in a special migration
> > Index: mmotm-2.6.32-Nov2/kernel/fork.c
> > ===================================================================
> > --- mmotm-2.6.32-Nov2.orig/kernel/fork.c
> > +++ mmotm-2.6.32-Nov2/kernel/fork.c
> > @@ -454,6 +454,7 @@ static struct mm_struct * mm_init(struct
> > A  A  A  A mm->nr_ptes = 0;
> > A  A  A  A set_mm_counter(mm, file_rss, 0);
> > A  A  A  A set_mm_counter(mm, anon_rss, 0);
> > + A  A  A  set_mm_counter(mm, swap_usage, 0);
> > A  A  A  A spin_lock_init(&mm->page_table_lock);
> > A  A  A  A mm->free_area_cache = TASK_UNMAPPED_BASE;
> > A  A  A  A mm->cached_hole_size = ~0UL;
> >
> > --
> > To unsubscribe, send a message with 'unsubscribe linux-mm' in
> > the body to majordomo@kvack.org. A For more info on Linux MM,
> > see: http://www.linux-mm.org/ .
> > Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
> >
> 
> That's good.
> From now on, we can chagne scanning of pte to find swap pte
> in smaps_pte_rangem, too. :)
> 

Thanks, I'll update this.
-Kame


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC][PATCH] lib: generic percpu counter array
  2009-11-05 15:15         ` Christoph Lameter
@ 2009-11-06  0:49           ` KAMEZAWA Hiroyuki
  -1 siblings, 0 replies; 42+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-11-06  0:49 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: linux-kernel, linux-mm, hugh.dickins, akpm, kosaki.motohiro


Thank you for review.

On Thu, 5 Nov 2009 10:15:36 -0500 (EST)
Christoph Lameter <cl@linux-foundation.org> wrote:

> On Thu, 5 Nov 2009, KAMEZAWA Hiroyuki wrote:
> 
> > +static inline void
> > +counter_array_add(struct counter_array *ca, int idx, int val)
> > +{
> > +	ca->counters[idx] += val;
> > +}
> 
> This is not a per cpu operation and therefore expensive. The new percpu
> this_cpu_inc f.e. generates a single x86 instruction for an increment.
> 
This code is for !SMP.


> > +void __counter_array_add(struct counter_array *ca, int idx, int val, int batch)
> > +{
> > +	long count, *pcount;
> > +
> > +	preempt_disable();
> > +
> > +	pcount = this_cpu_ptr(ca->v.array);
> > +	count = pcount[idx] + val;
> > +	if (!ca->v.nosync && ((count > batch) || (count < -batch))) {
> > +		atomic_long_add(count, &ca->counters[idx]);
> > +		pcount[idx] = 0;
> > +	} else
> > +		pcount[idx] = count;
> > +	preempt_enable();
> > +}
> 
> Too expensive to use in critical VM paths. The percpu operations generate
> a single instruction instead of the code above. No need for preempt etc.
> 
Hmm, ok. I'll have to see your patch, more.
I wonder how to use indexed-array and ops like add_return..


Thanks,
-Kame




^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC][PATCH] lib: generic percpu counter array
@ 2009-11-06  0:49           ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 42+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-11-06  0:49 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: linux-kernel, linux-mm, hugh.dickins, akpm, kosaki.motohiro


Thank you for review.

On Thu, 5 Nov 2009 10:15:36 -0500 (EST)
Christoph Lameter <cl@linux-foundation.org> wrote:

> On Thu, 5 Nov 2009, KAMEZAWA Hiroyuki wrote:
> 
> > +static inline void
> > +counter_array_add(struct counter_array *ca, int idx, int val)
> > +{
> > +	ca->counters[idx] += val;
> > +}
> 
> This is not a per cpu operation and therefore expensive. The new percpu
> this_cpu_inc f.e. generates a single x86 instruction for an increment.
> 
This code is for !SMP.


> > +void __counter_array_add(struct counter_array *ca, int idx, int val, int batch)
> > +{
> > +	long count, *pcount;
> > +
> > +	preempt_disable();
> > +
> > +	pcount = this_cpu_ptr(ca->v.array);
> > +	count = pcount[idx] + val;
> > +	if (!ca->v.nosync && ((count > batch) || (count < -batch))) {
> > +		atomic_long_add(count, &ca->counters[idx]);
> > +		pcount[idx] = 0;
> > +	} else
> > +		pcount[idx] = count;
> > +	preempt_enable();
> > +}
> 
> Too expensive to use in critical VM paths. The percpu operations generate
> a single instruction instead of the code above. No need for preempt etc.
> 
Hmm, ok. I'll have to see your patch, more.
I wonder how to use indexed-array and ops like add_return..


Thanks,
-Kame



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC][PATCH] lib: generic percpu counter array
  2009-11-05 15:20         ` Christoph Lameter
@ 2009-11-06  0:56           ` KAMEZAWA Hiroyuki
  -1 siblings, 0 replies; 42+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-11-06  0:56 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: linux-kernel, linux-mm, hugh.dickins, akpm, kosaki.motohiro

On Thu, 5 Nov 2009 10:20:18 -0500 (EST)
Christoph Lameter <cl@linux-foundation.org> wrote:

> On Thu, 5 Nov 2009, KAMEZAWA Hiroyuki wrote:
> 
> > Anothter major percpu coutner is vm_stat[]. This patch implements
> > vm_stat[] style counter array in lib/percpu_counter.c
> > This is designed for introducing vm_stat[] style counter to memcg,
> > but maybe useful for other people. By using this, counter array
> > using percpu can be implemented easily in compact structure.
> 
> 
> Note that vm_stat support was written that way because we have extreme
> space constraints due to the need to keep statistics per zone and per cpu
> and avoid cache line pressure that would result through the use of big
> integer arrays per zone and per cpu. For a large number of zones and cpus
> this is desastrous.
> 
> If you only need to keep statistics per cpu for an entity then the vmstat
> approach is overkill. A per cpu allocation of a counter is enough.
> 
counter per memcg is required.
Memcg uses its own one but I want to remove it. (it doesn't consider memory
placement.)
What I can use under /lib is percpu_counter, but it's really overkill.

My concern on pure percpu counter is "read" side.
Now, we read counters only via status file and sometimes vmscan will read it.
For supporting dirty_ratio, we need to read them more.
I'll check I can move it to pure percpu counter as you do in mm_counters and
see how read side is affected by for_each_possible_cpu(). Anyway, it's
better than current one.

Thanks,
-Kame


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [RFC][PATCH] lib: generic percpu counter array
@ 2009-11-06  0:56           ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 42+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-11-06  0:56 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: linux-kernel, linux-mm, hugh.dickins, akpm, kosaki.motohiro

On Thu, 5 Nov 2009 10:20:18 -0500 (EST)
Christoph Lameter <cl@linux-foundation.org> wrote:

> On Thu, 5 Nov 2009, KAMEZAWA Hiroyuki wrote:
> 
> > Anothter major percpu coutner is vm_stat[]. This patch implements
> > vm_stat[] style counter array in lib/percpu_counter.c
> > This is designed for introducing vm_stat[] style counter to memcg,
> > but maybe useful for other people. By using this, counter array
> > using percpu can be implemented easily in compact structure.
> 
> 
> Note that vm_stat support was written that way because we have extreme
> space constraints due to the need to keep statistics per zone and per cpu
> and avoid cache line pressure that would result through the use of big
> integer arrays per zone and per cpu. For a large number of zones and cpus
> this is desastrous.
> 
> If you only need to keep statistics per cpu for an entity then the vmstat
> approach is overkill. A per cpu allocation of a counter is enough.
> 
counter per memcg is required.
Memcg uses its own one but I want to remove it. (it doesn't consider memory
placement.)
What I can use under /lib is percpu_counter, but it's really overkill.

My concern on pure percpu counter is "read" side.
Now, we read counters only via status file and sometimes vmscan will read it.
For supporting dirty_ratio, we need to read them more.
I'll check I can move it to pure percpu counter as you do in mm_counters and
see how read side is affected by for_each_possible_cpu(). Anyway, it's
better than current one.

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH] show per-process swap usage via procfs v2
  2009-11-05 23:48     ` KAMEZAWA Hiroyuki
@ 2009-11-06  4:40       ` KAMEZAWA Hiroyuki
  -1 siblings, 0 replies; 42+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-11-06  4:40 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Minchan Kim, linux-kernel, linux-mm, hugh.dickins, cl, akpm,
	kosaki.motohiro

From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

Now, anon_rss and file_rss is counted as RSS and exported via /proc.
RSS usage is important information but one more information which
is often asked by users is "usage of swap".(user support team said.)

This patch counts swap entry usage per process and show it via
/proc/<pid>/status. I think status file is robust against new entry.
Then, it is the first candidate..

 After this, /proc/<pid>/status includes following line
 <snip>
 VmPeak:   315360 kB
 VmSize:   315360 kB
 VmLck:         0 kB
 VmHWM:    180452 kB
 VmRSS:    180452 kB
 VmData:   311624 kB
 VmStk:        84 kB
 VmExe:         4 kB
 VmLib:      1568 kB
 VmPTE:       640 kB
 VmSwap:   131240 kB <=== new information

Note:
  Because this patch catches swap_pte on page table, this will
  not catch shmem's swapout. It's already accounted in per-shmem
  inode and we don't need to do more.

Changelog: 2009/11/06
 - fixed bad use of is_migration_entry. Now, non_swap_entry() is used.
Changelog: 2009/11/03
 - clean up.
 - fixed initialization bug at fork (init_mm())

Acked-by: Acked-by; David Rientjes <rientjes@google.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 fs/proc/task_mmu.c       |    9 ++++++---
 include/linux/mm_types.h |    1 +
 kernel/fork.c            |    1 +
 mm/memory.c              |   30 +++++++++++++++++++++---------
 mm/rmap.c                |    1 +
 mm/swapfile.c            |    1 +
 6 files changed, 31 insertions(+), 12 deletions(-)

Index: mmotm-2.6.32-Nov2/include/linux/mm_types.h
===================================================================
--- mmotm-2.6.32-Nov2.orig/include/linux/mm_types.h
+++ mmotm-2.6.32-Nov2/include/linux/mm_types.h
@@ -228,6 +228,7 @@ struct mm_struct {
 	 */
 	mm_counter_t _file_rss;
 	mm_counter_t _anon_rss;
+	mm_counter_t _swap_usage;
 
 	unsigned long hiwater_rss;	/* High-watermark of RSS usage */
 	unsigned long hiwater_vm;	/* High-water virtual memory usage */
Index: mmotm-2.6.32-Nov2/mm/memory.c
===================================================================
--- mmotm-2.6.32-Nov2.orig/mm/memory.c
+++ mmotm-2.6.32-Nov2/mm/memory.c
@@ -376,12 +376,15 @@ int __pte_alloc_kernel(pmd_t *pmd, unsig
 	return 0;
 }
 
-static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
+static inline void
+add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss, int swap_usage)
 {
 	if (file_rss)
 		add_mm_counter(mm, file_rss, file_rss);
 	if (anon_rss)
 		add_mm_counter(mm, anon_rss, anon_rss);
+	if (swap_usage)
+		add_mm_counter(mm, swap_usage, swap_usage);
 }
 
 /*
@@ -597,7 +600,9 @@ copy_one_pte(struct mm_struct *dst_mm, s
 						 &src_mm->mmlist);
 				spin_unlock(&mmlist_lock);
 			}
-			if (is_write_migration_entry(entry) &&
+			if (!non_swap_entry(entry))
+				rss[2]++;
+			else if (is_write_migration_entry(entry) &&
 					is_cow_mapping(vm_flags)) {
 				/*
 				 * COW mappings require pages in both parent
@@ -648,11 +653,11 @@ static int copy_pte_range(struct mm_stru
 	pte_t *src_pte, *dst_pte;
 	spinlock_t *src_ptl, *dst_ptl;
 	int progress = 0;
-	int rss[2];
+	int rss[3];
 	swp_entry_t entry = (swp_entry_t){0};
 
 again:
-	rss[1] = rss[0] = 0;
+	rss[2] = rss[1] = rss[0] = 0;
 	dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
 	if (!dst_pte)
 		return -ENOMEM;
@@ -688,7 +693,7 @@ again:
 	arch_leave_lazy_mmu_mode();
 	spin_unlock(src_ptl);
 	pte_unmap_nested(orig_src_pte);
-	add_mm_rss(dst_mm, rss[0], rss[1]);
+	add_mm_rss(dst_mm, rss[0], rss[1], rss[2]);
 	pte_unmap_unlock(orig_dst_pte, dst_ptl);
 	cond_resched();
 
@@ -818,6 +823,7 @@ static unsigned long zap_pte_range(struc
 	spinlock_t *ptl;
 	int file_rss = 0;
 	int anon_rss = 0;
+	int swap_usage = 0;
 
 	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 	arch_enter_lazy_mmu_mode();
@@ -887,13 +893,18 @@ static unsigned long zap_pte_range(struc
 		if (pte_file(ptent)) {
 			if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
 				print_bad_pte(vma, addr, ptent, NULL);
-		} else if
-		  (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent))))
-			print_bad_pte(vma, addr, ptent, NULL);
+		} else {
+			swp_entry_t ent = pte_to_swp_entry(ptent);
+
+			if (!non_swap_entry(ent))
+				swap_usage--;
+			if (unlikely(!free_swap_and_cache(ent)))
+				print_bad_pte(vma, addr, ptent, NULL);
+		}
 		pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
 	} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
 
-	add_mm_rss(mm, file_rss, anon_rss);
+	add_mm_rss(mm, file_rss, anon_rss, swap_usage);
 	arch_leave_lazy_mmu_mode();
 	pte_unmap_unlock(pte - 1, ptl);
 
@@ -2595,6 +2606,7 @@ static int do_swap_page(struct mm_struct
 	 */
 
 	inc_mm_counter(mm, anon_rss);
+	dec_mm_counter(mm, swap_usage);
 	pte = mk_pte(page, vma->vm_page_prot);
 	if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
 		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
Index: mmotm-2.6.32-Nov2/mm/swapfile.c
===================================================================
--- mmotm-2.6.32-Nov2.orig/mm/swapfile.c
+++ mmotm-2.6.32-Nov2/mm/swapfile.c
@@ -837,6 +837,7 @@ static int unuse_pte(struct vm_area_stru
 	}
 
 	inc_mm_counter(vma->vm_mm, anon_rss);
+	dec_mm_counter(vma->vm_mm, swap_usage);
 	get_page(page);
 	set_pte_at(vma->vm_mm, addr, pte,
 		   pte_mkold(mk_pte(page, vma->vm_page_prot)));
Index: mmotm-2.6.32-Nov2/fs/proc/task_mmu.c
===================================================================
--- mmotm-2.6.32-Nov2.orig/fs/proc/task_mmu.c
+++ mmotm-2.6.32-Nov2/fs/proc/task_mmu.c
@@ -17,7 +17,7 @@
 void task_mem(struct seq_file *m, struct mm_struct *mm)
 {
 	unsigned long data, text, lib;
-	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
+	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss, swap;
 
 	/*
 	 * Note: to minimize their overhead, mm maintains hiwater_vm and
@@ -36,6 +36,7 @@ void task_mem(struct seq_file *m, struct
 	data = mm->total_vm - mm->shared_vm - mm->stack_vm;
 	text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
 	lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
+	swap = get_mm_counter(mm, swap_usage);
 	seq_printf(m,
 		"VmPeak:\t%8lu kB\n"
 		"VmSize:\t%8lu kB\n"
@@ -46,7 +47,8 @@ void task_mem(struct seq_file *m, struct
 		"VmStk:\t%8lu kB\n"
 		"VmExe:\t%8lu kB\n"
 		"VmLib:\t%8lu kB\n"
-		"VmPTE:\t%8lu kB\n",
+		"VmPTE:\t%8lu kB\n"
+		"VmSwap:\t%8lu kB\n",
 		hiwater_vm << (PAGE_SHIFT-10),
 		(total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
 		mm->locked_vm << (PAGE_SHIFT-10),
@@ -54,7 +56,8 @@ void task_mem(struct seq_file *m, struct
 		total_rss << (PAGE_SHIFT-10),
 		data << (PAGE_SHIFT-10),
 		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
-		(PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10);
+		(PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10,
+		swap << (PAGE_SHIFT - 10));
 }
 
 unsigned long task_vsize(struct mm_struct *mm)
Index: mmotm-2.6.32-Nov2/mm/rmap.c
===================================================================
--- mmotm-2.6.32-Nov2.orig/mm/rmap.c
+++ mmotm-2.6.32-Nov2/mm/rmap.c
@@ -834,6 +834,7 @@ static int try_to_unmap_one(struct page 
 				spin_unlock(&mmlist_lock);
 			}
 			dec_mm_counter(mm, anon_rss);
+			inc_mm_counter(mm, swap_usage);
 		} else if (PAGE_MIGRATION) {
 			/*
 			 * Store the pfn of the page in a special migration
Index: mmotm-2.6.32-Nov2/kernel/fork.c
===================================================================
--- mmotm-2.6.32-Nov2.orig/kernel/fork.c
+++ mmotm-2.6.32-Nov2/kernel/fork.c
@@ -454,6 +454,7 @@ static struct mm_struct * mm_init(struct
 	mm->nr_ptes = 0;
 	set_mm_counter(mm, file_rss, 0);
 	set_mm_counter(mm, anon_rss, 0);
+	set_mm_counter(mm, swap_usage, 0);
 	spin_lock_init(&mm->page_table_lock);
 	mm->free_area_cache = TASK_UNMAPPED_BASE;
 	mm->cached_hole_size = ~0UL;



^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH] show per-process swap usage via procfs v2
@ 2009-11-06  4:40       ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 42+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-11-06  4:40 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Minchan Kim, linux-kernel, linux-mm, hugh.dickins, cl, akpm,
	kosaki.motohiro

From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

Now, anon_rss and file_rss is counted as RSS and exported via /proc.
RSS usage is important information but one more information which
is often asked by users is "usage of swap".(user support team said.)

This patch counts swap entry usage per process and show it via
/proc/<pid>/status. I think status file is robust against new entry.
Then, it is the first candidate..

 After this, /proc/<pid>/status includes following line
 <snip>
 VmPeak:   315360 kB
 VmSize:   315360 kB
 VmLck:         0 kB
 VmHWM:    180452 kB
 VmRSS:    180452 kB
 VmData:   311624 kB
 VmStk:        84 kB
 VmExe:         4 kB
 VmLib:      1568 kB
 VmPTE:       640 kB
 VmSwap:   131240 kB <=== new information

Note:
  Because this patch catches swap_pte on page table, this will
  not catch shmem's swapout. It's already accounted in per-shmem
  inode and we don't need to do more.

Changelog: 2009/11/06
 - fixed bad use of is_migration_entry. Now, non_swap_entry() is used.
Changelog: 2009/11/03
 - clean up.
 - fixed initialization bug at fork (init_mm())

Acked-by: Acked-by; David Rientjes <rientjes@google.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 fs/proc/task_mmu.c       |    9 ++++++---
 include/linux/mm_types.h |    1 +
 kernel/fork.c            |    1 +
 mm/memory.c              |   30 +++++++++++++++++++++---------
 mm/rmap.c                |    1 +
 mm/swapfile.c            |    1 +
 6 files changed, 31 insertions(+), 12 deletions(-)

Index: mmotm-2.6.32-Nov2/include/linux/mm_types.h
===================================================================
--- mmotm-2.6.32-Nov2.orig/include/linux/mm_types.h
+++ mmotm-2.6.32-Nov2/include/linux/mm_types.h
@@ -228,6 +228,7 @@ struct mm_struct {
 	 */
 	mm_counter_t _file_rss;
 	mm_counter_t _anon_rss;
+	mm_counter_t _swap_usage;
 
 	unsigned long hiwater_rss;	/* High-watermark of RSS usage */
 	unsigned long hiwater_vm;	/* High-water virtual memory usage */
Index: mmotm-2.6.32-Nov2/mm/memory.c
===================================================================
--- mmotm-2.6.32-Nov2.orig/mm/memory.c
+++ mmotm-2.6.32-Nov2/mm/memory.c
@@ -376,12 +376,15 @@ int __pte_alloc_kernel(pmd_t *pmd, unsig
 	return 0;
 }
 
-static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
+static inline void
+add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss, int swap_usage)
 {
 	if (file_rss)
 		add_mm_counter(mm, file_rss, file_rss);
 	if (anon_rss)
 		add_mm_counter(mm, anon_rss, anon_rss);
+	if (swap_usage)
+		add_mm_counter(mm, swap_usage, swap_usage);
 }
 
 /*
@@ -597,7 +600,9 @@ copy_one_pte(struct mm_struct *dst_mm, s
 						 &src_mm->mmlist);
 				spin_unlock(&mmlist_lock);
 			}
-			if (is_write_migration_entry(entry) &&
+			if (!non_swap_entry(entry))
+				rss[2]++;
+			else if (is_write_migration_entry(entry) &&
 					is_cow_mapping(vm_flags)) {
 				/*
 				 * COW mappings require pages in both parent
@@ -648,11 +653,11 @@ static int copy_pte_range(struct mm_stru
 	pte_t *src_pte, *dst_pte;
 	spinlock_t *src_ptl, *dst_ptl;
 	int progress = 0;
-	int rss[2];
+	int rss[3];
 	swp_entry_t entry = (swp_entry_t){0};
 
 again:
-	rss[1] = rss[0] = 0;
+	rss[2] = rss[1] = rss[0] = 0;
 	dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
 	if (!dst_pte)
 		return -ENOMEM;
@@ -688,7 +693,7 @@ again:
 	arch_leave_lazy_mmu_mode();
 	spin_unlock(src_ptl);
 	pte_unmap_nested(orig_src_pte);
-	add_mm_rss(dst_mm, rss[0], rss[1]);
+	add_mm_rss(dst_mm, rss[0], rss[1], rss[2]);
 	pte_unmap_unlock(orig_dst_pte, dst_ptl);
 	cond_resched();
 
@@ -818,6 +823,7 @@ static unsigned long zap_pte_range(struc
 	spinlock_t *ptl;
 	int file_rss = 0;
 	int anon_rss = 0;
+	int swap_usage = 0;
 
 	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 	arch_enter_lazy_mmu_mode();
@@ -887,13 +893,18 @@ static unsigned long zap_pte_range(struc
 		if (pte_file(ptent)) {
 			if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
 				print_bad_pte(vma, addr, ptent, NULL);
-		} else if
-		  (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent))))
-			print_bad_pte(vma, addr, ptent, NULL);
+		} else {
+			swp_entry_t ent = pte_to_swp_entry(ptent);
+
+			if (!non_swap_entry(ent))
+				swap_usage--;
+			if (unlikely(!free_swap_and_cache(ent)))
+				print_bad_pte(vma, addr, ptent, NULL);
+		}
 		pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
 	} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
 
-	add_mm_rss(mm, file_rss, anon_rss);
+	add_mm_rss(mm, file_rss, anon_rss, swap_usage);
 	arch_leave_lazy_mmu_mode();
 	pte_unmap_unlock(pte - 1, ptl);
 
@@ -2595,6 +2606,7 @@ static int do_swap_page(struct mm_struct
 	 */
 
 	inc_mm_counter(mm, anon_rss);
+	dec_mm_counter(mm, swap_usage);
 	pte = mk_pte(page, vma->vm_page_prot);
 	if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
 		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
Index: mmotm-2.6.32-Nov2/mm/swapfile.c
===================================================================
--- mmotm-2.6.32-Nov2.orig/mm/swapfile.c
+++ mmotm-2.6.32-Nov2/mm/swapfile.c
@@ -837,6 +837,7 @@ static int unuse_pte(struct vm_area_stru
 	}
 
 	inc_mm_counter(vma->vm_mm, anon_rss);
+	dec_mm_counter(vma->vm_mm, swap_usage);
 	get_page(page);
 	set_pte_at(vma->vm_mm, addr, pte,
 		   pte_mkold(mk_pte(page, vma->vm_page_prot)));
Index: mmotm-2.6.32-Nov2/fs/proc/task_mmu.c
===================================================================
--- mmotm-2.6.32-Nov2.orig/fs/proc/task_mmu.c
+++ mmotm-2.6.32-Nov2/fs/proc/task_mmu.c
@@ -17,7 +17,7 @@
 void task_mem(struct seq_file *m, struct mm_struct *mm)
 {
 	unsigned long data, text, lib;
-	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
+	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss, swap;
 
 	/*
 	 * Note: to minimize their overhead, mm maintains hiwater_vm and
@@ -36,6 +36,7 @@ void task_mem(struct seq_file *m, struct
 	data = mm->total_vm - mm->shared_vm - mm->stack_vm;
 	text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
 	lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
+	swap = get_mm_counter(mm, swap_usage);
 	seq_printf(m,
 		"VmPeak:\t%8lu kB\n"
 		"VmSize:\t%8lu kB\n"
@@ -46,7 +47,8 @@ void task_mem(struct seq_file *m, struct
 		"VmStk:\t%8lu kB\n"
 		"VmExe:\t%8lu kB\n"
 		"VmLib:\t%8lu kB\n"
-		"VmPTE:\t%8lu kB\n",
+		"VmPTE:\t%8lu kB\n"
+		"VmSwap:\t%8lu kB\n",
 		hiwater_vm << (PAGE_SHIFT-10),
 		(total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
 		mm->locked_vm << (PAGE_SHIFT-10),
@@ -54,7 +56,8 @@ void task_mem(struct seq_file *m, struct
 		total_rss << (PAGE_SHIFT-10),
 		data << (PAGE_SHIFT-10),
 		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
-		(PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10);
+		(PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10,
+		swap << (PAGE_SHIFT - 10));
 }
 
 unsigned long task_vsize(struct mm_struct *mm)
Index: mmotm-2.6.32-Nov2/mm/rmap.c
===================================================================
--- mmotm-2.6.32-Nov2.orig/mm/rmap.c
+++ mmotm-2.6.32-Nov2/mm/rmap.c
@@ -834,6 +834,7 @@ static int try_to_unmap_one(struct page 
 				spin_unlock(&mmlist_lock);
 			}
 			dec_mm_counter(mm, anon_rss);
+			inc_mm_counter(mm, swap_usage);
 		} else if (PAGE_MIGRATION) {
 			/*
 			 * Store the pfn of the page in a special migration
Index: mmotm-2.6.32-Nov2/kernel/fork.c
===================================================================
--- mmotm-2.6.32-Nov2.orig/kernel/fork.c
+++ mmotm-2.6.32-Nov2/kernel/fork.c
@@ -454,6 +454,7 @@ static struct mm_struct * mm_init(struct
 	mm->nr_ptes = 0;
 	set_mm_counter(mm, file_rss, 0);
 	set_mm_counter(mm, anon_rss, 0);
+	set_mm_counter(mm, swap_usage, 0);
 	spin_lock_init(&mm->page_table_lock);
 	mm->free_area_cache = TASK_UNMAPPED_BASE;
 	mm->cached_hole_size = ~0UL;


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] show per-process swap usage via procfs v2
  2009-11-06  4:40       ` KAMEZAWA Hiroyuki
@ 2009-11-06 15:19         ` Minchan Kim
  -1 siblings, 0 replies; 42+ messages in thread
From: Minchan Kim @ 2009-11-06 15:19 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: linux-kernel, linux-mm, hugh.dickins, cl, akpm, kosaki.motohiro

On Fri, Nov 6, 2009 at 1:40 PM, KAMEZAWA Hiroyuki
<kamezawa.hiroyu@jp.fujitsu.com> wrote:
> From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
>
> Now, anon_rss and file_rss is counted as RSS and exported via /proc.
> RSS usage is important information but one more information which
> is often asked by users is "usage of swap".(user support team said.)
>
> This patch counts swap entry usage per process and show it via
> /proc/<pid>/status. I think status file is robust against new entry.
> Then, it is the first candidate..
>
>  After this, /proc/<pid>/status includes following line
>  <snip>
>  VmPeak:   315360 kB
>  VmSize:   315360 kB
>  VmLck:         0 kB
>  VmHWM:    180452 kB
>  VmRSS:    180452 kB
>  VmData:   311624 kB
>  VmStk:        84 kB
>  VmExe:         4 kB
>  VmLib:      1568 kB
>  VmPTE:       640 kB
>  VmSwap:   131240 kB <=== new information
>
> Note:
>  Because this patch catches swap_pte on page table, this will
>  not catch shmem's swapout. It's already accounted in per-shmem
>  inode and we don't need to do more.
>
> Changelog: 2009/11/06
>  - fixed bad use of is_migration_entry. Now, non_swap_entry() is used.
> Changelog: 2009/11/03
>  - clean up.
>  - fixed initialization bug at fork (init_mm())
>
> Acked-by: Acked-by; David Rientjes <rientjes@google.com>
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>

-- 
Kind regards,
Minchan Kim

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] show per-process swap usage via procfs v2
@ 2009-11-06 15:19         ` Minchan Kim
  0 siblings, 0 replies; 42+ messages in thread
From: Minchan Kim @ 2009-11-06 15:19 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: linux-kernel, linux-mm, hugh.dickins, cl, akpm, kosaki.motohiro

On Fri, Nov 6, 2009 at 1:40 PM, KAMEZAWA Hiroyuki
<kamezawa.hiroyu@jp.fujitsu.com> wrote:
> From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
>
> Now, anon_rss and file_rss is counted as RSS and exported via /proc.
> RSS usage is important information but one more information which
> is often asked by users is "usage of swap".(user support team said.)
>
> This patch counts swap entry usage per process and show it via
> /proc/<pid>/status. I think status file is robust against new entry.
> Then, it is the first candidate..
>
>  After this, /proc/<pid>/status includes following line
>  <snip>
>  VmPeak:   315360 kB
>  VmSize:   315360 kB
>  VmLck:         0 kB
>  VmHWM:    180452 kB
>  VmRSS:    180452 kB
>  VmData:   311624 kB
>  VmStk:        84 kB
>  VmExe:         4 kB
>  VmLib:      1568 kB
>  VmPTE:       640 kB
>  VmSwap:   131240 kB <=== new information
>
> Note:
>  Because this patch catches swap_pte on page table, this will
>  not catch shmem's swapout. It's already accounted in per-shmem
>  inode and we don't need to do more.
>
> Changelog: 2009/11/06
>  - fixed bad use of is_migration_entry. Now, non_swap_entry() is used.
> Changelog: 2009/11/03
>  - clean up.
>  - fixed initialization bug at fork (init_mm())
>
> Acked-by: Acked-by; David Rientjes <rientjes@google.com>
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>

-- 
Kind regards,
Minchan Kim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] show per-process swap usage via procfs
  2009-11-05 15:04       ` Christoph Lameter
@ 2009-11-08 17:04         ` Pavel Machek
  -1 siblings, 0 replies; 42+ messages in thread
From: Pavel Machek @ 2009-11-08 17:04 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: KOSAKI Motohiro, KAMEZAWA Hiroyuki, linux-kernel, linux-mm,
	hugh.dickins, akpm

On Thu 2009-11-05 10:04:01, Christoph Lameter wrote:
> On Thu, 5 Nov 2009, KOSAKI Motohiro wrote:
> 
> > > On Wed, 4 Nov 2009, KAMEZAWA Hiroyuki wrote:
> > >
> > > > Now, anon_rss and file_rss is counted as RSS and exported via /proc.
> > > > RSS usage is important information but one more information which
> > > > is often asked by users is "usage of swap".(user support team said.)
> > >
> > > Hmmm... Could we do some rework of the counters first so that they are per
> > > cpu?
> >
> > per-cpu swap counter?
> > It seems overkill effort....
> 
> The other alternative is to use atomic ops which are significantly slower
> and have an impact on critical sections.

...but compared to disk i/o, overhead should be almost zero, right?
Keep it simple...

								Pavel

-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] show per-process swap usage via procfs
@ 2009-11-08 17:04         ` Pavel Machek
  0 siblings, 0 replies; 42+ messages in thread
From: Pavel Machek @ 2009-11-08 17:04 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: KOSAKI Motohiro, KAMEZAWA Hiroyuki, linux-kernel, linux-mm,
	hugh.dickins, akpm

On Thu 2009-11-05 10:04:01, Christoph Lameter wrote:
> On Thu, 5 Nov 2009, KOSAKI Motohiro wrote:
> 
> > > On Wed, 4 Nov 2009, KAMEZAWA Hiroyuki wrote:
> > >
> > > > Now, anon_rss and file_rss is counted as RSS and exported via /proc.
> > > > RSS usage is important information but one more information which
> > > > is often asked by users is "usage of swap".(user support team said.)
> > >
> > > Hmmm... Could we do some rework of the counters first so that they are per
> > > cpu?
> >
> > per-cpu swap counter?
> > It seems overkill effort....
> 
> The other alternative is to use atomic ops which are significantly slower
> and have an impact on critical sections.

...but compared to disk i/o, overhead should be almost zero, right?
Keep it simple...

								Pavel

-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH] show per-process swap usage via procfs v3
  2009-11-06 15:19         ` Minchan Kim
@ 2009-11-11  2:25           ` KAMEZAWA Hiroyuki
  -1 siblings, 0 replies; 42+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-11-11  2:25 UTC (permalink / raw)
  To: Minchan Kim
  Cc: linux-kernel, linux-mm, hugh.dickins, cl, akpm, kosaki.motohiro

Updated Documentation/filesystems/proc.txt

==
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

Now, anon_rss and file_rss is counted as RSS and exported via /proc.
RSS usage is important information but one more information which
is often asked by users is "usage of swap".(user support team said.)

This patch counts swap entry usage per process and show it via
/proc/<pid>/status. I think status file is robust against new entry.
Then, it is the first candidate..

 After this, /proc/<pid>/status includes following line
 <snip>
 VmPeak:   315360 kB
 VmSize:   315360 kB
 VmLck:         0 kB
 VmHWM:    180452 kB
 VmRSS:    180452 kB
 VmData:   311624 kB
 VmStk:        84 kB
 VmExe:         4 kB
 VmLib:      1568 kB
 VmPTE:       640 kB
 VmSwap:   131240 kB <=== new information

Note:
  Because this patch catches swap_pte on page table, this will
  not catch shmem's swapout. It's already accounted in per-shmem
  inode and we don't need to do more.

Changelog: 2009/11/11
 - added an update for Documentation/filesystems/proc.txt
Changelog: 2009/11/06
 - fixed bad use of is_migration_entry. Now, non_swap_entry() is used.
Changelog: 2009/11/03
 - clean up.
 - fixed initialization bug at fork (init_mm())

Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Acked-by: Acked-by; David Rientjes <rientjes@google.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 Documentation/filesystems/proc.txt |    2 ++
 fs/proc/task_mmu.c                 |    9 ++++++---
 include/linux/mm_types.h           |    1 +
 kernel/fork.c                      |    1 +
 mm/memory.c                        |   30 +++++++++++++++++++++---------
 mm/rmap.c                          |    1 +
 mm/swapfile.c                      |    1 +
 7 files changed, 33 insertions(+), 12 deletions(-)

Index: mm-test-kernel/include/linux/mm_types.h
===================================================================
--- mm-test-kernel.orig/include/linux/mm_types.h
+++ mm-test-kernel/include/linux/mm_types.h
@@ -228,6 +228,7 @@ struct mm_struct {
 	 */
 	mm_counter_t _file_rss;
 	mm_counter_t _anon_rss;
+	mm_counter_t _swap_usage;
 
 	unsigned long hiwater_rss;	/* High-watermark of RSS usage */
 	unsigned long hiwater_vm;	/* High-water virtual memory usage */
Index: mm-test-kernel/mm/memory.c
===================================================================
--- mm-test-kernel.orig/mm/memory.c
+++ mm-test-kernel/mm/memory.c
@@ -376,12 +376,15 @@ int __pte_alloc_kernel(pmd_t *pmd, unsig
 	return 0;
 }
 
-static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
+static inline void
+add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss, int swap_usage)
 {
 	if (file_rss)
 		add_mm_counter(mm, file_rss, file_rss);
 	if (anon_rss)
 		add_mm_counter(mm, anon_rss, anon_rss);
+	if (swap_usage)
+		add_mm_counter(mm, swap_usage, swap_usage);
 }
 
 /*
@@ -597,7 +600,9 @@ copy_one_pte(struct mm_struct *dst_mm, s
 						 &src_mm->mmlist);
 				spin_unlock(&mmlist_lock);
 			}
-			if (is_write_migration_entry(entry) &&
+			if (!non_swap_entry(entry))
+				rss[2]++;
+			else if (is_write_migration_entry(entry) &&
 					is_cow_mapping(vm_flags)) {
 				/*
 				 * COW mappings require pages in both parent
@@ -648,11 +653,11 @@ static int copy_pte_range(struct mm_stru
 	pte_t *src_pte, *dst_pte;
 	spinlock_t *src_ptl, *dst_ptl;
 	int progress = 0;
-	int rss[2];
+	int rss[3];
 	swp_entry_t entry = (swp_entry_t){0};
 
 again:
-	rss[1] = rss[0] = 0;
+	rss[2] = rss[1] = rss[0] = 0;
 	dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
 	if (!dst_pte)
 		return -ENOMEM;
@@ -688,7 +693,7 @@ again:
 	arch_leave_lazy_mmu_mode();
 	spin_unlock(src_ptl);
 	pte_unmap_nested(orig_src_pte);
-	add_mm_rss(dst_mm, rss[0], rss[1]);
+	add_mm_rss(dst_mm, rss[0], rss[1], rss[2]);
 	pte_unmap_unlock(orig_dst_pte, dst_ptl);
 	cond_resched();
 
@@ -818,6 +823,7 @@ static unsigned long zap_pte_range(struc
 	spinlock_t *ptl;
 	int file_rss = 0;
 	int anon_rss = 0;
+	int swap_usage = 0;
 
 	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 	arch_enter_lazy_mmu_mode();
@@ -887,13 +893,18 @@ static unsigned long zap_pte_range(struc
 		if (pte_file(ptent)) {
 			if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
 				print_bad_pte(vma, addr, ptent, NULL);
-		} else if
-		  (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent))))
-			print_bad_pte(vma, addr, ptent, NULL);
+		} else {
+			swp_entry_t ent = pte_to_swp_entry(ptent);
+
+			if (!non_swap_entry(ent))
+				swap_usage--;
+			if (unlikely(!free_swap_and_cache(ent)))
+				print_bad_pte(vma, addr, ptent, NULL);
+		}
 		pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
 	} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
 
-	add_mm_rss(mm, file_rss, anon_rss);
+	add_mm_rss(mm, file_rss, anon_rss, swap_usage);
 	arch_leave_lazy_mmu_mode();
 	pte_unmap_unlock(pte - 1, ptl);
 
@@ -2595,6 +2606,7 @@ static int do_swap_page(struct mm_struct
 	 */
 
 	inc_mm_counter(mm, anon_rss);
+	dec_mm_counter(mm, swap_usage);
 	pte = mk_pte(page, vma->vm_page_prot);
 	if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
 		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
Index: mm-test-kernel/mm/swapfile.c
===================================================================
--- mm-test-kernel.orig/mm/swapfile.c
+++ mm-test-kernel/mm/swapfile.c
@@ -837,6 +837,7 @@ static int unuse_pte(struct vm_area_stru
 	}
 
 	inc_mm_counter(vma->vm_mm, anon_rss);
+	dec_mm_counter(vma->vm_mm, swap_usage);
 	get_page(page);
 	set_pte_at(vma->vm_mm, addr, pte,
 		   pte_mkold(mk_pte(page, vma->vm_page_prot)));
Index: mm-test-kernel/fs/proc/task_mmu.c
===================================================================
--- mm-test-kernel.orig/fs/proc/task_mmu.c
+++ mm-test-kernel/fs/proc/task_mmu.c
@@ -17,7 +17,7 @@
 void task_mem(struct seq_file *m, struct mm_struct *mm)
 {
 	unsigned long data, text, lib;
-	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
+	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss, swap;
 
 	/*
 	 * Note: to minimize their overhead, mm maintains hiwater_vm and
@@ -36,6 +36,7 @@ void task_mem(struct seq_file *m, struct
 	data = mm->total_vm - mm->shared_vm - mm->stack_vm;
 	text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
 	lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
+	swap = get_mm_counter(mm, swap_usage);
 	seq_printf(m,
 		"VmPeak:\t%8lu kB\n"
 		"VmSize:\t%8lu kB\n"
@@ -46,7 +47,8 @@ void task_mem(struct seq_file *m, struct
 		"VmStk:\t%8lu kB\n"
 		"VmExe:\t%8lu kB\n"
 		"VmLib:\t%8lu kB\n"
-		"VmPTE:\t%8lu kB\n",
+		"VmPTE:\t%8lu kB\n"
+		"VmSwap:\t%8lu kB\n",
 		hiwater_vm << (PAGE_SHIFT-10),
 		(total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
 		mm->locked_vm << (PAGE_SHIFT-10),
@@ -54,7 +56,8 @@ void task_mem(struct seq_file *m, struct
 		total_rss << (PAGE_SHIFT-10),
 		data << (PAGE_SHIFT-10),
 		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
-		(PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10);
+		(PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10,
+		swap << (PAGE_SHIFT - 10));
 }
 
 unsigned long task_vsize(struct mm_struct *mm)
Index: mm-test-kernel/mm/rmap.c
===================================================================
--- mm-test-kernel.orig/mm/rmap.c
+++ mm-test-kernel/mm/rmap.c
@@ -834,6 +834,7 @@ static int try_to_unmap_one(struct page 
 				spin_unlock(&mmlist_lock);
 			}
 			dec_mm_counter(mm, anon_rss);
+			inc_mm_counter(mm, swap_usage);
 		} else if (PAGE_MIGRATION) {
 			/*
 			 * Store the pfn of the page in a special migration
Index: mm-test-kernel/kernel/fork.c
===================================================================
--- mm-test-kernel.orig/kernel/fork.c
+++ mm-test-kernel/kernel/fork.c
@@ -454,6 +454,7 @@ static struct mm_struct * mm_init(struct
 	mm->nr_ptes = 0;
 	set_mm_counter(mm, file_rss, 0);
 	set_mm_counter(mm, anon_rss, 0);
+	set_mm_counter(mm, swap_usage, 0);
 	spin_lock_init(&mm->page_table_lock);
 	mm->free_area_cache = TASK_UNMAPPED_BASE;
 	mm->cached_hole_size = ~0UL;
Index: mm-test-kernel/Documentation/filesystems/proc.txt
===================================================================
--- mm-test-kernel.orig/Documentation/filesystems/proc.txt
+++ mm-test-kernel/Documentation/filesystems/proc.txt
@@ -163,6 +163,7 @@ read the file /proc/PID/status:
   VmExe:        68 kB
   VmLib:      1412 kB
   VmPTE:        20 kb
+  VmSwap:	 0 kb
   Threads:        1
   SigQ:   0/28578
   SigPnd: 0000000000000000
@@ -213,6 +214,7 @@ Table 1-2: Contents of the statm files (
  VmExe                       size of text segment
  VmLib                       size of shared library code
  VmPTE                       size of page table entries
+ VmSwap                      size of swapped out private rss.
  Threads                     number of threads
  SigQ                        number of signals queued/max. number for queue
  SigPnd                      bitmap of pending signals for the thread


^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH] show per-process swap usage via procfs v3
@ 2009-11-11  2:25           ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 42+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-11-11  2:25 UTC (permalink / raw)
  To: Minchan Kim
  Cc: linux-kernel, linux-mm, hugh.dickins, cl, akpm, kosaki.motohiro

Updated Documentation/filesystems/proc.txt

==
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

Now, anon_rss and file_rss is counted as RSS and exported via /proc.
RSS usage is important information but one more information which
is often asked by users is "usage of swap".(user support team said.)

This patch counts swap entry usage per process and show it via
/proc/<pid>/status. I think status file is robust against new entry.
Then, it is the first candidate..

 After this, /proc/<pid>/status includes following line
 <snip>
 VmPeak:   315360 kB
 VmSize:   315360 kB
 VmLck:         0 kB
 VmHWM:    180452 kB
 VmRSS:    180452 kB
 VmData:   311624 kB
 VmStk:        84 kB
 VmExe:         4 kB
 VmLib:      1568 kB
 VmPTE:       640 kB
 VmSwap:   131240 kB <=== new information

Note:
  Because this patch catches swap_pte on page table, this will
  not catch shmem's swapout. It's already accounted in per-shmem
  inode and we don't need to do more.

Changelog: 2009/11/11
 - added an update for Documentation/filesystems/proc.txt
Changelog: 2009/11/06
 - fixed bad use of is_migration_entry. Now, non_swap_entry() is used.
Changelog: 2009/11/03
 - clean up.
 - fixed initialization bug at fork (init_mm())

Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Acked-by: Acked-by; David Rientjes <rientjes@google.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 Documentation/filesystems/proc.txt |    2 ++
 fs/proc/task_mmu.c                 |    9 ++++++---
 include/linux/mm_types.h           |    1 +
 kernel/fork.c                      |    1 +
 mm/memory.c                        |   30 +++++++++++++++++++++---------
 mm/rmap.c                          |    1 +
 mm/swapfile.c                      |    1 +
 7 files changed, 33 insertions(+), 12 deletions(-)

Index: mm-test-kernel/include/linux/mm_types.h
===================================================================
--- mm-test-kernel.orig/include/linux/mm_types.h
+++ mm-test-kernel/include/linux/mm_types.h
@@ -228,6 +228,7 @@ struct mm_struct {
 	 */
 	mm_counter_t _file_rss;
 	mm_counter_t _anon_rss;
+	mm_counter_t _swap_usage;
 
 	unsigned long hiwater_rss;	/* High-watermark of RSS usage */
 	unsigned long hiwater_vm;	/* High-water virtual memory usage */
Index: mm-test-kernel/mm/memory.c
===================================================================
--- mm-test-kernel.orig/mm/memory.c
+++ mm-test-kernel/mm/memory.c
@@ -376,12 +376,15 @@ int __pte_alloc_kernel(pmd_t *pmd, unsig
 	return 0;
 }
 
-static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
+static inline void
+add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss, int swap_usage)
 {
 	if (file_rss)
 		add_mm_counter(mm, file_rss, file_rss);
 	if (anon_rss)
 		add_mm_counter(mm, anon_rss, anon_rss);
+	if (swap_usage)
+		add_mm_counter(mm, swap_usage, swap_usage);
 }
 
 /*
@@ -597,7 +600,9 @@ copy_one_pte(struct mm_struct *dst_mm, s
 						 &src_mm->mmlist);
 				spin_unlock(&mmlist_lock);
 			}
-			if (is_write_migration_entry(entry) &&
+			if (!non_swap_entry(entry))
+				rss[2]++;
+			else if (is_write_migration_entry(entry) &&
 					is_cow_mapping(vm_flags)) {
 				/*
 				 * COW mappings require pages in both parent
@@ -648,11 +653,11 @@ static int copy_pte_range(struct mm_stru
 	pte_t *src_pte, *dst_pte;
 	spinlock_t *src_ptl, *dst_ptl;
 	int progress = 0;
-	int rss[2];
+	int rss[3];
 	swp_entry_t entry = (swp_entry_t){0};
 
 again:
-	rss[1] = rss[0] = 0;
+	rss[2] = rss[1] = rss[0] = 0;
 	dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
 	if (!dst_pte)
 		return -ENOMEM;
@@ -688,7 +693,7 @@ again:
 	arch_leave_lazy_mmu_mode();
 	spin_unlock(src_ptl);
 	pte_unmap_nested(orig_src_pte);
-	add_mm_rss(dst_mm, rss[0], rss[1]);
+	add_mm_rss(dst_mm, rss[0], rss[1], rss[2]);
 	pte_unmap_unlock(orig_dst_pte, dst_ptl);
 	cond_resched();
 
@@ -818,6 +823,7 @@ static unsigned long zap_pte_range(struc
 	spinlock_t *ptl;
 	int file_rss = 0;
 	int anon_rss = 0;
+	int swap_usage = 0;
 
 	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 	arch_enter_lazy_mmu_mode();
@@ -887,13 +893,18 @@ static unsigned long zap_pte_range(struc
 		if (pte_file(ptent)) {
 			if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
 				print_bad_pte(vma, addr, ptent, NULL);
-		} else if
-		  (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent))))
-			print_bad_pte(vma, addr, ptent, NULL);
+		} else {
+			swp_entry_t ent = pte_to_swp_entry(ptent);
+
+			if (!non_swap_entry(ent))
+				swap_usage--;
+			if (unlikely(!free_swap_and_cache(ent)))
+				print_bad_pte(vma, addr, ptent, NULL);
+		}
 		pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
 	} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
 
-	add_mm_rss(mm, file_rss, anon_rss);
+	add_mm_rss(mm, file_rss, anon_rss, swap_usage);
 	arch_leave_lazy_mmu_mode();
 	pte_unmap_unlock(pte - 1, ptl);
 
@@ -2595,6 +2606,7 @@ static int do_swap_page(struct mm_struct
 	 */
 
 	inc_mm_counter(mm, anon_rss);
+	dec_mm_counter(mm, swap_usage);
 	pte = mk_pte(page, vma->vm_page_prot);
 	if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
 		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
Index: mm-test-kernel/mm/swapfile.c
===================================================================
--- mm-test-kernel.orig/mm/swapfile.c
+++ mm-test-kernel/mm/swapfile.c
@@ -837,6 +837,7 @@ static int unuse_pte(struct vm_area_stru
 	}
 
 	inc_mm_counter(vma->vm_mm, anon_rss);
+	dec_mm_counter(vma->vm_mm, swap_usage);
 	get_page(page);
 	set_pte_at(vma->vm_mm, addr, pte,
 		   pte_mkold(mk_pte(page, vma->vm_page_prot)));
Index: mm-test-kernel/fs/proc/task_mmu.c
===================================================================
--- mm-test-kernel.orig/fs/proc/task_mmu.c
+++ mm-test-kernel/fs/proc/task_mmu.c
@@ -17,7 +17,7 @@
 void task_mem(struct seq_file *m, struct mm_struct *mm)
 {
 	unsigned long data, text, lib;
-	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
+	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss, swap;
 
 	/*
 	 * Note: to minimize their overhead, mm maintains hiwater_vm and
@@ -36,6 +36,7 @@ void task_mem(struct seq_file *m, struct
 	data = mm->total_vm - mm->shared_vm - mm->stack_vm;
 	text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
 	lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
+	swap = get_mm_counter(mm, swap_usage);
 	seq_printf(m,
 		"VmPeak:\t%8lu kB\n"
 		"VmSize:\t%8lu kB\n"
@@ -46,7 +47,8 @@ void task_mem(struct seq_file *m, struct
 		"VmStk:\t%8lu kB\n"
 		"VmExe:\t%8lu kB\n"
 		"VmLib:\t%8lu kB\n"
-		"VmPTE:\t%8lu kB\n",
+		"VmPTE:\t%8lu kB\n"
+		"VmSwap:\t%8lu kB\n",
 		hiwater_vm << (PAGE_SHIFT-10),
 		(total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
 		mm->locked_vm << (PAGE_SHIFT-10),
@@ -54,7 +56,8 @@ void task_mem(struct seq_file *m, struct
 		total_rss << (PAGE_SHIFT-10),
 		data << (PAGE_SHIFT-10),
 		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
-		(PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10);
+		(PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10,
+		swap << (PAGE_SHIFT - 10));
 }
 
 unsigned long task_vsize(struct mm_struct *mm)
Index: mm-test-kernel/mm/rmap.c
===================================================================
--- mm-test-kernel.orig/mm/rmap.c
+++ mm-test-kernel/mm/rmap.c
@@ -834,6 +834,7 @@ static int try_to_unmap_one(struct page 
 				spin_unlock(&mmlist_lock);
 			}
 			dec_mm_counter(mm, anon_rss);
+			inc_mm_counter(mm, swap_usage);
 		} else if (PAGE_MIGRATION) {
 			/*
 			 * Store the pfn of the page in a special migration
Index: mm-test-kernel/kernel/fork.c
===================================================================
--- mm-test-kernel.orig/kernel/fork.c
+++ mm-test-kernel/kernel/fork.c
@@ -454,6 +454,7 @@ static struct mm_struct * mm_init(struct
 	mm->nr_ptes = 0;
 	set_mm_counter(mm, file_rss, 0);
 	set_mm_counter(mm, anon_rss, 0);
+	set_mm_counter(mm, swap_usage, 0);
 	spin_lock_init(&mm->page_table_lock);
 	mm->free_area_cache = TASK_UNMAPPED_BASE;
 	mm->cached_hole_size = ~0UL;
Index: mm-test-kernel/Documentation/filesystems/proc.txt
===================================================================
--- mm-test-kernel.orig/Documentation/filesystems/proc.txt
+++ mm-test-kernel/Documentation/filesystems/proc.txt
@@ -163,6 +163,7 @@ read the file /proc/PID/status:
   VmExe:        68 kB
   VmLib:      1412 kB
   VmPTE:        20 kb
+  VmSwap:	 0 kb
   Threads:        1
   SigQ:   0/28578
   SigPnd: 0000000000000000
@@ -213,6 +214,7 @@ Table 1-2: Contents of the statm files (
  VmExe                       size of text segment
  VmLib                       size of shared library code
  VmPTE                       size of page table entries
+ VmSwap                      size of swapped out private rss.
  Threads                     number of threads
  SigQ                        number of signals queued/max. number for queue
  SigPnd                      bitmap of pending signals for the thread

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] show per-process swap usage via procfs v3
  2009-11-11  2:25           ` KAMEZAWA Hiroyuki
@ 2009-11-12 15:20             ` Christoph Lameter
  -1 siblings, 0 replies; 42+ messages in thread
From: Christoph Lameter @ 2009-11-12 15:20 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Minchan Kim, linux-kernel, linux-mm, hugh.dickins, akpm, kosaki.motohiro

On Wed, 11 Nov 2009, KAMEZAWA Hiroyuki wrote:

>
> Index: mm-test-kernel/include/linux/mm_types.h
> ===================================================================
> --- mm-test-kernel.orig/include/linux/mm_types.h
> +++ mm-test-kernel/include/linux/mm_types.h
> @@ -228,6 +228,7 @@ struct mm_struct {
>  	 */
>  	mm_counter_t _file_rss;
>  	mm_counter_t _anon_rss;
> +	mm_counter_t _swap_usage;

This is going to be another hit on vm performance if we get down this
road.

At least put

#ifdef CONFIG_SWAP ?

around this so that we can switch it off?

> @@ -597,7 +600,9 @@ copy_one_pte(struct mm_struct *dst_mm, s
>  						 &src_mm->mmlist);
>  				spin_unlock(&mmlist_lock);
>  			}
> -			if (is_write_migration_entry(entry) &&
> +			if (!non_swap_entry(entry))
> +				rss[2]++;
> +			else if (is_write_migration_entry(entry) &&
>  					is_cow_mapping(vm_flags)) {
>  				/*

What are the implications for fork performance?

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] show per-process swap usage via procfs v3
@ 2009-11-12 15:20             ` Christoph Lameter
  0 siblings, 0 replies; 42+ messages in thread
From: Christoph Lameter @ 2009-11-12 15:20 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Minchan Kim, linux-kernel, linux-mm, hugh.dickins, akpm, kosaki.motohiro

On Wed, 11 Nov 2009, KAMEZAWA Hiroyuki wrote:

>
> Index: mm-test-kernel/include/linux/mm_types.h
> ===================================================================
> --- mm-test-kernel.orig/include/linux/mm_types.h
> +++ mm-test-kernel/include/linux/mm_types.h
> @@ -228,6 +228,7 @@ struct mm_struct {
>  	 */
>  	mm_counter_t _file_rss;
>  	mm_counter_t _anon_rss;
> +	mm_counter_t _swap_usage;

This is going to be another hit on vm performance if we get down this
road.

At least put

#ifdef CONFIG_SWAP ?

around this so that we can switch it off?

> @@ -597,7 +600,9 @@ copy_one_pte(struct mm_struct *dst_mm, s
>  						 &src_mm->mmlist);
>  				spin_unlock(&mmlist_lock);
>  			}
> -			if (is_write_migration_entry(entry) &&
> +			if (!non_swap_entry(entry))
> +				rss[2]++;
> +			else if (is_write_migration_entry(entry) &&
>  					is_cow_mapping(vm_flags)) {
>  				/*

What are the implications for fork performance?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] show per-process swap usage via procfs v3
  2009-11-12 15:20             ` Christoph Lameter
@ 2009-11-13  1:51               ` KAMEZAWA Hiroyuki
  -1 siblings, 0 replies; 42+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-11-13  1:51 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Minchan Kim, linux-kernel, linux-mm, hugh.dickins, akpm, kosaki.motohiro

On Thu, 12 Nov 2009 10:20:29 -0500 (EST)
Christoph Lameter <cl@linux-foundation.org> wrote:

> On Wed, 11 Nov 2009, KAMEZAWA Hiroyuki wrote:
> 
> >
> > Index: mm-test-kernel/include/linux/mm_types.h
> > ===================================================================
> > --- mm-test-kernel.orig/include/linux/mm_types.h
> > +++ mm-test-kernel/include/linux/mm_types.h
> > @@ -228,6 +228,7 @@ struct mm_struct {
> >  	 */
> >  	mm_counter_t _file_rss;
> >  	mm_counter_t _anon_rss;
> > +	mm_counter_t _swap_usage;
> 
> This is going to be another hit on vm performance if we get down this
> road.
> 
> At least put
> 
> #ifdef CONFIG_SWAP ?
> 
> around this so that we can switch it off?
> 
Hmm, okay. But I'm not sure I can do it in clean way.
(Or, I'll wait for you updates for mm_counters, or I do by myself.)

> > @@ -597,7 +600,9 @@ copy_one_pte(struct mm_struct *dst_mm, s
> >  						 &src_mm->mmlist);
> >  				spin_unlock(&mmlist_lock);
> >  			}
> > -			if (is_write_migration_entry(entry) &&
> > +			if (!non_swap_entry(entry))
> > +				rss[2]++;
> > +			else if (is_write_migration_entry(entry) &&
> >  					is_cow_mapping(vm_flags)) {
> >  				/*
> 
> What are the implications for fork performance?

This path is executed when page table entry contains a entry of
  !pte_none() && !pte_present().

There are not very big chance to reach here.(this path is under unlikely()).

Thanks,
-Kame


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] show per-process swap usage via procfs v3
@ 2009-11-13  1:51               ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 42+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-11-13  1:51 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Minchan Kim, linux-kernel, linux-mm, hugh.dickins, akpm, kosaki.motohiro

On Thu, 12 Nov 2009 10:20:29 -0500 (EST)
Christoph Lameter <cl@linux-foundation.org> wrote:

> On Wed, 11 Nov 2009, KAMEZAWA Hiroyuki wrote:
> 
> >
> > Index: mm-test-kernel/include/linux/mm_types.h
> > ===================================================================
> > --- mm-test-kernel.orig/include/linux/mm_types.h
> > +++ mm-test-kernel/include/linux/mm_types.h
> > @@ -228,6 +228,7 @@ struct mm_struct {
> >  	 */
> >  	mm_counter_t _file_rss;
> >  	mm_counter_t _anon_rss;
> > +	mm_counter_t _swap_usage;
> 
> This is going to be another hit on vm performance if we get down this
> road.
> 
> At least put
> 
> #ifdef CONFIG_SWAP ?
> 
> around this so that we can switch it off?
> 
Hmm, okay. But I'm not sure I can do it in clean way.
(Or, I'll wait for you updates for mm_counters, or I do by myself.)

> > @@ -597,7 +600,9 @@ copy_one_pte(struct mm_struct *dst_mm, s
> >  						 &src_mm->mmlist);
> >  				spin_unlock(&mmlist_lock);
> >  			}
> > -			if (is_write_migration_entry(entry) &&
> > +			if (!non_swap_entry(entry))
> > +				rss[2]++;
> > +			else if (is_write_migration_entry(entry) &&
> >  					is_cow_mapping(vm_flags)) {
> >  				/*
> 
> What are the implications for fork performance?

This path is executed when page table entry contains a entry of
  !pte_none() && !pte_present().

There are not very big chance to reach here.(this path is under unlikely()).

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] show per-process swap usage via procfs v3
  2009-11-13  1:51               ` KAMEZAWA Hiroyuki
@ 2009-11-13  2:35                 ` KAMEZAWA Hiroyuki
  -1 siblings, 0 replies; 42+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-11-13  2:35 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Christoph Lameter, Minchan Kim, linux-kernel, linux-mm,
	hugh.dickins, akpm, kosaki.motohiro

On Fri, 13 Nov 2009 10:51:12 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > > @@ -597,7 +600,9 @@ copy_one_pte(struct mm_struct *dst_mm, s
> > >  						 &src_mm->mmlist);
> > >  				spin_unlock(&mmlist_lock);
> > >  			}
> > > -			if (is_write_migration_entry(entry) &&
> > > +			if (!non_swap_entry(entry))
> > > +				rss[2]++;
> > > +			else if (is_write_migration_entry(entry) &&
> > >  					is_cow_mapping(vm_flags)) {
> > >  				/*
> > 
> > What are the implications for fork performance?
> 
> This path is executed when page table entry contains a entry of
>   !pte_none() && !pte_present().
> 
> There are not very big chance to reach here.(this path is under unlikely()).
> 

[before]
 text    data     bss     dec     hex filename
6649003 3221828 10232816        20103647        132c1df vmlinux
[after]
   text    data     bss     dec     hex filename
6649243 3221828 10232816        20103887        132c2cf vmlinux

Now, 240 bytes of text size..Hmm.

Thanks,
-Kame


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH] show per-process swap usage via procfs v3
@ 2009-11-13  2:35                 ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 42+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-11-13  2:35 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Christoph Lameter, Minchan Kim, linux-kernel, linux-mm,
	hugh.dickins, akpm, kosaki.motohiro

On Fri, 13 Nov 2009 10:51:12 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> > > @@ -597,7 +600,9 @@ copy_one_pte(struct mm_struct *dst_mm, s
> > >  						 &src_mm->mmlist);
> > >  				spin_unlock(&mmlist_lock);
> > >  			}
> > > -			if (is_write_migration_entry(entry) &&
> > > +			if (!non_swap_entry(entry))
> > > +				rss[2]++;
> > > +			else if (is_write_migration_entry(entry) &&
> > >  					is_cow_mapping(vm_flags)) {
> > >  				/*
> > 
> > What are the implications for fork performance?
> 
> This path is executed when page table entry contains a entry of
>   !pte_none() && !pte_present().
> 
> There are not very big chance to reach here.(this path is under unlikely()).
> 

[before]
 text    data     bss     dec     hex filename
6649003 3221828 10232816        20103647        132c1df vmlinux
[after]
   text    data     bss     dec     hex filename
6649243 3221828 10232816        20103887        132c2cf vmlinux

Now, 240 bytes of text size..Hmm.

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 42+ messages in thread

end of thread, other threads:[~2009-11-13  2:37 UTC | newest]

Thread overview: 42+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-11-04  6:24 [PATCH] show per-process swap usage via procfs KAMEZAWA Hiroyuki
2009-11-04  6:24 ` KAMEZAWA Hiroyuki
2009-11-04 19:15 ` Christoph Lameter
2009-11-04 19:15   ` Christoph Lameter
2009-11-04 23:25   ` KOSAKI Motohiro
2009-11-04 23:25     ` KOSAKI Motohiro
2009-11-05  2:28     ` KAMEZAWA Hiroyuki
2009-11-05  2:28       ` KAMEZAWA Hiroyuki
2009-11-05 15:04     ` Christoph Lameter
2009-11-05 15:04       ` Christoph Lameter
2009-11-08 17:04       ` Pavel Machek
2009-11-08 17:04         ` Pavel Machek
2009-11-05  0:06   ` KAMEZAWA Hiroyuki
2009-11-05  0:06     ` KAMEZAWA Hiroyuki
2009-11-05  5:16     ` [RFC][PATCH] lib: generic percpu counter array KAMEZAWA Hiroyuki
2009-11-05  5:16       ` KAMEZAWA Hiroyuki
2009-11-05 15:15       ` Christoph Lameter
2009-11-05 15:15         ` Christoph Lameter
2009-11-06  0:49         ` KAMEZAWA Hiroyuki
2009-11-06  0:49           ` KAMEZAWA Hiroyuki
2009-11-05 15:20       ` Christoph Lameter
2009-11-05 15:20         ` Christoph Lameter
2009-11-06  0:56         ` KAMEZAWA Hiroyuki
2009-11-06  0:56           ` KAMEZAWA Hiroyuki
2009-11-05 14:41 ` [PATCH] show per-process swap usage via procfs KOSAKI Motohiro
2009-11-05 14:41   ` KOSAKI Motohiro
2009-11-05 15:11 ` Minchan Kim
2009-11-05 15:11   ` Minchan Kim
2009-11-05 23:48   ` KAMEZAWA Hiroyuki
2009-11-05 23:48     ` KAMEZAWA Hiroyuki
2009-11-06  4:40     ` [PATCH] show per-process swap usage via procfs v2 KAMEZAWA Hiroyuki
2009-11-06  4:40       ` KAMEZAWA Hiroyuki
2009-11-06 15:19       ` Minchan Kim
2009-11-06 15:19         ` Minchan Kim
2009-11-11  2:25         ` [PATCH] show per-process swap usage via procfs v3 KAMEZAWA Hiroyuki
2009-11-11  2:25           ` KAMEZAWA Hiroyuki
2009-11-12 15:20           ` Christoph Lameter
2009-11-12 15:20             ` Christoph Lameter
2009-11-13  1:51             ` KAMEZAWA Hiroyuki
2009-11-13  1:51               ` KAMEZAWA Hiroyuki
2009-11-13  2:35               ` KAMEZAWA Hiroyuki
2009-11-13  2:35                 ` KAMEZAWA Hiroyuki

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.