All of lore.kernel.org
 help / color / mirror / Atom feed
* [MM] Make mm counters per cpu instead of atomic
@ 2009-11-04 19:14 ` Christoph Lameter
  0 siblings, 0 replies; 66+ messages in thread
From: Christoph Lameter @ 2009-11-04 19:14 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo

From: Christoph Lameter <cl@linux-foundation.org>
Subject: Make mm counters per cpu

Changing the mm counters to per cpu counters is possible after the introduction
of the generic per cpu operations (currently in percpu and -next).

With that the contention on the counters in mm_struct can be avoided. The
USE_SPLIT_PTLOCKS case distinction can go away. Larger SMP systems do not
need to perform atomic updates to mm counters anymore. Various code paths
can be simplified since per cpu counter updates are fast and batching
of counter updates is no longer needed.

One price to pay for these improvements is the need to scan over all percpu
counters when the actual count values are needed.

Signed-off-by: Christoph Lameter <cl@linux-foundation.org>

---
 fs/proc/task_mmu.c       |   14 +++++++++-
 include/linux/mm_types.h |   16 ++++--------
 include/linux/sched.h    |   61 ++++++++++++++++++++---------------------------
 kernel/fork.c            |   25 ++++++++++++++-----
 mm/filemap_xip.c         |    2 -
 mm/fremap.c              |    2 -
 mm/init-mm.c             |    3 ++
 mm/memory.c              |   20 +++++++--------
 mm/rmap.c                |   10 +++----
 mm/swapfile.c            |    2 -
 10 files changed, 84 insertions(+), 71 deletions(-)

Index: linux-2.6/include/linux/mm_types.h
===================================================================
--- linux-2.6.orig/include/linux/mm_types.h	2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/include/linux/mm_types.h	2009-11-04 13:13:42.000000000 -0600
@@ -24,11 +24,10 @@ struct address_space;

 #define USE_SPLIT_PTLOCKS	(NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS)

-#if USE_SPLIT_PTLOCKS
-typedef atomic_long_t mm_counter_t;
-#else  /* !USE_SPLIT_PTLOCKS */
-typedef unsigned long mm_counter_t;
-#endif /* !USE_SPLIT_PTLOCKS */
+struct mm_counter {
+	long file;
+	long anon;
+};

 /*
  * Each physical page in the system has a struct page associated with
@@ -223,11 +222,8 @@ struct mm_struct {
 						 * by mmlist_lock
 						 */

-	/* Special counters, in some configurations protected by the
-	 * page_table_lock, in other configurations by being atomic.
-	 */
-	mm_counter_t _file_rss;
-	mm_counter_t _anon_rss;
+	/* Special percpu counters */
+	struct mm_counter *rss;

 	unsigned long hiwater_rss;	/* High-watermark of RSS usage */
 	unsigned long hiwater_vm;	/* High-water virtual memory usage */
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h	2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/include/linux/sched.h	2009-11-04 13:13:42.000000000 -0600
@@ -385,41 +385,32 @@ arch_get_unmapped_area_topdown(struct fi
 extern void arch_unmap_area(struct mm_struct *, unsigned long);
 extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);

-#if USE_SPLIT_PTLOCKS
-/*
- * The mm counters are not protected by its page_table_lock,
- * so must be incremented atomically.
- */
-#define set_mm_counter(mm, member, value) atomic_long_set(&(mm)->_##member, value)
-#define get_mm_counter(mm, member) ((unsigned long)atomic_long_read(&(mm)->_##member))
-#define add_mm_counter(mm, member, value) atomic_long_add(value, &(mm)->_##member)
-#define inc_mm_counter(mm, member) atomic_long_inc(&(mm)->_##member)
-#define dec_mm_counter(mm, member) atomic_long_dec(&(mm)->_##member)
-
-#else  /* !USE_SPLIT_PTLOCKS */
-/*
- * The mm counters are protected by its page_table_lock,
- * so can be incremented directly.
- */
-#define set_mm_counter(mm, member, value) (mm)->_##member = (value)
-#define get_mm_counter(mm, member) ((mm)->_##member)
-#define add_mm_counter(mm, member, value) (mm)->_##member += (value)
-#define inc_mm_counter(mm, member) (mm)->_##member++
-#define dec_mm_counter(mm, member) (mm)->_##member--
-
-#endif /* !USE_SPLIT_PTLOCKS */
-
-#define get_mm_rss(mm)					\
-	(get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss))
-#define update_hiwater_rss(mm)	do {			\
-	unsigned long _rss = get_mm_rss(mm);		\
-	if ((mm)->hiwater_rss < _rss)			\
-		(mm)->hiwater_rss = _rss;		\
-} while (0)
-#define update_hiwater_vm(mm)	do {			\
-	if ((mm)->hiwater_vm < (mm)->total_vm)		\
-		(mm)->hiwater_vm = (mm)->total_vm;	\
-} while (0)
+static inline unsigned long get_mm_rss(struct mm_struct *mm)
+{
+	int cpu;
+	unsigned long r = 0;
+
+	for_each_possible_cpu(cpu) {
+		struct mm_counter *c = per_cpu_ptr(mm->rss, cpu);
+
+		r = c->file + c->anon;
+	}
+
+	return r;
+}
+
+static inline void update_hiwater_rss(struct mm_struct *mm)
+{
+	unsigned long _rss = get_mm_rss(mm);
+	if (mm->hiwater_rss < _rss)
+		mm->hiwater_rss = _rss;
+}
+
+static inline void update_hiwater_vm(struct mm_struct *mm)
+{
+	if (mm->hiwater_vm < mm->total_vm)
+		mm->hiwater_vm = mm->total_vm;
+}

 static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm)
 {
Index: linux-2.6/kernel/fork.c
===================================================================
--- linux-2.6.orig/kernel/fork.c	2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/kernel/fork.c	2009-11-04 13:14:19.000000000 -0600
@@ -444,6 +444,8 @@ static void mm_init_aio(struct mm_struct

 static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
 {
+	int cpu;
+
 	atomic_set(&mm->mm_users, 1);
 	atomic_set(&mm->mm_count, 1);
 	init_rwsem(&mm->mmap_sem);
@@ -452,8 +454,11 @@ static struct mm_struct * mm_init(struct
 		(current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
 	mm->core_state = NULL;
 	mm->nr_ptes = 0;
-	set_mm_counter(mm, file_rss, 0);
-	set_mm_counter(mm, anon_rss, 0);
+	for_each_possible_cpu(cpu) {
+		struct mm_counter *m;
+
+		memset(m, sizeof(struct mm_counter), 0);
+	}
 	spin_lock_init(&mm->page_table_lock);
 	mm->free_area_cache = TASK_UNMAPPED_BASE;
 	mm->cached_hole_size = ~0UL;
@@ -480,7 +485,13 @@ struct mm_struct * mm_alloc(void)
 	mm = allocate_mm();
 	if (mm) {
 		memset(mm, 0, sizeof(*mm));
-		mm = mm_init(mm, current);
+		mm->rss = alloc_percpu(struct mm_counter);
+		if (mm->rss)
+			mm = mm_init(mm, current);
+		else {
+			free_mm(mm);
+			mm = NULL;
+		}
 	}
 	return mm;
 }
@@ -496,6 +507,7 @@ void __mmdrop(struct mm_struct *mm)
 	mm_free_pgd(mm);
 	destroy_context(mm);
 	mmu_notifier_mm_destroy(mm);
+	free_percpu(mm->rss);
 	free_mm(mm);
 }
 EXPORT_SYMBOL_GPL(__mmdrop);
@@ -631,6 +643,9 @@ struct mm_struct *dup_mm(struct task_str
 		goto fail_nomem;

 	memcpy(mm, oldmm, sizeof(*mm));
+	mm->rss = alloc_percpu(struct mm_counter);
+	if (!mm->rss)
+		goto fail_nomem;

 	/* Initializing for Swap token stuff */
 	mm->token_priority = 0;
@@ -661,15 +676,13 @@ free_pt:
 	mm->binfmt = NULL;
 	mmput(mm);

-fail_nomem:
-	return NULL;
-
 fail_nocontext:
 	/*
 	 * If init_new_context() failed, we cannot use mmput() to free the mm
 	 * because it calls destroy_context()
 	 */
 	mm_free_pgd(mm);
+fail_nomem:
 	free_mm(mm);
 	return NULL;
 }
Index: linux-2.6/fs/proc/task_mmu.c
===================================================================
--- linux-2.6.orig/fs/proc/task_mmu.c	2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/fs/proc/task_mmu.c	2009-11-04 13:13:42.000000000 -0600
@@ -65,11 +65,21 @@ unsigned long task_vsize(struct mm_struc
 int task_statm(struct mm_struct *mm, int *shared, int *text,
 	       int *data, int *resident)
 {
-	*shared = get_mm_counter(mm, file_rss);
+	int cpu;
+	int anon_rss = 0;
+	int file_rss = 0;
+
+	for_each_possible_cpu(cpu) {
+		struct mm_counter *c = per_cpu_ptr(mm->rss, cpu);
+
+		anon_rss += c->anon;
+		file_rss += c->file;
+	}
+	*shared = file_rss;
 	*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
 								>> PAGE_SHIFT;
 	*data = mm->total_vm - mm->shared_vm;
-	*resident = *shared + get_mm_counter(mm, anon_rss);
+	*resident = *shared + anon_rss;
 	return mm->total_vm;
 }

Index: linux-2.6/mm/filemap_xip.c
===================================================================
--- linux-2.6.orig/mm/filemap_xip.c	2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/mm/filemap_xip.c	2009-11-04 13:13:42.000000000 -0600
@@ -194,7 +194,7 @@ retry:
 			flush_cache_page(vma, address, pte_pfn(*pte));
 			pteval = ptep_clear_flush_notify(vma, address, pte);
 			page_remove_rmap(page);
-			dec_mm_counter(mm, file_rss);
+			__this_cpu_dec(mm->rss->file);
 			BUG_ON(pte_dirty(pteval));
 			pte_unmap_unlock(pte, ptl);
 			page_cache_release(page);
Index: linux-2.6/mm/fremap.c
===================================================================
--- linux-2.6.orig/mm/fremap.c	2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/mm/fremap.c	2009-11-04 13:13:42.000000000 -0600
@@ -40,7 +40,7 @@ static void zap_pte(struct mm_struct *mm
 			page_remove_rmap(page);
 			page_cache_release(page);
 			update_hiwater_rss(mm);
-			dec_mm_counter(mm, file_rss);
+			__this_cpu_dec(mm->rss->file);
 		}
 	} else {
 		if (!pte_file(pte))
Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c	2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/mm/memory.c	2009-11-04 13:13:42.000000000 -0600
@@ -379,9 +379,9 @@ int __pte_alloc_kernel(pmd_t *pmd, unsig
 static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
 {
 	if (file_rss)
-		add_mm_counter(mm, file_rss, file_rss);
+		__this_cpu_add(mm->rss->file, file_rss);
 	if (anon_rss)
-		add_mm_counter(mm, anon_rss, anon_rss);
+		__this_cpu_add(mm->rss->anon, anon_rss);
 }

 /*
@@ -1512,7 +1512,7 @@ static int insert_page(struct vm_area_st

 	/* Ok, finally just insert the thing.. */
 	get_page(page);
-	inc_mm_counter(mm, file_rss);
+	__this_cpu_inc(mm->rss->file);
 	page_add_file_rmap(page);
 	set_pte_at(mm, addr, pte, mk_pte(page, prot));

@@ -2148,11 +2148,11 @@ gotten:
 	if (likely(pte_same(*page_table, orig_pte))) {
 		if (old_page) {
 			if (!PageAnon(old_page)) {
-				dec_mm_counter(mm, file_rss);
-				inc_mm_counter(mm, anon_rss);
+				__this_cpu_dec(mm->rss->file);
+				__this_cpu_inc(mm->rss->anon);
 			}
 		} else
-			inc_mm_counter(mm, anon_rss);
+			__this_cpu_inc(mm->rss->anon);
 		flush_cache_page(vma, address, pte_pfn(orig_pte));
 		entry = mk_pte(new_page, vma->vm_page_prot);
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2579,7 +2579,7 @@ static int do_swap_page(struct mm_struct
 	 * discarded at swap_free().
 	 */

-	inc_mm_counter(mm, anon_rss);
+	__this_cpu_inc(mm->rss->anon);
 	pte = mk_pte(page, vma->vm_page_prot);
 	if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
 		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
@@ -2663,7 +2663,7 @@ static int do_anonymous_page(struct mm_s
 	if (!pte_none(*page_table))
 		goto release;

-	inc_mm_counter(mm, anon_rss);
+	__this_cpu_inc(mm->rss->anon);
 	page_add_new_anon_rmap(page, vma, address);
 setpte:
 	set_pte_at(mm, address, page_table, entry);
@@ -2817,10 +2817,10 @@ static int __do_fault(struct mm_struct *
 		if (flags & FAULT_FLAG_WRITE)
 			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 		if (anon) {
-			inc_mm_counter(mm, anon_rss);
+			__this_cpu_inc(mm->rss->anon);
 			page_add_new_anon_rmap(page, vma, address);
 		} else {
-			inc_mm_counter(mm, file_rss);
+			__this_cpu_inc(mm->rss->file);
 			page_add_file_rmap(page);
 			if (flags & FAULT_FLAG_WRITE) {
 				dirty_page = page;
Index: linux-2.6/mm/rmap.c
===================================================================
--- linux-2.6.orig/mm/rmap.c	2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/mm/rmap.c	2009-11-04 13:13:42.000000000 -0600
@@ -809,9 +809,9 @@ static int try_to_unmap_one(struct page

 	if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
 		if (PageAnon(page))
-			dec_mm_counter(mm, anon_rss);
+			__this_cpu_dec(mm->rss->anon);
 		else
-			dec_mm_counter(mm, file_rss);
+			__this_cpu_dec(mm->rss->file);
 		set_pte_at(mm, address, pte,
 				swp_entry_to_pte(make_hwpoison_entry(page)));
 	} else if (PageAnon(page)) {
@@ -829,7 +829,7 @@ static int try_to_unmap_one(struct page
 					list_add(&mm->mmlist, &init_mm.mmlist);
 				spin_unlock(&mmlist_lock);
 			}
-			dec_mm_counter(mm, anon_rss);
+			__this_cpu_dec(mm->rss->anon);
 		} else if (PAGE_MIGRATION) {
 			/*
 			 * Store the pfn of the page in a special migration
@@ -847,7 +847,7 @@ static int try_to_unmap_one(struct page
 		entry = make_migration_entry(page, pte_write(pteval));
 		set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
 	} else
-		dec_mm_counter(mm, file_rss);
+		__this_cpu_dec(mm->rss->file);


 	page_remove_rmap(page);
@@ -967,7 +967,7 @@ static int try_to_unmap_cluster(unsigned

 		page_remove_rmap(page);
 		page_cache_release(page);
-		dec_mm_counter(mm, file_rss);
+		__this_cpu_dec(mm->rss->file);
 		(*mapcount)--;
 	}
 	pte_unmap_unlock(pte - 1, ptl);
Index: linux-2.6/mm/swapfile.c
===================================================================
--- linux-2.6.orig/mm/swapfile.c	2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/mm/swapfile.c	2009-11-04 13:13:42.000000000 -0600
@@ -831,7 +831,7 @@ static int unuse_pte(struct vm_area_stru
 		goto out;
 	}

-	inc_mm_counter(vma->vm_mm, anon_rss);
+	__this_cpu_inc(vma->vm_mm->rss->anon);
 	get_page(page);
 	set_pte_at(vma->vm_mm, addr, pte,
 		   pte_mkold(mk_pte(page, vma->vm_page_prot)));
Index: linux-2.6/mm/init-mm.c
===================================================================
--- linux-2.6.orig/mm/init-mm.c	2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/mm/init-mm.c	2009-11-04 13:13:42.000000000 -0600
@@ -8,6 +8,8 @@
 #include <asm/atomic.h>
 #include <asm/pgtable.h>

+DEFINE_PER_CPU(struct mm_counter, init_mm_counters);
+
 struct mm_struct init_mm = {
 	.mm_rb		= RB_ROOT,
 	.pgd		= swapper_pg_dir,
@@ -17,4 +19,5 @@ struct mm_struct init_mm = {
 	.page_table_lock =  __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
 	.mmlist		= LIST_HEAD_INIT(init_mm.mmlist),
 	.cpu_vm_mask	= CPU_MASK_ALL,
+	.rss		= &init_mm_counters,
 };

^ permalink raw reply	[flat|nested] 66+ messages in thread

* [MM] Make mm counters per cpu instead of atomic
@ 2009-11-04 19:14 ` Christoph Lameter
  0 siblings, 0 replies; 66+ messages in thread
From: Christoph Lameter @ 2009-11-04 19:14 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo

From: Christoph Lameter <cl@linux-foundation.org>
Subject: Make mm counters per cpu

Changing the mm counters to per cpu counters is possible after the introduction
of the generic per cpu operations (currently in percpu and -next).

With that the contention on the counters in mm_struct can be avoided. The
USE_SPLIT_PTLOCKS case distinction can go away. Larger SMP systems do not
need to perform atomic updates to mm counters anymore. Various code paths
can be simplified since per cpu counter updates are fast and batching
of counter updates is no longer needed.

One price to pay for these improvements is the need to scan over all percpu
counters when the actual count values are needed.

Signed-off-by: Christoph Lameter <cl@linux-foundation.org>

---
 fs/proc/task_mmu.c       |   14 +++++++++-
 include/linux/mm_types.h |   16 ++++--------
 include/linux/sched.h    |   61 ++++++++++++++++++++---------------------------
 kernel/fork.c            |   25 ++++++++++++++-----
 mm/filemap_xip.c         |    2 -
 mm/fremap.c              |    2 -
 mm/init-mm.c             |    3 ++
 mm/memory.c              |   20 +++++++--------
 mm/rmap.c                |   10 +++----
 mm/swapfile.c            |    2 -
 10 files changed, 84 insertions(+), 71 deletions(-)

Index: linux-2.6/include/linux/mm_types.h
===================================================================
--- linux-2.6.orig/include/linux/mm_types.h	2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/include/linux/mm_types.h	2009-11-04 13:13:42.000000000 -0600
@@ -24,11 +24,10 @@ struct address_space;

 #define USE_SPLIT_PTLOCKS	(NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS)

-#if USE_SPLIT_PTLOCKS
-typedef atomic_long_t mm_counter_t;
-#else  /* !USE_SPLIT_PTLOCKS */
-typedef unsigned long mm_counter_t;
-#endif /* !USE_SPLIT_PTLOCKS */
+struct mm_counter {
+	long file;
+	long anon;
+};

 /*
  * Each physical page in the system has a struct page associated with
@@ -223,11 +222,8 @@ struct mm_struct {
 						 * by mmlist_lock
 						 */

-	/* Special counters, in some configurations protected by the
-	 * page_table_lock, in other configurations by being atomic.
-	 */
-	mm_counter_t _file_rss;
-	mm_counter_t _anon_rss;
+	/* Special percpu counters */
+	struct mm_counter *rss;

 	unsigned long hiwater_rss;	/* High-watermark of RSS usage */
 	unsigned long hiwater_vm;	/* High-water virtual memory usage */
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h	2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/include/linux/sched.h	2009-11-04 13:13:42.000000000 -0600
@@ -385,41 +385,32 @@ arch_get_unmapped_area_topdown(struct fi
 extern void arch_unmap_area(struct mm_struct *, unsigned long);
 extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);

-#if USE_SPLIT_PTLOCKS
-/*
- * The mm counters are not protected by its page_table_lock,
- * so must be incremented atomically.
- */
-#define set_mm_counter(mm, member, value) atomic_long_set(&(mm)->_##member, value)
-#define get_mm_counter(mm, member) ((unsigned long)atomic_long_read(&(mm)->_##member))
-#define add_mm_counter(mm, member, value) atomic_long_add(value, &(mm)->_##member)
-#define inc_mm_counter(mm, member) atomic_long_inc(&(mm)->_##member)
-#define dec_mm_counter(mm, member) atomic_long_dec(&(mm)->_##member)
-
-#else  /* !USE_SPLIT_PTLOCKS */
-/*
- * The mm counters are protected by its page_table_lock,
- * so can be incremented directly.
- */
-#define set_mm_counter(mm, member, value) (mm)->_##member = (value)
-#define get_mm_counter(mm, member) ((mm)->_##member)
-#define add_mm_counter(mm, member, value) (mm)->_##member += (value)
-#define inc_mm_counter(mm, member) (mm)->_##member++
-#define dec_mm_counter(mm, member) (mm)->_##member--
-
-#endif /* !USE_SPLIT_PTLOCKS */
-
-#define get_mm_rss(mm)					\
-	(get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss))
-#define update_hiwater_rss(mm)	do {			\
-	unsigned long _rss = get_mm_rss(mm);		\
-	if ((mm)->hiwater_rss < _rss)			\
-		(mm)->hiwater_rss = _rss;		\
-} while (0)
-#define update_hiwater_vm(mm)	do {			\
-	if ((mm)->hiwater_vm < (mm)->total_vm)		\
-		(mm)->hiwater_vm = (mm)->total_vm;	\
-} while (0)
+static inline unsigned long get_mm_rss(struct mm_struct *mm)
+{
+	int cpu;
+	unsigned long r = 0;
+
+	for_each_possible_cpu(cpu) {
+		struct mm_counter *c = per_cpu_ptr(mm->rss, cpu);
+
+		r = c->file + c->anon;
+	}
+
+	return r;
+}
+
+static inline void update_hiwater_rss(struct mm_struct *mm)
+{
+	unsigned long _rss = get_mm_rss(mm);
+	if (mm->hiwater_rss < _rss)
+		mm->hiwater_rss = _rss;
+}
+
+static inline void update_hiwater_vm(struct mm_struct *mm)
+{
+	if (mm->hiwater_vm < mm->total_vm)
+		mm->hiwater_vm = mm->total_vm;
+}

 static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm)
 {
Index: linux-2.6/kernel/fork.c
===================================================================
--- linux-2.6.orig/kernel/fork.c	2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/kernel/fork.c	2009-11-04 13:14:19.000000000 -0600
@@ -444,6 +444,8 @@ static void mm_init_aio(struct mm_struct

 static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
 {
+	int cpu;
+
 	atomic_set(&mm->mm_users, 1);
 	atomic_set(&mm->mm_count, 1);
 	init_rwsem(&mm->mmap_sem);
@@ -452,8 +454,11 @@ static struct mm_struct * mm_init(struct
 		(current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
 	mm->core_state = NULL;
 	mm->nr_ptes = 0;
-	set_mm_counter(mm, file_rss, 0);
-	set_mm_counter(mm, anon_rss, 0);
+	for_each_possible_cpu(cpu) {
+		struct mm_counter *m;
+
+		memset(m, sizeof(struct mm_counter), 0);
+	}
 	spin_lock_init(&mm->page_table_lock);
 	mm->free_area_cache = TASK_UNMAPPED_BASE;
 	mm->cached_hole_size = ~0UL;
@@ -480,7 +485,13 @@ struct mm_struct * mm_alloc(void)
 	mm = allocate_mm();
 	if (mm) {
 		memset(mm, 0, sizeof(*mm));
-		mm = mm_init(mm, current);
+		mm->rss = alloc_percpu(struct mm_counter);
+		if (mm->rss)
+			mm = mm_init(mm, current);
+		else {
+			free_mm(mm);
+			mm = NULL;
+		}
 	}
 	return mm;
 }
@@ -496,6 +507,7 @@ void __mmdrop(struct mm_struct *mm)
 	mm_free_pgd(mm);
 	destroy_context(mm);
 	mmu_notifier_mm_destroy(mm);
+	free_percpu(mm->rss);
 	free_mm(mm);
 }
 EXPORT_SYMBOL_GPL(__mmdrop);
@@ -631,6 +643,9 @@ struct mm_struct *dup_mm(struct task_str
 		goto fail_nomem;

 	memcpy(mm, oldmm, sizeof(*mm));
+	mm->rss = alloc_percpu(struct mm_counter);
+	if (!mm->rss)
+		goto fail_nomem;

 	/* Initializing for Swap token stuff */
 	mm->token_priority = 0;
@@ -661,15 +676,13 @@ free_pt:
 	mm->binfmt = NULL;
 	mmput(mm);

-fail_nomem:
-	return NULL;
-
 fail_nocontext:
 	/*
 	 * If init_new_context() failed, we cannot use mmput() to free the mm
 	 * because it calls destroy_context()
 	 */
 	mm_free_pgd(mm);
+fail_nomem:
 	free_mm(mm);
 	return NULL;
 }
Index: linux-2.6/fs/proc/task_mmu.c
===================================================================
--- linux-2.6.orig/fs/proc/task_mmu.c	2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/fs/proc/task_mmu.c	2009-11-04 13:13:42.000000000 -0600
@@ -65,11 +65,21 @@ unsigned long task_vsize(struct mm_struc
 int task_statm(struct mm_struct *mm, int *shared, int *text,
 	       int *data, int *resident)
 {
-	*shared = get_mm_counter(mm, file_rss);
+	int cpu;
+	int anon_rss = 0;
+	int file_rss = 0;
+
+	for_each_possible_cpu(cpu) {
+		struct mm_counter *c = per_cpu_ptr(mm->rss, cpu);
+
+		anon_rss += c->anon;
+		file_rss += c->file;
+	}
+	*shared = file_rss;
 	*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
 								>> PAGE_SHIFT;
 	*data = mm->total_vm - mm->shared_vm;
-	*resident = *shared + get_mm_counter(mm, anon_rss);
+	*resident = *shared + anon_rss;
 	return mm->total_vm;
 }

Index: linux-2.6/mm/filemap_xip.c
===================================================================
--- linux-2.6.orig/mm/filemap_xip.c	2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/mm/filemap_xip.c	2009-11-04 13:13:42.000000000 -0600
@@ -194,7 +194,7 @@ retry:
 			flush_cache_page(vma, address, pte_pfn(*pte));
 			pteval = ptep_clear_flush_notify(vma, address, pte);
 			page_remove_rmap(page);
-			dec_mm_counter(mm, file_rss);
+			__this_cpu_dec(mm->rss->file);
 			BUG_ON(pte_dirty(pteval));
 			pte_unmap_unlock(pte, ptl);
 			page_cache_release(page);
Index: linux-2.6/mm/fremap.c
===================================================================
--- linux-2.6.orig/mm/fremap.c	2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/mm/fremap.c	2009-11-04 13:13:42.000000000 -0600
@@ -40,7 +40,7 @@ static void zap_pte(struct mm_struct *mm
 			page_remove_rmap(page);
 			page_cache_release(page);
 			update_hiwater_rss(mm);
-			dec_mm_counter(mm, file_rss);
+			__this_cpu_dec(mm->rss->file);
 		}
 	} else {
 		if (!pte_file(pte))
Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c	2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/mm/memory.c	2009-11-04 13:13:42.000000000 -0600
@@ -379,9 +379,9 @@ int __pte_alloc_kernel(pmd_t *pmd, unsig
 static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
 {
 	if (file_rss)
-		add_mm_counter(mm, file_rss, file_rss);
+		__this_cpu_add(mm->rss->file, file_rss);
 	if (anon_rss)
-		add_mm_counter(mm, anon_rss, anon_rss);
+		__this_cpu_add(mm->rss->anon, anon_rss);
 }

 /*
@@ -1512,7 +1512,7 @@ static int insert_page(struct vm_area_st

 	/* Ok, finally just insert the thing.. */
 	get_page(page);
-	inc_mm_counter(mm, file_rss);
+	__this_cpu_inc(mm->rss->file);
 	page_add_file_rmap(page);
 	set_pte_at(mm, addr, pte, mk_pte(page, prot));

@@ -2148,11 +2148,11 @@ gotten:
 	if (likely(pte_same(*page_table, orig_pte))) {
 		if (old_page) {
 			if (!PageAnon(old_page)) {
-				dec_mm_counter(mm, file_rss);
-				inc_mm_counter(mm, anon_rss);
+				__this_cpu_dec(mm->rss->file);
+				__this_cpu_inc(mm->rss->anon);
 			}
 		} else
-			inc_mm_counter(mm, anon_rss);
+			__this_cpu_inc(mm->rss->anon);
 		flush_cache_page(vma, address, pte_pfn(orig_pte));
 		entry = mk_pte(new_page, vma->vm_page_prot);
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2579,7 +2579,7 @@ static int do_swap_page(struct mm_struct
 	 * discarded at swap_free().
 	 */

-	inc_mm_counter(mm, anon_rss);
+	__this_cpu_inc(mm->rss->anon);
 	pte = mk_pte(page, vma->vm_page_prot);
 	if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
 		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
@@ -2663,7 +2663,7 @@ static int do_anonymous_page(struct mm_s
 	if (!pte_none(*page_table))
 		goto release;

-	inc_mm_counter(mm, anon_rss);
+	__this_cpu_inc(mm->rss->anon);
 	page_add_new_anon_rmap(page, vma, address);
 setpte:
 	set_pte_at(mm, address, page_table, entry);
@@ -2817,10 +2817,10 @@ static int __do_fault(struct mm_struct *
 		if (flags & FAULT_FLAG_WRITE)
 			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 		if (anon) {
-			inc_mm_counter(mm, anon_rss);
+			__this_cpu_inc(mm->rss->anon);
 			page_add_new_anon_rmap(page, vma, address);
 		} else {
-			inc_mm_counter(mm, file_rss);
+			__this_cpu_inc(mm->rss->file);
 			page_add_file_rmap(page);
 			if (flags & FAULT_FLAG_WRITE) {
 				dirty_page = page;
Index: linux-2.6/mm/rmap.c
===================================================================
--- linux-2.6.orig/mm/rmap.c	2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/mm/rmap.c	2009-11-04 13:13:42.000000000 -0600
@@ -809,9 +809,9 @@ static int try_to_unmap_one(struct page

 	if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
 		if (PageAnon(page))
-			dec_mm_counter(mm, anon_rss);
+			__this_cpu_dec(mm->rss->anon);
 		else
-			dec_mm_counter(mm, file_rss);
+			__this_cpu_dec(mm->rss->file);
 		set_pte_at(mm, address, pte,
 				swp_entry_to_pte(make_hwpoison_entry(page)));
 	} else if (PageAnon(page)) {
@@ -829,7 +829,7 @@ static int try_to_unmap_one(struct page
 					list_add(&mm->mmlist, &init_mm.mmlist);
 				spin_unlock(&mmlist_lock);
 			}
-			dec_mm_counter(mm, anon_rss);
+			__this_cpu_dec(mm->rss->anon);
 		} else if (PAGE_MIGRATION) {
 			/*
 			 * Store the pfn of the page in a special migration
@@ -847,7 +847,7 @@ static int try_to_unmap_one(struct page
 		entry = make_migration_entry(page, pte_write(pteval));
 		set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
 	} else
-		dec_mm_counter(mm, file_rss);
+		__this_cpu_dec(mm->rss->file);


 	page_remove_rmap(page);
@@ -967,7 +967,7 @@ static int try_to_unmap_cluster(unsigned

 		page_remove_rmap(page);
 		page_cache_release(page);
-		dec_mm_counter(mm, file_rss);
+		__this_cpu_dec(mm->rss->file);
 		(*mapcount)--;
 	}
 	pte_unmap_unlock(pte - 1, ptl);
Index: linux-2.6/mm/swapfile.c
===================================================================
--- linux-2.6.orig/mm/swapfile.c	2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/mm/swapfile.c	2009-11-04 13:13:42.000000000 -0600
@@ -831,7 +831,7 @@ static int unuse_pte(struct vm_area_stru
 		goto out;
 	}

-	inc_mm_counter(vma->vm_mm, anon_rss);
+	__this_cpu_inc(vma->vm_mm->rss->anon);
 	get_page(page);
 	set_pte_at(vma->vm_mm, addr, pte,
 		   pte_mkold(mk_pte(page, vma->vm_page_prot)));
Index: linux-2.6/mm/init-mm.c
===================================================================
--- linux-2.6.orig/mm/init-mm.c	2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/mm/init-mm.c	2009-11-04 13:13:42.000000000 -0600
@@ -8,6 +8,8 @@
 #include <asm/atomic.h>
 #include <asm/pgtable.h>

+DEFINE_PER_CPU(struct mm_counter, init_mm_counters);
+
 struct mm_struct init_mm = {
 	.mm_rb		= RB_ROOT,
 	.pgd		= swapper_pg_dir,
@@ -17,4 +19,5 @@ struct mm_struct init_mm = {
 	.page_table_lock =  __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
 	.mmlist		= LIST_HEAD_INIT(init_mm.mmlist),
 	.cpu_vm_mask	= CPU_MASK_ALL,
+	.rss		= &init_mm_counters,
 };

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Remove rss batching from copy_page_range()
  2009-11-04 19:14 ` Christoph Lameter
@ 2009-11-04 19:17   ` Christoph Lameter
  -1 siblings, 0 replies; 66+ messages in thread
From: Christoph Lameter @ 2009-11-04 19:17 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo

From: Christoph Lameter <cl@linux-foundation.org>
Subject: Remove rss batching from copy_page_range()

With per cpu counters in mm there is no need for batching
mm counter updates anymore. Update counters directly while
copying pages.

Signed-off-by: Christoph Lameter <cl@linux-foundation.org>

---
 mm/memory.c |   27 ++++++++-------------------
 1 file changed, 8 insertions(+), 19 deletions(-)

Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c	2009-11-04 12:15:03.000000000 -0600
+++ linux-2.6/mm/memory.c	2009-11-04 13:03:45.000000000 -0600
@@ -376,14 +376,6 @@ int __pte_alloc_kernel(pmd_t *pmd, unsig
 	return 0;
 }

-static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
-{
-	if (file_rss)
-		__this_cpu_add(mm->rss->file, file_rss);
-	if (anon_rss)
-		__this_cpu_add(mm->rss->anon, anon_rss);
-}
-
 /*
  * This function is called to print an error when a bad pte
  * is found. For example, we might have a PFN-mapped pte in
@@ -575,7 +567,7 @@ out:
 static inline void
 copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
-		unsigned long addr, int *rss)
+		unsigned long addr)
 {
 	unsigned long vm_flags = vma->vm_flags;
 	pte_t pte = *src_pte;
@@ -630,7 +622,10 @@ copy_one_pte(struct mm_struct *dst_mm, s
 	if (page) {
 		get_page(page);
 		page_dup_rmap(page);
-		rss[PageAnon(page)]++;
+		if (PageAnon(page))
+			__this_cpu_inc(dst_mm->rss->anon);
+		else
+			__this_cpu_inc(dst_mm->rss->file);
 	}

 out_set_pte:
@@ -645,10 +640,8 @@ static int copy_pte_range(struct mm_stru
 	pte_t *src_pte, *dst_pte;
 	spinlock_t *src_ptl, *dst_ptl;
 	int progress = 0;
-	int rss[2];

 again:
-	rss[1] = rss[0] = 0;
 	dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
 	if (!dst_pte)
 		return -ENOMEM;
@@ -674,14 +667,13 @@ again:
 			progress++;
 			continue;
 		}
-		copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
+		copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr);
 		progress += 8;
 	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);

 	arch_leave_lazy_mmu_mode();
 	spin_unlock(src_ptl);
 	pte_unmap_nested(orig_src_pte);
-	add_mm_rss(dst_mm, rss[0], rss[1]);
 	pte_unmap_unlock(orig_dst_pte, dst_ptl);
 	cond_resched();
 	if (addr != end)
@@ -803,8 +795,6 @@ static unsigned long zap_pte_range(struc
 	struct mm_struct *mm = tlb->mm;
 	pte_t *pte;
 	spinlock_t *ptl;
-	int file_rss = 0;
-	int anon_rss = 0;

 	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 	arch_enter_lazy_mmu_mode();
@@ -850,14 +840,14 @@ static unsigned long zap_pte_range(struc
 				set_pte_at(mm, addr, pte,
 					   pgoff_to_pte(page->index));
 			if (PageAnon(page))
-				anon_rss--;
+				__this_cpu_dec(mm->rss->anon);
 			else {
 				if (pte_dirty(ptent))
 					set_page_dirty(page);
 				if (pte_young(ptent) &&
 				    likely(!VM_SequentialReadHint(vma)))
 					mark_page_accessed(page);
-				file_rss--;
+				__this_cpu_dec(mm->rss->file);
 			}
 			page_remove_rmap(page);
 			if (unlikely(page_mapcount(page) < 0))
@@ -880,7 +870,6 @@ static unsigned long zap_pte_range(struc
 		pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
 	} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));

-	add_mm_rss(mm, file_rss, anon_rss);
 	arch_leave_lazy_mmu_mode();
 	pte_unmap_unlock(pte - 1, ptl);


^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Remove rss batching from copy_page_range()
@ 2009-11-04 19:17   ` Christoph Lameter
  0 siblings, 0 replies; 66+ messages in thread
From: Christoph Lameter @ 2009-11-04 19:17 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo

From: Christoph Lameter <cl@linux-foundation.org>
Subject: Remove rss batching from copy_page_range()

With per cpu counters in mm there is no need for batching
mm counter updates anymore. Update counters directly while
copying pages.

Signed-off-by: Christoph Lameter <cl@linux-foundation.org>

---
 mm/memory.c |   27 ++++++++-------------------
 1 file changed, 8 insertions(+), 19 deletions(-)

Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c	2009-11-04 12:15:03.000000000 -0600
+++ linux-2.6/mm/memory.c	2009-11-04 13:03:45.000000000 -0600
@@ -376,14 +376,6 @@ int __pte_alloc_kernel(pmd_t *pmd, unsig
 	return 0;
 }

-static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
-{
-	if (file_rss)
-		__this_cpu_add(mm->rss->file, file_rss);
-	if (anon_rss)
-		__this_cpu_add(mm->rss->anon, anon_rss);
-}
-
 /*
  * This function is called to print an error when a bad pte
  * is found. For example, we might have a PFN-mapped pte in
@@ -575,7 +567,7 @@ out:
 static inline void
 copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
-		unsigned long addr, int *rss)
+		unsigned long addr)
 {
 	unsigned long vm_flags = vma->vm_flags;
 	pte_t pte = *src_pte;
@@ -630,7 +622,10 @@ copy_one_pte(struct mm_struct *dst_mm, s
 	if (page) {
 		get_page(page);
 		page_dup_rmap(page);
-		rss[PageAnon(page)]++;
+		if (PageAnon(page))
+			__this_cpu_inc(dst_mm->rss->anon);
+		else
+			__this_cpu_inc(dst_mm->rss->file);
 	}

 out_set_pte:
@@ -645,10 +640,8 @@ static int copy_pte_range(struct mm_stru
 	pte_t *src_pte, *dst_pte;
 	spinlock_t *src_ptl, *dst_ptl;
 	int progress = 0;
-	int rss[2];

 again:
-	rss[1] = rss[0] = 0;
 	dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
 	if (!dst_pte)
 		return -ENOMEM;
@@ -674,14 +667,13 @@ again:
 			progress++;
 			continue;
 		}
-		copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
+		copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr);
 		progress += 8;
 	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);

 	arch_leave_lazy_mmu_mode();
 	spin_unlock(src_ptl);
 	pte_unmap_nested(orig_src_pte);
-	add_mm_rss(dst_mm, rss[0], rss[1]);
 	pte_unmap_unlock(orig_dst_pte, dst_ptl);
 	cond_resched();
 	if (addr != end)
@@ -803,8 +795,6 @@ static unsigned long zap_pte_range(struc
 	struct mm_struct *mm = tlb->mm;
 	pte_t *pte;
 	spinlock_t *ptl;
-	int file_rss = 0;
-	int anon_rss = 0;

 	pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 	arch_enter_lazy_mmu_mode();
@@ -850,14 +840,14 @@ static unsigned long zap_pte_range(struc
 				set_pte_at(mm, addr, pte,
 					   pgoff_to_pte(page->index));
 			if (PageAnon(page))
-				anon_rss--;
+				__this_cpu_dec(mm->rss->anon);
 			else {
 				if (pte_dirty(ptent))
 					set_page_dirty(page);
 				if (pte_young(ptent) &&
 				    likely(!VM_SequentialReadHint(vma)))
 					mark_page_accessed(page);
-				file_rss--;
+				__this_cpu_dec(mm->rss->file);
 			}
 			page_remove_rmap(page);
 			if (unlikely(page_mapcount(page) < 0))
@@ -880,7 +870,6 @@ static unsigned long zap_pte_range(struc
 		pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
 	} while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));

-	add_mm_rss(mm, file_rss, anon_rss);
 	arch_leave_lazy_mmu_mode();
 	pte_unmap_unlock(pte - 1, ptl);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic
  2009-11-04 19:14 ` Christoph Lameter
@ 2009-11-04 21:01   ` Andi Kleen
  -1 siblings, 0 replies; 66+ messages in thread
From: Andi Kleen @ 2009-11-04 21:01 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: KAMEZAWA Hiroyuki, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo

Christoph Lameter <cl@linux-foundation.org> writes:
>
> One price to pay for these improvements is the need to scan over all percpu
> counters when the actual count values are needed.

Do you have numbers how costly alloc_percpu() is? I wonder what this
does to fork() overhead.

-Andi
 

-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic
@ 2009-11-04 21:01   ` Andi Kleen
  0 siblings, 0 replies; 66+ messages in thread
From: Andi Kleen @ 2009-11-04 21:01 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: KAMEZAWA Hiroyuki, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo

Christoph Lameter <cl@linux-foundation.org> writes:
>
> One price to pay for these improvements is the need to scan over all percpu
> counters when the actual count values are needed.

Do you have numbers how costly alloc_percpu() is? I wonder what this
does to fork() overhead.

-Andi
 

-- 
ak@linux.intel.com -- Speaking for myself only.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Remove rss batching from copy_page_range()
  2009-11-04 19:17   ` Christoph Lameter
@ 2009-11-04 21:02     ` Andi Kleen
  -1 siblings, 0 replies; 66+ messages in thread
From: Andi Kleen @ 2009-11-04 21:02 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: KAMEZAWA Hiroyuki, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo

Christoph Lameter <cl@linux-foundation.org> writes:

> From: Christoph Lameter <cl@linux-foundation.org>
> Subject: Remove rss batching from copy_page_range()
>
> With per cpu counters in mm there is no need for batching
> mm counter updates anymore. Update counters directly while
> copying pages.

Hmm, but with all the inlining with some luck the local
counters will be in registers. That will never be the case
with the per cpu counters.

So I'm not sure it's really an improvement?

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Remove rss batching from copy_page_range()
@ 2009-11-04 21:02     ` Andi Kleen
  0 siblings, 0 replies; 66+ messages in thread
From: Andi Kleen @ 2009-11-04 21:02 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: KAMEZAWA Hiroyuki, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo

Christoph Lameter <cl@linux-foundation.org> writes:

> From: Christoph Lameter <cl@linux-foundation.org>
> Subject: Remove rss batching from copy_page_range()
>
> With per cpu counters in mm there is no need for batching
> mm counter updates anymore. Update counters directly while
> copying pages.

Hmm, but with all the inlining with some luck the local
counters will be in registers. That will never be the case
with the per cpu counters.

So I'm not sure it's really an improvement?

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Remove rss batching from copy_page_range()
  2009-11-04 21:02     ` Andi Kleen
@ 2009-11-04 22:02       ` Christoph Lameter
  -1 siblings, 0 replies; 66+ messages in thread
From: Christoph Lameter @ 2009-11-04 22:02 UTC (permalink / raw)
  To: Andi Kleen
  Cc: KAMEZAWA Hiroyuki, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo

On Wed, 4 Nov 2009, Andi Kleen wrote:

> > With per cpu counters in mm there is no need for batching
> > mm counter updates anymore. Update counters directly while
> > copying pages.
>
> Hmm, but with all the inlining with some luck the local
> counters will be in registers. That will never be the case
> with the per cpu counters.

The function is too big for that to occur and the counters have to be
preserved across function calls. The code is shorter with the patch
applied:

christoph@:~/n/linux-2.6$ size mm/memory.o
   text	   data	    bss	    dec	    hex	filename
  20140	     56	     40	  20236	   4f0c	mm/memory.o
christoph@:~/n/linux-2.6$ quilt push
Applying patch mmcounter
patching file include/linux/mm_types.h
patching file include/linux/sched.h
patching file kernel/fork.c
patching file fs/proc/task_mmu.c
patching file mm/filemap_xip.c
patching file mm/fremap.c
patching file mm/memory.c
patching file mm/rmap.c
patching file mm/swapfile.c
patching file mm/init-mm.c

Now at patch mmcounter
christoph@:~/n/linux-2.6$ make mm/memory.o
  CHK     include/linux/version.h
  CHK     include/linux/utsrelease.h
  UPD     include/linux/utsrelease.h
  SYMLINK include/asm -> include/asm-x86
  CC      arch/x86/kernel/asm-offsets.s
  GEN     include/asm/asm-offsets.h
  CALL    scripts/checksyscalls.sh
  CC      mm/memory.o
christoph@:~/n/linux-2.6$ size mm/memory.o
   text	   data	    bss	    dec	    hex	filename
  20028	     56	     40	  20124	   4e9c	mm/memory.o
christoph@:~/n/linux-2.6$ quilt push
Applying patch simplify
patching file mm/memory.c

Now at patch simplify
christoph@:~/n/linux-2.6$ make mm/memory.o
  CHK     include/linux/version.h
  CHK     include/linux/utsrelease.h
  SYMLINK include/asm -> include/asm-x86
  CALL    scripts/checksyscalls.sh
  CC      mm/memory.o
christoph@:~/n/linux-2.6$ size mm/memory.o
   text	   data	    bss	    dec	    hex	filename
  19888	     56	     40	  19984	   4e10	mm/memory.o



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Remove rss batching from copy_page_range()
@ 2009-11-04 22:02       ` Christoph Lameter
  0 siblings, 0 replies; 66+ messages in thread
From: Christoph Lameter @ 2009-11-04 22:02 UTC (permalink / raw)
  To: Andi Kleen
  Cc: KAMEZAWA Hiroyuki, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo

On Wed, 4 Nov 2009, Andi Kleen wrote:

> > With per cpu counters in mm there is no need for batching
> > mm counter updates anymore. Update counters directly while
> > copying pages.
>
> Hmm, but with all the inlining with some luck the local
> counters will be in registers. That will never be the case
> with the per cpu counters.

The function is too big for that to occur and the counters have to be
preserved across function calls. The code is shorter with the patch
applied:

christoph@:~/n/linux-2.6$ size mm/memory.o
   text	   data	    bss	    dec	    hex	filename
  20140	     56	     40	  20236	   4f0c	mm/memory.o
christoph@:~/n/linux-2.6$ quilt push
Applying patch mmcounter
patching file include/linux/mm_types.h
patching file include/linux/sched.h
patching file kernel/fork.c
patching file fs/proc/task_mmu.c
patching file mm/filemap_xip.c
patching file mm/fremap.c
patching file mm/memory.c
patching file mm/rmap.c
patching file mm/swapfile.c
patching file mm/init-mm.c

Now at patch mmcounter
christoph@:~/n/linux-2.6$ make mm/memory.o
  CHK     include/linux/version.h
  CHK     include/linux/utsrelease.h
  UPD     include/linux/utsrelease.h
  SYMLINK include/asm -> include/asm-x86
  CC      arch/x86/kernel/asm-offsets.s
  GEN     include/asm/asm-offsets.h
  CALL    scripts/checksyscalls.sh
  CC      mm/memory.o
christoph@:~/n/linux-2.6$ size mm/memory.o
   text	   data	    bss	    dec	    hex	filename
  20028	     56	     40	  20124	   4e9c	mm/memory.o
christoph@:~/n/linux-2.6$ quilt push
Applying patch simplify
patching file mm/memory.c

Now at patch simplify
christoph@:~/n/linux-2.6$ make mm/memory.o
  CHK     include/linux/version.h
  CHK     include/linux/utsrelease.h
  SYMLINK include/asm -> include/asm-x86
  CALL    scripts/checksyscalls.sh
  CC      mm/memory.o
christoph@:~/n/linux-2.6$ size mm/memory.o
   text	   data	    bss	    dec	    hex	filename
  19888	     56	     40	  19984	   4e10	mm/memory.o


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic
  2009-11-04 19:14 ` Christoph Lameter
@ 2009-11-04 23:49   ` Dave Jones
  -1 siblings, 0 replies; 66+ messages in thread
From: Dave Jones @ 2009-11-04 23:49 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: KAMEZAWA Hiroyuki, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo

On Wed, Nov 04, 2009 at 02:14:41PM -0500, Christoph Lameter wrote:
 
 > +		memset(m, sizeof(struct mm_counter), 0);

Args wrong way around.

	Dave

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic
@ 2009-11-04 23:49   ` Dave Jones
  0 siblings, 0 replies; 66+ messages in thread
From: Dave Jones @ 2009-11-04 23:49 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: KAMEZAWA Hiroyuki, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo

On Wed, Nov 04, 2009 at 02:14:41PM -0500, Christoph Lameter wrote:
 
 > +		memset(m, sizeof(struct mm_counter), 0);

Args wrong way around.

	Dave

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic
  2009-11-04 19:14 ` Christoph Lameter
@ 2009-11-05  1:16   ` KAMEZAWA Hiroyuki
  -1 siblings, 0 replies; 66+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-11-05  1:16 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo

On Wed, 4 Nov 2009 14:14:41 -0500 (EST)
Christoph Lameter <cl@linux-foundation.org> wrote:

> From: Christoph Lameter <cl@linux-foundation.org>
> Subject: Make mm counters per cpu
> 
> Changing the mm counters to per cpu counters is possible after the introduction
> of the generic per cpu operations (currently in percpu and -next).
> 
> With that the contention on the counters in mm_struct can be avoided. The
> USE_SPLIT_PTLOCKS case distinction can go away. Larger SMP systems do not
> need to perform atomic updates to mm counters anymore. Various code paths
> can be simplified since per cpu counter updates are fast and batching
> of counter updates is no longer needed.
> 
> One price to pay for these improvements is the need to scan over all percpu
> counters when the actual count values are needed.
> 
> Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
> 

Hmm, I don't fully understand _new_ percpu but...
In logical (even if not realistic), x86-32 supports up to 512 ? cpus in Kconfig.
BIGSMP.

Then, if 65536 process runs, this consumes

65536(nr_proc) * 8 (size) * 512(cpus) = 256MBytes.

But x86's vmalloc area just has 80? MBytes. I (and my customers) don't have
this kind of exteme machine, but cpus tend to be many-core (and still support
32bit mode), now.

If 32, 64 cpus,
65536 * 8 * 32 = 16MB
65536 * 8 * 32 = 32MB

And if I add swap_usage,
65536 * 12 * 32 = 24MB.

It's influenced by the number of deivces attached to the sysytem but
people will see more -ENOMEM.

It seems this consumption/footprint is very big.

Thanks,
-Kame





> ---
>  fs/proc/task_mmu.c       |   14 +++++++++-
>  include/linux/mm_types.h |   16 ++++--------
>  include/linux/sched.h    |   61 ++++++++++++++++++++---------------------------
>  kernel/fork.c            |   25 ++++++++++++++-----
>  mm/filemap_xip.c         |    2 -
>  mm/fremap.c              |    2 -
>  mm/init-mm.c             |    3 ++
>  mm/memory.c              |   20 +++++++--------
>  mm/rmap.c                |   10 +++----
>  mm/swapfile.c            |    2 -
>  10 files changed, 84 insertions(+), 71 deletions(-)
> 
> Index: linux-2.6/include/linux/mm_types.h
> ===================================================================
> --- linux-2.6.orig/include/linux/mm_types.h	2009-11-04 13:08:33.000000000 -0600
> +++ linux-2.6/include/linux/mm_types.h	2009-11-04 13:13:42.000000000 -0600
> @@ -24,11 +24,10 @@ struct address_space;
> 
>  #define USE_SPLIT_PTLOCKS	(NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS)
> 
> -#if USE_SPLIT_PTLOCKS
> -typedef atomic_long_t mm_counter_t;
> -#else  /* !USE_SPLIT_PTLOCKS */
> -typedef unsigned long mm_counter_t;
> -#endif /* !USE_SPLIT_PTLOCKS */
> +struct mm_counter {
> +	long file;
> +	long anon;
> +};
> 
>  /*
>   * Each physical page in the system has a struct page associated with
> @@ -223,11 +222,8 @@ struct mm_struct {
>  						 * by mmlist_lock
>  						 */
> 
> -	/* Special counters, in some configurations protected by the
> -	 * page_table_lock, in other configurations by being atomic.
> -	 */
> -	mm_counter_t _file_rss;
> -	mm_counter_t _anon_rss;
> +	/* Special percpu counters */
> +	struct mm_counter *rss;
> 
>  	unsigned long hiwater_rss;	/* High-watermark of RSS usage */
>  	unsigned long hiwater_vm;	/* High-water virtual memory usage */
> Index: linux-2.6/include/linux/sched.h
> ===================================================================
> --- linux-2.6.orig/include/linux/sched.h	2009-11-04 13:08:33.000000000 -0600
> +++ linux-2.6/include/linux/sched.h	2009-11-04 13:13:42.000000000 -0600
> @@ -385,41 +385,32 @@ arch_get_unmapped_area_topdown(struct fi
>  extern void arch_unmap_area(struct mm_struct *, unsigned long);
>  extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
> 
> -#if USE_SPLIT_PTLOCKS
> -/*
> - * The mm counters are not protected by its page_table_lock,
> - * so must be incremented atomically.
> - */
> -#define set_mm_counter(mm, member, value) atomic_long_set(&(mm)->_##member, value)
> -#define get_mm_counter(mm, member) ((unsigned long)atomic_long_read(&(mm)->_##member))
> -#define add_mm_counter(mm, member, value) atomic_long_add(value, &(mm)->_##member)
> -#define inc_mm_counter(mm, member) atomic_long_inc(&(mm)->_##member)
> -#define dec_mm_counter(mm, member) atomic_long_dec(&(mm)->_##member)
> -
> -#else  /* !USE_SPLIT_PTLOCKS */
> -/*
> - * The mm counters are protected by its page_table_lock,
> - * so can be incremented directly.
> - */
> -#define set_mm_counter(mm, member, value) (mm)->_##member = (value)
> -#define get_mm_counter(mm, member) ((mm)->_##member)
> -#define add_mm_counter(mm, member, value) (mm)->_##member += (value)
> -#define inc_mm_counter(mm, member) (mm)->_##member++
> -#define dec_mm_counter(mm, member) (mm)->_##member--
> -
> -#endif /* !USE_SPLIT_PTLOCKS */
> -
> -#define get_mm_rss(mm)					\
> -	(get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss))
> -#define update_hiwater_rss(mm)	do {			\
> -	unsigned long _rss = get_mm_rss(mm);		\
> -	if ((mm)->hiwater_rss < _rss)			\
> -		(mm)->hiwater_rss = _rss;		\
> -} while (0)
> -#define update_hiwater_vm(mm)	do {			\
> -	if ((mm)->hiwater_vm < (mm)->total_vm)		\
> -		(mm)->hiwater_vm = (mm)->total_vm;	\
> -} while (0)
> +static inline unsigned long get_mm_rss(struct mm_struct *mm)
> +{
> +	int cpu;
> +	unsigned long r = 0;
> +
> +	for_each_possible_cpu(cpu) {
> +		struct mm_counter *c = per_cpu_ptr(mm->rss, cpu);
> +
> +		r = c->file + c->anon;
> +	}
> +
> +	return r;
> +}
> +
> +static inline void update_hiwater_rss(struct mm_struct *mm)
> +{
> +	unsigned long _rss = get_mm_rss(mm);
> +	if (mm->hiwater_rss < _rss)
> +		mm->hiwater_rss = _rss;
> +}
> +
> +static inline void update_hiwater_vm(struct mm_struct *mm)
> +{
> +	if (mm->hiwater_vm < mm->total_vm)
> +		mm->hiwater_vm = mm->total_vm;
> +}
> 
>  static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm)
>  {
> Index: linux-2.6/kernel/fork.c
> ===================================================================
> --- linux-2.6.orig/kernel/fork.c	2009-11-04 13:08:33.000000000 -0600
> +++ linux-2.6/kernel/fork.c	2009-11-04 13:14:19.000000000 -0600
> @@ -444,6 +444,8 @@ static void mm_init_aio(struct mm_struct
> 
>  static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
>  {
> +	int cpu;
> +
>  	atomic_set(&mm->mm_users, 1);
>  	atomic_set(&mm->mm_count, 1);
>  	init_rwsem(&mm->mmap_sem);
> @@ -452,8 +454,11 @@ static struct mm_struct * mm_init(struct
>  		(current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
>  	mm->core_state = NULL;
>  	mm->nr_ptes = 0;
> -	set_mm_counter(mm, file_rss, 0);
> -	set_mm_counter(mm, anon_rss, 0);
> +	for_each_possible_cpu(cpu) {
> +		struct mm_counter *m;
> +
> +		memset(m, sizeof(struct mm_counter), 0);
> +	}
>  	spin_lock_init(&mm->page_table_lock);
>  	mm->free_area_cache = TASK_UNMAPPED_BASE;
>  	mm->cached_hole_size = ~0UL;
> @@ -480,7 +485,13 @@ struct mm_struct * mm_alloc(void)
>  	mm = allocate_mm();
>  	if (mm) {
>  		memset(mm, 0, sizeof(*mm));
> -		mm = mm_init(mm, current);
> +		mm->rss = alloc_percpu(struct mm_counter);
> +		if (mm->rss)
> +			mm = mm_init(mm, current);
> +		else {
> +			free_mm(mm);
> +			mm = NULL;
> +		}
>  	}
>  	return mm;
>  }
> @@ -496,6 +507,7 @@ void __mmdrop(struct mm_struct *mm)
>  	mm_free_pgd(mm);
>  	destroy_context(mm);
>  	mmu_notifier_mm_destroy(mm);
> +	free_percpu(mm->rss);
>  	free_mm(mm);
>  }
>  EXPORT_SYMBOL_GPL(__mmdrop);
> @@ -631,6 +643,9 @@ struct mm_struct *dup_mm(struct task_str
>  		goto fail_nomem;
> 
>  	memcpy(mm, oldmm, sizeof(*mm));
> +	mm->rss = alloc_percpu(struct mm_counter);
> +	if (!mm->rss)
> +		goto fail_nomem;
> 
>  	/* Initializing for Swap token stuff */
>  	mm->token_priority = 0;
> @@ -661,15 +676,13 @@ free_pt:
>  	mm->binfmt = NULL;
>  	mmput(mm);
> 
> -fail_nomem:
> -	return NULL;
> -
>  fail_nocontext:
>  	/*
>  	 * If init_new_context() failed, we cannot use mmput() to free the mm
>  	 * because it calls destroy_context()
>  	 */
>  	mm_free_pgd(mm);
> +fail_nomem:
>  	free_mm(mm);
>  	return NULL;
>  }
> Index: linux-2.6/fs/proc/task_mmu.c
> ===================================================================
> --- linux-2.6.orig/fs/proc/task_mmu.c	2009-11-04 13:08:33.000000000 -0600
> +++ linux-2.6/fs/proc/task_mmu.c	2009-11-04 13:13:42.000000000 -0600
> @@ -65,11 +65,21 @@ unsigned long task_vsize(struct mm_struc
>  int task_statm(struct mm_struct *mm, int *shared, int *text,
>  	       int *data, int *resident)
>  {
> -	*shared = get_mm_counter(mm, file_rss);
> +	int cpu;
> +	int anon_rss = 0;
> +	int file_rss = 0;
> +
> +	for_each_possible_cpu(cpu) {
> +		struct mm_counter *c = per_cpu_ptr(mm->rss, cpu);
> +
> +		anon_rss += c->anon;
> +		file_rss += c->file;
> +	}
> +	*shared = file_rss;
>  	*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
>  								>> PAGE_SHIFT;
>  	*data = mm->total_vm - mm->shared_vm;
> -	*resident = *shared + get_mm_counter(mm, anon_rss);
> +	*resident = *shared + anon_rss;
>  	return mm->total_vm;
>  }
> 
> Index: linux-2.6/mm/filemap_xip.c
> ===================================================================
> --- linux-2.6.orig/mm/filemap_xip.c	2009-11-04 13:08:33.000000000 -0600
> +++ linux-2.6/mm/filemap_xip.c	2009-11-04 13:13:42.000000000 -0600
> @@ -194,7 +194,7 @@ retry:
>  			flush_cache_page(vma, address, pte_pfn(*pte));
>  			pteval = ptep_clear_flush_notify(vma, address, pte);
>  			page_remove_rmap(page);
> -			dec_mm_counter(mm, file_rss);
> +			__this_cpu_dec(mm->rss->file);
>  			BUG_ON(pte_dirty(pteval));
>  			pte_unmap_unlock(pte, ptl);
>  			page_cache_release(page);
> Index: linux-2.6/mm/fremap.c
> ===================================================================
> --- linux-2.6.orig/mm/fremap.c	2009-11-04 13:08:33.000000000 -0600
> +++ linux-2.6/mm/fremap.c	2009-11-04 13:13:42.000000000 -0600
> @@ -40,7 +40,7 @@ static void zap_pte(struct mm_struct *mm
>  			page_remove_rmap(page);
>  			page_cache_release(page);
>  			update_hiwater_rss(mm);
> -			dec_mm_counter(mm, file_rss);
> +			__this_cpu_dec(mm->rss->file);
>  		}
>  	} else {
>  		if (!pte_file(pte))
> Index: linux-2.6/mm/memory.c
> ===================================================================
> --- linux-2.6.orig/mm/memory.c	2009-11-04 13:08:33.000000000 -0600
> +++ linux-2.6/mm/memory.c	2009-11-04 13:13:42.000000000 -0600
> @@ -379,9 +379,9 @@ int __pte_alloc_kernel(pmd_t *pmd, unsig
>  static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
>  {
>  	if (file_rss)
> -		add_mm_counter(mm, file_rss, file_rss);
> +		__this_cpu_add(mm->rss->file, file_rss);
>  	if (anon_rss)
> -		add_mm_counter(mm, anon_rss, anon_rss);
> +		__this_cpu_add(mm->rss->anon, anon_rss);
>  }
> 
>  /*
> @@ -1512,7 +1512,7 @@ static int insert_page(struct vm_area_st
> 
>  	/* Ok, finally just insert the thing.. */
>  	get_page(page);
> -	inc_mm_counter(mm, file_rss);
> +	__this_cpu_inc(mm->rss->file);
>  	page_add_file_rmap(page);
>  	set_pte_at(mm, addr, pte, mk_pte(page, prot));
> 
> @@ -2148,11 +2148,11 @@ gotten:
>  	if (likely(pte_same(*page_table, orig_pte))) {
>  		if (old_page) {
>  			if (!PageAnon(old_page)) {
> -				dec_mm_counter(mm, file_rss);
> -				inc_mm_counter(mm, anon_rss);
> +				__this_cpu_dec(mm->rss->file);
> +				__this_cpu_inc(mm->rss->anon);
>  			}
>  		} else
> -			inc_mm_counter(mm, anon_rss);
> +			__this_cpu_inc(mm->rss->anon);
>  		flush_cache_page(vma, address, pte_pfn(orig_pte));
>  		entry = mk_pte(new_page, vma->vm_page_prot);
>  		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> @@ -2579,7 +2579,7 @@ static int do_swap_page(struct mm_struct
>  	 * discarded at swap_free().
>  	 */
> 
> -	inc_mm_counter(mm, anon_rss);
> +	__this_cpu_inc(mm->rss->anon);
>  	pte = mk_pte(page, vma->vm_page_prot);
>  	if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
>  		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
> @@ -2663,7 +2663,7 @@ static int do_anonymous_page(struct mm_s
>  	if (!pte_none(*page_table))
>  		goto release;
> 
> -	inc_mm_counter(mm, anon_rss);
> +	__this_cpu_inc(mm->rss->anon);
>  	page_add_new_anon_rmap(page, vma, address);
>  setpte:
>  	set_pte_at(mm, address, page_table, entry);
> @@ -2817,10 +2817,10 @@ static int __do_fault(struct mm_struct *
>  		if (flags & FAULT_FLAG_WRITE)
>  			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
>  		if (anon) {
> -			inc_mm_counter(mm, anon_rss);
> +			__this_cpu_inc(mm->rss->anon);
>  			page_add_new_anon_rmap(page, vma, address);
>  		} else {
> -			inc_mm_counter(mm, file_rss);
> +			__this_cpu_inc(mm->rss->file);
>  			page_add_file_rmap(page);
>  			if (flags & FAULT_FLAG_WRITE) {
>  				dirty_page = page;
> Index: linux-2.6/mm/rmap.c
> ===================================================================
> --- linux-2.6.orig/mm/rmap.c	2009-11-04 13:08:33.000000000 -0600
> +++ linux-2.6/mm/rmap.c	2009-11-04 13:13:42.000000000 -0600
> @@ -809,9 +809,9 @@ static int try_to_unmap_one(struct page
> 
>  	if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
>  		if (PageAnon(page))
> -			dec_mm_counter(mm, anon_rss);
> +			__this_cpu_dec(mm->rss->anon);
>  		else
> -			dec_mm_counter(mm, file_rss);
> +			__this_cpu_dec(mm->rss->file);
>  		set_pte_at(mm, address, pte,
>  				swp_entry_to_pte(make_hwpoison_entry(page)));
>  	} else if (PageAnon(page)) {
> @@ -829,7 +829,7 @@ static int try_to_unmap_one(struct page
>  					list_add(&mm->mmlist, &init_mm.mmlist);
>  				spin_unlock(&mmlist_lock);
>  			}
> -			dec_mm_counter(mm, anon_rss);
> +			__this_cpu_dec(mm->rss->anon);
>  		} else if (PAGE_MIGRATION) {
>  			/*
>  			 * Store the pfn of the page in a special migration
> @@ -847,7 +847,7 @@ static int try_to_unmap_one(struct page
>  		entry = make_migration_entry(page, pte_write(pteval));
>  		set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
>  	} else
> -		dec_mm_counter(mm, file_rss);
> +		__this_cpu_dec(mm->rss->file);
> 
> 
>  	page_remove_rmap(page);
> @@ -967,7 +967,7 @@ static int try_to_unmap_cluster(unsigned
> 
>  		page_remove_rmap(page);
>  		page_cache_release(page);
> -		dec_mm_counter(mm, file_rss);
> +		__this_cpu_dec(mm->rss->file);
>  		(*mapcount)--;
>  	}
>  	pte_unmap_unlock(pte - 1, ptl);
> Index: linux-2.6/mm/swapfile.c
> ===================================================================
> --- linux-2.6.orig/mm/swapfile.c	2009-11-04 13:08:33.000000000 -0600
> +++ linux-2.6/mm/swapfile.c	2009-11-04 13:13:42.000000000 -0600
> @@ -831,7 +831,7 @@ static int unuse_pte(struct vm_area_stru
>  		goto out;
>  	}
> 
> -	inc_mm_counter(vma->vm_mm, anon_rss);
> +	__this_cpu_inc(vma->vm_mm->rss->anon);
>  	get_page(page);
>  	set_pte_at(vma->vm_mm, addr, pte,
>  		   pte_mkold(mk_pte(page, vma->vm_page_prot)));
> Index: linux-2.6/mm/init-mm.c
> ===================================================================
> --- linux-2.6.orig/mm/init-mm.c	2009-11-04 13:08:33.000000000 -0600
> +++ linux-2.6/mm/init-mm.c	2009-11-04 13:13:42.000000000 -0600
> @@ -8,6 +8,8 @@
>  #include <asm/atomic.h>
>  #include <asm/pgtable.h>
> 
> +DEFINE_PER_CPU(struct mm_counter, init_mm_counters);
> +
>  struct mm_struct init_mm = {
>  	.mm_rb		= RB_ROOT,
>  	.pgd		= swapper_pg_dir,
> @@ -17,4 +19,5 @@ struct mm_struct init_mm = {
>  	.page_table_lock =  __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
>  	.mmlist		= LIST_HEAD_INIT(init_mm.mmlist),
>  	.cpu_vm_mask	= CPU_MASK_ALL,
> +	.rss		= &init_mm_counters,
>  };
> 


^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic
@ 2009-11-05  1:16   ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 66+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-11-05  1:16 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo

On Wed, 4 Nov 2009 14:14:41 -0500 (EST)
Christoph Lameter <cl@linux-foundation.org> wrote:

> From: Christoph Lameter <cl@linux-foundation.org>
> Subject: Make mm counters per cpu
> 
> Changing the mm counters to per cpu counters is possible after the introduction
> of the generic per cpu operations (currently in percpu and -next).
> 
> With that the contention on the counters in mm_struct can be avoided. The
> USE_SPLIT_PTLOCKS case distinction can go away. Larger SMP systems do not
> need to perform atomic updates to mm counters anymore. Various code paths
> can be simplified since per cpu counter updates are fast and batching
> of counter updates is no longer needed.
> 
> One price to pay for these improvements is the need to scan over all percpu
> counters when the actual count values are needed.
> 
> Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
> 

Hmm, I don't fully understand _new_ percpu but...
In logical (even if not realistic), x86-32 supports up to 512 ? cpus in Kconfig.
BIGSMP.

Then, if 65536 process runs, this consumes

65536(nr_proc) * 8 (size) * 512(cpus) = 256MBytes.

But x86's vmalloc area just has 80? MBytes. I (and my customers) don't have
this kind of exteme machine, but cpus tend to be many-core (and still support
32bit mode), now.

If 32, 64 cpus,
65536 * 8 * 32 = 16MB
65536 * 8 * 32 = 32MB

And if I add swap_usage,
65536 * 12 * 32 = 24MB.

It's influenced by the number of deivces attached to the sysytem but
people will see more -ENOMEM.

It seems this consumption/footprint is very big.

Thanks,
-Kame





> ---
>  fs/proc/task_mmu.c       |   14 +++++++++-
>  include/linux/mm_types.h |   16 ++++--------
>  include/linux/sched.h    |   61 ++++++++++++++++++++---------------------------
>  kernel/fork.c            |   25 ++++++++++++++-----
>  mm/filemap_xip.c         |    2 -
>  mm/fremap.c              |    2 -
>  mm/init-mm.c             |    3 ++
>  mm/memory.c              |   20 +++++++--------
>  mm/rmap.c                |   10 +++----
>  mm/swapfile.c            |    2 -
>  10 files changed, 84 insertions(+), 71 deletions(-)
> 
> Index: linux-2.6/include/linux/mm_types.h
> ===================================================================
> --- linux-2.6.orig/include/linux/mm_types.h	2009-11-04 13:08:33.000000000 -0600
> +++ linux-2.6/include/linux/mm_types.h	2009-11-04 13:13:42.000000000 -0600
> @@ -24,11 +24,10 @@ struct address_space;
> 
>  #define USE_SPLIT_PTLOCKS	(NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS)
> 
> -#if USE_SPLIT_PTLOCKS
> -typedef atomic_long_t mm_counter_t;
> -#else  /* !USE_SPLIT_PTLOCKS */
> -typedef unsigned long mm_counter_t;
> -#endif /* !USE_SPLIT_PTLOCKS */
> +struct mm_counter {
> +	long file;
> +	long anon;
> +};
> 
>  /*
>   * Each physical page in the system has a struct page associated with
> @@ -223,11 +222,8 @@ struct mm_struct {
>  						 * by mmlist_lock
>  						 */
> 
> -	/* Special counters, in some configurations protected by the
> -	 * page_table_lock, in other configurations by being atomic.
> -	 */
> -	mm_counter_t _file_rss;
> -	mm_counter_t _anon_rss;
> +	/* Special percpu counters */
> +	struct mm_counter *rss;
> 
>  	unsigned long hiwater_rss;	/* High-watermark of RSS usage */
>  	unsigned long hiwater_vm;	/* High-water virtual memory usage */
> Index: linux-2.6/include/linux/sched.h
> ===================================================================
> --- linux-2.6.orig/include/linux/sched.h	2009-11-04 13:08:33.000000000 -0600
> +++ linux-2.6/include/linux/sched.h	2009-11-04 13:13:42.000000000 -0600
> @@ -385,41 +385,32 @@ arch_get_unmapped_area_topdown(struct fi
>  extern void arch_unmap_area(struct mm_struct *, unsigned long);
>  extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
> 
> -#if USE_SPLIT_PTLOCKS
> -/*
> - * The mm counters are not protected by its page_table_lock,
> - * so must be incremented atomically.
> - */
> -#define set_mm_counter(mm, member, value) atomic_long_set(&(mm)->_##member, value)
> -#define get_mm_counter(mm, member) ((unsigned long)atomic_long_read(&(mm)->_##member))
> -#define add_mm_counter(mm, member, value) atomic_long_add(value, &(mm)->_##member)
> -#define inc_mm_counter(mm, member) atomic_long_inc(&(mm)->_##member)
> -#define dec_mm_counter(mm, member) atomic_long_dec(&(mm)->_##member)
> -
> -#else  /* !USE_SPLIT_PTLOCKS */
> -/*
> - * The mm counters are protected by its page_table_lock,
> - * so can be incremented directly.
> - */
> -#define set_mm_counter(mm, member, value) (mm)->_##member = (value)
> -#define get_mm_counter(mm, member) ((mm)->_##member)
> -#define add_mm_counter(mm, member, value) (mm)->_##member += (value)
> -#define inc_mm_counter(mm, member) (mm)->_##member++
> -#define dec_mm_counter(mm, member) (mm)->_##member--
> -
> -#endif /* !USE_SPLIT_PTLOCKS */
> -
> -#define get_mm_rss(mm)					\
> -	(get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss))
> -#define update_hiwater_rss(mm)	do {			\
> -	unsigned long _rss = get_mm_rss(mm);		\
> -	if ((mm)->hiwater_rss < _rss)			\
> -		(mm)->hiwater_rss = _rss;		\
> -} while (0)
> -#define update_hiwater_vm(mm)	do {			\
> -	if ((mm)->hiwater_vm < (mm)->total_vm)		\
> -		(mm)->hiwater_vm = (mm)->total_vm;	\
> -} while (0)
> +static inline unsigned long get_mm_rss(struct mm_struct *mm)
> +{
> +	int cpu;
> +	unsigned long r = 0;
> +
> +	for_each_possible_cpu(cpu) {
> +		struct mm_counter *c = per_cpu_ptr(mm->rss, cpu);
> +
> +		r = c->file + c->anon;
> +	}
> +
> +	return r;
> +}
> +
> +static inline void update_hiwater_rss(struct mm_struct *mm)
> +{
> +	unsigned long _rss = get_mm_rss(mm);
> +	if (mm->hiwater_rss < _rss)
> +		mm->hiwater_rss = _rss;
> +}
> +
> +static inline void update_hiwater_vm(struct mm_struct *mm)
> +{
> +	if (mm->hiwater_vm < mm->total_vm)
> +		mm->hiwater_vm = mm->total_vm;
> +}
> 
>  static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm)
>  {
> Index: linux-2.6/kernel/fork.c
> ===================================================================
> --- linux-2.6.orig/kernel/fork.c	2009-11-04 13:08:33.000000000 -0600
> +++ linux-2.6/kernel/fork.c	2009-11-04 13:14:19.000000000 -0600
> @@ -444,6 +444,8 @@ static void mm_init_aio(struct mm_struct
> 
>  static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
>  {
> +	int cpu;
> +
>  	atomic_set(&mm->mm_users, 1);
>  	atomic_set(&mm->mm_count, 1);
>  	init_rwsem(&mm->mmap_sem);
> @@ -452,8 +454,11 @@ static struct mm_struct * mm_init(struct
>  		(current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
>  	mm->core_state = NULL;
>  	mm->nr_ptes = 0;
> -	set_mm_counter(mm, file_rss, 0);
> -	set_mm_counter(mm, anon_rss, 0);
> +	for_each_possible_cpu(cpu) {
> +		struct mm_counter *m;
> +
> +		memset(m, sizeof(struct mm_counter), 0);
> +	}
>  	spin_lock_init(&mm->page_table_lock);
>  	mm->free_area_cache = TASK_UNMAPPED_BASE;
>  	mm->cached_hole_size = ~0UL;
> @@ -480,7 +485,13 @@ struct mm_struct * mm_alloc(void)
>  	mm = allocate_mm();
>  	if (mm) {
>  		memset(mm, 0, sizeof(*mm));
> -		mm = mm_init(mm, current);
> +		mm->rss = alloc_percpu(struct mm_counter);
> +		if (mm->rss)
> +			mm = mm_init(mm, current);
> +		else {
> +			free_mm(mm);
> +			mm = NULL;
> +		}
>  	}
>  	return mm;
>  }
> @@ -496,6 +507,7 @@ void __mmdrop(struct mm_struct *mm)
>  	mm_free_pgd(mm);
>  	destroy_context(mm);
>  	mmu_notifier_mm_destroy(mm);
> +	free_percpu(mm->rss);
>  	free_mm(mm);
>  }
>  EXPORT_SYMBOL_GPL(__mmdrop);
> @@ -631,6 +643,9 @@ struct mm_struct *dup_mm(struct task_str
>  		goto fail_nomem;
> 
>  	memcpy(mm, oldmm, sizeof(*mm));
> +	mm->rss = alloc_percpu(struct mm_counter);
> +	if (!mm->rss)
> +		goto fail_nomem;
> 
>  	/* Initializing for Swap token stuff */
>  	mm->token_priority = 0;
> @@ -661,15 +676,13 @@ free_pt:
>  	mm->binfmt = NULL;
>  	mmput(mm);
> 
> -fail_nomem:
> -	return NULL;
> -
>  fail_nocontext:
>  	/*
>  	 * If init_new_context() failed, we cannot use mmput() to free the mm
>  	 * because it calls destroy_context()
>  	 */
>  	mm_free_pgd(mm);
> +fail_nomem:
>  	free_mm(mm);
>  	return NULL;
>  }
> Index: linux-2.6/fs/proc/task_mmu.c
> ===================================================================
> --- linux-2.6.orig/fs/proc/task_mmu.c	2009-11-04 13:08:33.000000000 -0600
> +++ linux-2.6/fs/proc/task_mmu.c	2009-11-04 13:13:42.000000000 -0600
> @@ -65,11 +65,21 @@ unsigned long task_vsize(struct mm_struc
>  int task_statm(struct mm_struct *mm, int *shared, int *text,
>  	       int *data, int *resident)
>  {
> -	*shared = get_mm_counter(mm, file_rss);
> +	int cpu;
> +	int anon_rss = 0;
> +	int file_rss = 0;
> +
> +	for_each_possible_cpu(cpu) {
> +		struct mm_counter *c = per_cpu_ptr(mm->rss, cpu);
> +
> +		anon_rss += c->anon;
> +		file_rss += c->file;
> +	}
> +	*shared = file_rss;
>  	*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
>  								>> PAGE_SHIFT;
>  	*data = mm->total_vm - mm->shared_vm;
> -	*resident = *shared + get_mm_counter(mm, anon_rss);
> +	*resident = *shared + anon_rss;
>  	return mm->total_vm;
>  }
> 
> Index: linux-2.6/mm/filemap_xip.c
> ===================================================================
> --- linux-2.6.orig/mm/filemap_xip.c	2009-11-04 13:08:33.000000000 -0600
> +++ linux-2.6/mm/filemap_xip.c	2009-11-04 13:13:42.000000000 -0600
> @@ -194,7 +194,7 @@ retry:
>  			flush_cache_page(vma, address, pte_pfn(*pte));
>  			pteval = ptep_clear_flush_notify(vma, address, pte);
>  			page_remove_rmap(page);
> -			dec_mm_counter(mm, file_rss);
> +			__this_cpu_dec(mm->rss->file);
>  			BUG_ON(pte_dirty(pteval));
>  			pte_unmap_unlock(pte, ptl);
>  			page_cache_release(page);
> Index: linux-2.6/mm/fremap.c
> ===================================================================
> --- linux-2.6.orig/mm/fremap.c	2009-11-04 13:08:33.000000000 -0600
> +++ linux-2.6/mm/fremap.c	2009-11-04 13:13:42.000000000 -0600
> @@ -40,7 +40,7 @@ static void zap_pte(struct mm_struct *mm
>  			page_remove_rmap(page);
>  			page_cache_release(page);
>  			update_hiwater_rss(mm);
> -			dec_mm_counter(mm, file_rss);
> +			__this_cpu_dec(mm->rss->file);
>  		}
>  	} else {
>  		if (!pte_file(pte))
> Index: linux-2.6/mm/memory.c
> ===================================================================
> --- linux-2.6.orig/mm/memory.c	2009-11-04 13:08:33.000000000 -0600
> +++ linux-2.6/mm/memory.c	2009-11-04 13:13:42.000000000 -0600
> @@ -379,9 +379,9 @@ int __pte_alloc_kernel(pmd_t *pmd, unsig
>  static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
>  {
>  	if (file_rss)
> -		add_mm_counter(mm, file_rss, file_rss);
> +		__this_cpu_add(mm->rss->file, file_rss);
>  	if (anon_rss)
> -		add_mm_counter(mm, anon_rss, anon_rss);
> +		__this_cpu_add(mm->rss->anon, anon_rss);
>  }
> 
>  /*
> @@ -1512,7 +1512,7 @@ static int insert_page(struct vm_area_st
> 
>  	/* Ok, finally just insert the thing.. */
>  	get_page(page);
> -	inc_mm_counter(mm, file_rss);
> +	__this_cpu_inc(mm->rss->file);
>  	page_add_file_rmap(page);
>  	set_pte_at(mm, addr, pte, mk_pte(page, prot));
> 
> @@ -2148,11 +2148,11 @@ gotten:
>  	if (likely(pte_same(*page_table, orig_pte))) {
>  		if (old_page) {
>  			if (!PageAnon(old_page)) {
> -				dec_mm_counter(mm, file_rss);
> -				inc_mm_counter(mm, anon_rss);
> +				__this_cpu_dec(mm->rss->file);
> +				__this_cpu_inc(mm->rss->anon);
>  			}
>  		} else
> -			inc_mm_counter(mm, anon_rss);
> +			__this_cpu_inc(mm->rss->anon);
>  		flush_cache_page(vma, address, pte_pfn(orig_pte));
>  		entry = mk_pte(new_page, vma->vm_page_prot);
>  		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> @@ -2579,7 +2579,7 @@ static int do_swap_page(struct mm_struct
>  	 * discarded at swap_free().
>  	 */
> 
> -	inc_mm_counter(mm, anon_rss);
> +	__this_cpu_inc(mm->rss->anon);
>  	pte = mk_pte(page, vma->vm_page_prot);
>  	if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
>  		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
> @@ -2663,7 +2663,7 @@ static int do_anonymous_page(struct mm_s
>  	if (!pte_none(*page_table))
>  		goto release;
> 
> -	inc_mm_counter(mm, anon_rss);
> +	__this_cpu_inc(mm->rss->anon);
>  	page_add_new_anon_rmap(page, vma, address);
>  setpte:
>  	set_pte_at(mm, address, page_table, entry);
> @@ -2817,10 +2817,10 @@ static int __do_fault(struct mm_struct *
>  		if (flags & FAULT_FLAG_WRITE)
>  			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
>  		if (anon) {
> -			inc_mm_counter(mm, anon_rss);
> +			__this_cpu_inc(mm->rss->anon);
>  			page_add_new_anon_rmap(page, vma, address);
>  		} else {
> -			inc_mm_counter(mm, file_rss);
> +			__this_cpu_inc(mm->rss->file);
>  			page_add_file_rmap(page);
>  			if (flags & FAULT_FLAG_WRITE) {
>  				dirty_page = page;
> Index: linux-2.6/mm/rmap.c
> ===================================================================
> --- linux-2.6.orig/mm/rmap.c	2009-11-04 13:08:33.000000000 -0600
> +++ linux-2.6/mm/rmap.c	2009-11-04 13:13:42.000000000 -0600
> @@ -809,9 +809,9 @@ static int try_to_unmap_one(struct page
> 
>  	if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
>  		if (PageAnon(page))
> -			dec_mm_counter(mm, anon_rss);
> +			__this_cpu_dec(mm->rss->anon);
>  		else
> -			dec_mm_counter(mm, file_rss);
> +			__this_cpu_dec(mm->rss->file);
>  		set_pte_at(mm, address, pte,
>  				swp_entry_to_pte(make_hwpoison_entry(page)));
>  	} else if (PageAnon(page)) {
> @@ -829,7 +829,7 @@ static int try_to_unmap_one(struct page
>  					list_add(&mm->mmlist, &init_mm.mmlist);
>  				spin_unlock(&mmlist_lock);
>  			}
> -			dec_mm_counter(mm, anon_rss);
> +			__this_cpu_dec(mm->rss->anon);
>  		} else if (PAGE_MIGRATION) {
>  			/*
>  			 * Store the pfn of the page in a special migration
> @@ -847,7 +847,7 @@ static int try_to_unmap_one(struct page
>  		entry = make_migration_entry(page, pte_write(pteval));
>  		set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
>  	} else
> -		dec_mm_counter(mm, file_rss);
> +		__this_cpu_dec(mm->rss->file);
> 
> 
>  	page_remove_rmap(page);
> @@ -967,7 +967,7 @@ static int try_to_unmap_cluster(unsigned
> 
>  		page_remove_rmap(page);
>  		page_cache_release(page);
> -		dec_mm_counter(mm, file_rss);
> +		__this_cpu_dec(mm->rss->file);
>  		(*mapcount)--;
>  	}
>  	pte_unmap_unlock(pte - 1, ptl);
> Index: linux-2.6/mm/swapfile.c
> ===================================================================
> --- linux-2.6.orig/mm/swapfile.c	2009-11-04 13:08:33.000000000 -0600
> +++ linux-2.6/mm/swapfile.c	2009-11-04 13:13:42.000000000 -0600
> @@ -831,7 +831,7 @@ static int unuse_pte(struct vm_area_stru
>  		goto out;
>  	}
> 
> -	inc_mm_counter(vma->vm_mm, anon_rss);
> +	__this_cpu_inc(vma->vm_mm->rss->anon);
>  	get_page(page);
>  	set_pte_at(vma->vm_mm, addr, pte,
>  		   pte_mkold(mk_pte(page, vma->vm_page_prot)));
> Index: linux-2.6/mm/init-mm.c
> ===================================================================
> --- linux-2.6.orig/mm/init-mm.c	2009-11-04 13:08:33.000000000 -0600
> +++ linux-2.6/mm/init-mm.c	2009-11-04 13:13:42.000000000 -0600
> @@ -8,6 +8,8 @@
>  #include <asm/atomic.h>
>  #include <asm/pgtable.h>
> 
> +DEFINE_PER_CPU(struct mm_counter, init_mm_counters);
> +
>  struct mm_struct init_mm = {
>  	.mm_rb		= RB_ROOT,
>  	.pgd		= swapper_pg_dir,
> @@ -17,4 +19,5 @@ struct mm_struct init_mm = {
>  	.page_table_lock =  __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
>  	.mmlist		= LIST_HEAD_INIT(init_mm.mmlist),
>  	.cpu_vm_mask	= CPU_MASK_ALL,
> +	.rss		= &init_mm_counters,
>  };
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Remove rss batching from copy_page_range()
  2009-11-04 22:02       ` Christoph Lameter
@ 2009-11-05  8:27         ` Andi Kleen
  -1 siblings, 0 replies; 66+ messages in thread
From: Andi Kleen @ 2009-11-05  8:27 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andi Kleen, KAMEZAWA Hiroyuki, hugh.dickins, linux-mm,
	linux-kernel, akpm, Tejun Heo

On Wed, Nov 04, 2009 at 05:02:12PM -0500, Christoph Lameter wrote:
> On Wed, 4 Nov 2009, Andi Kleen wrote:
> 
> > > With per cpu counters in mm there is no need for batching
> > > mm counter updates anymore. Update counters directly while
> > > copying pages.
> >
> > Hmm, but with all the inlining with some luck the local
> > counters will be in registers. That will never be the case
> > with the per cpu counters.
> 
> The function is too big for that to occur and the counters have to be

If it's only called once then gcc doesn't care about size.

> preserved across function calls. The code is shorter with the patch
> applied:

I see. Thanks for the data.

-Andi

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Remove rss batching from copy_page_range()
@ 2009-11-05  8:27         ` Andi Kleen
  0 siblings, 0 replies; 66+ messages in thread
From: Andi Kleen @ 2009-11-05  8:27 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andi Kleen, KAMEZAWA Hiroyuki, hugh.dickins, linux-mm,
	linux-kernel, akpm, Tejun Heo

On Wed, Nov 04, 2009 at 05:02:12PM -0500, Christoph Lameter wrote:
> On Wed, 4 Nov 2009, Andi Kleen wrote:
> 
> > > With per cpu counters in mm there is no need for batching
> > > mm counter updates anymore. Update counters directly while
> > > copying pages.
> >
> > Hmm, but with all the inlining with some luck the local
> > counters will be in registers. That will never be the case
> > with the per cpu counters.
> 
> The function is too big for that to occur and the counters have to be

If it's only called once then gcc doesn't care about size.

> preserved across function calls. The code is shorter with the patch
> applied:

I see. Thanks for the data.

-Andi

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic
  2009-11-04 23:49   ` Dave Jones
@ 2009-11-05 15:04     ` Christoph Lameter
  -1 siblings, 0 replies; 66+ messages in thread
From: Christoph Lameter @ 2009-11-05 15:04 UTC (permalink / raw)
  To: Dave Jones
  Cc: KAMEZAWA Hiroyuki, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo

On Wed, 4 Nov 2009, Dave Jones wrote:

> On Wed, Nov 04, 2009 at 02:14:41PM -0500, Christoph Lameter wrote:
>
>  > +		memset(m, sizeof(struct mm_counter), 0);
>
> Args wrong way around.

Right. It works because percpu_alloc zeroes the data anyways.


^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic
@ 2009-11-05 15:04     ` Christoph Lameter
  0 siblings, 0 replies; 66+ messages in thread
From: Christoph Lameter @ 2009-11-05 15:04 UTC (permalink / raw)
  To: Dave Jones
  Cc: KAMEZAWA Hiroyuki, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo

On Wed, 4 Nov 2009, Dave Jones wrote:

> On Wed, Nov 04, 2009 at 02:14:41PM -0500, Christoph Lameter wrote:
>
>  > +		memset(m, sizeof(struct mm_counter), 0);
>
> Args wrong way around.

Right. It works because percpu_alloc zeroes the data anyways.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic
  2009-11-05  1:16   ` KAMEZAWA Hiroyuki
@ 2009-11-05 15:10     ` Christoph Lameter
  -1 siblings, 0 replies; 66+ messages in thread
From: Christoph Lameter @ 2009-11-05 15:10 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo

On Thu, 5 Nov 2009, KAMEZAWA Hiroyuki wrote:

> Hmm, I don't fully understand _new_ percpu but...
> In logical (even if not realistic), x86-32 supports up to 512 ? cpus in Kconfig.
> BIGSMP.

x86-32 only supports 32 processors. Plus per cpu areas are only allocated
for the possible processors.

> Then, if 65536 process runs, this consumes
>
> 65536(nr_proc) * 8 (size) * 512(cpus) = 256MBytes.

With 32 possible cpus this results in 16m of per cpu space use.


^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic
@ 2009-11-05 15:10     ` Christoph Lameter
  0 siblings, 0 replies; 66+ messages in thread
From: Christoph Lameter @ 2009-11-05 15:10 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo

On Thu, 5 Nov 2009, KAMEZAWA Hiroyuki wrote:

> Hmm, I don't fully understand _new_ percpu but...
> In logical (even if not realistic), x86-32 supports up to 512 ? cpus in Kconfig.
> BIGSMP.

x86-32 only supports 32 processors. Plus per cpu areas are only allocated
for the possible processors.

> Then, if 65536 process runs, this consumes
>
> 65536(nr_proc) * 8 (size) * 512(cpus) = 256MBytes.

With 32 possible cpus this results in 16m of per cpu space use.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic V2
  2009-11-05 15:04     ` Christoph Lameter
@ 2009-11-05 15:36       ` Christoph Lameter
  -1 siblings, 0 replies; 66+ messages in thread
From: Christoph Lameter @ 2009-11-05 15:36 UTC (permalink / raw)
  To: Dave Jones
  Cc: KAMEZAWA Hiroyuki, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo

From: Christoph Lameter <cl@linux-foundation.org>
Subject: Make mm counters per cpu V2

Changing the mm counters to per cpu counters is possible after the introduction
of the generic per cpu operations (currently in percpu and -next).

With that the contention on the counters in mm_struct can be avoided. The
USE_SPLIT_PTLOCKS case distinction can go away. Larger SMP systems do not
need to perform atomic updates to mm counters anymore. Various code paths
can be simplified since per cpu counter updates are fast and batching
of counter updates is no longer needed.

One price to pay for these improvements is the need to scan over all percpu
counters when the actual count values are needed.

V1->V2
- Remove useless and buggy per cpu counter initialization.
  alloc_percpu already zeros the values.

Signed-off-by: Christoph Lameter <cl@linux-foundation.org>

---
 fs/proc/task_mmu.c       |   14 +++++++++-
 include/linux/mm_types.h |   16 ++++--------
 include/linux/sched.h    |   61 ++++++++++++++++++++---------------------------
 kernel/fork.c            |   18 +++++++++----
 mm/filemap_xip.c         |    2 -
 mm/fremap.c              |    2 -
 mm/init-mm.c             |    3 ++
 mm/memory.c              |   20 +++++++--------
 mm/rmap.c                |   10 +++----
 mm/swapfile.c            |    2 -
 10 files changed, 77 insertions(+), 71 deletions(-)

Index: linux-2.6/include/linux/mm_types.h
===================================================================
--- linux-2.6.orig/include/linux/mm_types.h	2009-11-05 09:22:17.000000000 -0600
+++ linux-2.6/include/linux/mm_types.h	2009-11-05 09:22:37.000000000 -0600
@@ -24,11 +24,10 @@ struct address_space;

 #define USE_SPLIT_PTLOCKS	(NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS)

-#if USE_SPLIT_PTLOCKS
-typedef atomic_long_t mm_counter_t;
-#else  /* !USE_SPLIT_PTLOCKS */
-typedef unsigned long mm_counter_t;
-#endif /* !USE_SPLIT_PTLOCKS */
+struct mm_counter {
+	long file;
+	long anon;
+};

 /*
  * Each physical page in the system has a struct page associated with
@@ -223,11 +222,8 @@ struct mm_struct {
 						 * by mmlist_lock
 						 */

-	/* Special counters, in some configurations protected by the
-	 * page_table_lock, in other configurations by being atomic.
-	 */
-	mm_counter_t _file_rss;
-	mm_counter_t _anon_rss;
+	/* Special percpu counters */
+	struct mm_counter *rss;

 	unsigned long hiwater_rss;	/* High-watermark of RSS usage */
 	unsigned long hiwater_vm;	/* High-water virtual memory usage */
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h	2009-11-05 09:22:17.000000000 -0600
+++ linux-2.6/include/linux/sched.h	2009-11-05 09:22:37.000000000 -0600
@@ -385,41 +385,32 @@ arch_get_unmapped_area_topdown(struct fi
 extern void arch_unmap_area(struct mm_struct *, unsigned long);
 extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);

-#if USE_SPLIT_PTLOCKS
-/*
- * The mm counters are not protected by its page_table_lock,
- * so must be incremented atomically.
- */
-#define set_mm_counter(mm, member, value) atomic_long_set(&(mm)->_##member, value)
-#define get_mm_counter(mm, member) ((unsigned long)atomic_long_read(&(mm)->_##member))
-#define add_mm_counter(mm, member, value) atomic_long_add(value, &(mm)->_##member)
-#define inc_mm_counter(mm, member) atomic_long_inc(&(mm)->_##member)
-#define dec_mm_counter(mm, member) atomic_long_dec(&(mm)->_##member)
-
-#else  /* !USE_SPLIT_PTLOCKS */
-/*
- * The mm counters are protected by its page_table_lock,
- * so can be incremented directly.
- */
-#define set_mm_counter(mm, member, value) (mm)->_##member = (value)
-#define get_mm_counter(mm, member) ((mm)->_##member)
-#define add_mm_counter(mm, member, value) (mm)->_##member += (value)
-#define inc_mm_counter(mm, member) (mm)->_##member++
-#define dec_mm_counter(mm, member) (mm)->_##member--
-
-#endif /* !USE_SPLIT_PTLOCKS */
-
-#define get_mm_rss(mm)					\
-	(get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss))
-#define update_hiwater_rss(mm)	do {			\
-	unsigned long _rss = get_mm_rss(mm);		\
-	if ((mm)->hiwater_rss < _rss)			\
-		(mm)->hiwater_rss = _rss;		\
-} while (0)
-#define update_hiwater_vm(mm)	do {			\
-	if ((mm)->hiwater_vm < (mm)->total_vm)		\
-		(mm)->hiwater_vm = (mm)->total_vm;	\
-} while (0)
+static inline unsigned long get_mm_rss(struct mm_struct *mm)
+{
+	int cpu;
+	unsigned long r = 0;
+
+	for_each_possible_cpu(cpu) {
+		struct mm_counter *c = per_cpu_ptr(mm->rss, cpu);
+
+		r = c->file + c->anon;
+	}
+
+	return r;
+}
+
+static inline void update_hiwater_rss(struct mm_struct *mm)
+{
+	unsigned long _rss = get_mm_rss(mm);
+	if (mm->hiwater_rss < _rss)
+		mm->hiwater_rss = _rss;
+}
+
+static inline void update_hiwater_vm(struct mm_struct *mm)
+{
+	if (mm->hiwater_vm < mm->total_vm)
+		mm->hiwater_vm = mm->total_vm;
+}

 static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm)
 {
Index: linux-2.6/kernel/fork.c
===================================================================
--- linux-2.6.orig/kernel/fork.c	2009-11-05 09:22:17.000000000 -0600
+++ linux-2.6/kernel/fork.c	2009-11-05 09:25:30.000000000 -0600
@@ -452,8 +452,6 @@ static struct mm_struct * mm_init(struct
 		(current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
 	mm->core_state = NULL;
 	mm->nr_ptes = 0;
-	set_mm_counter(mm, file_rss, 0);
-	set_mm_counter(mm, anon_rss, 0);
 	spin_lock_init(&mm->page_table_lock);
 	mm->free_area_cache = TASK_UNMAPPED_BASE;
 	mm->cached_hole_size = ~0UL;
@@ -480,7 +478,13 @@ struct mm_struct * mm_alloc(void)
 	mm = allocate_mm();
 	if (mm) {
 		memset(mm, 0, sizeof(*mm));
-		mm = mm_init(mm, current);
+		mm->rss = alloc_percpu(struct mm_counter);
+		if (mm->rss)
+			mm = mm_init(mm, current);
+		else {
+			free_mm(mm);
+			mm = NULL;
+		}
 	}
 	return mm;
 }
@@ -496,6 +500,7 @@ void __mmdrop(struct mm_struct *mm)
 	mm_free_pgd(mm);
 	destroy_context(mm);
 	mmu_notifier_mm_destroy(mm);
+	free_percpu(mm->rss);
 	free_mm(mm);
 }
 EXPORT_SYMBOL_GPL(__mmdrop);
@@ -631,6 +636,9 @@ struct mm_struct *dup_mm(struct task_str
 		goto fail_nomem;

 	memcpy(mm, oldmm, sizeof(*mm));
+	mm->rss = alloc_percpu(struct mm_counter);
+	if (!mm->rss)
+		goto fail_nomem;

 	/* Initializing for Swap token stuff */
 	mm->token_priority = 0;
@@ -661,15 +669,13 @@ free_pt:
 	mm->binfmt = NULL;
 	mmput(mm);

-fail_nomem:
-	return NULL;
-
 fail_nocontext:
 	/*
 	 * If init_new_context() failed, we cannot use mmput() to free the mm
 	 * because it calls destroy_context()
 	 */
 	mm_free_pgd(mm);
+fail_nomem:
 	free_mm(mm);
 	return NULL;
 }
Index: linux-2.6/fs/proc/task_mmu.c
===================================================================
--- linux-2.6.orig/fs/proc/task_mmu.c	2009-11-05 09:22:17.000000000 -0600
+++ linux-2.6/fs/proc/task_mmu.c	2009-11-05 09:22:37.000000000 -0600
@@ -65,11 +65,21 @@ unsigned long task_vsize(struct mm_struc
 int task_statm(struct mm_struct *mm, int *shared, int *text,
 	       int *data, int *resident)
 {
-	*shared = get_mm_counter(mm, file_rss);
+	int cpu;
+	int anon_rss = 0;
+	int file_rss = 0;
+
+	for_each_possible_cpu(cpu) {
+		struct mm_counter *c = per_cpu_ptr(mm->rss, cpu);
+
+		anon_rss += c->anon;
+		file_rss += c->file;
+	}
+	*shared = file_rss;
 	*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
 								>> PAGE_SHIFT;
 	*data = mm->total_vm - mm->shared_vm;
-	*resident = *shared + get_mm_counter(mm, anon_rss);
+	*resident = *shared + anon_rss;
 	return mm->total_vm;
 }

Index: linux-2.6/mm/filemap_xip.c
===================================================================
--- linux-2.6.orig/mm/filemap_xip.c	2009-11-05 09:22:17.000000000 -0600
+++ linux-2.6/mm/filemap_xip.c	2009-11-05 09:22:37.000000000 -0600
@@ -194,7 +194,7 @@ retry:
 			flush_cache_page(vma, address, pte_pfn(*pte));
 			pteval = ptep_clear_flush_notify(vma, address, pte);
 			page_remove_rmap(page);
-			dec_mm_counter(mm, file_rss);
+			__this_cpu_dec(mm->rss->file);
 			BUG_ON(pte_dirty(pteval));
 			pte_unmap_unlock(pte, ptl);
 			page_cache_release(page);
Index: linux-2.6/mm/fremap.c
===================================================================
--- linux-2.6.orig/mm/fremap.c	2009-11-05 09:22:17.000000000 -0600
+++ linux-2.6/mm/fremap.c	2009-11-05 09:22:37.000000000 -0600
@@ -40,7 +40,7 @@ static void zap_pte(struct mm_struct *mm
 			page_remove_rmap(page);
 			page_cache_release(page);
 			update_hiwater_rss(mm);
-			dec_mm_counter(mm, file_rss);
+			__this_cpu_dec(mm->rss->file);
 		}
 	} else {
 		if (!pte_file(pte))
Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c	2009-11-05 09:22:17.000000000 -0600
+++ linux-2.6/mm/memory.c	2009-11-05 09:22:37.000000000 -0600
@@ -379,9 +379,9 @@ int __pte_alloc_kernel(pmd_t *pmd, unsig
 static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
 {
 	if (file_rss)
-		add_mm_counter(mm, file_rss, file_rss);
+		__this_cpu_add(mm->rss->file, file_rss);
 	if (anon_rss)
-		add_mm_counter(mm, anon_rss, anon_rss);
+		__this_cpu_add(mm->rss->anon, anon_rss);
 }

 /*
@@ -1512,7 +1512,7 @@ static int insert_page(struct vm_area_st

 	/* Ok, finally just insert the thing.. */
 	get_page(page);
-	inc_mm_counter(mm, file_rss);
+	__this_cpu_inc(mm->rss->file);
 	page_add_file_rmap(page);
 	set_pte_at(mm, addr, pte, mk_pte(page, prot));

@@ -2148,11 +2148,11 @@ gotten:
 	if (likely(pte_same(*page_table, orig_pte))) {
 		if (old_page) {
 			if (!PageAnon(old_page)) {
-				dec_mm_counter(mm, file_rss);
-				inc_mm_counter(mm, anon_rss);
+				__this_cpu_dec(mm->rss->file);
+				__this_cpu_inc(mm->rss->anon);
 			}
 		} else
-			inc_mm_counter(mm, anon_rss);
+			__this_cpu_inc(mm->rss->anon);
 		flush_cache_page(vma, address, pte_pfn(orig_pte));
 		entry = mk_pte(new_page, vma->vm_page_prot);
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2579,7 +2579,7 @@ static int do_swap_page(struct mm_struct
 	 * discarded at swap_free().
 	 */

-	inc_mm_counter(mm, anon_rss);
+	__this_cpu_inc(mm->rss->anon);
 	pte = mk_pte(page, vma->vm_page_prot);
 	if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
 		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
@@ -2663,7 +2663,7 @@ static int do_anonymous_page(struct mm_s
 	if (!pte_none(*page_table))
 		goto release;

-	inc_mm_counter(mm, anon_rss);
+	__this_cpu_inc(mm->rss->anon);
 	page_add_new_anon_rmap(page, vma, address);
 setpte:
 	set_pte_at(mm, address, page_table, entry);
@@ -2817,10 +2817,10 @@ static int __do_fault(struct mm_struct *
 		if (flags & FAULT_FLAG_WRITE)
 			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 		if (anon) {
-			inc_mm_counter(mm, anon_rss);
+			__this_cpu_inc(mm->rss->anon);
 			page_add_new_anon_rmap(page, vma, address);
 		} else {
-			inc_mm_counter(mm, file_rss);
+			__this_cpu_inc(mm->rss->file);
 			page_add_file_rmap(page);
 			if (flags & FAULT_FLAG_WRITE) {
 				dirty_page = page;
Index: linux-2.6/mm/rmap.c
===================================================================
--- linux-2.6.orig/mm/rmap.c	2009-11-05 09:22:17.000000000 -0600
+++ linux-2.6/mm/rmap.c	2009-11-05 09:22:37.000000000 -0600
@@ -809,9 +809,9 @@ static int try_to_unmap_one(struct page

 	if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
 		if (PageAnon(page))
-			dec_mm_counter(mm, anon_rss);
+			__this_cpu_dec(mm->rss->anon);
 		else
-			dec_mm_counter(mm, file_rss);
+			__this_cpu_dec(mm->rss->file);
 		set_pte_at(mm, address, pte,
 				swp_entry_to_pte(make_hwpoison_entry(page)));
 	} else if (PageAnon(page)) {
@@ -829,7 +829,7 @@ static int try_to_unmap_one(struct page
 					list_add(&mm->mmlist, &init_mm.mmlist);
 				spin_unlock(&mmlist_lock);
 			}
-			dec_mm_counter(mm, anon_rss);
+			__this_cpu_dec(mm->rss->anon);
 		} else if (PAGE_MIGRATION) {
 			/*
 			 * Store the pfn of the page in a special migration
@@ -847,7 +847,7 @@ static int try_to_unmap_one(struct page
 		entry = make_migration_entry(page, pte_write(pteval));
 		set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
 	} else
-		dec_mm_counter(mm, file_rss);
+		__this_cpu_dec(mm->rss->file);


 	page_remove_rmap(page);
@@ -967,7 +967,7 @@ static int try_to_unmap_cluster(unsigned

 		page_remove_rmap(page);
 		page_cache_release(page);
-		dec_mm_counter(mm, file_rss);
+		__this_cpu_dec(mm->rss->file);
 		(*mapcount)--;
 	}
 	pte_unmap_unlock(pte - 1, ptl);
Index: linux-2.6/mm/swapfile.c
===================================================================
--- linux-2.6.orig/mm/swapfile.c	2009-11-05 09:22:17.000000000 -0600
+++ linux-2.6/mm/swapfile.c	2009-11-05 09:22:37.000000000 -0600
@@ -831,7 +831,7 @@ static int unuse_pte(struct vm_area_stru
 		goto out;
 	}

-	inc_mm_counter(vma->vm_mm, anon_rss);
+	__this_cpu_inc(vma->vm_mm->rss->anon);
 	get_page(page);
 	set_pte_at(vma->vm_mm, addr, pte,
 		   pte_mkold(mk_pte(page, vma->vm_page_prot)));
Index: linux-2.6/mm/init-mm.c
===================================================================
--- linux-2.6.orig/mm/init-mm.c	2009-11-05 09:22:17.000000000 -0600
+++ linux-2.6/mm/init-mm.c	2009-11-05 09:22:37.000000000 -0600
@@ -8,6 +8,8 @@
 #include <asm/atomic.h>
 #include <asm/pgtable.h>

+DEFINE_PER_CPU(struct mm_counter, init_mm_counters);
+
 struct mm_struct init_mm = {
 	.mm_rb		= RB_ROOT,
 	.pgd		= swapper_pg_dir,
@@ -17,4 +19,5 @@ struct mm_struct init_mm = {
 	.page_table_lock =  __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
 	.mmlist		= LIST_HEAD_INIT(init_mm.mmlist),
 	.cpu_vm_mask	= CPU_MASK_ALL,
+	.rss		= &init_mm_counters,
 };

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic V2
@ 2009-11-05 15:36       ` Christoph Lameter
  0 siblings, 0 replies; 66+ messages in thread
From: Christoph Lameter @ 2009-11-05 15:36 UTC (permalink / raw)
  To: Dave Jones
  Cc: KAMEZAWA Hiroyuki, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo

From: Christoph Lameter <cl@linux-foundation.org>
Subject: Make mm counters per cpu V2

Changing the mm counters to per cpu counters is possible after the introduction
of the generic per cpu operations (currently in percpu and -next).

With that the contention on the counters in mm_struct can be avoided. The
USE_SPLIT_PTLOCKS case distinction can go away. Larger SMP systems do not
need to perform atomic updates to mm counters anymore. Various code paths
can be simplified since per cpu counter updates are fast and batching
of counter updates is no longer needed.

One price to pay for these improvements is the need to scan over all percpu
counters when the actual count values are needed.

V1->V2
- Remove useless and buggy per cpu counter initialization.
  alloc_percpu already zeros the values.

Signed-off-by: Christoph Lameter <cl@linux-foundation.org>

---
 fs/proc/task_mmu.c       |   14 +++++++++-
 include/linux/mm_types.h |   16 ++++--------
 include/linux/sched.h    |   61 ++++++++++++++++++++---------------------------
 kernel/fork.c            |   18 +++++++++----
 mm/filemap_xip.c         |    2 -
 mm/fremap.c              |    2 -
 mm/init-mm.c             |    3 ++
 mm/memory.c              |   20 +++++++--------
 mm/rmap.c                |   10 +++----
 mm/swapfile.c            |    2 -
 10 files changed, 77 insertions(+), 71 deletions(-)

Index: linux-2.6/include/linux/mm_types.h
===================================================================
--- linux-2.6.orig/include/linux/mm_types.h	2009-11-05 09:22:17.000000000 -0600
+++ linux-2.6/include/linux/mm_types.h	2009-11-05 09:22:37.000000000 -0600
@@ -24,11 +24,10 @@ struct address_space;

 #define USE_SPLIT_PTLOCKS	(NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS)

-#if USE_SPLIT_PTLOCKS
-typedef atomic_long_t mm_counter_t;
-#else  /* !USE_SPLIT_PTLOCKS */
-typedef unsigned long mm_counter_t;
-#endif /* !USE_SPLIT_PTLOCKS */
+struct mm_counter {
+	long file;
+	long anon;
+};

 /*
  * Each physical page in the system has a struct page associated with
@@ -223,11 +222,8 @@ struct mm_struct {
 						 * by mmlist_lock
 						 */

-	/* Special counters, in some configurations protected by the
-	 * page_table_lock, in other configurations by being atomic.
-	 */
-	mm_counter_t _file_rss;
-	mm_counter_t _anon_rss;
+	/* Special percpu counters */
+	struct mm_counter *rss;

 	unsigned long hiwater_rss;	/* High-watermark of RSS usage */
 	unsigned long hiwater_vm;	/* High-water virtual memory usage */
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h	2009-11-05 09:22:17.000000000 -0600
+++ linux-2.6/include/linux/sched.h	2009-11-05 09:22:37.000000000 -0600
@@ -385,41 +385,32 @@ arch_get_unmapped_area_topdown(struct fi
 extern void arch_unmap_area(struct mm_struct *, unsigned long);
 extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);

-#if USE_SPLIT_PTLOCKS
-/*
- * The mm counters are not protected by its page_table_lock,
- * so must be incremented atomically.
- */
-#define set_mm_counter(mm, member, value) atomic_long_set(&(mm)->_##member, value)
-#define get_mm_counter(mm, member) ((unsigned long)atomic_long_read(&(mm)->_##member))
-#define add_mm_counter(mm, member, value) atomic_long_add(value, &(mm)->_##member)
-#define inc_mm_counter(mm, member) atomic_long_inc(&(mm)->_##member)
-#define dec_mm_counter(mm, member) atomic_long_dec(&(mm)->_##member)
-
-#else  /* !USE_SPLIT_PTLOCKS */
-/*
- * The mm counters are protected by its page_table_lock,
- * so can be incremented directly.
- */
-#define set_mm_counter(mm, member, value) (mm)->_##member = (value)
-#define get_mm_counter(mm, member) ((mm)->_##member)
-#define add_mm_counter(mm, member, value) (mm)->_##member += (value)
-#define inc_mm_counter(mm, member) (mm)->_##member++
-#define dec_mm_counter(mm, member) (mm)->_##member--
-
-#endif /* !USE_SPLIT_PTLOCKS */
-
-#define get_mm_rss(mm)					\
-	(get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss))
-#define update_hiwater_rss(mm)	do {			\
-	unsigned long _rss = get_mm_rss(mm);		\
-	if ((mm)->hiwater_rss < _rss)			\
-		(mm)->hiwater_rss = _rss;		\
-} while (0)
-#define update_hiwater_vm(mm)	do {			\
-	if ((mm)->hiwater_vm < (mm)->total_vm)		\
-		(mm)->hiwater_vm = (mm)->total_vm;	\
-} while (0)
+static inline unsigned long get_mm_rss(struct mm_struct *mm)
+{
+	int cpu;
+	unsigned long r = 0;
+
+	for_each_possible_cpu(cpu) {
+		struct mm_counter *c = per_cpu_ptr(mm->rss, cpu);
+
+		r = c->file + c->anon;
+	}
+
+	return r;
+}
+
+static inline void update_hiwater_rss(struct mm_struct *mm)
+{
+	unsigned long _rss = get_mm_rss(mm);
+	if (mm->hiwater_rss < _rss)
+		mm->hiwater_rss = _rss;
+}
+
+static inline void update_hiwater_vm(struct mm_struct *mm)
+{
+	if (mm->hiwater_vm < mm->total_vm)
+		mm->hiwater_vm = mm->total_vm;
+}

 static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm)
 {
Index: linux-2.6/kernel/fork.c
===================================================================
--- linux-2.6.orig/kernel/fork.c	2009-11-05 09:22:17.000000000 -0600
+++ linux-2.6/kernel/fork.c	2009-11-05 09:25:30.000000000 -0600
@@ -452,8 +452,6 @@ static struct mm_struct * mm_init(struct
 		(current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
 	mm->core_state = NULL;
 	mm->nr_ptes = 0;
-	set_mm_counter(mm, file_rss, 0);
-	set_mm_counter(mm, anon_rss, 0);
 	spin_lock_init(&mm->page_table_lock);
 	mm->free_area_cache = TASK_UNMAPPED_BASE;
 	mm->cached_hole_size = ~0UL;
@@ -480,7 +478,13 @@ struct mm_struct * mm_alloc(void)
 	mm = allocate_mm();
 	if (mm) {
 		memset(mm, 0, sizeof(*mm));
-		mm = mm_init(mm, current);
+		mm->rss = alloc_percpu(struct mm_counter);
+		if (mm->rss)
+			mm = mm_init(mm, current);
+		else {
+			free_mm(mm);
+			mm = NULL;
+		}
 	}
 	return mm;
 }
@@ -496,6 +500,7 @@ void __mmdrop(struct mm_struct *mm)
 	mm_free_pgd(mm);
 	destroy_context(mm);
 	mmu_notifier_mm_destroy(mm);
+	free_percpu(mm->rss);
 	free_mm(mm);
 }
 EXPORT_SYMBOL_GPL(__mmdrop);
@@ -631,6 +636,9 @@ struct mm_struct *dup_mm(struct task_str
 		goto fail_nomem;

 	memcpy(mm, oldmm, sizeof(*mm));
+	mm->rss = alloc_percpu(struct mm_counter);
+	if (!mm->rss)
+		goto fail_nomem;

 	/* Initializing for Swap token stuff */
 	mm->token_priority = 0;
@@ -661,15 +669,13 @@ free_pt:
 	mm->binfmt = NULL;
 	mmput(mm);

-fail_nomem:
-	return NULL;
-
 fail_nocontext:
 	/*
 	 * If init_new_context() failed, we cannot use mmput() to free the mm
 	 * because it calls destroy_context()
 	 */
 	mm_free_pgd(mm);
+fail_nomem:
 	free_mm(mm);
 	return NULL;
 }
Index: linux-2.6/fs/proc/task_mmu.c
===================================================================
--- linux-2.6.orig/fs/proc/task_mmu.c	2009-11-05 09:22:17.000000000 -0600
+++ linux-2.6/fs/proc/task_mmu.c	2009-11-05 09:22:37.000000000 -0600
@@ -65,11 +65,21 @@ unsigned long task_vsize(struct mm_struc
 int task_statm(struct mm_struct *mm, int *shared, int *text,
 	       int *data, int *resident)
 {
-	*shared = get_mm_counter(mm, file_rss);
+	int cpu;
+	int anon_rss = 0;
+	int file_rss = 0;
+
+	for_each_possible_cpu(cpu) {
+		struct mm_counter *c = per_cpu_ptr(mm->rss, cpu);
+
+		anon_rss += c->anon;
+		file_rss += c->file;
+	}
+	*shared = file_rss;
 	*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
 								>> PAGE_SHIFT;
 	*data = mm->total_vm - mm->shared_vm;
-	*resident = *shared + get_mm_counter(mm, anon_rss);
+	*resident = *shared + anon_rss;
 	return mm->total_vm;
 }

Index: linux-2.6/mm/filemap_xip.c
===================================================================
--- linux-2.6.orig/mm/filemap_xip.c	2009-11-05 09:22:17.000000000 -0600
+++ linux-2.6/mm/filemap_xip.c	2009-11-05 09:22:37.000000000 -0600
@@ -194,7 +194,7 @@ retry:
 			flush_cache_page(vma, address, pte_pfn(*pte));
 			pteval = ptep_clear_flush_notify(vma, address, pte);
 			page_remove_rmap(page);
-			dec_mm_counter(mm, file_rss);
+			__this_cpu_dec(mm->rss->file);
 			BUG_ON(pte_dirty(pteval));
 			pte_unmap_unlock(pte, ptl);
 			page_cache_release(page);
Index: linux-2.6/mm/fremap.c
===================================================================
--- linux-2.6.orig/mm/fremap.c	2009-11-05 09:22:17.000000000 -0600
+++ linux-2.6/mm/fremap.c	2009-11-05 09:22:37.000000000 -0600
@@ -40,7 +40,7 @@ static void zap_pte(struct mm_struct *mm
 			page_remove_rmap(page);
 			page_cache_release(page);
 			update_hiwater_rss(mm);
-			dec_mm_counter(mm, file_rss);
+			__this_cpu_dec(mm->rss->file);
 		}
 	} else {
 		if (!pte_file(pte))
Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c	2009-11-05 09:22:17.000000000 -0600
+++ linux-2.6/mm/memory.c	2009-11-05 09:22:37.000000000 -0600
@@ -379,9 +379,9 @@ int __pte_alloc_kernel(pmd_t *pmd, unsig
 static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
 {
 	if (file_rss)
-		add_mm_counter(mm, file_rss, file_rss);
+		__this_cpu_add(mm->rss->file, file_rss);
 	if (anon_rss)
-		add_mm_counter(mm, anon_rss, anon_rss);
+		__this_cpu_add(mm->rss->anon, anon_rss);
 }

 /*
@@ -1512,7 +1512,7 @@ static int insert_page(struct vm_area_st

 	/* Ok, finally just insert the thing.. */
 	get_page(page);
-	inc_mm_counter(mm, file_rss);
+	__this_cpu_inc(mm->rss->file);
 	page_add_file_rmap(page);
 	set_pte_at(mm, addr, pte, mk_pte(page, prot));

@@ -2148,11 +2148,11 @@ gotten:
 	if (likely(pte_same(*page_table, orig_pte))) {
 		if (old_page) {
 			if (!PageAnon(old_page)) {
-				dec_mm_counter(mm, file_rss);
-				inc_mm_counter(mm, anon_rss);
+				__this_cpu_dec(mm->rss->file);
+				__this_cpu_inc(mm->rss->anon);
 			}
 		} else
-			inc_mm_counter(mm, anon_rss);
+			__this_cpu_inc(mm->rss->anon);
 		flush_cache_page(vma, address, pte_pfn(orig_pte));
 		entry = mk_pte(new_page, vma->vm_page_prot);
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2579,7 +2579,7 @@ static int do_swap_page(struct mm_struct
 	 * discarded at swap_free().
 	 */

-	inc_mm_counter(mm, anon_rss);
+	__this_cpu_inc(mm->rss->anon);
 	pte = mk_pte(page, vma->vm_page_prot);
 	if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
 		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
@@ -2663,7 +2663,7 @@ static int do_anonymous_page(struct mm_s
 	if (!pte_none(*page_table))
 		goto release;

-	inc_mm_counter(mm, anon_rss);
+	__this_cpu_inc(mm->rss->anon);
 	page_add_new_anon_rmap(page, vma, address);
 setpte:
 	set_pte_at(mm, address, page_table, entry);
@@ -2817,10 +2817,10 @@ static int __do_fault(struct mm_struct *
 		if (flags & FAULT_FLAG_WRITE)
 			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 		if (anon) {
-			inc_mm_counter(mm, anon_rss);
+			__this_cpu_inc(mm->rss->anon);
 			page_add_new_anon_rmap(page, vma, address);
 		} else {
-			inc_mm_counter(mm, file_rss);
+			__this_cpu_inc(mm->rss->file);
 			page_add_file_rmap(page);
 			if (flags & FAULT_FLAG_WRITE) {
 				dirty_page = page;
Index: linux-2.6/mm/rmap.c
===================================================================
--- linux-2.6.orig/mm/rmap.c	2009-11-05 09:22:17.000000000 -0600
+++ linux-2.6/mm/rmap.c	2009-11-05 09:22:37.000000000 -0600
@@ -809,9 +809,9 @@ static int try_to_unmap_one(struct page

 	if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
 		if (PageAnon(page))
-			dec_mm_counter(mm, anon_rss);
+			__this_cpu_dec(mm->rss->anon);
 		else
-			dec_mm_counter(mm, file_rss);
+			__this_cpu_dec(mm->rss->file);
 		set_pte_at(mm, address, pte,
 				swp_entry_to_pte(make_hwpoison_entry(page)));
 	} else if (PageAnon(page)) {
@@ -829,7 +829,7 @@ static int try_to_unmap_one(struct page
 					list_add(&mm->mmlist, &init_mm.mmlist);
 				spin_unlock(&mmlist_lock);
 			}
-			dec_mm_counter(mm, anon_rss);
+			__this_cpu_dec(mm->rss->anon);
 		} else if (PAGE_MIGRATION) {
 			/*
 			 * Store the pfn of the page in a special migration
@@ -847,7 +847,7 @@ static int try_to_unmap_one(struct page
 		entry = make_migration_entry(page, pte_write(pteval));
 		set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
 	} else
-		dec_mm_counter(mm, file_rss);
+		__this_cpu_dec(mm->rss->file);


 	page_remove_rmap(page);
@@ -967,7 +967,7 @@ static int try_to_unmap_cluster(unsigned

 		page_remove_rmap(page);
 		page_cache_release(page);
-		dec_mm_counter(mm, file_rss);
+		__this_cpu_dec(mm->rss->file);
 		(*mapcount)--;
 	}
 	pte_unmap_unlock(pte - 1, ptl);
Index: linux-2.6/mm/swapfile.c
===================================================================
--- linux-2.6.orig/mm/swapfile.c	2009-11-05 09:22:17.000000000 -0600
+++ linux-2.6/mm/swapfile.c	2009-11-05 09:22:37.000000000 -0600
@@ -831,7 +831,7 @@ static int unuse_pte(struct vm_area_stru
 		goto out;
 	}

-	inc_mm_counter(vma->vm_mm, anon_rss);
+	__this_cpu_inc(vma->vm_mm->rss->anon);
 	get_page(page);
 	set_pte_at(vma->vm_mm, addr, pte,
 		   pte_mkold(mk_pte(page, vma->vm_page_prot)));
Index: linux-2.6/mm/init-mm.c
===================================================================
--- linux-2.6.orig/mm/init-mm.c	2009-11-05 09:22:17.000000000 -0600
+++ linux-2.6/mm/init-mm.c	2009-11-05 09:22:37.000000000 -0600
@@ -8,6 +8,8 @@
 #include <asm/atomic.h>
 #include <asm/pgtable.h>

+DEFINE_PER_CPU(struct mm_counter, init_mm_counters);
+
 struct mm_struct init_mm = {
 	.mm_rb		= RB_ROOT,
 	.pgd		= swapper_pg_dir,
@@ -17,4 +19,5 @@ struct mm_struct init_mm = {
 	.page_table_lock =  __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
 	.mmlist		= LIST_HEAD_INIT(init_mm.mmlist),
 	.cpu_vm_mask	= CPU_MASK_ALL,
+	.rss		= &init_mm_counters,
 };

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic
  2009-11-05 15:10     ` Christoph Lameter
@ 2009-11-05 23:42       ` KAMEZAWA Hiroyuki
  -1 siblings, 0 replies; 66+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-11-05 23:42 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo

On Thu, 5 Nov 2009 10:10:56 -0500 (EST)
Christoph Lameter <cl@linux-foundation.org> wrote:

> On Thu, 5 Nov 2009, KAMEZAWA Hiroyuki wrote:
> 
> > Hmm, I don't fully understand _new_ percpu but...
> > In logical (even if not realistic), x86-32 supports up to 512 ? cpus in Kconfig.
> > BIGSMP.
> 
> x86-32 only supports 32 processors. Plus per cpu areas are only allocated
> for the possible processors.
> 
My number is just from Kconfig.

> > Then, if 65536 process runs, this consumes
> >
> > 65536(nr_proc) * 8 (size) * 512(cpus) = 256MBytes.
> 
> With 32 possible cpus this results in 16m of per cpu space use.
> 
If swap_usage is added, 24m, 25% of vmalloc area.
(But, yes, returning -ENOMEM to fork() is ok to me, 65536 proc are extreme.)

Thanks,
-Kame


^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic
@ 2009-11-05 23:42       ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 66+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-11-05 23:42 UTC (permalink / raw)
  To: Christoph Lameter; +Cc: hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo

On Thu, 5 Nov 2009 10:10:56 -0500 (EST)
Christoph Lameter <cl@linux-foundation.org> wrote:

> On Thu, 5 Nov 2009, KAMEZAWA Hiroyuki wrote:
> 
> > Hmm, I don't fully understand _new_ percpu but...
> > In logical (even if not realistic), x86-32 supports up to 512 ? cpus in Kconfig.
> > BIGSMP.
> 
> x86-32 only supports 32 processors. Plus per cpu areas are only allocated
> for the possible processors.
> 
My number is just from Kconfig.

> > Then, if 65536 process runs, this consumes
> >
> > 65536(nr_proc) * 8 (size) * 512(cpus) = 256MBytes.
> 
> With 32 possible cpus this results in 16m of per cpu space use.
> 
If swap_usage is added, 24m, 25% of vmalloc area.
(But, yes, returning -ENOMEM to fork() is ok to me, 65536 proc are extreme.)

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic V2
  2009-11-05 15:36       ` Christoph Lameter
@ 2009-11-06  1:11         ` KAMEZAWA Hiroyuki
  -1 siblings, 0 replies; 66+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-11-06  1:11 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Dave Jones, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo

On Thu, 5 Nov 2009 10:36:06 -0500 (EST)
Christoph Lameter <cl@linux-foundation.org> wrote:

> From: Christoph Lameter <cl@linux-foundation.org>
> Subject: Make mm counters per cpu V2
> 
> Changing the mm counters to per cpu counters is possible after the introduction
> of the generic per cpu operations (currently in percpu and -next).
> 
> With that the contention on the counters in mm_struct can be avoided. The
> USE_SPLIT_PTLOCKS case distinction can go away. Larger SMP systems do not
> need to perform atomic updates to mm counters anymore. Various code paths
> can be simplified since per cpu counter updates are fast and batching
> of counter updates is no longer needed.
> 
> One price to pay for these improvements is the need to scan over all percpu
> counters when the actual count values are needed.
> 
> V1->V2
> - Remove useless and buggy per cpu counter initialization.
>   alloc_percpu already zeros the values.
> 
> Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
> 
Thanks. My small concern is read-side.

This is the result of 'top -b -n 1' with 2000 processes(most of them just sleep)
on my 8cpu, SMP box.

== [Before]
 Performance counter stats for 'top -b -n 1' (5 runs):

     406.690304  task-clock-msecs         #      0.442 CPUs    ( +-   3.327% )
             32  context-switches         #      0.000 M/sec   ( +-   0.000% )
              0  CPU-migrations           #      0.000 M/sec   ( +-   0.000% )
            718  page-faults              #      0.002 M/sec   ( +-   0.000% )
      987832447  cycles                   #   2428.955 M/sec   ( +-   2.655% )
      933831356  instructions             #      0.945 IPC     ( +-   2.585% )
       17383990  cache-references         #     42.745 M/sec   ( +-   1.676% )
         353620  cache-misses             #      0.870 M/sec   ( +-   0.614% )

    0.920712639  seconds time elapsed   ( +-   1.609% )

== [After]
 Performance counter stats for 'top -b -n 1' (5 runs):

     675.926348  task-clock-msecs         #      0.568 CPUs    ( +-   0.601% )
             62  context-switches         #      0.000 M/sec   ( +-   1.587% )
              0  CPU-migrations           #      0.000 M/sec   ( +-   0.000% )
           1095  page-faults              #      0.002 M/sec   ( +-   0.000% )
     1896320818  cycles                   #   2805.514 M/sec   ( +-   1.494% )
     1790600289  instructions             #      0.944 IPC     ( +-   1.333% )
       35406398  cache-references         #     52.382 M/sec   ( +-   0.876% )
         722781  cache-misses             #      1.069 M/sec   ( +-   0.192% )

    1.190605561  seconds time elapsed   ( +-   0.417% )

Because I know 'ps' related workload is used in various ways, "How this will
be in large smp" is my concern.

Maybe usual use of 'ps -elf' will not read RSS value and not affected by this.
If this counter supports single-thread-mode (most of apps are single threaded),
impact will not be big.

Thanks,
-Kame


^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic V2
@ 2009-11-06  1:11         ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 66+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-11-06  1:11 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Dave Jones, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo

On Thu, 5 Nov 2009 10:36:06 -0500 (EST)
Christoph Lameter <cl@linux-foundation.org> wrote:

> From: Christoph Lameter <cl@linux-foundation.org>
> Subject: Make mm counters per cpu V2
> 
> Changing the mm counters to per cpu counters is possible after the introduction
> of the generic per cpu operations (currently in percpu and -next).
> 
> With that the contention on the counters in mm_struct can be avoided. The
> USE_SPLIT_PTLOCKS case distinction can go away. Larger SMP systems do not
> need to perform atomic updates to mm counters anymore. Various code paths
> can be simplified since per cpu counter updates are fast and batching
> of counter updates is no longer needed.
> 
> One price to pay for these improvements is the need to scan over all percpu
> counters when the actual count values are needed.
> 
> V1->V2
> - Remove useless and buggy per cpu counter initialization.
>   alloc_percpu already zeros the values.
> 
> Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
> 
Thanks. My small concern is read-side.

This is the result of 'top -b -n 1' with 2000 processes(most of them just sleep)
on my 8cpu, SMP box.

== [Before]
 Performance counter stats for 'top -b -n 1' (5 runs):

     406.690304  task-clock-msecs         #      0.442 CPUs    ( +-   3.327% )
             32  context-switches         #      0.000 M/sec   ( +-   0.000% )
              0  CPU-migrations           #      0.000 M/sec   ( +-   0.000% )
            718  page-faults              #      0.002 M/sec   ( +-   0.000% )
      987832447  cycles                   #   2428.955 M/sec   ( +-   2.655% )
      933831356  instructions             #      0.945 IPC     ( +-   2.585% )
       17383990  cache-references         #     42.745 M/sec   ( +-   1.676% )
         353620  cache-misses             #      0.870 M/sec   ( +-   0.614% )

    0.920712639  seconds time elapsed   ( +-   1.609% )

== [After]
 Performance counter stats for 'top -b -n 1' (5 runs):

     675.926348  task-clock-msecs         #      0.568 CPUs    ( +-   0.601% )
             62  context-switches         #      0.000 M/sec   ( +-   1.587% )
              0  CPU-migrations           #      0.000 M/sec   ( +-   0.000% )
           1095  page-faults              #      0.002 M/sec   ( +-   0.000% )
     1896320818  cycles                   #   2805.514 M/sec   ( +-   1.494% )
     1790600289  instructions             #      0.944 IPC     ( +-   1.333% )
       35406398  cache-references         #     52.382 M/sec   ( +-   0.876% )
         722781  cache-misses             #      1.069 M/sec   ( +-   0.192% )

    1.190605561  seconds time elapsed   ( +-   0.417% )

Because I know 'ps' related workload is used in various ways, "How this will
be in large smp" is my concern.

Maybe usual use of 'ps -elf' will not read RSS value and not affected by this.
If this counter supports single-thread-mode (most of apps are single threaded),
impact will not be big.

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic V2
  2009-11-06  1:11         ` KAMEZAWA Hiroyuki
@ 2009-11-06  3:23           ` KAMEZAWA Hiroyuki
  -1 siblings, 0 replies; 66+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-11-06  3:23 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Christoph Lameter, Dave Jones, hugh.dickins, linux-mm,
	linux-kernel, akpm, Tejun Heo

On Fri, 6 Nov 2009 10:11:06 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> This is the result of 'top -b -n 1' with 2000 processes(most of them just sleep)
> on my 8cpu, SMP box.
> 
> == [Before]
>  Performance counter stats for 'top -b -n 1' (5 runs):
> 
>      406.690304  task-clock-msecs         #      0.442 CPUs    ( +-   3.327% )
>              32  context-switches         #      0.000 M/sec   ( +-   0.000% )
>               0  CPU-migrations           #      0.000 M/sec   ( +-   0.000% )
>             718  page-faults              #      0.002 M/sec   ( +-   0.000% )
>       987832447  cycles                   #   2428.955 M/sec   ( +-   2.655% )
>       933831356  instructions             #      0.945 IPC     ( +-   2.585% )
>        17383990  cache-references         #     42.745 M/sec   ( +-   1.676% )
>          353620  cache-misses             #      0.870 M/sec   ( +-   0.614% )
> 
>     0.920712639  seconds time elapsed   ( +-   1.609% )
> 
> == [After]
>  Performance counter stats for 'top -b -n 1' (5 runs):
> 
>      675.926348  task-clock-msecs         #      0.568 CPUs    ( +-   0.601% )
>              62  context-switches         #      0.000 M/sec   ( +-   1.587% )
>               0  CPU-migrations           #      0.000 M/sec   ( +-   0.000% )
>            1095  page-faults              #      0.002 M/sec   ( +-   0.000% )
>      1896320818  cycles                   #   2805.514 M/sec   ( +-   1.494% )
>      1790600289  instructions             #      0.944 IPC     ( +-   1.333% )
>        35406398  cache-references         #     52.382 M/sec   ( +-   0.876% )
>          722781  cache-misses             #      1.069 M/sec   ( +-   0.192% )
> 
>     1.190605561  seconds time elapsed   ( +-   0.417% )
> 
> Because I know 'ps' related workload is used in various ways, "How this will
> be in large smp" is my concern.
> 
> Maybe usual use of 'ps -elf' will not read RSS value and not affected by this.
> If this counter supports single-thread-mode (most of apps are single threaded),
> impact will not be big.
> 

Measured extreme case benefits with attached program. 
please see # of page faults. Bigger is better.
please let me know my program is buggy.
Excuse:
My .config may not be for extreme performace challenge, and my host only have 8cpus.
(memcg is enabled, hahaha...)

# of page fault is not very stable (affected by task-clock-msecs.)
but maybe we have some improvements.

I'd like to see score of "top" and this in big servers......

BTW, can't we have single-thread-mode for this counter ?
Usual program's read-side will get much benefit.....


==[Before]==
 Performance counter stats for './multi-fault 8' (5 runs):

  474810.516710  task-clock-msecs         #      7.912 CPUs    ( +-   0.006% )
          10713  context-switches         #      0.000 M/sec   ( +-   2.529% )
              8  CPU-migrations           #      0.000 M/sec   ( +-   0.000% )
       16669105  page-faults              #      0.035 M/sec   ( +-   0.449% )
  1487101488902  cycles                   #   3131.989 M/sec   ( +-   0.012% )
   307164795479  instructions             #      0.207 IPC     ( +-   0.177% )
     2355518599  cache-references         #      4.961 M/sec   ( +-   0.420% )
      901969818  cache-misses             #      1.900 M/sec   ( +-   0.824% )

   60.008425257  seconds time elapsed   ( +-   0.004% )

==[After]==
 Performance counter stats for './multi-fault 8' (5 runs):

  474212.969563  task-clock-msecs         #      7.902 CPUs    ( +-   0.007% )
          10281  context-switches         #      0.000 M/sec   ( +-   0.156% )
              9  CPU-migrations           #      0.000 M/sec   ( +-   0.000% )
       16795696  page-faults              #      0.035 M/sec   ( +-   2.218% )
  1485411063159  cycles                   #   3132.371 M/sec   ( +-   0.014% )
   305810331186  instructions             #      0.206 IPC     ( +-   0.133% )
     2391293765  cache-references         #      5.043 M/sec   ( +-   0.737% )
      890490519  cache-misses             #      1.878 M/sec   ( +-   0.212% )

   60.010631769  seconds time elapsed   ( +-   0.004% )

Thanks,
-Kame

==

/*
 * multi-fault.c :: causes 60secs of parallel page fault in multi-thread.
 * % gcc -O2 -o multi-fault multi-fault.c -lpthread
 * % multi-fault # of cpus.
 */

#define _GNU_SOURCE
#include <stdio.h>
#include <pthread.h>
#include <sched.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>

#define NR_THREADS	32
pthread_t threads[NR_THREADS];
/*
 * For avoiding contention in page table lock, FAULT area is
 * sparse. If FAULT_LENGTH is too large for your cpus, decrease it.
 */
#define MMAP_LENGTH	(8 * 1024 * 1024)
#define FAULT_LENGTH	(2 * 1024 * 1024)
void *mmap_area[NR_THREADS];
#define PAGE_SIZE	4096

pthread_barrier_t barrier;
int name[NR_THREADS];

void *worker(void *data)
{
	int cpu = *(int *)data;
	cpu_set_t set;

	CPU_ZERO(&set);
	CPU_SET(cpu, &set);
	sched_setaffinity(0, sizeof(set), &set);
	pthread_barrier_wait(&barrier);

	while (1) {
		char *c;
		char *start = mmap_area[cpu];
		char *end = mmap_area[cpu] + FAULT_LENGTH;

		for (c = start; c < end; c += PAGE_SIZE)
			*c = 0;

		madvise(start, FAULT_LENGTH, MADV_DONTNEED);
	}
	return NULL;
}

int main(int argc, char *argv[])
{
	int i, num, ret;

	if (argc < 2)
		return 0;

	num = atoi(argv[1]);	

	pthread_barrier_init(&barrier, NULL, num + 1);

	for (i = 0; i < num; i++) {
		name[i] = i;
		ret = pthread_create(&threads[i], NULL, worker, &name[i]);
		if (ret < 0) {
			perror("pthread create");
			return 0;
		}
		mmap_area[i] = mmap(NULL, MMAP_LENGTH,
				PROT_WRITE | PROT_READ,
				MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
	}
	pthread_barrier_wait(&barrier);	
	sleep(60);
	return 0;
}






^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic V2
@ 2009-11-06  3:23           ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 66+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-11-06  3:23 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Christoph Lameter, Dave Jones, hugh.dickins, linux-mm,
	linux-kernel, akpm, Tejun Heo

On Fri, 6 Nov 2009 10:11:06 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:
> This is the result of 'top -b -n 1' with 2000 processes(most of them just sleep)
> on my 8cpu, SMP box.
> 
> == [Before]
>  Performance counter stats for 'top -b -n 1' (5 runs):
> 
>      406.690304  task-clock-msecs         #      0.442 CPUs    ( +-   3.327% )
>              32  context-switches         #      0.000 M/sec   ( +-   0.000% )
>               0  CPU-migrations           #      0.000 M/sec   ( +-   0.000% )
>             718  page-faults              #      0.002 M/sec   ( +-   0.000% )
>       987832447  cycles                   #   2428.955 M/sec   ( +-   2.655% )
>       933831356  instructions             #      0.945 IPC     ( +-   2.585% )
>        17383990  cache-references         #     42.745 M/sec   ( +-   1.676% )
>          353620  cache-misses             #      0.870 M/sec   ( +-   0.614% )
> 
>     0.920712639  seconds time elapsed   ( +-   1.609% )
> 
> == [After]
>  Performance counter stats for 'top -b -n 1' (5 runs):
> 
>      675.926348  task-clock-msecs         #      0.568 CPUs    ( +-   0.601% )
>              62  context-switches         #      0.000 M/sec   ( +-   1.587% )
>               0  CPU-migrations           #      0.000 M/sec   ( +-   0.000% )
>            1095  page-faults              #      0.002 M/sec   ( +-   0.000% )
>      1896320818  cycles                   #   2805.514 M/sec   ( +-   1.494% )
>      1790600289  instructions             #      0.944 IPC     ( +-   1.333% )
>        35406398  cache-references         #     52.382 M/sec   ( +-   0.876% )
>          722781  cache-misses             #      1.069 M/sec   ( +-   0.192% )
> 
>     1.190605561  seconds time elapsed   ( +-   0.417% )
> 
> Because I know 'ps' related workload is used in various ways, "How this will
> be in large smp" is my concern.
> 
> Maybe usual use of 'ps -elf' will not read RSS value and not affected by this.
> If this counter supports single-thread-mode (most of apps are single threaded),
> impact will not be big.
> 

Measured extreme case benefits with attached program. 
please see # of page faults. Bigger is better.
please let me know my program is buggy.
Excuse:
My .config may not be for extreme performace challenge, and my host only have 8cpus.
(memcg is enabled, hahaha...)

# of page fault is not very stable (affected by task-clock-msecs.)
but maybe we have some improvements.

I'd like to see score of "top" and this in big servers......

BTW, can't we have single-thread-mode for this counter ?
Usual program's read-side will get much benefit.....


==[Before]==
 Performance counter stats for './multi-fault 8' (5 runs):

  474810.516710  task-clock-msecs         #      7.912 CPUs    ( +-   0.006% )
          10713  context-switches         #      0.000 M/sec   ( +-   2.529% )
              8  CPU-migrations           #      0.000 M/sec   ( +-   0.000% )
       16669105  page-faults              #      0.035 M/sec   ( +-   0.449% )
  1487101488902  cycles                   #   3131.989 M/sec   ( +-   0.012% )
   307164795479  instructions             #      0.207 IPC     ( +-   0.177% )
     2355518599  cache-references         #      4.961 M/sec   ( +-   0.420% )
      901969818  cache-misses             #      1.900 M/sec   ( +-   0.824% )

   60.008425257  seconds time elapsed   ( +-   0.004% )

==[After]==
 Performance counter stats for './multi-fault 8' (5 runs):

  474212.969563  task-clock-msecs         #      7.902 CPUs    ( +-   0.007% )
          10281  context-switches         #      0.000 M/sec   ( +-   0.156% )
              9  CPU-migrations           #      0.000 M/sec   ( +-   0.000% )
       16795696  page-faults              #      0.035 M/sec   ( +-   2.218% )
  1485411063159  cycles                   #   3132.371 M/sec   ( +-   0.014% )
   305810331186  instructions             #      0.206 IPC     ( +-   0.133% )
     2391293765  cache-references         #      5.043 M/sec   ( +-   0.737% )
      890490519  cache-misses             #      1.878 M/sec   ( +-   0.212% )

   60.010631769  seconds time elapsed   ( +-   0.004% )

Thanks,
-Kame

==

/*
 * multi-fault.c :: causes 60secs of parallel page fault in multi-thread.
 * % gcc -O2 -o multi-fault multi-fault.c -lpthread
 * % multi-fault # of cpus.
 */

#define _GNU_SOURCE
#include <stdio.h>
#include <pthread.h>
#include <sched.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>

#define NR_THREADS	32
pthread_t threads[NR_THREADS];
/*
 * For avoiding contention in page table lock, FAULT area is
 * sparse. If FAULT_LENGTH is too large for your cpus, decrease it.
 */
#define MMAP_LENGTH	(8 * 1024 * 1024)
#define FAULT_LENGTH	(2 * 1024 * 1024)
void *mmap_area[NR_THREADS];
#define PAGE_SIZE	4096

pthread_barrier_t barrier;
int name[NR_THREADS];

void *worker(void *data)
{
	int cpu = *(int *)data;
	cpu_set_t set;

	CPU_ZERO(&set);
	CPU_SET(cpu, &set);
	sched_setaffinity(0, sizeof(set), &set);
	pthread_barrier_wait(&barrier);

	while (1) {
		char *c;
		char *start = mmap_area[cpu];
		char *end = mmap_area[cpu] + FAULT_LENGTH;

		for (c = start; c < end; c += PAGE_SIZE)
			*c = 0;

		madvise(start, FAULT_LENGTH, MADV_DONTNEED);
	}
	return NULL;
}

int main(int argc, char *argv[])
{
	int i, num, ret;

	if (argc < 2)
		return 0;

	num = atoi(argv[1]);	

	pthread_barrier_init(&barrier, NULL, num + 1);

	for (i = 0; i < num; i++) {
		name[i] = i;
		ret = pthread_create(&threads[i], NULL, worker, &name[i]);
		if (ret < 0) {
			perror("pthread create");
			return 0;
		}
		mmap_area[i] = mmap(NULL, MMAP_LENGTH,
				PROT_WRITE | PROT_READ,
				MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
	}
	pthread_barrier_wait(&barrier);	
	sleep(60);
	return 0;
}





--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic V2
  2009-11-05 15:36       ` Christoph Lameter
@ 2009-11-06  4:08         ` KAMEZAWA Hiroyuki
  -1 siblings, 0 replies; 66+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-11-06  4:08 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Dave Jones, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo

On Thu, 5 Nov 2009 10:36:06 -0500 (EST)
Christoph Lameter <cl@linux-foundation.org> wrote:
> +static inline unsigned long get_mm_rss(struct mm_struct *mm)
> +{
> +	int cpu;
> +	unsigned long r = 0;
> +
> +	for_each_possible_cpu(cpu) {
> +		struct mm_counter *c = per_cpu_ptr(mm->rss, cpu);
> +
> +		r = c->file + c->anon;

r += c->file + c->anon;

Thanks,
-Kame


^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic V2
@ 2009-11-06  4:08         ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 66+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-11-06  4:08 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Dave Jones, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo

On Thu, 5 Nov 2009 10:36:06 -0500 (EST)
Christoph Lameter <cl@linux-foundation.org> wrote:
> +static inline unsigned long get_mm_rss(struct mm_struct *mm)
> +{
> +	int cpu;
> +	unsigned long r = 0;
> +
> +	for_each_possible_cpu(cpu) {
> +		struct mm_counter *c = per_cpu_ptr(mm->rss, cpu);
> +
> +		r = c->file + c->anon;

r += c->file + c->anon;

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic V2
  2009-11-05 15:36       ` Christoph Lameter
@ 2009-11-06  4:15         ` KAMEZAWA Hiroyuki
  -1 siblings, 0 replies; 66+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-11-06  4:15 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Dave Jones, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo

On Thu, 5 Nov 2009 10:36:06 -0500 (EST)
Christoph Lameter <cl@linux-foundation.org> wrote:

> +static inline unsigned long get_mm_rss(struct mm_struct *mm)
> +{
> +	int cpu;
> +	unsigned long r = 0;
> +
> +	for_each_possible_cpu(cpu) {
> +		struct mm_counter *c = per_cpu_ptr(mm->rss, cpu);
> +
> +		r = c->file + c->anon;
> +	}
> +
> +	return r;
> +}
> +
> +static inline void update_hiwater_rss(struct mm_struct *mm)
> +{
> +	unsigned long _rss = get_mm_rss(mm);
> +	if (mm->hiwater_rss < _rss)
> +		mm->hiwater_rss = _rss;
> +}
> +

I'm sorry for my replies are scatterd.

Isn't it better to add some filter in following path ?

==
static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                                enum ttu_flags flags)
{
<snip>
       /* Update high watermark before we lower rss */
        update_hiwater_rss(mm);
==

Thanks,
-Kame


^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic V2
@ 2009-11-06  4:15         ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 66+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-11-06  4:15 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Dave Jones, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo

On Thu, 5 Nov 2009 10:36:06 -0500 (EST)
Christoph Lameter <cl@linux-foundation.org> wrote:

> +static inline unsigned long get_mm_rss(struct mm_struct *mm)
> +{
> +	int cpu;
> +	unsigned long r = 0;
> +
> +	for_each_possible_cpu(cpu) {
> +		struct mm_counter *c = per_cpu_ptr(mm->rss, cpu);
> +
> +		r = c->file + c->anon;
> +	}
> +
> +	return r;
> +}
> +
> +static inline void update_hiwater_rss(struct mm_struct *mm)
> +{
> +	unsigned long _rss = get_mm_rss(mm);
> +	if (mm->hiwater_rss < _rss)
> +		mm->hiwater_rss = _rss;
> +}
> +

I'm sorry for my replies are scatterd.

Isn't it better to add some filter in following path ?

==
static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                                enum ttu_flags flags)
{
<snip>
       /* Update high watermark before we lower rss */
        update_hiwater_rss(mm);
==

Thanks,
-Kame

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic V2
  2009-11-06  3:23           ` KAMEZAWA Hiroyuki
@ 2009-11-06 17:32             ` Christoph Lameter
  -1 siblings, 0 replies; 66+ messages in thread
From: Christoph Lameter @ 2009-11-06 17:32 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Dave Jones, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo

On Fri, 6 Nov 2009, KAMEZAWA Hiroyuki wrote:

> BTW, can't we have single-thread-mode for this counter ?
> Usual program's read-side will get much benefit.....

Thanks for the measurements.

A single thread mode would be good. Ideas on how to add that would be
appreciated.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic V2
@ 2009-11-06 17:32             ` Christoph Lameter
  0 siblings, 0 replies; 66+ messages in thread
From: Christoph Lameter @ 2009-11-06 17:32 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Dave Jones, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo

On Fri, 6 Nov 2009, KAMEZAWA Hiroyuki wrote:

> BTW, can't we have single-thread-mode for this counter ?
> Usual program's read-side will get much benefit.....

Thanks for the measurements.

A single thread mode would be good. Ideas on how to add that would be
appreciated.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic V2
  2009-11-06 17:32             ` Christoph Lameter
@ 2009-11-06 19:03               ` KAMEZAWA Hiroyuki
  -1 siblings, 0 replies; 66+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-11-06 19:03 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: KAMEZAWA Hiroyuki, Dave Jones, hugh.dickins, linux-mm,
	linux-kernel, akpm, Tejun Heo

Christoph Lameter wrote:
> On Fri, 6 Nov 2009, KAMEZAWA Hiroyuki wrote:
>
>> BTW, can't we have single-thread-mode for this counter ?
>> Usual program's read-side will get much benefit.....
>
> Thanks for the measurements.
>
> A single thread mode would be good. Ideas on how to add that would be
> appreciated.
>

Maybe there are some ways....At brief thought....
==
struct usage_counter {
    long rss;
    long file;
}


struct mm_struct {
    ....
    atomic_long_t  rss;   /* only updated when usage_counter is NULL */
    atomic_long_t  file;  /* only updated when usage_counter is NULL */
    struct usage_counter *usage;  /* percpu counter used when
                                     multi-threaded */
    .....
}

And allocate mm->usage only when the first CLONE_THREAD is specified.

if (mm->usage)
    access per cpu
else
    atomic_long_xxx

and read operation will be

    val = atomic_read(mm->rss);
    if (mm->usage)
        for_each_possible_cpu()....
==
Does "if" seems too costly ?

If this idea is bad, I think moving mm_counter to task_struct from
mm_struct and doing slow-sync is an idea instead of percpu.

for example

struct task_struct {
    ....
    mm_counter_t temp_counter;
    ....
};

struct mm_struct {
    .....
    atomic_long_t rss;
    atomic_long_t file;
};

And adds temp_counter's value to mm_struct at some good point....before
sleep ?
kswapd and reclaim routine can update mm_struct's counter, directly.
Readers just read mm_struct's counter.

Thanks,
-Kame










^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic V2
@ 2009-11-06 19:03               ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 66+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-11-06 19:03 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: KAMEZAWA Hiroyuki, Dave Jones, hugh.dickins, linux-mm,
	linux-kernel, akpm, Tejun Heo

Christoph Lameter wrote:
> On Fri, 6 Nov 2009, KAMEZAWA Hiroyuki wrote:
>
>> BTW, can't we have single-thread-mode for this counter ?
>> Usual program's read-side will get much benefit.....
>
> Thanks for the measurements.
>
> A single thread mode would be good. Ideas on how to add that would be
> appreciated.
>

Maybe there are some ways....At brief thought....
==
struct usage_counter {
    long rss;
    long file;
}


struct mm_struct {
    ....
    atomic_long_t  rss;   /* only updated when usage_counter is NULL */
    atomic_long_t  file;  /* only updated when usage_counter is NULL */
    struct usage_counter *usage;  /* percpu counter used when
                                     multi-threaded */
    .....
}

And allocate mm->usage only when the first CLONE_THREAD is specified.

if (mm->usage)
    access per cpu
else
    atomic_long_xxx

and read operation will be

    val = atomic_read(mm->rss);
    if (mm->usage)
        for_each_possible_cpu()....
==
Does "if" seems too costly ?

If this idea is bad, I think moving mm_counter to task_struct from
mm_struct and doing slow-sync is an idea instead of percpu.

for example

struct task_struct {
    ....
    mm_counter_t temp_counter;
    ....
};

struct mm_struct {
    .....
    atomic_long_t rss;
    atomic_long_t file;
};

And adds temp_counter's value to mm_struct at some good point....before
sleep ?
kswapd and reclaim routine can update mm_struct's counter, directly.
Readers just read mm_struct's counter.

Thanks,
-Kame









--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic V2
  2009-11-06 19:03               ` KAMEZAWA Hiroyuki
@ 2009-11-06 19:13                 ` Christoph Lameter
  -1 siblings, 0 replies; 66+ messages in thread
From: Christoph Lameter @ 2009-11-06 19:13 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Dave Jones, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo

On Sat, 7 Nov 2009, KAMEZAWA Hiroyuki wrote:

> And allocate mm->usage only when the first CLONE_THREAD is specified.

Ok.

> if (mm->usage)
>     access per cpu
> else
>     atomic_long_xxx

If we just have one thread: Do we need atomic access at all?

> and read operation will be
>
>     val = atomic_read(mm->rss);
>     if (mm->usage)
>         for_each_possible_cpu()....

or
   val = m->rss
   for_each_cpu(cpu) val+= percpu ...


> ==
> Does "if" seems too costly ?

The above method would avoid the if.

> If this idea is bad, I think moving mm_counter to task_struct from
> mm_struct and doing slow-sync is an idea instead of percpu.

Yeah then the access is effectively percpu as long as preempt is disabled.

But then for the mmap_writer_lock we would need to traverse a doubly
linked list to add up the counters. Bad caching on that one and we would
have to lock the list too. Sigh.

> kswapd and reclaim routine can update mm_struct's counter, directly.
> Readers just read mm_struct's counter.

Would work for rss counters but not for avoiding the rw semaphore I guess.


^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic V2
@ 2009-11-06 19:13                 ` Christoph Lameter
  0 siblings, 0 replies; 66+ messages in thread
From: Christoph Lameter @ 2009-11-06 19:13 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Dave Jones, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo

On Sat, 7 Nov 2009, KAMEZAWA Hiroyuki wrote:

> And allocate mm->usage only when the first CLONE_THREAD is specified.

Ok.

> if (mm->usage)
>     access per cpu
> else
>     atomic_long_xxx

If we just have one thread: Do we need atomic access at all?

> and read operation will be
>
>     val = atomic_read(mm->rss);
>     if (mm->usage)
>         for_each_possible_cpu()....

or
   val = m->rss
   for_each_cpu(cpu) val+= percpu ...


> ==
> Does "if" seems too costly ?

The above method would avoid the if.

> If this idea is bad, I think moving mm_counter to task_struct from
> mm_struct and doing slow-sync is an idea instead of percpu.

Yeah then the access is effectively percpu as long as preempt is disabled.

But then for the mmap_writer_lock we would need to traverse a doubly
linked list to add up the counters. Bad caching on that one and we would
have to lock the list too. Sigh.

> kswapd and reclaim routine can update mm_struct's counter, directly.
> Readers just read mm_struct's counter.

Would work for rss counters but not for avoiding the rw semaphore I guess.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic V2
  2009-11-06 19:13                 ` Christoph Lameter
@ 2009-11-06 19:20                   ` KAMEZAWA Hiroyuki
  -1 siblings, 0 replies; 66+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-11-06 19:20 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: KAMEZAWA Hiroyuki, Dave Jones, hugh.dickins, linux-mm,
	linux-kernel, akpm, Tejun Heo

Christoph Lameter wrote:
> On Sat, 7 Nov 2009, KAMEZAWA Hiroyuki wrote:
>
>> And allocate mm->usage only when the first CLONE_THREAD is specified.
>
> Ok.
>
>> if (mm->usage)
>>     access per cpu
>> else
>>     atomic_long_xxx
>
> If we just have one thread: Do we need atomic access at all?
>
Unfortunately, kswapd/vmscan touch this.

Thanks,
-Kame



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic V2
@ 2009-11-06 19:20                   ` KAMEZAWA Hiroyuki
  0 siblings, 0 replies; 66+ messages in thread
From: KAMEZAWA Hiroyuki @ 2009-11-06 19:20 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: KAMEZAWA Hiroyuki, Dave Jones, hugh.dickins, linux-mm,
	linux-kernel, akpm, Tejun Heo

Christoph Lameter wrote:
> On Sat, 7 Nov 2009, KAMEZAWA Hiroyuki wrote:
>
>> And allocate mm->usage only when the first CLONE_THREAD is specified.
>
> Ok.
>
>> if (mm->usage)
>>     access per cpu
>> else
>>     atomic_long_xxx
>
> If we just have one thread: Do we need atomic access at all?
>
Unfortunately, kswapd/vmscan touch this.

Thanks,
-Kame


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic V2
  2009-11-06 19:20                   ` KAMEZAWA Hiroyuki
@ 2009-11-06 19:47                     ` Christoph Lameter
  -1 siblings, 0 replies; 66+ messages in thread
From: Christoph Lameter @ 2009-11-06 19:47 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Dave Jones, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo

On Sat, 7 Nov 2009, KAMEZAWA Hiroyuki wrote:

> > If we just have one thread: Do we need atomic access at all?
> >
> Unfortunately, kswapd/vmscan touch this.

Right. And those can also occur from another processor that the process
never has run on before. Argh.


^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic V2
@ 2009-11-06 19:47                     ` Christoph Lameter
  0 siblings, 0 replies; 66+ messages in thread
From: Christoph Lameter @ 2009-11-06 19:47 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Dave Jones, hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo

On Sat, 7 Nov 2009, KAMEZAWA Hiroyuki wrote:

> > If we just have one thread: Do we need atomic access at all?
> >
> Unfortunately, kswapd/vmscan touch this.

Right. And those can also occur from another processor that the process
never has run on before. Argh.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic V2
  2009-11-06  1:11         ` KAMEZAWA Hiroyuki
@ 2009-11-10 22:44           ` Andrew Morton
  -1 siblings, 0 replies; 66+ messages in thread
From: Andrew Morton @ 2009-11-10 22:44 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Christoph Lameter, Dave Jones, hugh.dickins, linux-mm,
	linux-kernel, Tejun Heo

On Fri, 6 Nov 2009 10:11:06 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:

> On Thu, 5 Nov 2009 10:36:06 -0500 (EST)
> Christoph Lameter <cl@linux-foundation.org> wrote:
> 
> > From: Christoph Lameter <cl@linux-foundation.org>
> > Subject: Make mm counters per cpu V2
> > 
> > Changing the mm counters to per cpu counters is possible after the introduction
> > of the generic per cpu operations (currently in percpu and -next).
> > 
> > With that the contention on the counters in mm_struct can be avoided. The
> > USE_SPLIT_PTLOCKS case distinction can go away. Larger SMP systems do not
> > need to perform atomic updates to mm counters anymore. Various code paths
> > can be simplified since per cpu counter updates are fast and batching
> > of counter updates is no longer needed.
> > 
> > One price to pay for these improvements is the need to scan over all percpu
> > counters when the actual count values are needed.
> > 
> > V1->V2
> > - Remove useless and buggy per cpu counter initialization.
> >   alloc_percpu already zeros the values.
> > 
> > Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
> > 
> Thanks. My small concern is read-side.

Me too.

For example, with 1000 possible CPUs (possible, not present and not
online), and 1000 processes, ps(1) will have to wallow through a
million cachelines in task_statm().

And then we have get_mm_rs(), which now will hit 1000 cachelines.  And
get_mm_rs() is called (via
account_user_time()->acct_update_integrals()) from the clock tick.

Adding a thousand cache misses to the timer interrupt is the sort of
thing which makes people unhappy?


^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic V2
@ 2009-11-10 22:44           ` Andrew Morton
  0 siblings, 0 replies; 66+ messages in thread
From: Andrew Morton @ 2009-11-10 22:44 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: Christoph Lameter, Dave Jones, hugh.dickins, linux-mm,
	linux-kernel, Tejun Heo

On Fri, 6 Nov 2009 10:11:06 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> wrote:

> On Thu, 5 Nov 2009 10:36:06 -0500 (EST)
> Christoph Lameter <cl@linux-foundation.org> wrote:
> 
> > From: Christoph Lameter <cl@linux-foundation.org>
> > Subject: Make mm counters per cpu V2
> > 
> > Changing the mm counters to per cpu counters is possible after the introduction
> > of the generic per cpu operations (currently in percpu and -next).
> > 
> > With that the contention on the counters in mm_struct can be avoided. The
> > USE_SPLIT_PTLOCKS case distinction can go away. Larger SMP systems do not
> > need to perform atomic updates to mm counters anymore. Various code paths
> > can be simplified since per cpu counter updates are fast and batching
> > of counter updates is no longer needed.
> > 
> > One price to pay for these improvements is the need to scan over all percpu
> > counters when the actual count values are needed.
> > 
> > V1->V2
> > - Remove useless and buggy per cpu counter initialization.
> >   alloc_percpu already zeros the values.
> > 
> > Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
> > 
> Thanks. My small concern is read-side.

Me too.

For example, with 1000 possible CPUs (possible, not present and not
online), and 1000 processes, ps(1) will have to wallow through a
million cachelines in task_statm().

And then we have get_mm_rs(), which now will hit 1000 cachelines.  And
get_mm_rs() is called (via
account_user_time()->acct_update_integrals()) from the clock tick.

Adding a thousand cache misses to the timer interrupt is the sort of
thing which makes people unhappy?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic V2
  2009-11-10 22:44           ` Andrew Morton
@ 2009-11-10 23:20             ` Christoph Lameter
  -1 siblings, 0 replies; 66+ messages in thread
From: Christoph Lameter @ 2009-11-10 23:20 UTC (permalink / raw)
  To: Andrew Morton
  Cc: KAMEZAWA Hiroyuki, Dave Jones, hugh.dickins, linux-mm,
	linux-kernel, Tejun Heo

On Tue, 10 Nov 2009, Andrew Morton wrote:

> Adding a thousand cache misses to the timer interrupt is the sort of
> thing which makes people unhappy?

Obviously I was hoping for new ideas instead of just restatements of the
problem.


^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic V2
@ 2009-11-10 23:20             ` Christoph Lameter
  0 siblings, 0 replies; 66+ messages in thread
From: Christoph Lameter @ 2009-11-10 23:20 UTC (permalink / raw)
  To: Andrew Morton
  Cc: KAMEZAWA Hiroyuki, Dave Jones, hugh.dickins, linux-mm,
	linux-kernel, Tejun Heo

On Tue, 10 Nov 2009, Andrew Morton wrote:

> Adding a thousand cache misses to the timer interrupt is the sort of
> thing which makes people unhappy?

Obviously I was hoping for new ideas instead of just restatements of the
problem.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic
  2009-11-04 19:14 ` Christoph Lameter
@ 2009-11-17  6:48   ` Zhang, Yanmin
  -1 siblings, 0 replies; 66+ messages in thread
From: Zhang, Yanmin @ 2009-11-17  6:48 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: KAMEZAWA Hiroyuki, hugh.dickins, linux-mm, linux-kernel, akpm,
	Tejun Heo, Andi Kleen

On Wed, 2009-11-04 at 14:14 -0500, Christoph Lameter wrote:
> From: Christoph Lameter <cl@linux-foundation.org>
> Subject: Make mm counters per cpu
> 
> Changing the mm counters to per cpu counters is possible after the introduction
> of the generic per cpu operations (currently in percpu and -next).
> 
> With that the contention on the counters in mm_struct can be avoided. The
> USE_SPLIT_PTLOCKS case distinction can go away. Larger SMP systems do not
> need to perform atomic updates to mm counters anymore. Various code paths
> can be simplified since per cpu counter updates are fast and batching
> of counter updates is no longer needed.
> 
> One price to pay for these improvements is the need to scan over all percpu
> counters when the actual count values are needed.
> 
> Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
> 
> ---
>  fs/proc/task_mmu.c       |   14 +++++++++-
>  include/linux/mm_types.h |   16 ++++--------
>  include/linux/sched.h    |   61 ++++++++++++++++++++---------------------------
>  kernel/fork.c            |   25 ++++++++++++++-----
>  mm/filemap_xip.c         |    2 -
>  mm/fremap.c              |    2 -
>  mm/init-mm.c             |    3 ++
>  mm/memory.c              |   20 +++++++--------
>  mm/rmap.c                |   10 +++----
>  mm/swapfile.c            |    2 -
>  10 files changed, 84 insertions(+), 71 deletions(-)
> 
> Index: linux-2.6/include/linux/mm_types.h
> ===================================================================
> --- linux-2.6.orig/include/linux/mm_types.h	2009-11-04 13:08:33.000000000 -0600
> +++ linux-2.6/include/linux/mm_types.h	2009-11-04 13:13:42.000000000 -0600
> @@ -24,11 +24,10 @@ struct address_space;

> Index: linux-2.6/kernel/fork.c
> ===================================================================
> --- linux-2.6.orig/kernel/fork.c	2009-11-04 13:08:33.000000000 -0600
> +++ linux-2.6/kernel/fork.c	2009-11-04 13:14:19.000000000 -0600
> @@ -444,6 +444,8 @@ static void mm_init_aio(struct mm_struct
> 
>  static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
>  {
> +	int cpu;
> +
>  	atomic_set(&mm->mm_users, 1);
>  	atomic_set(&mm->mm_count, 1);
>  	init_rwsem(&mm->mmap_sem);
> @@ -452,8 +454,11 @@ static struct mm_struct * mm_init(struct
>  		(current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
>  	mm->core_state = NULL;
>  	mm->nr_ptes = 0;
> -	set_mm_counter(mm, file_rss, 0);
> -	set_mm_counter(mm, anon_rss, 0);
> +	for_each_possible_cpu(cpu) {
> +		struct mm_counter *m;
> +
> +		memset(m, sizeof(struct mm_counter), 0);
Above memset is wrong.
1) m isn't initiated;
2) It seems the 2nd and the 3rd parameters should be interchanged.



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic
@ 2009-11-17  6:48   ` Zhang, Yanmin
  0 siblings, 0 replies; 66+ messages in thread
From: Zhang, Yanmin @ 2009-11-17  6:48 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: KAMEZAWA Hiroyuki, hugh.dickins, linux-mm, linux-kernel, akpm,
	Tejun Heo, Andi Kleen

On Wed, 2009-11-04 at 14:14 -0500, Christoph Lameter wrote:
> From: Christoph Lameter <cl@linux-foundation.org>
> Subject: Make mm counters per cpu
> 
> Changing the mm counters to per cpu counters is possible after the introduction
> of the generic per cpu operations (currently in percpu and -next).
> 
> With that the contention on the counters in mm_struct can be avoided. The
> USE_SPLIT_PTLOCKS case distinction can go away. Larger SMP systems do not
> need to perform atomic updates to mm counters anymore. Various code paths
> can be simplified since per cpu counter updates are fast and batching
> of counter updates is no longer needed.
> 
> One price to pay for these improvements is the need to scan over all percpu
> counters when the actual count values are needed.
> 
> Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
> 
> ---
>  fs/proc/task_mmu.c       |   14 +++++++++-
>  include/linux/mm_types.h |   16 ++++--------
>  include/linux/sched.h    |   61 ++++++++++++++++++++---------------------------
>  kernel/fork.c            |   25 ++++++++++++++-----
>  mm/filemap_xip.c         |    2 -
>  mm/fremap.c              |    2 -
>  mm/init-mm.c             |    3 ++
>  mm/memory.c              |   20 +++++++--------
>  mm/rmap.c                |   10 +++----
>  mm/swapfile.c            |    2 -
>  10 files changed, 84 insertions(+), 71 deletions(-)
> 
> Index: linux-2.6/include/linux/mm_types.h
> ===================================================================
> --- linux-2.6.orig/include/linux/mm_types.h	2009-11-04 13:08:33.000000000 -0600
> +++ linux-2.6/include/linux/mm_types.h	2009-11-04 13:13:42.000000000 -0600
> @@ -24,11 +24,10 @@ struct address_space;

> Index: linux-2.6/kernel/fork.c
> ===================================================================
> --- linux-2.6.orig/kernel/fork.c	2009-11-04 13:08:33.000000000 -0600
> +++ linux-2.6/kernel/fork.c	2009-11-04 13:14:19.000000000 -0600
> @@ -444,6 +444,8 @@ static void mm_init_aio(struct mm_struct
> 
>  static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
>  {
> +	int cpu;
> +
>  	atomic_set(&mm->mm_users, 1);
>  	atomic_set(&mm->mm_count, 1);
>  	init_rwsem(&mm->mmap_sem);
> @@ -452,8 +454,11 @@ static struct mm_struct * mm_init(struct
>  		(current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
>  	mm->core_state = NULL;
>  	mm->nr_ptes = 0;
> -	set_mm_counter(mm, file_rss, 0);
> -	set_mm_counter(mm, anon_rss, 0);
> +	for_each_possible_cpu(cpu) {
> +		struct mm_counter *m;
> +
> +		memset(m, sizeof(struct mm_counter), 0);
Above memset is wrong.
1) m isn't initiated;
2) It seems the 2nd and the 3rd parameters should be interchanged.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic
  2009-11-17  6:48   ` Zhang, Yanmin
@ 2009-11-17  7:31     ` Zhang, Yanmin
  -1 siblings, 0 replies; 66+ messages in thread
From: Zhang, Yanmin @ 2009-11-17  7:31 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: KAMEZAWA Hiroyuki, hugh.dickins, linux-mm, linux-kernel, akpm,
	Tejun Heo, Andi Kleen

On Tue, 2009-11-17 at 14:48 +0800, Zhang, Yanmin wrote:
> On Wed, 2009-11-04 at 14:14 -0500, Christoph Lameter wrote:
> > From: Christoph Lameter <cl@linux-foundation.org>
> > Subject: Make mm counters per cpu
> > 
> > Changing the mm counters to per cpu counters is possible after the introduction
> > of the generic per cpu operations (currently in percpu and -next).
> > 
> > With that the contention on the counters in mm_struct can be avoided. The
> > USE_SPLIT_PTLOCKS case distinction can go away. Larger SMP systems do not
> > need to perform atomic updates to mm counters anymore. Various code paths
> > can be simplified since per cpu counter updates are fast and batching
> > of counter updates is no longer needed.
> > 
> > One price to pay for these improvements is the need to scan over all percpu
> > counters when the actual count values are needed.
> > 
> > Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
> > 
> > ---
> >  fs/proc/task_mmu.c       |   14 +++++++++-
> >  include/linux/mm_types.h |   16 ++++--------
> >  include/linux/sched.h    |   61 ++++++++++++++++++++---------------------------
> >  kernel/fork.c            |   25 ++++++++++++++-----
> >  mm/filemap_xip.c         |    2 -
> >  mm/fremap.c              |    2 -
> >  mm/init-mm.c             |    3 ++
> >  mm/memory.c              |   20 +++++++--------
> >  mm/rmap.c                |   10 +++----
> >  mm/swapfile.c            |    2 -
> >  10 files changed, 84 insertions(+), 71 deletions(-)
> > 
> > Index: linux-2.6/include/linux/mm_types.h
> > ===================================================================
> > --- linux-2.6.orig/include/linux/mm_types.h	2009-11-04 13:08:33.000000000 -0600
> > +++ linux-2.6/include/linux/mm_types.h	2009-11-04 13:13:42.000000000 -0600
> > @@ -24,11 +24,10 @@ struct address_space;
> 
> > Index: linux-2.6/kernel/fork.c
> > ===================================================================
> > --- linux-2.6.orig/kernel/fork.c	2009-11-04 13:08:33.000000000 -0600
> > +++ linux-2.6/kernel/fork.c	2009-11-04 13:14:19.000000000 -0600
> > @@ -444,6 +444,8 @@ static void mm_init_aio(struct mm_struct
> > 
> >  static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
> >  {
> > +	int cpu;
> > +
> >  	atomic_set(&mm->mm_users, 1);
> >  	atomic_set(&mm->mm_count, 1);
> >  	init_rwsem(&mm->mmap_sem);
> > @@ -452,8 +454,11 @@ static struct mm_struct * mm_init(struct
> >  		(current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
> >  	mm->core_state = NULL;
> >  	mm->nr_ptes = 0;
> > -	set_mm_counter(mm, file_rss, 0);
> > -	set_mm_counter(mm, anon_rss, 0);
> > +	for_each_possible_cpu(cpu) {
> > +		struct mm_counter *m;
> > +
> > +		memset(m, sizeof(struct mm_counter), 0);
> Above memset is wrong.
> 1) m isn't initiated;
> 2) It seems the 2nd and the 3rd parameters should be interchanged.
Changing it to below fixes the command hang issue.

        for_each_possible_cpu(cpu) {
                struct mm_counter *m = per_cpu(mm->rss->readers, cpu);

                memset(m, 0, sizeof(struct mm_counter));
        }



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic
@ 2009-11-17  7:31     ` Zhang, Yanmin
  0 siblings, 0 replies; 66+ messages in thread
From: Zhang, Yanmin @ 2009-11-17  7:31 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: KAMEZAWA Hiroyuki, hugh.dickins, linux-mm, linux-kernel, akpm,
	Tejun Heo, Andi Kleen

On Tue, 2009-11-17 at 14:48 +0800, Zhang, Yanmin wrote:
> On Wed, 2009-11-04 at 14:14 -0500, Christoph Lameter wrote:
> > From: Christoph Lameter <cl@linux-foundation.org>
> > Subject: Make mm counters per cpu
> > 
> > Changing the mm counters to per cpu counters is possible after the introduction
> > of the generic per cpu operations (currently in percpu and -next).
> > 
> > With that the contention on the counters in mm_struct can be avoided. The
> > USE_SPLIT_PTLOCKS case distinction can go away. Larger SMP systems do not
> > need to perform atomic updates to mm counters anymore. Various code paths
> > can be simplified since per cpu counter updates are fast and batching
> > of counter updates is no longer needed.
> > 
> > One price to pay for these improvements is the need to scan over all percpu
> > counters when the actual count values are needed.
> > 
> > Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
> > 
> > ---
> >  fs/proc/task_mmu.c       |   14 +++++++++-
> >  include/linux/mm_types.h |   16 ++++--------
> >  include/linux/sched.h    |   61 ++++++++++++++++++++---------------------------
> >  kernel/fork.c            |   25 ++++++++++++++-----
> >  mm/filemap_xip.c         |    2 -
> >  mm/fremap.c              |    2 -
> >  mm/init-mm.c             |    3 ++
> >  mm/memory.c              |   20 +++++++--------
> >  mm/rmap.c                |   10 +++----
> >  mm/swapfile.c            |    2 -
> >  10 files changed, 84 insertions(+), 71 deletions(-)
> > 
> > Index: linux-2.6/include/linux/mm_types.h
> > ===================================================================
> > --- linux-2.6.orig/include/linux/mm_types.h	2009-11-04 13:08:33.000000000 -0600
> > +++ linux-2.6/include/linux/mm_types.h	2009-11-04 13:13:42.000000000 -0600
> > @@ -24,11 +24,10 @@ struct address_space;
> 
> > Index: linux-2.6/kernel/fork.c
> > ===================================================================
> > --- linux-2.6.orig/kernel/fork.c	2009-11-04 13:08:33.000000000 -0600
> > +++ linux-2.6/kernel/fork.c	2009-11-04 13:14:19.000000000 -0600
> > @@ -444,6 +444,8 @@ static void mm_init_aio(struct mm_struct
> > 
> >  static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
> >  {
> > +	int cpu;
> > +
> >  	atomic_set(&mm->mm_users, 1);
> >  	atomic_set(&mm->mm_count, 1);
> >  	init_rwsem(&mm->mmap_sem);
> > @@ -452,8 +454,11 @@ static struct mm_struct * mm_init(struct
> >  		(current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
> >  	mm->core_state = NULL;
> >  	mm->nr_ptes = 0;
> > -	set_mm_counter(mm, file_rss, 0);
> > -	set_mm_counter(mm, anon_rss, 0);
> > +	for_each_possible_cpu(cpu) {
> > +		struct mm_counter *m;
> > +
> > +		memset(m, sizeof(struct mm_counter), 0);
> Above memset is wrong.
> 1) m isn't initiated;
> 2) It seems the 2nd and the 3rd parameters should be interchanged.
Changing it to below fixes the command hang issue.

        for_each_possible_cpu(cpu) {
                struct mm_counter *m = per_cpu(mm->rss->readers, cpu);

                memset(m, 0, sizeof(struct mm_counter));
        }


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic
  2009-11-17  7:31     ` Zhang, Yanmin
@ 2009-11-17  9:34       ` Zhang, Yanmin
  -1 siblings, 0 replies; 66+ messages in thread
From: Zhang, Yanmin @ 2009-11-17  9:34 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: KAMEZAWA Hiroyuki, hugh.dickins, linux-mm, linux-kernel, akpm,
	Tejun Heo, Andi Kleen

On Tue, 2009-11-17 at 15:31 +0800, Zhang, Yanmin wrote:
> On Tue, 2009-11-17 at 14:48 +0800, Zhang, Yanmin wrote:
> > On Wed, 2009-11-04 at 14:14 -0500, Christoph Lameter wrote:
> > > From: Christoph Lameter <cl@linux-foundation.org>
> > > Subject: Make mm counters per cpu
> > > 
> > > Changing the mm counters to per cpu counters is possible after the introduction
> > > of the generic per cpu operations (currently in percpu and -next).
> > > 
> > > With that the contention on the counters in mm_struct can be avoided. The
> > > USE_SPLIT_PTLOCKS case distinction can go away. Larger SMP systems do not
> > > need to perform atomic updates to mm counters anymore. Various code paths
> > > can be simplified since per cpu counter updates are fast and batching
> > > of counter updates is no longer needed.
> > > 
> > > One price to pay for these improvements is the need to scan over all percpu
> > > counters when the actual count values are needed.
> > > 
> > > Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
> > > 
> > > ---
> > >  fs/proc/task_mmu.c       |   14 +++++++++-
> > >  include/linux/mm_types.h |   16 ++++--------
> > >  include/linux/sched.h    |   61 ++++++++++++++++++++---------------------------
> > >  kernel/fork.c            |   25 ++++++++++++++-----
> > >  mm/filemap_xip.c         |    2 -
> > >  mm/fremap.c              |    2 -
> > >  mm/init-mm.c             |    3 ++
> > >  mm/memory.c              |   20 +++++++--------
> > >  mm/rmap.c                |   10 +++----
> > >  mm/swapfile.c            |    2 -
> > >  10 files changed, 84 insertions(+), 71 deletions(-)
> > > 
> > > Index: linux-2.6/include/linux/mm_types.h
> > > ===================================================================
> > > --- linux-2.6.orig/include/linux/mm_types.h	2009-11-04 13:08:33.000000000 -0600
> > > +++ linux-2.6/include/linux/mm_types.h	2009-11-04 13:13:42.000000000 -0600
> > > @@ -24,11 +24,10 @@ struct address_space;
> > 
> > > Index: linux-2.6/kernel/fork.c
> > > ===================================================================
> > > --- linux-2.6.orig/kernel/fork.c	2009-11-04 13:08:33.000000000 -0600
> > > +++ linux-2.6/kernel/fork.c	2009-11-04 13:14:19.000000000 -0600
> > > @@ -444,6 +444,8 @@ static void mm_init_aio(struct mm_struct
> > > 
> > >  static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
> > >  {
> > > +	int cpu;
> > > +
> > >  	atomic_set(&mm->mm_users, 1);
> > >  	atomic_set(&mm->mm_count, 1);
> > >  	init_rwsem(&mm->mmap_sem);
> > > @@ -452,8 +454,11 @@ static struct mm_struct * mm_init(struct
> > >  		(current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
> > >  	mm->core_state = NULL;
> > >  	mm->nr_ptes = 0;
> > > -	set_mm_counter(mm, file_rss, 0);
> > > -	set_mm_counter(mm, anon_rss, 0);
> > > +	for_each_possible_cpu(cpu) {
> > > +		struct mm_counter *m;
> > > +
> > > +		memset(m, sizeof(struct mm_counter), 0);
> > Above memset is wrong.
> > 1) m isn't initiated;
> > 2) It seems the 2nd and the 3rd parameters should be interchanged.
> Changing it to below fixes the command hang issue.
> 
>         for_each_possible_cpu(cpu) {
>                 struct mm_counter *m = per_cpu(mm->rss->readers, cpu);
> 
>                 memset(m, 0, sizeof(struct mm_counter));
>         }

Sorry. I was too optimistic and used another kernel to boot.
The right change above should be:
 struct mm_counter *m = per_cpu_ptr(mm->rss, cpu);

Compiler doesn't report error/warning when I use any member.

With the change, command 'make oldconfig' and a boot command still
hangs.

Yanmin



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic
@ 2009-11-17  9:34       ` Zhang, Yanmin
  0 siblings, 0 replies; 66+ messages in thread
From: Zhang, Yanmin @ 2009-11-17  9:34 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: KAMEZAWA Hiroyuki, hugh.dickins, linux-mm, linux-kernel, akpm,
	Tejun Heo, Andi Kleen

On Tue, 2009-11-17 at 15:31 +0800, Zhang, Yanmin wrote:
> On Tue, 2009-11-17 at 14:48 +0800, Zhang, Yanmin wrote:
> > On Wed, 2009-11-04 at 14:14 -0500, Christoph Lameter wrote:
> > > From: Christoph Lameter <cl@linux-foundation.org>
> > > Subject: Make mm counters per cpu
> > > 
> > > Changing the mm counters to per cpu counters is possible after the introduction
> > > of the generic per cpu operations (currently in percpu and -next).
> > > 
> > > With that the contention on the counters in mm_struct can be avoided. The
> > > USE_SPLIT_PTLOCKS case distinction can go away. Larger SMP systems do not
> > > need to perform atomic updates to mm counters anymore. Various code paths
> > > can be simplified since per cpu counter updates are fast and batching
> > > of counter updates is no longer needed.
> > > 
> > > One price to pay for these improvements is the need to scan over all percpu
> > > counters when the actual count values are needed.
> > > 
> > > Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
> > > 
> > > ---
> > >  fs/proc/task_mmu.c       |   14 +++++++++-
> > >  include/linux/mm_types.h |   16 ++++--------
> > >  include/linux/sched.h    |   61 ++++++++++++++++++++---------------------------
> > >  kernel/fork.c            |   25 ++++++++++++++-----
> > >  mm/filemap_xip.c         |    2 -
> > >  mm/fremap.c              |    2 -
> > >  mm/init-mm.c             |    3 ++
> > >  mm/memory.c              |   20 +++++++--------
> > >  mm/rmap.c                |   10 +++----
> > >  mm/swapfile.c            |    2 -
> > >  10 files changed, 84 insertions(+), 71 deletions(-)
> > > 
> > > Index: linux-2.6/include/linux/mm_types.h
> > > ===================================================================
> > > --- linux-2.6.orig/include/linux/mm_types.h	2009-11-04 13:08:33.000000000 -0600
> > > +++ linux-2.6/include/linux/mm_types.h	2009-11-04 13:13:42.000000000 -0600
> > > @@ -24,11 +24,10 @@ struct address_space;
> > 
> > > Index: linux-2.6/kernel/fork.c
> > > ===================================================================
> > > --- linux-2.6.orig/kernel/fork.c	2009-11-04 13:08:33.000000000 -0600
> > > +++ linux-2.6/kernel/fork.c	2009-11-04 13:14:19.000000000 -0600
> > > @@ -444,6 +444,8 @@ static void mm_init_aio(struct mm_struct
> > > 
> > >  static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
> > >  {
> > > +	int cpu;
> > > +
> > >  	atomic_set(&mm->mm_users, 1);
> > >  	atomic_set(&mm->mm_count, 1);
> > >  	init_rwsem(&mm->mmap_sem);
> > > @@ -452,8 +454,11 @@ static struct mm_struct * mm_init(struct
> > >  		(current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
> > >  	mm->core_state = NULL;
> > >  	mm->nr_ptes = 0;
> > > -	set_mm_counter(mm, file_rss, 0);
> > > -	set_mm_counter(mm, anon_rss, 0);
> > > +	for_each_possible_cpu(cpu) {
> > > +		struct mm_counter *m;
> > > +
> > > +		memset(m, sizeof(struct mm_counter), 0);
> > Above memset is wrong.
> > 1) m isn't initiated;
> > 2) It seems the 2nd and the 3rd parameters should be interchanged.
> Changing it to below fixes the command hang issue.
> 
>         for_each_possible_cpu(cpu) {
>                 struct mm_counter *m = per_cpu(mm->rss->readers, cpu);
> 
>                 memset(m, 0, sizeof(struct mm_counter));
>         }

Sorry. I was too optimistic and used another kernel to boot.
The right change above should be:
 struct mm_counter *m = per_cpu_ptr(mm->rss, cpu);

Compiler doesn't report error/warning when I use any member.

With the change, command 'make oldconfig' and a boot command still
hangs.

Yanmin


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic
  2009-11-17  9:34       ` Zhang, Yanmin
@ 2009-11-17 17:25         ` Christoph Lameter
  -1 siblings, 0 replies; 66+ messages in thread
From: Christoph Lameter @ 2009-11-17 17:25 UTC (permalink / raw)
  To: Zhang, Yanmin
  Cc: KAMEZAWA Hiroyuki, hugh.dickins, linux-mm, linux-kernel, akpm,
	Tejun Heo, Andi Kleen

On Tue, 17 Nov 2009, Zhang, Yanmin wrote:

> The right change above should be:
>  struct mm_counter *m = per_cpu_ptr(mm->rss, cpu);

Right.

> With the change, command 'make oldconfig' and a boot command still
> hangs.

Not sure if its worth spending more time on this but if you want I will
consolidate the fixes so far and put out another patchset.

Where does it hang during boot?


^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic
@ 2009-11-17 17:25         ` Christoph Lameter
  0 siblings, 0 replies; 66+ messages in thread
From: Christoph Lameter @ 2009-11-17 17:25 UTC (permalink / raw)
  To: Zhang, Yanmin
  Cc: KAMEZAWA Hiroyuki, hugh.dickins, linux-mm, linux-kernel, akpm,
	Tejun Heo, Andi Kleen

On Tue, 17 Nov 2009, Zhang, Yanmin wrote:

> The right change above should be:
>  struct mm_counter *m = per_cpu_ptr(mm->rss, cpu);

Right.

> With the change, command 'make oldconfig' and a boot command still
> hangs.

Not sure if its worth spending more time on this but if you want I will
consolidate the fixes so far and put out another patchset.

Where does it hang during boot?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic
  2009-11-17 17:25         ` Christoph Lameter
@ 2009-11-19  0:48           ` Zhang, Yanmin
  -1 siblings, 0 replies; 66+ messages in thread
From: Zhang, Yanmin @ 2009-11-19  0:48 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: KAMEZAWA Hiroyuki, hugh.dickins, linux-mm, linux-kernel, akpm,
	Tejun Heo, Andi Kleen

On Tue, 2009-11-17 at 12:25 -0500, Christoph Lameter wrote:
> On Tue, 17 Nov 2009, Zhang, Yanmin wrote:
> 
> > The right change above should be:
> >  struct mm_counter *m = per_cpu_ptr(mm->rss, cpu);
> 
> Right.
> 
> > With the change, command 'make oldconfig' and a boot command still
> > hangs.
> 
> Not sure if its worth spending more time on this but if you want I will
> consolidate the fixes so far and put out another patchset.
> 
> Where does it hang during boot?
> 
1) A init boot script calss pidof and pidof hands in
access_process_vm => (mutex_lock <=> mutex_unlock), so actually in
mm_reader_lock.
2) 'make oldconfig' hangs in sys_map => msleep, actually in mm_writer_lock.

I will check it today.




^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic
@ 2009-11-19  0:48           ` Zhang, Yanmin
  0 siblings, 0 replies; 66+ messages in thread
From: Zhang, Yanmin @ 2009-11-19  0:48 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: KAMEZAWA Hiroyuki, hugh.dickins, linux-mm, linux-kernel, akpm,
	Tejun Heo, Andi Kleen

On Tue, 2009-11-17 at 12:25 -0500, Christoph Lameter wrote:
> On Tue, 17 Nov 2009, Zhang, Yanmin wrote:
> 
> > The right change above should be:
> >  struct mm_counter *m = per_cpu_ptr(mm->rss, cpu);
> 
> Right.
> 
> > With the change, command 'make oldconfig' and a boot command still
> > hangs.
> 
> Not sure if its worth spending more time on this but if you want I will
> consolidate the fixes so far and put out another patchset.
> 
> Where does it hang during boot?
> 
1) A init boot script calss pidof and pidof hands in
access_process_vm => (mutex_lock <=> mutex_unlock), so actually in
mm_reader_lock.
2) 'make oldconfig' hangs in sys_map => msleep, actually in mm_writer_lock.

I will check it today.



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic
  2009-11-17 17:25         ` Christoph Lameter
@ 2009-11-23  8:51           ` Zhang, Yanmin
  -1 siblings, 0 replies; 66+ messages in thread
From: Zhang, Yanmin @ 2009-11-23  8:51 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: KAMEZAWA Hiroyuki, hugh.dickins, linux-mm, linux-kernel, akpm,
	Tejun Heo, Andi Kleen

On Tue, 2009-11-17 at 12:25 -0500, Christoph Lameter wrote:
> On Tue, 17 Nov 2009, Zhang, Yanmin wrote:
> 
> > The right change above should be:
> >  struct mm_counter *m = per_cpu_ptr(mm->rss, cpu);
> 
> Right.
> 
> > With the change, command 'make oldconfig' and a boot command still
> > hangs.
> 
> Not sure if its worth spending more time on this but if you want I will
> consolidate the fixes so far and put out another patchset.
> 
> Where does it hang during boot?
Definitely faint.

1) In function exec_mmap: in the 2nd 'if (old_mm) {', mm_reader_unlock
should be used. Your patch uses mm_reader_lock. I found it when reviewing your
patch, but forgot to fix it when testing.
2) In function madvise: the last unlock should be mm_reader_unlock. Your
patch uses mm_writer_unlock.

It's easy to hit the issues with normal testing. I'm surprised you didn't
hit them.

Another theoretic issue is below scenario:
Process A get the read lock on cpu 0 and is scheduled to cpu 2 to unlock. Then
it's scheduled back to cpu 0 to repeat the step. eventually, the reader counter
will overflow. Considering multiple thread cases, it might be faster to
overflow than what we imagine. When it overflows, processes will hang there.



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic
@ 2009-11-23  8:51           ` Zhang, Yanmin
  0 siblings, 0 replies; 66+ messages in thread
From: Zhang, Yanmin @ 2009-11-23  8:51 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: KAMEZAWA Hiroyuki, hugh.dickins, linux-mm, linux-kernel, akpm,
	Tejun Heo, Andi Kleen

On Tue, 2009-11-17 at 12:25 -0500, Christoph Lameter wrote:
> On Tue, 17 Nov 2009, Zhang, Yanmin wrote:
> 
> > The right change above should be:
> >  struct mm_counter *m = per_cpu_ptr(mm->rss, cpu);
> 
> Right.
> 
> > With the change, command 'make oldconfig' and a boot command still
> > hangs.
> 
> Not sure if its worth spending more time on this but if you want I will
> consolidate the fixes so far and put out another patchset.
> 
> Where does it hang during boot?
Definitely faint.

1) In function exec_mmap: in the 2nd 'if (old_mm) {', mm_reader_unlock
should be used. Your patch uses mm_reader_lock. I found it when reviewing your
patch, but forgot to fix it when testing.
2) In function madvise: the last unlock should be mm_reader_unlock. Your
patch uses mm_writer_unlock.

It's easy to hit the issues with normal testing. I'm surprised you didn't
hit them.

Another theoretic issue is below scenario:
Process A get the read lock on cpu 0 and is scheduled to cpu 2 to unlock. Then
it's scheduled back to cpu 0 to repeat the step. eventually, the reader counter
will overflow. Considering multiple thread cases, it might be faster to
overflow than what we imagine. When it overflows, processes will hang there.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic
  2009-11-23  8:51           ` Zhang, Yanmin
@ 2009-11-23 14:31             ` Christoph Lameter
  -1 siblings, 0 replies; 66+ messages in thread
From: Christoph Lameter @ 2009-11-23 14:31 UTC (permalink / raw)
  To: Zhang, Yanmin
  Cc: KAMEZAWA Hiroyuki, hugh.dickins, linux-mm, linux-kernel, akpm,
	Tejun Heo, Andi Kleen

On Mon, 23 Nov 2009, Zhang, Yanmin wrote:

> Another theoretic issue is below scenario:
> Process A get the read lock on cpu 0 and is scheduled to cpu 2 to unlock. Then
> it's scheduled back to cpu 0 to repeat the step. eventually, the reader counter
> will overflow. Considering multiple thread cases, it might be faster to
> overflow than what we imagine. When it overflows, processes will hang there.

True.... We need to find some alternative to per cpu data to scale mmap
sem then.

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic
@ 2009-11-23 14:31             ` Christoph Lameter
  0 siblings, 0 replies; 66+ messages in thread
From: Christoph Lameter @ 2009-11-23 14:31 UTC (permalink / raw)
  To: Zhang, Yanmin
  Cc: KAMEZAWA Hiroyuki, hugh.dickins, linux-mm, linux-kernel, akpm,
	Tejun Heo, Andi Kleen

On Mon, 23 Nov 2009, Zhang, Yanmin wrote:

> Another theoretic issue is below scenario:
> Process A get the read lock on cpu 0 and is scheduled to cpu 2 to unlock. Then
> it's scheduled back to cpu 0 to repeat the step. eventually, the reader counter
> will overflow. Considering multiple thread cases, it might be faster to
> overflow than what we imagine. When it overflows, processes will hang there.

True.... We need to find some alternative to per cpu data to scale mmap
sem then.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic
  2009-11-23 14:31             ` Christoph Lameter
@ 2009-11-24  8:02               ` Zhang, Yanmin
  -1 siblings, 0 replies; 66+ messages in thread
From: Zhang, Yanmin @ 2009-11-24  8:02 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: KAMEZAWA Hiroyuki, hugh.dickins, linux-mm, linux-kernel, akpm,
	Tejun Heo, Andi Kleen

On Mon, 2009-11-23 at 08:31 -0600, Christoph Lameter wrote:
> On Mon, 23 Nov 2009, Zhang, Yanmin wrote:
> 
> > Another theoretic issue is below scenario:
> > Process A get the read lock on cpu 0 and is scheduled to cpu 2 to unlock. Then
> > it's scheduled back to cpu 0 to repeat the step. eventually, the reader counter
> > will overflow. Considering multiple thread cases, it might be faster to
> > overflow than what we imagine. When it overflows, processes will hang there.
> 
> True.... We need to find some alternative to per cpu data to scale mmap
> sem then.
I ran lots of benchmarks such like specjbb2005/hackbench/tbench/dbench/iozone
/sysbench_oltp(mysql)/aim7 against percpu tree(based on 2.6.32-rc7) on a 4*8*2 logical
cpu machine, and didn't find big result difference between with your patch and without
your patch.




^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic
@ 2009-11-24  8:02               ` Zhang, Yanmin
  0 siblings, 0 replies; 66+ messages in thread
From: Zhang, Yanmin @ 2009-11-24  8:02 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: KAMEZAWA Hiroyuki, hugh.dickins, linux-mm, linux-kernel, akpm,
	Tejun Heo, Andi Kleen

On Mon, 2009-11-23 at 08:31 -0600, Christoph Lameter wrote:
> On Mon, 23 Nov 2009, Zhang, Yanmin wrote:
> 
> > Another theoretic issue is below scenario:
> > Process A get the read lock on cpu 0 and is scheduled to cpu 2 to unlock. Then
> > it's scheduled back to cpu 0 to repeat the step. eventually, the reader counter
> > will overflow. Considering multiple thread cases, it might be faster to
> > overflow than what we imagine. When it overflows, processes will hang there.
> 
> True.... We need to find some alternative to per cpu data to scale mmap
> sem then.
I ran lots of benchmarks such like specjbb2005/hackbench/tbench/dbench/iozone
/sysbench_oltp(mysql)/aim7 against percpu tree(based on 2.6.32-rc7) on a 4*8*2 logical
cpu machine, and didn't find big result difference between with your patch and without
your patch.



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic
  2009-11-24  8:02               ` Zhang, Yanmin
@ 2009-11-24 15:17                 ` Christoph Lameter
  -1 siblings, 0 replies; 66+ messages in thread
From: Christoph Lameter @ 2009-11-24 15:17 UTC (permalink / raw)
  To: Zhang, Yanmin
  Cc: KAMEZAWA Hiroyuki, hugh.dickins, linux-mm, linux-kernel, akpm,
	Tejun Heo, Andi Kleen

On Tue, 24 Nov 2009, Zhang, Yanmin wrote:

> > True.... We need to find some alternative to per cpu data to scale mmap
> > sem then.
> I ran lots of benchmarks such like specjbb2005/hackbench/tbench/dbench/iozone
> /sysbench_oltp(mysql)/aim7 against percpu tree(based on 2.6.32-rc7) on a 4*8*2 logical
> cpu machine, and didn't find big result difference between with your patch and without
> your patch.

This affects loads that heavily use mmap_sem. You wont find too many
issues in tests that do not run processes with a large thread count and
cause lots of faults or uses of get_user_pages(). The tests you list are
not of that nature.


^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic
@ 2009-11-24 15:17                 ` Christoph Lameter
  0 siblings, 0 replies; 66+ messages in thread
From: Christoph Lameter @ 2009-11-24 15:17 UTC (permalink / raw)
  To: Zhang, Yanmin
  Cc: KAMEZAWA Hiroyuki, hugh.dickins, linux-mm, linux-kernel, akpm,
	Tejun Heo, Andi Kleen

On Tue, 24 Nov 2009, Zhang, Yanmin wrote:

> > True.... We need to find some alternative to per cpu data to scale mmap
> > sem then.
> I ran lots of benchmarks such like specjbb2005/hackbench/tbench/dbench/iozone
> /sysbench_oltp(mysql)/aim7 against percpu tree(based on 2.6.32-rc7) on a 4*8*2 logical
> cpu machine, and didn't find big result difference between with your patch and without
> your patch.

This affects loads that heavily use mmap_sem. You wont find too many
issues in tests that do not run processes with a large thread count and
cause lots of faults or uses of get_user_pages(). The tests you list are
not of that nature.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic
  2009-11-24 15:17                 ` Christoph Lameter
@ 2009-11-25  1:23                   ` Zhang, Yanmin
  -1 siblings, 0 replies; 66+ messages in thread
From: Zhang, Yanmin @ 2009-11-25  1:23 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: KAMEZAWA Hiroyuki, hugh.dickins, linux-mm, linux-kernel, akpm,
	Tejun Heo, Andi Kleen

On Tue, 2009-11-24 at 09:17 -0600, Christoph Lameter wrote:
> On Tue, 24 Nov 2009, Zhang, Yanmin wrote:
> 
> > > True.... We need to find some alternative to per cpu data to scale mmap
> > > sem then.
> > I ran lots of benchmarks such like specjbb2005/hackbench/tbench/dbench/iozone
> > /sysbench_oltp(mysql)/aim7 against percpu tree(based on 2.6.32-rc7) on a 4*8*2 logical
> > cpu machine, and didn't find big result difference between with your patch and without
> > your patch.
> 
> This affects loads that heavily use mmap_sem. You wont find too many
> issues in tests that do not run processes with a large thread count and
> cause lots of faults or uses of get_user_pages(). The tests you list are
> not of that nature.
sysbench_oltp(mysql) is kind of such workload. Both sysbench and mysql are
multi-threaded. 2 years ago, I investigated a scalability issue of such
 workload and found mysql causes frequent down_read(mm->mmap_sem). Nick changes
it to down_read to fix it.

But this workload doesn't work well with more than 64 threads because mysql has some
unreasonable big locks in userspace (implemented as a conditional spinlock in
userspace).

Yanmin



^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [MM] Make mm counters per cpu instead of atomic
@ 2009-11-25  1:23                   ` Zhang, Yanmin
  0 siblings, 0 replies; 66+ messages in thread
From: Zhang, Yanmin @ 2009-11-25  1:23 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: KAMEZAWA Hiroyuki, hugh.dickins, linux-mm, linux-kernel, akpm,
	Tejun Heo, Andi Kleen

On Tue, 2009-11-24 at 09:17 -0600, Christoph Lameter wrote:
> On Tue, 24 Nov 2009, Zhang, Yanmin wrote:
> 
> > > True.... We need to find some alternative to per cpu data to scale mmap
> > > sem then.
> > I ran lots of benchmarks such like specjbb2005/hackbench/tbench/dbench/iozone
> > /sysbench_oltp(mysql)/aim7 against percpu tree(based on 2.6.32-rc7) on a 4*8*2 logical
> > cpu machine, and didn't find big result difference between with your patch and without
> > your patch.
> 
> This affects loads that heavily use mmap_sem. You wont find too many
> issues in tests that do not run processes with a large thread count and
> cause lots of faults or uses of get_user_pages(). The tests you list are
> not of that nature.
sysbench_oltp(mysql) is kind of such workload. Both sysbench and mysql are
multi-threaded. 2 years ago, I investigated a scalability issue of such
 workload and found mysql causes frequent down_read(mm->mmap_sem). Nick changes
it to down_read to fix it.

But this workload doesn't work well with more than 64 threads because mysql has some
unreasonable big locks in userspace (implemented as a conditional spinlock in
userspace).

Yanmin


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 66+ messages in thread

end of thread, other threads:[~2009-11-25  1:20 UTC | newest]

Thread overview: 66+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-11-04 19:14 [MM] Make mm counters per cpu instead of atomic Christoph Lameter
2009-11-04 19:14 ` Christoph Lameter
2009-11-04 19:17 ` [MM] Remove rss batching from copy_page_range() Christoph Lameter
2009-11-04 19:17   ` Christoph Lameter
2009-11-04 21:02   ` Andi Kleen
2009-11-04 21:02     ` Andi Kleen
2009-11-04 22:02     ` Christoph Lameter
2009-11-04 22:02       ` Christoph Lameter
2009-11-05  8:27       ` Andi Kleen
2009-11-05  8:27         ` Andi Kleen
2009-11-04 21:01 ` [MM] Make mm counters per cpu instead of atomic Andi Kleen
2009-11-04 21:01   ` Andi Kleen
2009-11-04 23:49 ` Dave Jones
2009-11-04 23:49   ` Dave Jones
2009-11-05 15:04   ` Christoph Lameter
2009-11-05 15:04     ` Christoph Lameter
2009-11-05 15:36     ` [MM] Make mm counters per cpu instead of atomic V2 Christoph Lameter
2009-11-05 15:36       ` Christoph Lameter
2009-11-06  1:11       ` KAMEZAWA Hiroyuki
2009-11-06  1:11         ` KAMEZAWA Hiroyuki
2009-11-06  3:23         ` KAMEZAWA Hiroyuki
2009-11-06  3:23           ` KAMEZAWA Hiroyuki
2009-11-06 17:32           ` Christoph Lameter
2009-11-06 17:32             ` Christoph Lameter
2009-11-06 19:03             ` KAMEZAWA Hiroyuki
2009-11-06 19:03               ` KAMEZAWA Hiroyuki
2009-11-06 19:13               ` Christoph Lameter
2009-11-06 19:13                 ` Christoph Lameter
2009-11-06 19:20                 ` KAMEZAWA Hiroyuki
2009-11-06 19:20                   ` KAMEZAWA Hiroyuki
2009-11-06 19:47                   ` Christoph Lameter
2009-11-06 19:47                     ` Christoph Lameter
2009-11-10 22:44         ` Andrew Morton
2009-11-10 22:44           ` Andrew Morton
2009-11-10 23:20           ` Christoph Lameter
2009-11-10 23:20             ` Christoph Lameter
2009-11-06  4:08       ` KAMEZAWA Hiroyuki
2009-11-06  4:08         ` KAMEZAWA Hiroyuki
2009-11-06  4:15       ` KAMEZAWA Hiroyuki
2009-11-06  4:15         ` KAMEZAWA Hiroyuki
2009-11-05  1:16 ` [MM] Make mm counters per cpu instead of atomic KAMEZAWA Hiroyuki
2009-11-05  1:16   ` KAMEZAWA Hiroyuki
2009-11-05 15:10   ` Christoph Lameter
2009-11-05 15:10     ` Christoph Lameter
2009-11-05 23:42     ` KAMEZAWA Hiroyuki
2009-11-05 23:42       ` KAMEZAWA Hiroyuki
2009-11-17  6:48 ` Zhang, Yanmin
2009-11-17  6:48   ` Zhang, Yanmin
2009-11-17  7:31   ` Zhang, Yanmin
2009-11-17  7:31     ` Zhang, Yanmin
2009-11-17  9:34     ` Zhang, Yanmin
2009-11-17  9:34       ` Zhang, Yanmin
2009-11-17 17:25       ` Christoph Lameter
2009-11-17 17:25         ` Christoph Lameter
2009-11-19  0:48         ` Zhang, Yanmin
2009-11-19  0:48           ` Zhang, Yanmin
2009-11-23  8:51         ` Zhang, Yanmin
2009-11-23  8:51           ` Zhang, Yanmin
2009-11-23 14:31           ` Christoph Lameter
2009-11-23 14:31             ` Christoph Lameter
2009-11-24  8:02             ` Zhang, Yanmin
2009-11-24  8:02               ` Zhang, Yanmin
2009-11-24 15:17               ` Christoph Lameter
2009-11-24 15:17                 ` Christoph Lameter
2009-11-25  1:23                 ` Zhang, Yanmin
2009-11-25  1:23                   ` Zhang, Yanmin

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.