[MM] Make mm counters per cpu instead of atomic

* [MM] Make mm counters per cpu instead of atomic
@ 2009-11-04 19:14 ` Christoph Lameter
  0 siblings, 0 replies; 66+ messages in thread
From: Christoph Lameter @ 2009-11-04 19:14 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki; +Cc: hugh.dickins, linux-mm, linux-kernel, akpm, Tejun Heo

From: Christoph Lameter <cl@linux-foundation.org>
Subject: Make mm counters per cpu

Changing the mm counters to per cpu counters is possible after the introduction
of the generic per cpu operations (currently in percpu and -next).

With that the contention on the counters in mm_struct can be avoided. The
USE_SPLIT_PTLOCKS case distinction can go away. Larger SMP systems do not
need to perform atomic updates to mm counters anymore. Various code paths
can be simplified since per cpu counter updates are fast and batching
of counter updates is no longer needed.

One price to pay for these improvements is the need to scan over all percpu
counters when the actual count values are needed.

Signed-off-by: Christoph Lameter <cl@linux-foundation.org>

---
 fs/proc/task_mmu.c       |   14 +++++++++-
 include/linux/mm_types.h |   16 ++++--------
 include/linux/sched.h    |   61 ++++++++++++++++++++---------------------------
 kernel/fork.c            |   25 ++++++++++++++-----
 mm/filemap_xip.c         |    2 -
 mm/fremap.c              |    2 -
 mm/init-mm.c             |    3 ++
 mm/memory.c              |   20 +++++++--------
 mm/rmap.c                |   10 +++----
 mm/swapfile.c            |    2 -
 10 files changed, 84 insertions(+), 71 deletions(-)

Index: linux-2.6/include/linux/mm_types.h
===================================================================

--- linux-2.6.orig/include/linux/mm_types.h	2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/include/linux/mm_types.h	2009-11-04 13:13:42.000000000 -0600
@@ -24,11 +24,10 @@ struct address_space;

 #define USE_SPLIT_PTLOCKS	(NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS)

-#if USE_SPLIT_PTLOCKS
-typedef atomic_long_t mm_counter_t;
-#else  /* !USE_SPLIT_PTLOCKS */
-typedef unsigned long mm_counter_t;
-#endif /* !USE_SPLIT_PTLOCKS */
+struct mm_counter {
+	long file;
+	long anon;
+};

 /*
  * Each physical page in the system has a struct page associated with
@@ -223,11 +222,8 @@ struct mm_struct {
 						 * by mmlist_lock
 						 */

-	/* Special counters, in some configurations protected by the
-	 * page_table_lock, in other configurations by being atomic.
-	 */
-	mm_counter_t _file_rss;
-	mm_counter_t _anon_rss;
+	/* Special percpu counters */
+	struct mm_counter *rss;

 	unsigned long hiwater_rss;	/* High-watermark of RSS usage */
 	unsigned long hiwater_vm;	/* High-water virtual memory usage */
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h	2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/include/linux/sched.h	2009-11-04 13:13:42.000000000 -0600
@@ -385,41 +385,32 @@ arch_get_unmapped_area_topdown(struct fi
 extern void arch_unmap_area(struct mm_struct *, unsigned long);
 extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);

-#if USE_SPLIT_PTLOCKS
-/*
- * The mm counters are not protected by its page_table_lock,
- * so must be incremented atomically.
- */
-#define set_mm_counter(mm, member, value) atomic_long_set(&(mm)->_##member, value)
-#define get_mm_counter(mm, member) ((unsigned long)atomic_long_read(&(mm)->_##member))
-#define add_mm_counter(mm, member, value) atomic_long_add(value, &(mm)->_##member)
-#define inc_mm_counter(mm, member) atomic_long_inc(&(mm)->_##member)
-#define dec_mm_counter(mm, member) atomic_long_dec(&(mm)->_##member)
-
-#else  /* !USE_SPLIT_PTLOCKS */
-/*
- * The mm counters are protected by its page_table_lock,
- * so can be incremented directly.
- */
-#define set_mm_counter(mm, member, value) (mm)->_##member = (value)
-#define get_mm_counter(mm, member) ((mm)->_##member)
-#define add_mm_counter(mm, member, value) (mm)->_##member += (value)
-#define inc_mm_counter(mm, member) (mm)->_##member++
-#define dec_mm_counter(mm, member) (mm)->_##member--
-
-#endif /* !USE_SPLIT_PTLOCKS */
-
-#define get_mm_rss(mm)					\
-	(get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss))
-#define update_hiwater_rss(mm)	do {			\
-	unsigned long _rss = get_mm_rss(mm);		\
-	if ((mm)->hiwater_rss < _rss)			\
-		(mm)->hiwater_rss = _rss;		\
-} while (0)
-#define update_hiwater_vm(mm)	do {			\
-	if ((mm)->hiwater_vm < (mm)->total_vm)		\
-		(mm)->hiwater_vm = (mm)->total_vm;	\
-} while (0)
+static inline unsigned long get_mm_rss(struct mm_struct *mm)
+{
+	int cpu;
+	unsigned long r = 0;
+
+	for_each_possible_cpu(cpu) {
+		struct mm_counter *c = per_cpu_ptr(mm->rss, cpu);
+
+		r = c->file + c->anon;
+	}
+
+	return r;
+}
+
+static inline void update_hiwater_rss(struct mm_struct *mm)
+{
+	unsigned long _rss = get_mm_rss(mm);
+	if (mm->hiwater_rss < _rss)
+		mm->hiwater_rss = _rss;
+}
+
+static inline void update_hiwater_vm(struct mm_struct *mm)
+{
+	if (mm->hiwater_vm < mm->total_vm)
+		mm->hiwater_vm = mm->total_vm;
+}

 static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm)
 {
Index: linux-2.6/kernel/fork.c
===================================================================
--- linux-2.6.orig/kernel/fork.c	2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/kernel/fork.c	2009-11-04 13:14:19.000000000 -0600
@@ -444,6 +444,8 @@ static void mm_init_aio(struct mm_struct

 static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
 {
+	int cpu;
+
 	atomic_set(&mm->mm_users, 1);
 	atomic_set(&mm->mm_count, 1);
 	init_rwsem(&mm->mmap_sem);
@@ -452,8 +454,11 @@ static struct mm_struct * mm_init(struct
 		(current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
 	mm->core_state = NULL;
 	mm->nr_ptes = 0;
-	set_mm_counter(mm, file_rss, 0);
-	set_mm_counter(mm, anon_rss, 0);
+	for_each_possible_cpu(cpu) {
+		struct mm_counter *m;
+
+		memset(m, sizeof(struct mm_counter), 0);
+	}
 	spin_lock_init(&mm->page_table_lock);
 	mm->free_area_cache = TASK_UNMAPPED_BASE;
 	mm->cached_hole_size = ~0UL;
@@ -480,7 +485,13 @@ struct mm_struct * mm_alloc(void)
 	mm = allocate_mm();
 	if (mm) {
 		memset(mm, 0, sizeof(*mm));
-		mm = mm_init(mm, current);
+		mm->rss = alloc_percpu(struct mm_counter);
+		if (mm->rss)
+			mm = mm_init(mm, current);
+		else {
+			free_mm(mm);
+			mm = NULL;
+		}
 	}
 	return mm;
 }
@@ -496,6 +507,7 @@ void __mmdrop(struct mm_struct *mm)
 	mm_free_pgd(mm);
 	destroy_context(mm);
 	mmu_notifier_mm_destroy(mm);
+	free_percpu(mm->rss);
 	free_mm(mm);
 }
 EXPORT_SYMBOL_GPL(__mmdrop);
@@ -631,6 +643,9 @@ struct mm_struct *dup_mm(struct task_str
 		goto fail_nomem;

 	memcpy(mm, oldmm, sizeof(*mm));
+	mm->rss = alloc_percpu(struct mm_counter);
+	if (!mm->rss)
+		goto fail_nomem;

 	/* Initializing for Swap token stuff */
 	mm->token_priority = 0;
@@ -661,15 +676,13 @@ free_pt:
 	mm->binfmt = NULL;
 	mmput(mm);

-fail_nomem:
-	return NULL;
-
 fail_nocontext:
 	/*
 	 * If init_new_context() failed, we cannot use mmput() to free the mm
 	 * because it calls destroy_context()
 	 */
 	mm_free_pgd(mm);
+fail_nomem:
 	free_mm(mm);
 	return NULL;
 }
Index: linux-2.6/fs/proc/task_mmu.c
===================================================================
--- linux-2.6.orig/fs/proc/task_mmu.c	2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/fs/proc/task_mmu.c	2009-11-04 13:13:42.000000000 -0600
@@ -65,11 +65,21 @@ unsigned long task_vsize(struct mm_struc
 int task_statm(struct mm_struct *mm, int *shared, int *text,
 	       int *data, int *resident)
 {
-	*shared = get_mm_counter(mm, file_rss);
+	int cpu;
+	int anon_rss = 0;
+	int file_rss = 0;
+
+	for_each_possible_cpu(cpu) {
+		struct mm_counter *c = per_cpu_ptr(mm->rss, cpu);
+
+		anon_rss += c->anon;
+		file_rss += c->file;
+	}
+	*shared = file_rss;
 	*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
 								>> PAGE_SHIFT;
 	*data = mm->total_vm - mm->shared_vm;
-	*resident = *shared + get_mm_counter(mm, anon_rss);
+	*resident = *shared + anon_rss;
 	return mm->total_vm;
 }

Index: linux-2.6/mm/filemap_xip.c
===================================================================
--- linux-2.6.orig/mm/filemap_xip.c	2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/mm/filemap_xip.c	2009-11-04 13:13:42.000000000 -0600
@@ -194,7 +194,7 @@ retry:
 			flush_cache_page(vma, address, pte_pfn(*pte));
 			pteval = ptep_clear_flush_notify(vma, address, pte);
 			page_remove_rmap(page);
-			dec_mm_counter(mm, file_rss);
+			__this_cpu_dec(mm->rss->file);
 			BUG_ON(pte_dirty(pteval));
 			pte_unmap_unlock(pte, ptl);
 			page_cache_release(page);
Index: linux-2.6/mm/fremap.c
===================================================================
--- linux-2.6.orig/mm/fremap.c	2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/mm/fremap.c	2009-11-04 13:13:42.000000000 -0600
@@ -40,7 +40,7 @@ static void zap_pte(struct mm_struct *mm
 			page_remove_rmap(page);
 			page_cache_release(page);
 			update_hiwater_rss(mm);
-			dec_mm_counter(mm, file_rss);
+			__this_cpu_dec(mm->rss->file);
 		}
 	} else {
 		if (!pte_file(pte))
Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c	2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/mm/memory.c	2009-11-04 13:13:42.000000000 -0600
@@ -379,9 +379,9 @@ int __pte_alloc_kernel(pmd_t *pmd, unsig
 static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
 {
 	if (file_rss)
-		add_mm_counter(mm, file_rss, file_rss);
+		__this_cpu_add(mm->rss->file, file_rss);
 	if (anon_rss)
-		add_mm_counter(mm, anon_rss, anon_rss);
+		__this_cpu_add(mm->rss->anon, anon_rss);
 }

 /*
@@ -1512,7 +1512,7 @@ static int insert_page(struct vm_area_st

 	/* Ok, finally just insert the thing.. */
 	get_page(page);
-	inc_mm_counter(mm, file_rss);
+	__this_cpu_inc(mm->rss->file);
 	page_add_file_rmap(page);
 	set_pte_at(mm, addr, pte, mk_pte(page, prot));

@@ -2148,11 +2148,11 @@ gotten:
 	if (likely(pte_same(*page_table, orig_pte))) {
 		if (old_page) {
 			if (!PageAnon(old_page)) {
-				dec_mm_counter(mm, file_rss);
-				inc_mm_counter(mm, anon_rss);
+				__this_cpu_dec(mm->rss->file);
+				__this_cpu_inc(mm->rss->anon);
 			}
 		} else
-			inc_mm_counter(mm, anon_rss);
+			__this_cpu_inc(mm->rss->anon);
 		flush_cache_page(vma, address, pte_pfn(orig_pte));
 		entry = mk_pte(new_page, vma->vm_page_prot);
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2579,7 +2579,7 @@ static int do_swap_page(struct mm_struct
 	 * discarded at swap_free().
 	 */

-	inc_mm_counter(mm, anon_rss);
+	__this_cpu_inc(mm->rss->anon);
 	pte = mk_pte(page, vma->vm_page_prot);
 	if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
 		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
@@ -2663,7 +2663,7 @@ static int do_anonymous_page(struct mm_s
 	if (!pte_none(*page_table))
 		goto release;

-	inc_mm_counter(mm, anon_rss);
+	__this_cpu_inc(mm->rss->anon);
 	page_add_new_anon_rmap(page, vma, address);
 setpte:
 	set_pte_at(mm, address, page_table, entry);
@@ -2817,10 +2817,10 @@ static int __do_fault(struct mm_struct *
 		if (flags & FAULT_FLAG_WRITE)
 			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 		if (anon) {
-			inc_mm_counter(mm, anon_rss);
+			__this_cpu_inc(mm->rss->anon);
 			page_add_new_anon_rmap(page, vma, address);
 		} else {
-			inc_mm_counter(mm, file_rss);
+			__this_cpu_inc(mm->rss->file);
 			page_add_file_rmap(page);
 			if (flags & FAULT_FLAG_WRITE) {
 				dirty_page = page;
Index: linux-2.6/mm/rmap.c
===================================================================
--- linux-2.6.orig/mm/rmap.c	2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/mm/rmap.c	2009-11-04 13:13:42.000000000 -0600
@@ -809,9 +809,9 @@ static int try_to_unmap_one(struct page

 	if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
 		if (PageAnon(page))
-			dec_mm_counter(mm, anon_rss);
+			__this_cpu_dec(mm->rss->anon);
 		else
-			dec_mm_counter(mm, file_rss);
+			__this_cpu_dec(mm->rss->file);
 		set_pte_at(mm, address, pte,
 				swp_entry_to_pte(make_hwpoison_entry(page)));
 	} else if (PageAnon(page)) {
@@ -829,7 +829,7 @@ static int try_to_unmap_one(struct page
 					list_add(&mm->mmlist, &init_mm.mmlist);
 				spin_unlock(&mmlist_lock);
 			}
-			dec_mm_counter(mm, anon_rss);
+			__this_cpu_dec(mm->rss->anon);
 		} else if (PAGE_MIGRATION) {
 			/*
 			 * Store the pfn of the page in a special migration
@@ -847,7 +847,7 @@ static int try_to_unmap_one(struct page
 		entry = make_migration_entry(page, pte_write(pteval));
 		set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
 	} else
-		dec_mm_counter(mm, file_rss);
+		__this_cpu_dec(mm->rss->file);


 	page_remove_rmap(page);
@@ -967,7 +967,7 @@ static int try_to_unmap_cluster(unsigned

 		page_remove_rmap(page);
 		page_cache_release(page);
-		dec_mm_counter(mm, file_rss);
+		__this_cpu_dec(mm->rss->file);
 		(*mapcount)--;
 	}
 	pte_unmap_unlock(pte - 1, ptl);
Index: linux-2.6/mm/swapfile.c
===================================================================
--- linux-2.6.orig/mm/swapfile.c	2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/mm/swapfile.c	2009-11-04 13:13:42.000000000 -0600
@@ -831,7 +831,7 @@ static int unuse_pte(struct vm_area_stru
 		goto out;
 	}

-	inc_mm_counter(vma->vm_mm, anon_rss);
+	__this_cpu_inc(vma->vm_mm->rss->anon);
 	get_page(page);
 	set_pte_at(vma->vm_mm, addr, pte,
 		   pte_mkold(mk_pte(page, vma->vm_page_prot)));
Index: linux-2.6/mm/init-mm.c
===================================================================
--- linux-2.6.orig/mm/init-mm.c	2009-11-04 13:08:33.000000000 -0600
+++ linux-2.6/mm/init-mm.c	2009-11-04 13:13:42.000000000 -0600
@@ -8,6 +8,8 @@
 #include <asm/atomic.h>
 #include <asm/pgtable.h>

+DEFINE_PER_CPU(struct mm_counter, init_mm_counters);
+
 struct mm_struct init_mm = {
 	.mm_rb		= RB_ROOT,
 	.pgd		= swapper_pg_dir,
@@ -17,4 +19,5 @@ struct mm_struct init_mm = {
 	.page_table_lock =  __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
 	.mmlist		= LIST_HEAD_INIT(init_mm.mmlist),
 	.cpu_vm_mask	= CPU_MASK_ALL,
+	.rss		= &init_mm_counters,
 };

^ permalink raw reply	[flat|nested] 66+ messages in thread