All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 1/2] Make the batch size of the percpu_counter configurable
@ 2013-05-03 10:10 ` Tim Chen
  0 siblings, 0 replies; 22+ messages in thread
From: Tim Chen @ 2013-05-03 10:10 UTC (permalink / raw)
  To: Andrew Morton, Tejun Heo, Christoph Lameter, Al Viro
  Cc: Tim Chen, Eric Dumazet, Ric Mason, Simon Jeons, Dave Hansen,
	Andi Kleen, linux-kernel, linux-mm

Currently, there is a single, global, variable (percpu_counter_batch) that
controls the batch sizes for every 'struct percpu_counter' on the system.

However, there are some applications, e.g. memory accounting where it is
more appropriate to scale the batch size according to the memory size.
This patch adds the infrastructure to be able to change the batch sizes
for each individual instance of 'struct percpu_counter'.

v2:
1. Change batch from pointer to static value.
2. Move list of percpu_counters out of CONFIG_HOTPLUG_CPU, so
batch size could be updated.

Thanks for the feedbacks on version 1 of this patch.

Tim

Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
---
 include/linux/percpu_counter.h | 25 +++++++++++++++---
 lib/percpu_counter.c           | 57 +++++++++++++++++++++++++++---------------
 2 files changed, 59 insertions(+), 23 deletions(-)

diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index d5dd465..e9f77cd 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -18,10 +18,10 @@
 struct percpu_counter {
 	raw_spinlock_t lock;
 	s64 count;
-#ifdef CONFIG_HOTPLUG_CPU
 	struct list_head list;	/* All percpu_counters are on a list */
-#endif
 	s32 __percpu *counters;
+	int batch;
+	int (*compute_batch)(void);
 };
 
 extern int percpu_counter_batch;
@@ -40,11 +40,24 @@ void percpu_counter_destroy(struct percpu_counter *fbc);
 void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
 void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch);
 s64 __percpu_counter_sum(struct percpu_counter *fbc);
+void __percpu_counter_batch_resize(struct percpu_counter *fbc);
 int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs);
 
+static inline int percpu_counter_and_batch_init(struct percpu_counter *fbc,
+			s64 amount, int (*compute_batch)(void))
+{
+	int ret = percpu_counter_init(fbc, amount);
+
+	if (compute_batch && !ret){
+		fbc->compute_batch = compute_batch;
+		__percpu_counter_batch_resize(fbc);
+	}
+	return ret;
+}
+
 static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
 {
-	__percpu_counter_add(fbc, amount, percpu_counter_batch);
+	__percpu_counter_add(fbc, amount, fbc->batch);
 }
 
 static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)
@@ -95,6 +108,12 @@ static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount)
 	return 0;
 }
 
+static inline int percpu_counter_and_batch_init(struct percpu_counter *fbc,
+			s64 amount, int (*compute_batch)(void))
+{
+	return percpu_counter_init(fbc, amount);
+}
+
 static inline void percpu_counter_destroy(struct percpu_counter *fbc)
 {
 }
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index ba6085d..4ad9e18 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -10,10 +10,8 @@
 #include <linux/module.h>
 #include <linux/debugobjects.h>
 
-#ifdef CONFIG_HOTPLUG_CPU
 static LIST_HEAD(percpu_counters);
 static DEFINE_SPINLOCK(percpu_counters_lock);
-#endif
 
 #ifdef CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER
 
@@ -116,17 +114,17 @@ int __percpu_counter_init(struct percpu_counter *fbc, s64 amount,
 	lockdep_set_class(&fbc->lock, key);
 	fbc->count = amount;
 	fbc->counters = alloc_percpu(s32);
+	fbc->batch = percpu_counter_batch;
+	fbc->compute_batch = NULL;
 	if (!fbc->counters)
 		return -ENOMEM;
 
 	debug_percpu_counter_activate(fbc);
 
-#ifdef CONFIG_HOTPLUG_CPU
 	INIT_LIST_HEAD(&fbc->list);
 	spin_lock(&percpu_counters_lock);
 	list_add(&fbc->list, &percpu_counters);
 	spin_unlock(&percpu_counters_lock);
-#endif
 	return 0;
 }
 EXPORT_SYMBOL(__percpu_counter_init);
@@ -138,11 +136,9 @@ void percpu_counter_destroy(struct percpu_counter *fbc)
 
 	debug_percpu_counter_deactivate(fbc);
 
-#ifdef CONFIG_HOTPLUG_CPU
 	spin_lock(&percpu_counters_lock);
 	list_del(&fbc->list);
 	spin_unlock(&percpu_counters_lock);
-#endif
 	free_percpu(fbc->counters);
 	fbc->counters = NULL;
 }
@@ -158,31 +154,45 @@ static void compute_batch_value(void)
 	percpu_counter_batch = max(32, nr*2);
 }
 
+void __percpu_counter_batch_resize(struct percpu_counter *fbc)
+{
+	unsigned long flags;
+	int new_batch;
+
+	if (fbc->compute_batch)
+		new_batch = max(fbc->compute_batch(), percpu_counter_batch);
+	else
+		new_batch = percpu_counter_batch;
+
+	raw_spin_lock_irqsave(&fbc->lock, flags);
+	fbc->batch = new_batch;
+	raw_spin_unlock_irqrestore(&fbc->lock, flags);
+}
+EXPORT_SYMBOL(__percpu_counter_batch_resize);
+
 static int __cpuinit percpu_counter_hotcpu_callback(struct notifier_block *nb,
 					unsigned long action, void *hcpu)
 {
-#ifdef CONFIG_HOTPLUG_CPU
 	unsigned int cpu;
 	struct percpu_counter *fbc;
 
 	compute_batch_value();
-	if (action != CPU_DEAD)
-		return NOTIFY_OK;
-
 	cpu = (unsigned long)hcpu;
 	spin_lock(&percpu_counters_lock);
 	list_for_each_entry(fbc, &percpu_counters, list) {
-		s32 *pcount;
-		unsigned long flags;
-
-		raw_spin_lock_irqsave(&fbc->lock, flags);
-		pcount = per_cpu_ptr(fbc->counters, cpu);
-		fbc->count += *pcount;
-		*pcount = 0;
-		raw_spin_unlock_irqrestore(&fbc->lock, flags);
+		__percpu_counter_batch_resize(fbc);
+		if (action == CPU_DEAD) {
+			s32 *pcount;
+			unsigned long flags;
+
+			raw_spin_lock_irqsave(&fbc->lock, flags);
+			pcount = per_cpu_ptr(fbc->counters, cpu);
+			fbc->count += *pcount;
+			*pcount = 0;
+			raw_spin_unlock_irqrestore(&fbc->lock, flags);
+		}
 	}
 	spin_unlock(&percpu_counters_lock);
-#endif
 	return NOTIFY_OK;
 }
 
@@ -196,7 +206,7 @@ int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs)
 
 	count = percpu_counter_read(fbc);
 	/* Check to see if rough count will be sufficient for comparison */
-	if (abs(count - rhs) > (percpu_counter_batch*num_online_cpus())) {
+	if (abs(count - rhs) > (fbc->batch*num_online_cpus())) {
 		if (count > rhs)
 			return 1;
 		else
@@ -215,7 +225,14 @@ EXPORT_SYMBOL(percpu_counter_compare);
 
 static int __init percpu_counter_startup(void)
 {
+	struct percpu_counter *fbc;
+
 	compute_batch_value();
+	spin_lock(&percpu_counters_lock);
+	list_for_each_entry(fbc, &percpu_counters, list) {
+		__percpu_counter_batch_resize(fbc);
+	}
+	spin_unlock(&percpu_counters_lock);
 	hotcpu_notifier(percpu_counter_hotcpu_callback, 0);
 	return 0;
 }
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH v2 1/2] Make the batch size of the percpu_counter configurable
@ 2013-05-03 10:10 ` Tim Chen
  0 siblings, 0 replies; 22+ messages in thread
From: Tim Chen @ 2013-05-03 10:10 UTC (permalink / raw)
  To: Andrew Morton, Tejun Heo, Christoph Lameter, Al Viro
  Cc: Tim Chen, Eric Dumazet, Ric Mason, Simon Jeons, Dave Hansen,
	Andi Kleen, linux-kernel, linux-mm

Currently, there is a single, global, variable (percpu_counter_batch) that
controls the batch sizes for every 'struct percpu_counter' on the system.

However, there are some applications, e.g. memory accounting where it is
more appropriate to scale the batch size according to the memory size.
This patch adds the infrastructure to be able to change the batch sizes
for each individual instance of 'struct percpu_counter'.

v2:
1. Change batch from pointer to static value.
2. Move list of percpu_counters out of CONFIG_HOTPLUG_CPU, so
batch size could be updated.

Thanks for the feedbacks on version 1 of this patch.

Tim

Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
---
 include/linux/percpu_counter.h | 25 +++++++++++++++---
 lib/percpu_counter.c           | 57 +++++++++++++++++++++++++++---------------
 2 files changed, 59 insertions(+), 23 deletions(-)

diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index d5dd465..e9f77cd 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -18,10 +18,10 @@
 struct percpu_counter {
 	raw_spinlock_t lock;
 	s64 count;
-#ifdef CONFIG_HOTPLUG_CPU
 	struct list_head list;	/* All percpu_counters are on a list */
-#endif
 	s32 __percpu *counters;
+	int batch;
+	int (*compute_batch)(void);
 };
 
 extern int percpu_counter_batch;
@@ -40,11 +40,24 @@ void percpu_counter_destroy(struct percpu_counter *fbc);
 void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
 void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch);
 s64 __percpu_counter_sum(struct percpu_counter *fbc);
+void __percpu_counter_batch_resize(struct percpu_counter *fbc);
 int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs);
 
+static inline int percpu_counter_and_batch_init(struct percpu_counter *fbc,
+			s64 amount, int (*compute_batch)(void))
+{
+	int ret = percpu_counter_init(fbc, amount);
+
+	if (compute_batch && !ret){
+		fbc->compute_batch = compute_batch;
+		__percpu_counter_batch_resize(fbc);
+	}
+	return ret;
+}
+
 static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
 {
-	__percpu_counter_add(fbc, amount, percpu_counter_batch);
+	__percpu_counter_add(fbc, amount, fbc->batch);
 }
 
 static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)
@@ -95,6 +108,12 @@ static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount)
 	return 0;
 }
 
+static inline int percpu_counter_and_batch_init(struct percpu_counter *fbc,
+			s64 amount, int (*compute_batch)(void))
+{
+	return percpu_counter_init(fbc, amount);
+}
+
 static inline void percpu_counter_destroy(struct percpu_counter *fbc)
 {
 }
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index ba6085d..4ad9e18 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -10,10 +10,8 @@
 #include <linux/module.h>
 #include <linux/debugobjects.h>
 
-#ifdef CONFIG_HOTPLUG_CPU
 static LIST_HEAD(percpu_counters);
 static DEFINE_SPINLOCK(percpu_counters_lock);
-#endif
 
 #ifdef CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER
 
@@ -116,17 +114,17 @@ int __percpu_counter_init(struct percpu_counter *fbc, s64 amount,
 	lockdep_set_class(&fbc->lock, key);
 	fbc->count = amount;
 	fbc->counters = alloc_percpu(s32);
+	fbc->batch = percpu_counter_batch;
+	fbc->compute_batch = NULL;
 	if (!fbc->counters)
 		return -ENOMEM;
 
 	debug_percpu_counter_activate(fbc);
 
-#ifdef CONFIG_HOTPLUG_CPU
 	INIT_LIST_HEAD(&fbc->list);
 	spin_lock(&percpu_counters_lock);
 	list_add(&fbc->list, &percpu_counters);
 	spin_unlock(&percpu_counters_lock);
-#endif
 	return 0;
 }
 EXPORT_SYMBOL(__percpu_counter_init);
@@ -138,11 +136,9 @@ void percpu_counter_destroy(struct percpu_counter *fbc)
 
 	debug_percpu_counter_deactivate(fbc);
 
-#ifdef CONFIG_HOTPLUG_CPU
 	spin_lock(&percpu_counters_lock);
 	list_del(&fbc->list);
 	spin_unlock(&percpu_counters_lock);
-#endif
 	free_percpu(fbc->counters);
 	fbc->counters = NULL;
 }
@@ -158,31 +154,45 @@ static void compute_batch_value(void)
 	percpu_counter_batch = max(32, nr*2);
 }
 
+void __percpu_counter_batch_resize(struct percpu_counter *fbc)
+{
+	unsigned long flags;
+	int new_batch;
+
+	if (fbc->compute_batch)
+		new_batch = max(fbc->compute_batch(), percpu_counter_batch);
+	else
+		new_batch = percpu_counter_batch;
+
+	raw_spin_lock_irqsave(&fbc->lock, flags);
+	fbc->batch = new_batch;
+	raw_spin_unlock_irqrestore(&fbc->lock, flags);
+}
+EXPORT_SYMBOL(__percpu_counter_batch_resize);
+
 static int __cpuinit percpu_counter_hotcpu_callback(struct notifier_block *nb,
 					unsigned long action, void *hcpu)
 {
-#ifdef CONFIG_HOTPLUG_CPU
 	unsigned int cpu;
 	struct percpu_counter *fbc;
 
 	compute_batch_value();
-	if (action != CPU_DEAD)
-		return NOTIFY_OK;
-
 	cpu = (unsigned long)hcpu;
 	spin_lock(&percpu_counters_lock);
 	list_for_each_entry(fbc, &percpu_counters, list) {
-		s32 *pcount;
-		unsigned long flags;
-
-		raw_spin_lock_irqsave(&fbc->lock, flags);
-		pcount = per_cpu_ptr(fbc->counters, cpu);
-		fbc->count += *pcount;
-		*pcount = 0;
-		raw_spin_unlock_irqrestore(&fbc->lock, flags);
+		__percpu_counter_batch_resize(fbc);
+		if (action == CPU_DEAD) {
+			s32 *pcount;
+			unsigned long flags;
+
+			raw_spin_lock_irqsave(&fbc->lock, flags);
+			pcount = per_cpu_ptr(fbc->counters, cpu);
+			fbc->count += *pcount;
+			*pcount = 0;
+			raw_spin_unlock_irqrestore(&fbc->lock, flags);
+		}
 	}
 	spin_unlock(&percpu_counters_lock);
-#endif
 	return NOTIFY_OK;
 }
 
@@ -196,7 +206,7 @@ int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs)
 
 	count = percpu_counter_read(fbc);
 	/* Check to see if rough count will be sufficient for comparison */
-	if (abs(count - rhs) > (percpu_counter_batch*num_online_cpus())) {
+	if (abs(count - rhs) > (fbc->batch*num_online_cpus())) {
 		if (count > rhs)
 			return 1;
 		else
@@ -215,7 +225,14 @@ EXPORT_SYMBOL(percpu_counter_compare);
 
 static int __init percpu_counter_startup(void)
 {
+	struct percpu_counter *fbc;
+
 	compute_batch_value();
+	spin_lock(&percpu_counters_lock);
+	list_for_each_entry(fbc, &percpu_counters, list) {
+		__percpu_counter_batch_resize(fbc);
+	}
+	spin_unlock(&percpu_counters_lock);
 	hotcpu_notifier(percpu_counter_hotcpu_callback, 0);
 	return 0;
 }
-- 
1.7.11.7

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH v2 2/2] Make batch size for memory accounting configured according to size of memory
  2013-05-03 10:10 ` Tim Chen
@ 2013-05-03 10:10   ` Tim Chen
  -1 siblings, 0 replies; 22+ messages in thread
From: Tim Chen @ 2013-05-03 10:10 UTC (permalink / raw)
  To: Andrew Morton, Tejun Heo, Christoph Lameter, Al Viro
  Cc: Tim Chen, Eric Dumazet, Ric Mason, Simon Jeons, Dave Hansen,
	Andi Kleen, linux-kernel, linux-mm

Currently the per cpu counter's batch size for memory accounting is
configured as twice the number of cpus in the system.  However,
for system with very large memory, it is more appropriate to make it
proportional to the memory size per cpu in the system.

For example, for a x86_64 system with 64 cpus and 128 GB of memory,
the batch size is only 2*64 pages (0.5 MB).  So any memory accounting
changes of more than 0.5MB will overflow the per cpu counter into
the global counter.  Instead, for the new scheme, the batch size
is configured to be 0.4% of the memory/cpu = 8MB (128 GB/64 /256),
which is more inline with the memory size.

Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
---
 mm/mmap.c  | 11 ++++++++++-
 mm/nommu.c | 11 ++++++++++-
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/mm/mmap.c b/mm/mmap.c
index 0db0de1..6c1fcd09 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3090,10 +3090,19 @@ void mm_drop_all_locks(struct mm_struct *mm)
 /*
  * initialise the VMA slab
  */
+static int mm_compute_batch(void)
+{
+	int nr = num_online_cpus();
+
+	/* batch size set to 0.4% of (total memory/#cpus) */
+	return (int) (totalram_pages/nr) / 256;
+}
+
 void __init mmap_init(void)
 {
 	int ret;
 
-	ret = percpu_counter_init(&vm_committed_as, 0);
+	ret = percpu_counter_and_batch_init(&vm_committed_as, 0,
+						&mm_compute_batch);
 	VM_BUG_ON(ret);
 }
diff --git a/mm/nommu.c b/mm/nommu.c
index 2f3ea74..2a250d3 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -526,11 +526,20 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
 /*
  * initialise the VMA and region record slabs
  */
+static int mm_compute_batch(void)
+{
+	int nr = num_online_cpus();
+
+	/* batch size set to 0.4% of (total memory/#cpus) */
+	return (int) (totalram_pages/nr) / 256;
+}
+
 void __init mmap_init(void)
 {
 	int ret;
 
-	ret = percpu_counter_init(&vm_committed_as, 0);
+	ret = percpu_counter_and_batch_init(&vm_committed_as, 0,
+				&mm_compute_batch);
 	VM_BUG_ON(ret);
 	vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC);
 }
-- 
1.7.11.7


^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH v2 2/2] Make batch size for memory accounting configured according to size of memory
@ 2013-05-03 10:10   ` Tim Chen
  0 siblings, 0 replies; 22+ messages in thread
From: Tim Chen @ 2013-05-03 10:10 UTC (permalink / raw)
  To: Andrew Morton, Tejun Heo, Christoph Lameter, Al Viro
  Cc: Tim Chen, Eric Dumazet, Ric Mason, Simon Jeons, Dave Hansen,
	Andi Kleen, linux-kernel, linux-mm

Currently the per cpu counter's batch size for memory accounting is
configured as twice the number of cpus in the system.  However,
for system with very large memory, it is more appropriate to make it
proportional to the memory size per cpu in the system.

For example, for a x86_64 system with 64 cpus and 128 GB of memory,
the batch size is only 2*64 pages (0.5 MB).  So any memory accounting
changes of more than 0.5MB will overflow the per cpu counter into
the global counter.  Instead, for the new scheme, the batch size
is configured to be 0.4% of the memory/cpu = 8MB (128 GB/64 /256),
which is more inline with the memory size.

Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
---
 mm/mmap.c  | 11 ++++++++++-
 mm/nommu.c | 11 ++++++++++-
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/mm/mmap.c b/mm/mmap.c
index 0db0de1..6c1fcd09 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3090,10 +3090,19 @@ void mm_drop_all_locks(struct mm_struct *mm)
 /*
  * initialise the VMA slab
  */
+static int mm_compute_batch(void)
+{
+	int nr = num_online_cpus();
+
+	/* batch size set to 0.4% of (total memory/#cpus) */
+	return (int) (totalram_pages/nr) / 256;
+}
+
 void __init mmap_init(void)
 {
 	int ret;
 
-	ret = percpu_counter_init(&vm_committed_as, 0);
+	ret = percpu_counter_and_batch_init(&vm_committed_as, 0,
+						&mm_compute_batch);
 	VM_BUG_ON(ret);
 }
diff --git a/mm/nommu.c b/mm/nommu.c
index 2f3ea74..2a250d3 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -526,11 +526,20 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
 /*
  * initialise the VMA and region record slabs
  */
+static int mm_compute_batch(void)
+{
+	int nr = num_online_cpus();
+
+	/* batch size set to 0.4% of (total memory/#cpus) */
+	return (int) (totalram_pages/nr) / 256;
+}
+
 void __init mmap_init(void)
 {
 	int ret;
 
-	ret = percpu_counter_init(&vm_committed_as, 0);
+	ret = percpu_counter_and_batch_init(&vm_committed_as, 0,
+				&mm_compute_batch);
 	VM_BUG_ON(ret);
 	vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC);
 }
-- 
1.7.11.7

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 1/2] Make the batch size of the percpu_counter configurable
  2013-05-03 10:10 ` Tim Chen
@ 2013-05-21 20:41   ` Andrew Morton
  -1 siblings, 0 replies; 22+ messages in thread
From: Andrew Morton @ 2013-05-21 20:41 UTC (permalink / raw)
  To: Tim Chen
  Cc: Tejun Heo, Christoph Lameter, Al Viro, Eric Dumazet, Ric Mason,
	Simon Jeons, Dave Hansen, Andi Kleen, linux-kernel, linux-mm

On Fri,  3 May 2013 03:10:52 -0700 Tim Chen <tim.c.chen@linux.intel.com> wrote:

> Currently, there is a single, global, variable (percpu_counter_batch) that
> controls the batch sizes for every 'struct percpu_counter' on the system.
> 
> However, there are some applications, e.g. memory accounting where it is
> more appropriate to scale the batch size according to the memory size.
> This patch adds the infrastructure to be able to change the batch sizes
> for each individual instance of 'struct percpu_counter'.
> 

This patch seems to add rather a lot of unnecessary code.

- The increase in the size of percu_counter is regrettable.

- The change to percpu_counter_startup() is unneeded - no
  percpu_counters should exist at this time.  (We may have screwed this
  up - percpu_counter_startup() shuold probably be explicitly called
  from start_kernel()).

- Once the percpu_counter_startup() change is removed, all that code
  which got moved out of CONFIG_HOTPLUG_CPU can be put back.

And probably other stuff.


If you want to use a larger batch size for vm_committed_as, why not
just use the existing __percpu_counter_add(..., batch)?  Easy.


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 1/2] Make the batch size of the percpu_counter configurable
@ 2013-05-21 20:41   ` Andrew Morton
  0 siblings, 0 replies; 22+ messages in thread
From: Andrew Morton @ 2013-05-21 20:41 UTC (permalink / raw)
  To: Tim Chen
  Cc: Tejun Heo, Christoph Lameter, Al Viro, Eric Dumazet, Ric Mason,
	Simon Jeons, Dave Hansen, Andi Kleen, linux-kernel, linux-mm

On Fri,  3 May 2013 03:10:52 -0700 Tim Chen <tim.c.chen@linux.intel.com> wrote:

> Currently, there is a single, global, variable (percpu_counter_batch) that
> controls the batch sizes for every 'struct percpu_counter' on the system.
> 
> However, there are some applications, e.g. memory accounting where it is
> more appropriate to scale the batch size according to the memory size.
> This patch adds the infrastructure to be able to change the batch sizes
> for each individual instance of 'struct percpu_counter'.
> 

This patch seems to add rather a lot of unnecessary code.

- The increase in the size of percu_counter is regrettable.

- The change to percpu_counter_startup() is unneeded - no
  percpu_counters should exist at this time.  (We may have screwed this
  up - percpu_counter_startup() shuold probably be explicitly called
  from start_kernel()).

- Once the percpu_counter_startup() change is removed, all that code
  which got moved out of CONFIG_HOTPLUG_CPU can be put back.

And probably other stuff.


If you want to use a larger batch size for vm_committed_as, why not
just use the existing __percpu_counter_add(..., batch)?  Easy.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 1/2] Make the batch size of the percpu_counter configurable
  2013-05-21 20:41   ` Andrew Morton
@ 2013-05-21 23:27     ` Tim Chen
  -1 siblings, 0 replies; 22+ messages in thread
From: Tim Chen @ 2013-05-21 23:27 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Tejun Heo, Christoph Lameter, Al Viro, Eric Dumazet, Ric Mason,
	Simon Jeons, Dave Hansen, Andi Kleen, linux-kernel, linux-mm

On Tue, 2013-05-21 at 13:41 -0700, Andrew Morton wrote:

> This patch seems to add rather a lot of unnecessary code.
> 
> - The increase in the size of percu_counter is regrettable.
> 
> - The change to percpu_counter_startup() is unneeded - no
>   percpu_counters should exist at this time.  (We may have screwed this
>   up - percpu_counter_startup() shuold probably be explicitly called
>   from start_kernel()).
> 
> - Once the percpu_counter_startup() change is removed, all that code
>   which got moved out of CONFIG_HOTPLUG_CPU can be put back.
> 
> And probably other stuff.
> 
> 
> If you want to use a larger batch size for vm_committed_as, why not
> just use the existing __percpu_counter_add(..., batch)?  Easy.
> 

Andrew,

Thanks for your comments and reviews.
Will something like the following work if we get rid of the percpu
counter changes and use __percpu_counter_add(..., batch)?  In
benchmark with a lot of memory changes via brk, this makes quite
a difference when we go to a bigger batch size.

Tim

Change batch size for memory accounting to be proportional to memory available.

Currently the per cpu counter's batch size for memory accounting is
configured as twice the number of cpus in the system.  However,
for system with very large memory, it is more appropriate to make it
proportional to the memory size per cpu in the system.

For example, for a x86_64 system with 64 cpus and 128 GB of memory,
the batch size is only 2*64 pages (0.5 MB).  So any memory accounting
changes of more than 0.5MB will overflow the per cpu counter into
the global counter.  Instead, for the new scheme, the batch size
is configured to be 0.4% of the memory/cpu = 8MB (128 GB/64 /256),
which is more inline with the memory size.

Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
---
 include/linux/mman.h |  5 +++++
 mm/mmap.c            | 14 ++++++++++++++
 mm/nommu.c           | 14 ++++++++++++++
 3 files changed, 33 insertions(+)

diff --git a/include/linux/mman.h b/include/linux/mman.h
index 9aa863d..11d5ce9 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -10,12 +10,17 @@
 extern int sysctl_overcommit_memory;
 extern int sysctl_overcommit_ratio;
 extern struct percpu_counter vm_committed_as;
+extern int vm_committed_as_batch;
 
 unsigned long vm_memory_committed(void);
 
 static inline void vm_acct_memory(long pages)
 {
+#ifdef CONFIG_SMP
+	__percpu_counter_add(&vm_committed_as, pages, vm_committed_as_batch);
+#else
 	percpu_counter_add(&vm_committed_as, pages);
+#endif
 }
 
 static inline void vm_unacct_memory(long pages)
diff --git a/mm/mmap.c b/mm/mmap.c
index f681e18..0eef503 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3145,11 +3145,25 @@ void mm_drop_all_locks(struct mm_struct *mm)
 /*
  * initialise the VMA slab
  */
+
+int vm_committed_as_batch;
+EXPORT_SYMBOL(vm_committed_as_batch);
+
+static int mm_compute_batch(void)
+{
+	int nr = num_present_cpus();
+	int batch = max(32, nr*2);
+
+	/* batch size set to 0.4% of (total memory/#cpus) */
+	return max((int) (totalram_pages/nr) / 256, batch);
+}
+
 void __init mmap_init(void)
 {
 	int ret;
 
 	ret = percpu_counter_init(&vm_committed_as, 0);
+	vm_committed_as_batch = mm_compute_batch();
 	VM_BUG_ON(ret);
 }
 
diff --git a/mm/nommu.c b/mm/nommu.c
index 298884d..1b7008a 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -527,11 +527,25 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
 /*
  * initialise the VMA and region record slabs
  */
+
+int vm_committed_as_batch;
+EXPORT_SYMBOL(vm_committed_as_batch);
+
+static int mm_compute_batch(void)
+{
+	int nr = num_present_cpus();
+	int batch = max(32, nr*2);
+
+	/* batch size set to 0.4% of (total memory/#cpus) */
+	return max((int) (totalram_pages/nr) / 256, batch);
+}
+
 void __init mmap_init(void)
 {
 	int ret;
 
 	ret = percpu_counter_init(&vm_committed_as, 0);
+	vm_committed_as_batch = mm_compute_batch();
 	VM_BUG_ON(ret);
 	vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC);
 }
-- 
1.7.11.7




^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 1/2] Make the batch size of the percpu_counter configurable
@ 2013-05-21 23:27     ` Tim Chen
  0 siblings, 0 replies; 22+ messages in thread
From: Tim Chen @ 2013-05-21 23:27 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Tejun Heo, Christoph Lameter, Al Viro, Eric Dumazet, Ric Mason,
	Simon Jeons, Dave Hansen, Andi Kleen, linux-kernel, linux-mm

On Tue, 2013-05-21 at 13:41 -0700, Andrew Morton wrote:

> This patch seems to add rather a lot of unnecessary code.
> 
> - The increase in the size of percu_counter is regrettable.
> 
> - The change to percpu_counter_startup() is unneeded - no
>   percpu_counters should exist at this time.  (We may have screwed this
>   up - percpu_counter_startup() shuold probably be explicitly called
>   from start_kernel()).
> 
> - Once the percpu_counter_startup() change is removed, all that code
>   which got moved out of CONFIG_HOTPLUG_CPU can be put back.
> 
> And probably other stuff.
> 
> 
> If you want to use a larger batch size for vm_committed_as, why not
> just use the existing __percpu_counter_add(..., batch)?  Easy.
> 

Andrew,

Thanks for your comments and reviews.
Will something like the following work if we get rid of the percpu
counter changes and use __percpu_counter_add(..., batch)?  In
benchmark with a lot of memory changes via brk, this makes quite
a difference when we go to a bigger batch size.

Tim

Change batch size for memory accounting to be proportional to memory available.

Currently the per cpu counter's batch size for memory accounting is
configured as twice the number of cpus in the system.  However,
for system with very large memory, it is more appropriate to make it
proportional to the memory size per cpu in the system.

For example, for a x86_64 system with 64 cpus and 128 GB of memory,
the batch size is only 2*64 pages (0.5 MB).  So any memory accounting
changes of more than 0.5MB will overflow the per cpu counter into
the global counter.  Instead, for the new scheme, the batch size
is configured to be 0.4% of the memory/cpu = 8MB (128 GB/64 /256),
which is more inline with the memory size.

Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
---
 include/linux/mman.h |  5 +++++
 mm/mmap.c            | 14 ++++++++++++++
 mm/nommu.c           | 14 ++++++++++++++
 3 files changed, 33 insertions(+)

diff --git a/include/linux/mman.h b/include/linux/mman.h
index 9aa863d..11d5ce9 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -10,12 +10,17 @@
 extern int sysctl_overcommit_memory;
 extern int sysctl_overcommit_ratio;
 extern struct percpu_counter vm_committed_as;
+extern int vm_committed_as_batch;
 
 unsigned long vm_memory_committed(void);
 
 static inline void vm_acct_memory(long pages)
 {
+#ifdef CONFIG_SMP
+	__percpu_counter_add(&vm_committed_as, pages, vm_committed_as_batch);
+#else
 	percpu_counter_add(&vm_committed_as, pages);
+#endif
 }
 
 static inline void vm_unacct_memory(long pages)
diff --git a/mm/mmap.c b/mm/mmap.c
index f681e18..0eef503 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3145,11 +3145,25 @@ void mm_drop_all_locks(struct mm_struct *mm)
 /*
  * initialise the VMA slab
  */
+
+int vm_committed_as_batch;
+EXPORT_SYMBOL(vm_committed_as_batch);
+
+static int mm_compute_batch(void)
+{
+	int nr = num_present_cpus();
+	int batch = max(32, nr*2);
+
+	/* batch size set to 0.4% of (total memory/#cpus) */
+	return max((int) (totalram_pages/nr) / 256, batch);
+}
+
 void __init mmap_init(void)
 {
 	int ret;
 
 	ret = percpu_counter_init(&vm_committed_as, 0);
+	vm_committed_as_batch = mm_compute_batch();
 	VM_BUG_ON(ret);
 }
 
diff --git a/mm/nommu.c b/mm/nommu.c
index 298884d..1b7008a 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -527,11 +527,25 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
 /*
  * initialise the VMA and region record slabs
  */
+
+int vm_committed_as_batch;
+EXPORT_SYMBOL(vm_committed_as_batch);
+
+static int mm_compute_batch(void)
+{
+	int nr = num_present_cpus();
+	int batch = max(32, nr*2);
+
+	/* batch size set to 0.4% of (total memory/#cpus) */
+	return max((int) (totalram_pages/nr) / 256, batch);
+}
+
 void __init mmap_init(void)
 {
 	int ret;
 
 	ret = percpu_counter_init(&vm_committed_as, 0);
+	vm_committed_as_batch = mm_compute_batch();
 	VM_BUG_ON(ret);
 	vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC);
 }
-- 
1.7.11.7



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 1/2] Make the batch size of the percpu_counter configurable
  2013-05-21 23:27     ` Tim Chen
@ 2013-05-21 23:41       ` Andrew Morton
  -1 siblings, 0 replies; 22+ messages in thread
From: Andrew Morton @ 2013-05-21 23:41 UTC (permalink / raw)
  To: Tim Chen
  Cc: Tejun Heo, Christoph Lameter, Al Viro, Eric Dumazet, Ric Mason,
	Simon Jeons, Dave Hansen, Andi Kleen, linux-kernel, linux-mm

On Tue, 21 May 2013 16:27:29 -0700 Tim Chen <tim.c.chen@linux.intel.com> wrote:

> Will something like the following work if we get rid of the percpu
> counter changes and use __percpu_counter_add(..., batch)?  In
> benchmark with a lot of memory changes via brk, this makes quite
> a difference when we go to a bigger batch size.

That looks pretty close.

> Tim
> 
> Change batch size for memory accounting to be proportional to memory available.
> 
> Currently the per cpu counter's batch size for memory accounting is
> configured as twice the number of cpus in the system.  However,
> for system with very large memory, it is more appropriate to make it
> proportional to the memory size per cpu in the system.
> 
> For example, for a x86_64 system with 64 cpus and 128 GB of memory,
> the batch size is only 2*64 pages (0.5 MB).  So any memory accounting
> changes of more than 0.5MB will overflow the per cpu counter into
> the global counter.  Instead, for the new scheme, the batch size
> is configured to be 0.4% of the memory/cpu = 8MB (128 GB/64 /256),
> which is more inline with the memory size.
> 
> Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
> ---
>  include/linux/mman.h |  5 +++++
>  mm/mmap.c            | 14 ++++++++++++++
>  mm/nommu.c           | 14 ++++++++++++++
>  3 files changed, 33 insertions(+)
> 
> diff --git a/include/linux/mman.h b/include/linux/mman.h
> index 9aa863d..11d5ce9 100644
> --- a/include/linux/mman.h
> +++ b/include/linux/mman.h
> @@ -10,12 +10,17 @@
>  extern int sysctl_overcommit_memory;
>  extern int sysctl_overcommit_ratio;
>  extern struct percpu_counter vm_committed_as;
> +extern int vm_committed_as_batch;
>  
>  unsigned long vm_memory_committed(void);
>  
>  static inline void vm_acct_memory(long pages)
>  {
> +#ifdef CONFIG_SMP
> +	__percpu_counter_add(&vm_committed_as, pages, vm_committed_as_batch);
> +#else
>  	percpu_counter_add(&vm_committed_as, pages);
> +#endif
>  }

I think we could use __percpu_counter_add() unconditionally here and
just do

#ifdef CONFIG_SMP
#define vm_committed_as_batch 0
#else
int vm_committed_as_batch;
#endif

The EXPORT_SYMBOL(vm_committed_as_batch) is unneeded.

> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -3145,11 +3145,25 @@ void mm_drop_all_locks(struct mm_struct *mm)
>  /*
>   * initialise the VMA slab
>   */
> +
> +int vm_committed_as_batch;
> +EXPORT_SYMBOL(vm_committed_as_batch);
> +
> +static int mm_compute_batch(void)
> +{
> +	int nr = num_present_cpus();
> +	int batch = max(32, nr*2);
> +
> +	/* batch size set to 0.4% of (total memory/#cpus) */
> +	return max((int) (totalram_pages/nr) / 256, batch);
> +}

Change this to do the assignment to vm_committed_as_batch then put this
code inside #ifdef CONFIG_SMP and do

#else	/* CONFIG_SMP */
static inline void mm_compute_batch(void)
{
}
#endif

>  void __init mmap_init(void)
>  {
>  	int ret;
>  
>  	ret = percpu_counter_init(&vm_committed_as, 0);
> +	vm_committed_as_batch = mm_compute_batch();

This becomes just

	mm_compute_batch();

>  	VM_BUG_ON(ret);
>  }
>  
> diff --git a/mm/nommu.c b/mm/nommu.c
> index 298884d..1b7008a 100644
> --- a/mm/nommu.c
> +++ b/mm/nommu.c
> @@ -527,11 +527,25 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
>  /*
>   * initialise the VMA and region record slabs
>   */
> +
> +int vm_committed_as_batch;
> +EXPORT_SYMBOL(vm_committed_as_batch);
> +
> +static int mm_compute_batch(void)
> +{
> +	int nr = num_present_cpus();
> +	int batch = max(32, nr*2);
> +
> +	/* batch size set to 0.4% of (total memory/#cpus) */
> +	return max((int) (totalram_pages/nr) / 256, batch);
> +}
> +
>  void __init mmap_init(void)
>  {
>  	int ret;
>  
>  	ret = percpu_counter_init(&vm_committed_as, 0);
> +	vm_committed_as_batch = mm_compute_batch();
>  	VM_BUG_ON(ret);
>  	vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC);

I'm not sure that CONFIG_MMU=n && CONFIG_SMP=y even exists.  Perhaps it
does.  But there's no point in ruling out that option here.

The nommu code becomes identical to the mmu code so we should put it in
a shared file.  I suppose mmap.c would be as good a place as any.

We could make mm_compute_batch() __init and call it from mm_init(). 
But really it should be __meminit and there should be a memory-hotplug
notifier handler which adjusts vm_committed_as_batch's value.



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 1/2] Make the batch size of the percpu_counter configurable
@ 2013-05-21 23:41       ` Andrew Morton
  0 siblings, 0 replies; 22+ messages in thread
From: Andrew Morton @ 2013-05-21 23:41 UTC (permalink / raw)
  To: Tim Chen
  Cc: Tejun Heo, Christoph Lameter, Al Viro, Eric Dumazet, Ric Mason,
	Simon Jeons, Dave Hansen, Andi Kleen, linux-kernel, linux-mm

On Tue, 21 May 2013 16:27:29 -0700 Tim Chen <tim.c.chen@linux.intel.com> wrote:

> Will something like the following work if we get rid of the percpu
> counter changes and use __percpu_counter_add(..., batch)?  In
> benchmark with a lot of memory changes via brk, this makes quite
> a difference when we go to a bigger batch size.

That looks pretty close.

> Tim
> 
> Change batch size for memory accounting to be proportional to memory available.
> 
> Currently the per cpu counter's batch size for memory accounting is
> configured as twice the number of cpus in the system.  However,
> for system with very large memory, it is more appropriate to make it
> proportional to the memory size per cpu in the system.
> 
> For example, for a x86_64 system with 64 cpus and 128 GB of memory,
> the batch size is only 2*64 pages (0.5 MB).  So any memory accounting
> changes of more than 0.5MB will overflow the per cpu counter into
> the global counter.  Instead, for the new scheme, the batch size
> is configured to be 0.4% of the memory/cpu = 8MB (128 GB/64 /256),
> which is more inline with the memory size.
> 
> Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
> ---
>  include/linux/mman.h |  5 +++++
>  mm/mmap.c            | 14 ++++++++++++++
>  mm/nommu.c           | 14 ++++++++++++++
>  3 files changed, 33 insertions(+)
> 
> diff --git a/include/linux/mman.h b/include/linux/mman.h
> index 9aa863d..11d5ce9 100644
> --- a/include/linux/mman.h
> +++ b/include/linux/mman.h
> @@ -10,12 +10,17 @@
>  extern int sysctl_overcommit_memory;
>  extern int sysctl_overcommit_ratio;
>  extern struct percpu_counter vm_committed_as;
> +extern int vm_committed_as_batch;
>  
>  unsigned long vm_memory_committed(void);
>  
>  static inline void vm_acct_memory(long pages)
>  {
> +#ifdef CONFIG_SMP
> +	__percpu_counter_add(&vm_committed_as, pages, vm_committed_as_batch);
> +#else
>  	percpu_counter_add(&vm_committed_as, pages);
> +#endif
>  }

I think we could use __percpu_counter_add() unconditionally here and
just do

#ifdef CONFIG_SMP
#define vm_committed_as_batch 0
#else
int vm_committed_as_batch;
#endif

The EXPORT_SYMBOL(vm_committed_as_batch) is unneeded.

> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -3145,11 +3145,25 @@ void mm_drop_all_locks(struct mm_struct *mm)
>  /*
>   * initialise the VMA slab
>   */
> +
> +int vm_committed_as_batch;
> +EXPORT_SYMBOL(vm_committed_as_batch);
> +
> +static int mm_compute_batch(void)
> +{
> +	int nr = num_present_cpus();
> +	int batch = max(32, nr*2);
> +
> +	/* batch size set to 0.4% of (total memory/#cpus) */
> +	return max((int) (totalram_pages/nr) / 256, batch);
> +}

Change this to do the assignment to vm_committed_as_batch then put this
code inside #ifdef CONFIG_SMP and do

#else	/* CONFIG_SMP */
static inline void mm_compute_batch(void)
{
}
#endif

>  void __init mmap_init(void)
>  {
>  	int ret;
>  
>  	ret = percpu_counter_init(&vm_committed_as, 0);
> +	vm_committed_as_batch = mm_compute_batch();

This becomes just

	mm_compute_batch();

>  	VM_BUG_ON(ret);
>  }
>  
> diff --git a/mm/nommu.c b/mm/nommu.c
> index 298884d..1b7008a 100644
> --- a/mm/nommu.c
> +++ b/mm/nommu.c
> @@ -527,11 +527,25 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
>  /*
>   * initialise the VMA and region record slabs
>   */
> +
> +int vm_committed_as_batch;
> +EXPORT_SYMBOL(vm_committed_as_batch);
> +
> +static int mm_compute_batch(void)
> +{
> +	int nr = num_present_cpus();
> +	int batch = max(32, nr*2);
> +
> +	/* batch size set to 0.4% of (total memory/#cpus) */
> +	return max((int) (totalram_pages/nr) / 256, batch);
> +}
> +
>  void __init mmap_init(void)
>  {
>  	int ret;
>  
>  	ret = percpu_counter_init(&vm_committed_as, 0);
> +	vm_committed_as_batch = mm_compute_batch();
>  	VM_BUG_ON(ret);
>  	vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC);

I'm not sure that CONFIG_MMU=n && CONFIG_SMP=y even exists.  Perhaps it
does.  But there's no point in ruling out that option here.

The nommu code becomes identical to the mmu code so we should put it in
a shared file.  I suppose mmap.c would be as good a place as any.

We could make mm_compute_batch() __init and call it from mm_init(). 
But really it should be __meminit and there should be a memory-hotplug
notifier handler which adjusts vm_committed_as_batch's value.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 1/2] Make the batch size of the percpu_counter configurable
  2013-05-21 23:41       ` Andrew Morton
@ 2013-05-22  0:43         ` Tim Chen
  -1 siblings, 0 replies; 22+ messages in thread
From: Tim Chen @ 2013-05-22  0:43 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Tejun Heo, Christoph Lameter, Al Viro, Eric Dumazet, Ric Mason,
	Simon Jeons, Dave Hansen, Andi Kleen, linux-kernel, linux-mm

On Tue, 2013-05-21 at 16:41 -0700, Andrew Morton wrote:

> I think we could use __percpu_counter_add() unconditionally here and
> just do
> 
> #ifdef CONFIG_SMP
> #define vm_committed_as_batch 0
> #else
> int vm_committed_as_batch;
> #endif
> 
> The EXPORT_SYMBOL(vm_committed_as_batch) is unneeded.
> 

Thanks.  I've made the changes suggested.

> >  void __init mmap_init(void)
> >  {
> >  	int ret;
> >  
> >  	ret = percpu_counter_init(&vm_committed_as, 0);
> > +	vm_committed_as_batch = mm_compute_batch();
> >  	VM_BUG_ON(ret);
> >  	vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC);
> 
> I'm not sure that CONFIG_MMU=n && CONFIG_SMP=y even exists.  Perhaps it
> does.  But there's no point in ruling out that option here.
> 
> The nommu code becomes identical to the mmu code so we should put it in
> a shared file.  I suppose mmap.c would be as good a place as any.
> 

I've consolidated things in mman.h.

> We could make mm_compute_batch() __init and call it from mm_init(). 
> But really it should be __meminit and there should be a memory-hotplug
> notifier handler which adjusts vm_committed_as_batch's value.
> 

I'll spin off another version of the patch later to add the
memory-hotplug notifier.  In the mean time, does the following looks
good to you?

Thanks.

Tim

Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
---
 include/linux/mman.h | 20 +++++++++++++++++++-
 mm/mmap.c            |  4 ++++
 mm/nommu.c           |  4 ++++
 3 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/include/linux/mman.h b/include/linux/mman.h
index 9aa863d..443bcae 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -10,12 +10,30 @@
 extern int sysctl_overcommit_memory;
 extern int sysctl_overcommit_ratio;
 extern struct percpu_counter vm_committed_as;
+#ifdef CONFIG_SMP
+extern int vm_committed_as_batch;
+
+static inline void mm_compute_batch(void)
+{
+        int nr = num_present_cpus();
+        int batch = max(32, nr*2);
+
+        /* batch size set to 0.4% of (total memory/#cpus) */
+        vm_committed_as_batch = max((int) (totalram_pages/nr) / 256, batch);
+}
+#else
+#define vm_committed_as_batch 0
+
+static inline void mm_compute_batch(void)
+{
+}
+#endif
 
 unsigned long vm_memory_committed(void);
 
 static inline void vm_acct_memory(long pages)
 {
-	percpu_counter_add(&vm_committed_as, pages);
+	__percpu_counter_add(&vm_committed_as, pages, vm_committed_as_batch);
 }
 
 static inline void vm_unacct_memory(long pages)
diff --git a/mm/mmap.c b/mm/mmap.c
index f681e18..55c8773 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3145,11 +3145,15 @@ void mm_drop_all_locks(struct mm_struct *mm)
 /*
  * initialise the VMA slab
  */
+
+int vm_committed_as_batch;
+
 void __init mmap_init(void)
 {
 	int ret;
 
 	ret = percpu_counter_init(&vm_committed_as, 0);
+	mm_compute_batch();
 	VM_BUG_ON(ret);
 }
 
diff --git a/mm/nommu.c b/mm/nommu.c
index 298884d..9ad16ba 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -527,11 +527,15 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
 /*
  * initialise the VMA and region record slabs
  */
+
+int vm_committed_as_batch;
+
 void __init mmap_init(void)
 {
 	int ret;
 
 	ret = percpu_counter_init(&vm_committed_as, 0);
+	mm_compute_batch();
 	VM_BUG_ON(ret);
 	vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC);
 }
-- 
1.7.11.7




^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 1/2] Make the batch size of the percpu_counter configurable
@ 2013-05-22  0:43         ` Tim Chen
  0 siblings, 0 replies; 22+ messages in thread
From: Tim Chen @ 2013-05-22  0:43 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Tejun Heo, Christoph Lameter, Al Viro, Eric Dumazet, Ric Mason,
	Simon Jeons, Dave Hansen, Andi Kleen, linux-kernel, linux-mm

On Tue, 2013-05-21 at 16:41 -0700, Andrew Morton wrote:

> I think we could use __percpu_counter_add() unconditionally here and
> just do
> 
> #ifdef CONFIG_SMP
> #define vm_committed_as_batch 0
> #else
> int vm_committed_as_batch;
> #endif
> 
> The EXPORT_SYMBOL(vm_committed_as_batch) is unneeded.
> 

Thanks.  I've made the changes suggested.

> >  void __init mmap_init(void)
> >  {
> >  	int ret;
> >  
> >  	ret = percpu_counter_init(&vm_committed_as, 0);
> > +	vm_committed_as_batch = mm_compute_batch();
> >  	VM_BUG_ON(ret);
> >  	vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC);
> 
> I'm not sure that CONFIG_MMU=n && CONFIG_SMP=y even exists.  Perhaps it
> does.  But there's no point in ruling out that option here.
> 
> The nommu code becomes identical to the mmu code so we should put it in
> a shared file.  I suppose mmap.c would be as good a place as any.
> 

I've consolidated things in mman.h.

> We could make mm_compute_batch() __init and call it from mm_init(). 
> But really it should be __meminit and there should be a memory-hotplug
> notifier handler which adjusts vm_committed_as_batch's value.
> 

I'll spin off another version of the patch later to add the
memory-hotplug notifier.  In the mean time, does the following looks
good to you?

Thanks.

Tim

Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
---
 include/linux/mman.h | 20 +++++++++++++++++++-
 mm/mmap.c            |  4 ++++
 mm/nommu.c           |  4 ++++
 3 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/include/linux/mman.h b/include/linux/mman.h
index 9aa863d..443bcae 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -10,12 +10,30 @@
 extern int sysctl_overcommit_memory;
 extern int sysctl_overcommit_ratio;
 extern struct percpu_counter vm_committed_as;
+#ifdef CONFIG_SMP
+extern int vm_committed_as_batch;
+
+static inline void mm_compute_batch(void)
+{
+        int nr = num_present_cpus();
+        int batch = max(32, nr*2);
+
+        /* batch size set to 0.4% of (total memory/#cpus) */
+        vm_committed_as_batch = max((int) (totalram_pages/nr) / 256, batch);
+}
+#else
+#define vm_committed_as_batch 0
+
+static inline void mm_compute_batch(void)
+{
+}
+#endif
 
 unsigned long vm_memory_committed(void);
 
 static inline void vm_acct_memory(long pages)
 {
-	percpu_counter_add(&vm_committed_as, pages);
+	__percpu_counter_add(&vm_committed_as, pages, vm_committed_as_batch);
 }
 
 static inline void vm_unacct_memory(long pages)
diff --git a/mm/mmap.c b/mm/mmap.c
index f681e18..55c8773 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3145,11 +3145,15 @@ void mm_drop_all_locks(struct mm_struct *mm)
 /*
  * initialise the VMA slab
  */
+
+int vm_committed_as_batch;
+
 void __init mmap_init(void)
 {
 	int ret;
 
 	ret = percpu_counter_init(&vm_committed_as, 0);
+	mm_compute_batch();
 	VM_BUG_ON(ret);
 }
 
diff --git a/mm/nommu.c b/mm/nommu.c
index 298884d..9ad16ba 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -527,11 +527,15 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
 /*
  * initialise the VMA and region record slabs
  */
+
+int vm_committed_as_batch;
+
 void __init mmap_init(void)
 {
 	int ret;
 
 	ret = percpu_counter_init(&vm_committed_as, 0);
+	mm_compute_batch();
 	VM_BUG_ON(ret);
 	vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC);
 }
-- 
1.7.11.7



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 1/2] Make the batch size of the percpu_counter configurable
  2013-05-22  0:43         ` Tim Chen
@ 2013-05-22  7:20           ` Andrew Morton
  -1 siblings, 0 replies; 22+ messages in thread
From: Andrew Morton @ 2013-05-22  7:20 UTC (permalink / raw)
  To: Tim Chen
  Cc: Tejun Heo, Christoph Lameter, Al Viro, Eric Dumazet, Ric Mason,
	Simon Jeons, Dave Hansen, Andi Kleen, linux-kernel, linux-mm

On Tue, 21 May 2013 17:43:10 -0700 Tim Chen <tim.c.chen@linux.intel.com> wrote:

> 
> I'll spin off another version of the patch later to add the
> memory-hotplug notifier.  In the mean time, does the following looks
> good to you?
> 
> ...
>
> --- a/include/linux/mman.h
> +++ b/include/linux/mman.h
> @@ -10,12 +10,30 @@
>  extern int sysctl_overcommit_memory;
>  extern int sysctl_overcommit_ratio;
>  extern struct percpu_counter vm_committed_as;
> +#ifdef CONFIG_SMP
> +extern int vm_committed_as_batch;
> +
> +static inline void mm_compute_batch(void)
> +{
> +        int nr = num_present_cpus();
> +        int batch = max(32, nr*2);
> +
> +        /* batch size set to 0.4% of (total memory/#cpus) */
> +        vm_committed_as_batch = max((int) (totalram_pages/nr) / 256, batch);

Use max_t() here.

That expression will overflow when the machine has two exabytes of RAM ;)

> +}
> +#else
> +#define vm_committed_as_batch 0
> +
> +static inline void mm_compute_batch(void)
> +{
> +}
> +#endif

I think it would be better if all the above was not inlined.  There's
no particular reason to inline it, and putting it here requires that
mman.h include a bunch more header files (which the patch forgot to
do).

>  unsigned long vm_memory_committed(void);
>  
>  static inline void vm_acct_memory(long pages)
>  {
> -	percpu_counter_add(&vm_committed_as, pages);
> +	__percpu_counter_add(&vm_committed_as, pages, vm_committed_as_batch);
>  }
>  
>  static inline void vm_unacct_memory(long pages)
> diff --git a/mm/mmap.c b/mm/mmap.c
> index f681e18..55c8773 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -3145,11 +3145,15 @@ void mm_drop_all_locks(struct mm_struct *mm)
>  /*
>   * initialise the VMA slab
>   */
> +
> +int vm_committed_as_batch;
> +
>  void __init mmap_init(void)
>  {
>  	int ret;
>  
>  	ret = percpu_counter_init(&vm_committed_as, 0);
> +	mm_compute_batch();
>  	VM_BUG_ON(ret);
>  }
>  
> diff --git a/mm/nommu.c b/mm/nommu.c
> index 298884d..9ad16ba 100644
> --- a/mm/nommu.c
> +++ b/mm/nommu.c
> @@ -527,11 +527,15 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
>  /*
>   * initialise the VMA and region record slabs
>   */
> +
> +int vm_committed_as_batch;

This definition duplicates the one in mmap.c?

>  void __init mmap_init(void)
>  {
>  	int ret;
>  
>  	ret = percpu_counter_init(&vm_committed_as, 0);
> +	mm_compute_batch();
>  	VM_BUG_ON(ret);
>  	vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC);
>  }


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 1/2] Make the batch size of the percpu_counter configurable
@ 2013-05-22  7:20           ` Andrew Morton
  0 siblings, 0 replies; 22+ messages in thread
From: Andrew Morton @ 2013-05-22  7:20 UTC (permalink / raw)
  To: Tim Chen
  Cc: Tejun Heo, Christoph Lameter, Al Viro, Eric Dumazet, Ric Mason,
	Simon Jeons, Dave Hansen, Andi Kleen, linux-kernel, linux-mm

On Tue, 21 May 2013 17:43:10 -0700 Tim Chen <tim.c.chen@linux.intel.com> wrote:

> 
> I'll spin off another version of the patch later to add the
> memory-hotplug notifier.  In the mean time, does the following looks
> good to you?
> 
> ...
>
> --- a/include/linux/mman.h
> +++ b/include/linux/mman.h
> @@ -10,12 +10,30 @@
>  extern int sysctl_overcommit_memory;
>  extern int sysctl_overcommit_ratio;
>  extern struct percpu_counter vm_committed_as;
> +#ifdef CONFIG_SMP
> +extern int vm_committed_as_batch;
> +
> +static inline void mm_compute_batch(void)
> +{
> +        int nr = num_present_cpus();
> +        int batch = max(32, nr*2);
> +
> +        /* batch size set to 0.4% of (total memory/#cpus) */
> +        vm_committed_as_batch = max((int) (totalram_pages/nr) / 256, batch);

Use max_t() here.

That expression will overflow when the machine has two exabytes of RAM ;)

> +}
> +#else
> +#define vm_committed_as_batch 0
> +
> +static inline void mm_compute_batch(void)
> +{
> +}
> +#endif

I think it would be better if all the above was not inlined.  There's
no particular reason to inline it, and putting it here requires that
mman.h include a bunch more header files (which the patch forgot to
do).

>  unsigned long vm_memory_committed(void);
>  
>  static inline void vm_acct_memory(long pages)
>  {
> -	percpu_counter_add(&vm_committed_as, pages);
> +	__percpu_counter_add(&vm_committed_as, pages, vm_committed_as_batch);
>  }
>  
>  static inline void vm_unacct_memory(long pages)
> diff --git a/mm/mmap.c b/mm/mmap.c
> index f681e18..55c8773 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -3145,11 +3145,15 @@ void mm_drop_all_locks(struct mm_struct *mm)
>  /*
>   * initialise the VMA slab
>   */
> +
> +int vm_committed_as_batch;
> +
>  void __init mmap_init(void)
>  {
>  	int ret;
>  
>  	ret = percpu_counter_init(&vm_committed_as, 0);
> +	mm_compute_batch();
>  	VM_BUG_ON(ret);
>  }
>  
> diff --git a/mm/nommu.c b/mm/nommu.c
> index 298884d..9ad16ba 100644
> --- a/mm/nommu.c
> +++ b/mm/nommu.c
> @@ -527,11 +527,15 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
>  /*
>   * initialise the VMA and region record slabs
>   */
> +
> +int vm_committed_as_batch;

This definition duplicates the one in mmap.c?

>  void __init mmap_init(void)
>  {
>  	int ret;
>  
>  	ret = percpu_counter_init(&vm_committed_as, 0);
> +	mm_compute_batch();
>  	VM_BUG_ON(ret);
>  	vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC);
>  }

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 1/2] Make the batch size of the percpu_counter configurable
  2013-05-22  7:20           ` Andrew Morton
@ 2013-05-22 23:37             ` Tim Chen
  -1 siblings, 0 replies; 22+ messages in thread
From: Tim Chen @ 2013-05-22 23:37 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Tejun Heo, Christoph Lameter, Al Viro, Eric Dumazet, Ric Mason,
	Simon Jeons, Dave Hansen, Andi Kleen, linux-kernel, linux-mm

On Wed, 2013-05-22 at 00:20 -0700, Andrew Morton wrote:

> > +#ifdef CONFIG_SMP
> > +extern int vm_committed_as_batch;
> > +
> > +static inline void mm_compute_batch(void)
> > +{
> > +        int nr = num_present_cpus();
> > +        int batch = max(32, nr*2);
> > +
> > +        /* batch size set to 0.4% of (total memory/#cpus) */
> > +        vm_committed_as_batch = max((int) (totalram_pages/nr) / 256, batch);
> 
> Use max_t() here.
> That expression will overflow when the machine has two exabytes of RAM ;)
> 

I've updated the computation to use max_t and also added a check for
overflow.

> > +}
> > +#else
> > +#define vm_committed_as_batch 0
> > +
> > +static inline void mm_compute_batch(void)
> > +{
> > +}
> > +#endif
> 
> I think it would be better if all the above was not inlined.  There's
> no particular reason to inline it, and putting it here requires that
> mman.h include a bunch more header files (which the patch forgot to
> do).
> 

Now mm_compute_batch has been moved to mm_init.c and not inlined. Also
a memory hot plug notifier is registered to recompute batch size 
when memory size changes.

> > diff --git a/mm/nommu.c b/mm/nommu.c
> > index 298884d..9ad16ba 100644
> > --- a/mm/nommu.c
> > +++ b/mm/nommu.c
> > @@ -527,11 +527,15 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
> >  /*
> >   * initialise the VMA and region record slabs
> >   */
> > +
> > +int vm_committed_as_batch;
> 
> This definition duplicates the one in mmap.c?

Only one copy defined in mm_init.c now.

Thanks

Tim

Currently the per cpu counter's batch size for memory accounting is
configured as twice the number of cpus in the system.  However,
for system with very large memory, it is more appropriate to make it
proportional to the memory size per cpu in the system.

For example, for a x86_64 system with 64 cpus and 128 GB of memory,
the batch size is only 2*64 pages (0.5 MB).  So any memory accounting
changes of more than 0.5MB will overflow the per cpu counter into
the global counter.  Instead, for the new scheme, the batch size
is configured to be 0.4% of the memory/cpu = 8MB (128 GB/64 /256),
which is more inline with the memory size.

Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
---
 include/linux/mman.h |  8 +++++++-
 mm/mm_init.c         | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/include/linux/mman.h b/include/linux/mman.h
index 9aa863d..92dc257 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -11,11 +11,17 @@ extern int sysctl_overcommit_memory;
 extern int sysctl_overcommit_ratio;
 extern struct percpu_counter vm_committed_as;
 
+#ifdef CONFIG_SMP
+extern s32 vm_committed_as_batch;
+#else
+#define vm_committed_as_batch 0
+#endif
+
 unsigned long vm_memory_committed(void);
 
 static inline void vm_acct_memory(long pages)
 {
-	percpu_counter_add(&vm_committed_as, pages);
+	__percpu_counter_add(&vm_committed_as, pages, vm_committed_as_batch);
 }
 
 static inline void vm_unacct_memory(long pages)
diff --git a/mm/mm_init.c b/mm/mm_init.c
index c280a02..bfb9034 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -9,6 +9,8 @@
 #include <linux/init.h>
 #include <linux/kobject.h>
 #include <linux/export.h>
+#include <linux/memory.h>
+#include <linux/notifier.h>
 #include "internal.h"
 
 #ifdef CONFIG_DEBUG_MEMORY_INIT
@@ -147,6 +149,51 @@ early_param("mminit_loglevel", set_mminit_loglevel);
 struct kobject *mm_kobj;
 EXPORT_SYMBOL_GPL(mm_kobj);
 
+#ifdef CONFIG_SMP
+s32 vm_committed_as_batch = 32;
+
+static void __meminit mm_compute_batch(void)
+{
+	u64 memsized_batch;
+	s32 nr = num_present_cpus();
+	s32 batch = max_t(s32, nr*2, 32);
+
+	/* batch size set to 0.4% of (total memory/#cpus), or max int32 */
+	memsized_batch = min_t(u64, (totalram_pages/nr)/256, 0x7fffffff);
+
+	vm_committed_as_batch = max_t(s32, memsized_batch, batch);
+}
+
+static int __meminit mm_compute_batch_notifier(struct notifier_block *self,
+					unsigned long action, void *arg)
+{
+	switch (action) {
+	case MEM_ONLINE:
+	case MEM_OFFLINE:
+		mm_compute_batch();
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block compute_batch_nb = {
+	.notifier_call = mm_compute_batch_notifier,
+	.priority = IPC_CALLBACK_PRI, /* use lowest priority */
+};
+
+static int __init mm_compute_batch_init(void)
+{
+	mm_compute_batch();
+	register_hotmemory_notifier(&compute_batch_nb);
+
+	return 0;
+}
+
+__initcall(mm_compute_batch_init);
+
+#endif
+
 static int __init mm_sysfs_init(void)
 {
 	mm_kobj = kobject_create_and_add("mm", kernel_kobj);
-- 
1.7.11.7




^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 1/2] Make the batch size of the percpu_counter configurable
@ 2013-05-22 23:37             ` Tim Chen
  0 siblings, 0 replies; 22+ messages in thread
From: Tim Chen @ 2013-05-22 23:37 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Tejun Heo, Christoph Lameter, Al Viro, Eric Dumazet, Ric Mason,
	Simon Jeons, Dave Hansen, Andi Kleen, linux-kernel, linux-mm

On Wed, 2013-05-22 at 00:20 -0700, Andrew Morton wrote:

> > +#ifdef CONFIG_SMP
> > +extern int vm_committed_as_batch;
> > +
> > +static inline void mm_compute_batch(void)
> > +{
> > +        int nr = num_present_cpus();
> > +        int batch = max(32, nr*2);
> > +
> > +        /* batch size set to 0.4% of (total memory/#cpus) */
> > +        vm_committed_as_batch = max((int) (totalram_pages/nr) / 256, batch);
> 
> Use max_t() here.
> That expression will overflow when the machine has two exabytes of RAM ;)
> 

I've updated the computation to use max_t and also added a check for
overflow.

> > +}
> > +#else
> > +#define vm_committed_as_batch 0
> > +
> > +static inline void mm_compute_batch(void)
> > +{
> > +}
> > +#endif
> 
> I think it would be better if all the above was not inlined.  There's
> no particular reason to inline it, and putting it here requires that
> mman.h include a bunch more header files (which the patch forgot to
> do).
> 

Now mm_compute_batch has been moved to mm_init.c and not inlined. Also
a memory hot plug notifier is registered to recompute batch size 
when memory size changes.

> > diff --git a/mm/nommu.c b/mm/nommu.c
> > index 298884d..9ad16ba 100644
> > --- a/mm/nommu.c
> > +++ b/mm/nommu.c
> > @@ -527,11 +527,15 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
> >  /*
> >   * initialise the VMA and region record slabs
> >   */
> > +
> > +int vm_committed_as_batch;
> 
> This definition duplicates the one in mmap.c?

Only one copy defined in mm_init.c now.

Thanks

Tim

Currently the per cpu counter's batch size for memory accounting is
configured as twice the number of cpus in the system.  However,
for system with very large memory, it is more appropriate to make it
proportional to the memory size per cpu in the system.

For example, for a x86_64 system with 64 cpus and 128 GB of memory,
the batch size is only 2*64 pages (0.5 MB).  So any memory accounting
changes of more than 0.5MB will overflow the per cpu counter into
the global counter.  Instead, for the new scheme, the batch size
is configured to be 0.4% of the memory/cpu = 8MB (128 GB/64 /256),
which is more inline with the memory size.

Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
---
 include/linux/mman.h |  8 +++++++-
 mm/mm_init.c         | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/include/linux/mman.h b/include/linux/mman.h
index 9aa863d..92dc257 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -11,11 +11,17 @@ extern int sysctl_overcommit_memory;
 extern int sysctl_overcommit_ratio;
 extern struct percpu_counter vm_committed_as;
 
+#ifdef CONFIG_SMP
+extern s32 vm_committed_as_batch;
+#else
+#define vm_committed_as_batch 0
+#endif
+
 unsigned long vm_memory_committed(void);
 
 static inline void vm_acct_memory(long pages)
 {
-	percpu_counter_add(&vm_committed_as, pages);
+	__percpu_counter_add(&vm_committed_as, pages, vm_committed_as_batch);
 }
 
 static inline void vm_unacct_memory(long pages)
diff --git a/mm/mm_init.c b/mm/mm_init.c
index c280a02..bfb9034 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -9,6 +9,8 @@
 #include <linux/init.h>
 #include <linux/kobject.h>
 #include <linux/export.h>
+#include <linux/memory.h>
+#include <linux/notifier.h>
 #include "internal.h"
 
 #ifdef CONFIG_DEBUG_MEMORY_INIT
@@ -147,6 +149,51 @@ early_param("mminit_loglevel", set_mminit_loglevel);
 struct kobject *mm_kobj;
 EXPORT_SYMBOL_GPL(mm_kobj);
 
+#ifdef CONFIG_SMP
+s32 vm_committed_as_batch = 32;
+
+static void __meminit mm_compute_batch(void)
+{
+	u64 memsized_batch;
+	s32 nr = num_present_cpus();
+	s32 batch = max_t(s32, nr*2, 32);
+
+	/* batch size set to 0.4% of (total memory/#cpus), or max int32 */
+	memsized_batch = min_t(u64, (totalram_pages/nr)/256, 0x7fffffff);
+
+	vm_committed_as_batch = max_t(s32, memsized_batch, batch);
+}
+
+static int __meminit mm_compute_batch_notifier(struct notifier_block *self,
+					unsigned long action, void *arg)
+{
+	switch (action) {
+	case MEM_ONLINE:
+	case MEM_OFFLINE:
+		mm_compute_batch();
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block compute_batch_nb = {
+	.notifier_call = mm_compute_batch_notifier,
+	.priority = IPC_CALLBACK_PRI, /* use lowest priority */
+};
+
+static int __init mm_compute_batch_init(void)
+{
+	mm_compute_batch();
+	register_hotmemory_notifier(&compute_batch_nb);
+
+	return 0;
+}
+
+__initcall(mm_compute_batch_init);
+
+#endif
+
 static int __init mm_sysfs_init(void)
 {
 	mm_kobj = kobject_create_and_add("mm", kernel_kobj);
-- 
1.7.11.7



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 1/2] Make the batch size of the percpu_counter configurable
  2013-05-22 23:37             ` Tim Chen
@ 2013-05-29 19:26               ` Andrew Morton
  -1 siblings, 0 replies; 22+ messages in thread
From: Andrew Morton @ 2013-05-29 19:26 UTC (permalink / raw)
  To: Tim Chen
  Cc: Tejun Heo, Christoph Lameter, Al Viro, Eric Dumazet, Ric Mason,
	Simon Jeons, Dave Hansen, Andi Kleen, linux-kernel, linux-mm

On Wed, 22 May 2013 16:37:18 -0700 Tim Chen <tim.c.chen@linux.intel.com> wrote:

> Currently the per cpu counter's batch size for memory accounting is
> configured as twice the number of cpus in the system.  However,
> for system with very large memory, it is more appropriate to make it
> proportional to the memory size per cpu in the system.
> 
> For example, for a x86_64 system with 64 cpus and 128 GB of memory,
> the batch size is only 2*64 pages (0.5 MB).  So any memory accounting
> changes of more than 0.5MB will overflow the per cpu counter into
> the global counter.  Instead, for the new scheme, the batch size
> is configured to be 0.4% of the memory/cpu = 8MB (128 GB/64 /256),
> which is more inline with the memory size.

I renamed the patch to "mm: tune vm_committed_as percpu_counter
batching size".

Do we have any performance testing results?  They're pretty important
for a performance-improvement patch ;)


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 1/2] Make the batch size of the percpu_counter configurable
@ 2013-05-29 19:26               ` Andrew Morton
  0 siblings, 0 replies; 22+ messages in thread
From: Andrew Morton @ 2013-05-29 19:26 UTC (permalink / raw)
  To: Tim Chen
  Cc: Tejun Heo, Christoph Lameter, Al Viro, Eric Dumazet, Ric Mason,
	Simon Jeons, Dave Hansen, Andi Kleen, linux-kernel, linux-mm

On Wed, 22 May 2013 16:37:18 -0700 Tim Chen <tim.c.chen@linux.intel.com> wrote:

> Currently the per cpu counter's batch size for memory accounting is
> configured as twice the number of cpus in the system.  However,
> for system with very large memory, it is more appropriate to make it
> proportional to the memory size per cpu in the system.
> 
> For example, for a x86_64 system with 64 cpus and 128 GB of memory,
> the batch size is only 2*64 pages (0.5 MB).  So any memory accounting
> changes of more than 0.5MB will overflow the per cpu counter into
> the global counter.  Instead, for the new scheme, the batch size
> is configured to be 0.4% of the memory/cpu = 8MB (128 GB/64 /256),
> which is more inline with the memory size.

I renamed the patch to "mm: tune vm_committed_as percpu_counter
batching size".

Do we have any performance testing results?  They're pretty important
for a performance-improvement patch ;)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 1/2] Make the batch size of the percpu_counter configurable
  2013-05-29 19:26               ` Andrew Morton
@ 2013-05-29 21:20                 ` Tim Chen
  -1 siblings, 0 replies; 22+ messages in thread
From: Tim Chen @ 2013-05-29 21:20 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Tejun Heo, Christoph Lameter, Al Viro, Eric Dumazet, Ric Mason,
	Simon Jeons, Dave Hansen, Andi Kleen, linux-kernel, linux-mm

On Wed, 2013-05-29 at 12:26 -0700, Andrew Morton wrote:
> On Wed, 22 May 2013 16:37:18 -0700 Tim Chen <tim.c.chen@linux.intel.com> wrote:
> 
> > Currently the per cpu counter's batch size for memory accounting is
> > configured as twice the number of cpus in the system.  However,
> > for system with very large memory, it is more appropriate to make it
> > proportional to the memory size per cpu in the system.
> > 
> > For example, for a x86_64 system with 64 cpus and 128 GB of memory,
> > the batch size is only 2*64 pages (0.5 MB).  So any memory accounting
> > changes of more than 0.5MB will overflow the per cpu counter into
> > the global counter.  Instead, for the new scheme, the batch size
> > is configured to be 0.4% of the memory/cpu = 8MB (128 GB/64 /256),
> > which is more inline with the memory size.
> 
> I renamed the patch to "mm: tune vm_committed_as percpu_counter
> batching size".
> 
> Do we have any performance testing results?  They're pretty important
> for a performance-improvement patch ;)
> 

I've done a repeated brk test of 800KB (from will-it-scale test suite)
with 80 concurrent processes on a 4 socket Westmere machine with a 
total of 40 cores.  Without the patch, about 80% of cpu is spent on
spin-lock contention within the vm_committed_as counter. With the patch,
there's a 73x speedup on the benchmark and the lock contention drops off
almost entirely.

Tim


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 1/2] Make the batch size of the percpu_counter configurable
@ 2013-05-29 21:20                 ` Tim Chen
  0 siblings, 0 replies; 22+ messages in thread
From: Tim Chen @ 2013-05-29 21:20 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Tejun Heo, Christoph Lameter, Al Viro, Eric Dumazet, Ric Mason,
	Simon Jeons, Dave Hansen, Andi Kleen, linux-kernel, linux-mm

On Wed, 2013-05-29 at 12:26 -0700, Andrew Morton wrote:
> On Wed, 22 May 2013 16:37:18 -0700 Tim Chen <tim.c.chen@linux.intel.com> wrote:
> 
> > Currently the per cpu counter's batch size for memory accounting is
> > configured as twice the number of cpus in the system.  However,
> > for system with very large memory, it is more appropriate to make it
> > proportional to the memory size per cpu in the system.
> > 
> > For example, for a x86_64 system with 64 cpus and 128 GB of memory,
> > the batch size is only 2*64 pages (0.5 MB).  So any memory accounting
> > changes of more than 0.5MB will overflow the per cpu counter into
> > the global counter.  Instead, for the new scheme, the batch size
> > is configured to be 0.4% of the memory/cpu = 8MB (128 GB/64 /256),
> > which is more inline with the memory size.
> 
> I renamed the patch to "mm: tune vm_committed_as percpu_counter
> batching size".
> 
> Do we have any performance testing results?  They're pretty important
> for a performance-improvement patch ;)
> 

I've done a repeated brk test of 800KB (from will-it-scale test suite)
with 80 concurrent processes on a 4 socket Westmere machine with a 
total of 40 cores.  Without the patch, about 80% of cpu is spent on
spin-lock contention within the vm_committed_as counter. With the patch,
there's a 73x speedup on the benchmark and the lock contention drops off
almost entirely.

Tim

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 1/2] Make the batch size of the percpu_counter configurable
  2013-05-29 21:20                 ` Tim Chen
@ 2013-05-29 21:34                   ` Andrew Morton
  -1 siblings, 0 replies; 22+ messages in thread
From: Andrew Morton @ 2013-05-29 21:34 UTC (permalink / raw)
  To: Tim Chen
  Cc: Tejun Heo, Christoph Lameter, Al Viro, Eric Dumazet, Ric Mason,
	Simon Jeons, Dave Hansen, Andi Kleen, linux-kernel, linux-mm

On Wed, 29 May 2013 14:20:12 -0700 Tim Chen <tim.c.chen@linux.intel.com> wrote:

> > Do we have any performance testing results?  They're pretty important
> > for a performance-improvement patch ;)
> > 
> 
> I've done a repeated brk test of 800KB (from will-it-scale test suite)
> with 80 concurrent processes on a 4 socket Westmere machine with a 
> total of 40 cores.  Without the patch, about 80% of cpu is spent on
> spin-lock contention within the vm_committed_as counter. With the patch,
> there's a 73x speedup on the benchmark and the lock contention drops off
> almost entirely.

Only a 73x speedup?  I dunno what they pay you for ;)

How serious is this performance problem in real-world work?  For
something of this magnitude we might want to backport the patch into
earlier kernels (because most everyone who uses those kernels will be
doing this anyway).  However such an act would require a pretty clear
explanation of the benefit which end-users will see.



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 1/2] Make the batch size of the percpu_counter configurable
@ 2013-05-29 21:34                   ` Andrew Morton
  0 siblings, 0 replies; 22+ messages in thread
From: Andrew Morton @ 2013-05-29 21:34 UTC (permalink / raw)
  To: Tim Chen
  Cc: Tejun Heo, Christoph Lameter, Al Viro, Eric Dumazet, Ric Mason,
	Simon Jeons, Dave Hansen, Andi Kleen, linux-kernel, linux-mm

On Wed, 29 May 2013 14:20:12 -0700 Tim Chen <tim.c.chen@linux.intel.com> wrote:

> > Do we have any performance testing results?  They're pretty important
> > for a performance-improvement patch ;)
> > 
> 
> I've done a repeated brk test of 800KB (from will-it-scale test suite)
> with 80 concurrent processes on a 4 socket Westmere machine with a 
> total of 40 cores.  Without the patch, about 80% of cpu is spent on
> spin-lock contention within the vm_committed_as counter. With the patch,
> there's a 73x speedup on the benchmark and the lock contention drops off
> almost entirely.

Only a 73x speedup?  I dunno what they pay you for ;)

How serious is this performance problem in real-world work?  For
something of this magnitude we might want to backport the patch into
earlier kernels (because most everyone who uses those kernels will be
doing this anyway).  However such an act would require a pretty clear
explanation of the benefit which end-users will see.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2013-05-29 21:34 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-05-03 10:10 [PATCH v2 1/2] Make the batch size of the percpu_counter configurable Tim Chen
2013-05-03 10:10 ` Tim Chen
2013-05-03 10:10 ` [PATCH v2 2/2] Make batch size for memory accounting configured according to size of memory Tim Chen
2013-05-03 10:10   ` Tim Chen
2013-05-21 20:41 ` [PATCH v2 1/2] Make the batch size of the percpu_counter configurable Andrew Morton
2013-05-21 20:41   ` Andrew Morton
2013-05-21 23:27   ` Tim Chen
2013-05-21 23:27     ` Tim Chen
2013-05-21 23:41     ` Andrew Morton
2013-05-21 23:41       ` Andrew Morton
2013-05-22  0:43       ` Tim Chen
2013-05-22  0:43         ` Tim Chen
2013-05-22  7:20         ` Andrew Morton
2013-05-22  7:20           ` Andrew Morton
2013-05-22 23:37           ` Tim Chen
2013-05-22 23:37             ` Tim Chen
2013-05-29 19:26             ` Andrew Morton
2013-05-29 19:26               ` Andrew Morton
2013-05-29 21:20               ` Tim Chen
2013-05-29 21:20                 ` Tim Chen
2013-05-29 21:34                 ` Andrew Morton
2013-05-29 21:34                   ` Andrew Morton

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.