All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] mm: make lru_add_drain_all() selective
@ 2013-08-06 20:22 ` Chris Metcalf
  0 siblings, 0 replies; 104+ messages in thread
From: Chris Metcalf @ 2013-08-06 20:22 UTC (permalink / raw)
  To: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner, Frederic Weisbecker

This change makes lru_add_drain_all() only selectively interrupt
the cpus that have per-cpu free pages that can be drained.

This is important in nohz mode where calling mlockall(), for
example, otherwise will interrupt every core unnecessarily.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
 include/linux/workqueue.h |  3 +++
 kernel/workqueue.c        | 35 ++++++++++++++++++++++++++---------
 mm/swap.c                 | 38 +++++++++++++++++++++++++++++++++++++-
 3 files changed, 66 insertions(+), 10 deletions(-)

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index a0ed78a..71a3fe7 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -13,6 +13,8 @@
 #include <linux/atomic.h>
 #include <linux/cpumask.h>
 
+struct cpumask;
+
 struct workqueue_struct;
 
 struct work_struct;
@@ -470,6 +472,7 @@ extern void flush_workqueue(struct workqueue_struct *wq);
 extern void drain_workqueue(struct workqueue_struct *wq);
 extern void flush_scheduled_work(void);
 
+extern int schedule_on_cpu_mask(work_func_t func, const struct cpumask *mask);
 extern int schedule_on_each_cpu(work_func_t func);
 
 int execute_in_process_context(work_func_t fn, struct execute_work *);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f02c4a4..a6d1809 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2962,17 +2962,18 @@ bool cancel_delayed_work_sync(struct delayed_work *dwork)
 EXPORT_SYMBOL(cancel_delayed_work_sync);
 
 /**
- * schedule_on_each_cpu - execute a function synchronously on each online CPU
+ * schedule_on_cpu_mask - execute a function synchronously on each listed CPU
  * @func: the function to call
+ * @mask: the cpumask to invoke the function on
  *
- * schedule_on_each_cpu() executes @func on each online CPU using the
+ * schedule_on_cpu_mask() executes @func on each listed CPU using the
  * system workqueue and blocks until all CPUs have completed.
- * schedule_on_each_cpu() is very slow.
+ * schedule_on_cpu_mask() is very slow.
  *
  * RETURNS:
  * 0 on success, -errno on failure.
  */
-int schedule_on_each_cpu(work_func_t func)
+int schedule_on_cpu_mask(work_func_t func, const struct cpumask *mask)
 {
 	int cpu;
 	struct work_struct __percpu *works;
@@ -2981,24 +2982,40 @@ int schedule_on_each_cpu(work_func_t func)
 	if (!works)
 		return -ENOMEM;
 
-	get_online_cpus();
-
-	for_each_online_cpu(cpu) {
+	for_each_cpu(cpu, mask) {
 		struct work_struct *work = per_cpu_ptr(works, cpu);
 
 		INIT_WORK(work, func);
 		schedule_work_on(cpu, work);
 	}
 
-	for_each_online_cpu(cpu)
+	for_each_cpu(cpu, mask)
 		flush_work(per_cpu_ptr(works, cpu));
 
-	put_online_cpus();
 	free_percpu(works);
 	return 0;
 }
 
 /**
+ * schedule_on_each_cpu - execute a function synchronously on each online CPU
+ * @func: the function to call
+ *
+ * schedule_on_each_cpu() executes @func on each online CPU using the
+ * system workqueue and blocks until all CPUs have completed.
+ * schedule_on_each_cpu() is very slow.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int schedule_on_each_cpu(work_func_t func)
+{
+	get_online_cpus();
+	schedule_on_cpu_mask(func, cpu_online_mask);
+	put_online_cpus();
+	return 0;
+}
+
+/**
  * flush_scheduled_work - ensure that any scheduled work has run to completion.
  *
  * Forces execution of the kernel-global workqueue and blocks until its
diff --git a/mm/swap.c b/mm/swap.c
index 4a1d0d2..981b1d9 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -683,7 +683,43 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
  */
 int lru_add_drain_all(void)
 {
-	return schedule_on_each_cpu(lru_add_drain_per_cpu);
+	cpumask_var_t mask;
+	int cpu, rc;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+	cpumask_clear(mask);
+
+	/*
+	 * Figure out which cpus need flushing.  It's OK if we race
+	 * with changes to the per-cpu lru pvecs, since it's no worse
+	 * than if we flushed all cpus, since a cpu could still end
+	 * up putting pages back on its pvec before we returned.
+	 * And this avoids interrupting other cpus unnecessarily.
+	 */
+	for_each_online_cpu(cpu) {
+		struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu);
+		struct pagevec *pvec = &per_cpu(lru_rotate_pvecs, cpu);
+		int count = pagevec_count(pvec);
+		int lru;
+
+		if (!count) {
+			for_each_lru(lru) {
+				pvec = &pvecs[lru - LRU_BASE];
+				count = pagevec_count(pvec);
+				if (count)
+					break;
+			}
+		}
+
+		if (count)
+			cpumask_set_cpu(cpu, mask);
+	}
+
+	rc = schedule_on_cpu_mask(lru_add_drain_per_cpu, mask);
+
+	free_cpumask_var(mask);
+	return rc;
 }
 
 /*
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 104+ messages in thread

* [PATCH v2] mm: make lru_add_drain_all() selective
  2013-08-06 20:22 ` Chris Metcalf
@ 2013-08-06 20:22   ` Chris Metcalf
  -1 siblings, 0 replies; 104+ messages in thread
From: Chris Metcalf @ 2013-08-06 20:22 UTC (permalink / raw)
  To: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner, Frederic Weisbecker

This change makes lru_add_drain_all() only selectively interrupt
the cpus that have per-cpu free pages that can be drained.

This is important in nohz mode where calling mlockall(), for
example, otherwise will interrupt every core unnecessarily.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
Oops! In the previous version of this change I had just blindly patched
it forward from a slightly older version of mm/swap.c.  This version is
now properly against a version of mm/swap.c that includes all the latest
changes to lru_add_drain_all().

 include/linux/workqueue.h |  3 +++
 kernel/workqueue.c        | 35 ++++++++++++++++++++++++++---------
 mm/swap.c                 | 37 ++++++++++++++++++++++++++++++++++++-
 3 files changed, 65 insertions(+), 10 deletions(-)

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index a0ed78a..71a3fe7 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -13,6 +13,8 @@
 #include <linux/atomic.h>
 #include <linux/cpumask.h>
 
+struct cpumask;
+
 struct workqueue_struct;
 
 struct work_struct;
@@ -470,6 +472,7 @@ extern void flush_workqueue(struct workqueue_struct *wq);
 extern void drain_workqueue(struct workqueue_struct *wq);
 extern void flush_scheduled_work(void);
 
+extern int schedule_on_cpu_mask(work_func_t func, const struct cpumask *mask);
 extern int schedule_on_each_cpu(work_func_t func);
 
 int execute_in_process_context(work_func_t fn, struct execute_work *);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f02c4a4..a6d1809 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2962,17 +2962,18 @@ bool cancel_delayed_work_sync(struct delayed_work *dwork)
 EXPORT_SYMBOL(cancel_delayed_work_sync);
 
 /**
- * schedule_on_each_cpu - execute a function synchronously on each online CPU
+ * schedule_on_cpu_mask - execute a function synchronously on each listed CPU
  * @func: the function to call
+ * @mask: the cpumask to invoke the function on
  *
- * schedule_on_each_cpu() executes @func on each online CPU using the
+ * schedule_on_cpu_mask() executes @func on each listed CPU using the
  * system workqueue and blocks until all CPUs have completed.
- * schedule_on_each_cpu() is very slow.
+ * schedule_on_cpu_mask() is very slow.
  *
  * RETURNS:
  * 0 on success, -errno on failure.
  */
-int schedule_on_each_cpu(work_func_t func)
+int schedule_on_cpu_mask(work_func_t func, const struct cpumask *mask)
 {
 	int cpu;
 	struct work_struct __percpu *works;
@@ -2981,24 +2982,40 @@ int schedule_on_each_cpu(work_func_t func)
 	if (!works)
 		return -ENOMEM;
 
-	get_online_cpus();
-
-	for_each_online_cpu(cpu) {
+	for_each_cpu(cpu, mask) {
 		struct work_struct *work = per_cpu_ptr(works, cpu);
 
 		INIT_WORK(work, func);
 		schedule_work_on(cpu, work);
 	}
 
-	for_each_online_cpu(cpu)
+	for_each_cpu(cpu, mask)
 		flush_work(per_cpu_ptr(works, cpu));
 
-	put_online_cpus();
 	free_percpu(works);
 	return 0;
 }
 
 /**
+ * schedule_on_each_cpu - execute a function synchronously on each online CPU
+ * @func: the function to call
+ *
+ * schedule_on_each_cpu() executes @func on each online CPU using the
+ * system workqueue and blocks until all CPUs have completed.
+ * schedule_on_each_cpu() is very slow.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int schedule_on_each_cpu(work_func_t func)
+{
+	get_online_cpus();
+	schedule_on_cpu_mask(func, cpu_online_mask);
+	put_online_cpus();
+	return 0;
+}
+
+/**
  * flush_scheduled_work - ensure that any scheduled work has run to completion.
  *
  * Forces execution of the kernel-global workqueue and blocks until its
diff --git a/mm/swap.c b/mm/swap.c
index 4a1d0d2..d4a862b 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -405,6 +405,11 @@ static void activate_page_drain(int cpu)
 		pagevec_lru_move_fn(pvec, __activate_page, NULL);
 }
 
+static bool need_activate_page_drain(int cpu)
+{
+	return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0;
+}
+
 void activate_page(struct page *page)
 {
 	if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
@@ -422,6 +427,11 @@ static inline void activate_page_drain(int cpu)
 {
 }
 
+static bool need_activate_page_drain(int cpu)
+{
+	return false;
+}
+
 void activate_page(struct page *page)
 {
 	struct zone *zone = page_zone(page);
@@ -683,7 +693,32 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
  */
 int lru_add_drain_all(void)
 {
-	return schedule_on_each_cpu(lru_add_drain_per_cpu);
+	cpumask_var_t mask;
+	int cpu, rc;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+	cpumask_clear(mask);
+
+	/*
+	 * Figure out which cpus need flushing.  It's OK if we race
+	 * with changes to the per-cpu lru pvecs, since it's no worse
+	 * than if we flushed all cpus, since a cpu could still end
+	 * up putting pages back on its pvec before we returned.
+	 * And this avoids interrupting other cpus unnecessarily.
+	 */
+	for_each_online_cpu(cpu) {
+		if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
+		    pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
+		    pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
+		    need_activate_page_drain(cpu))
+			cpumask_set_cpu(cpu, mask);
+	}
+
+	rc = schedule_on_cpu_mask(lru_add_drain_per_cpu, mask);
+
+	free_cpumask_var(mask);
+	return rc;
 }
 
 /*
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 104+ messages in thread

* [PATCH] mm: make lru_add_drain_all() selective
@ 2013-08-06 20:22 ` Chris Metcalf
  0 siblings, 0 replies; 104+ messages in thread
From: Chris Metcalf @ 2013-08-06 20:22 UTC (permalink / raw)
  To: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner, Frederic Weisbecker

This change makes lru_add_drain_all() only selectively interrupt
the cpus that have per-cpu free pages that can be drained.

This is important in nohz mode where calling mlockall(), for
example, otherwise will interrupt every core unnecessarily.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
 include/linux/workqueue.h |  3 +++
 kernel/workqueue.c        | 35 ++++++++++++++++++++++++++---------
 mm/swap.c                 | 38 +++++++++++++++++++++++++++++++++++++-
 3 files changed, 66 insertions(+), 10 deletions(-)

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index a0ed78a..71a3fe7 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -13,6 +13,8 @@
 #include <linux/atomic.h>
 #include <linux/cpumask.h>
 
+struct cpumask;
+
 struct workqueue_struct;
 
 struct work_struct;
@@ -470,6 +472,7 @@ extern void flush_workqueue(struct workqueue_struct *wq);
 extern void drain_workqueue(struct workqueue_struct *wq);
 extern void flush_scheduled_work(void);
 
+extern int schedule_on_cpu_mask(work_func_t func, const struct cpumask *mask);
 extern int schedule_on_each_cpu(work_func_t func);
 
 int execute_in_process_context(work_func_t fn, struct execute_work *);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f02c4a4..a6d1809 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2962,17 +2962,18 @@ bool cancel_delayed_work_sync(struct delayed_work *dwork)
 EXPORT_SYMBOL(cancel_delayed_work_sync);
 
 /**
- * schedule_on_each_cpu - execute a function synchronously on each online CPU
+ * schedule_on_cpu_mask - execute a function synchronously on each listed CPU
  * @func: the function to call
+ * @mask: the cpumask to invoke the function on
  *
- * schedule_on_each_cpu() executes @func on each online CPU using the
+ * schedule_on_cpu_mask() executes @func on each listed CPU using the
  * system workqueue and blocks until all CPUs have completed.
- * schedule_on_each_cpu() is very slow.
+ * schedule_on_cpu_mask() is very slow.
  *
  * RETURNS:
  * 0 on success, -errno on failure.
  */
-int schedule_on_each_cpu(work_func_t func)
+int schedule_on_cpu_mask(work_func_t func, const struct cpumask *mask)
 {
 	int cpu;
 	struct work_struct __percpu *works;
@@ -2981,24 +2982,40 @@ int schedule_on_each_cpu(work_func_t func)
 	if (!works)
 		return -ENOMEM;
 
-	get_online_cpus();
-
-	for_each_online_cpu(cpu) {
+	for_each_cpu(cpu, mask) {
 		struct work_struct *work = per_cpu_ptr(works, cpu);
 
 		INIT_WORK(work, func);
 		schedule_work_on(cpu, work);
 	}
 
-	for_each_online_cpu(cpu)
+	for_each_cpu(cpu, mask)
 		flush_work(per_cpu_ptr(works, cpu));
 
-	put_online_cpus();
 	free_percpu(works);
 	return 0;
 }
 
 /**
+ * schedule_on_each_cpu - execute a function synchronously on each online CPU
+ * @func: the function to call
+ *
+ * schedule_on_each_cpu() executes @func on each online CPU using the
+ * system workqueue and blocks until all CPUs have completed.
+ * schedule_on_each_cpu() is very slow.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int schedule_on_each_cpu(work_func_t func)
+{
+	get_online_cpus();
+	schedule_on_cpu_mask(func, cpu_online_mask);
+	put_online_cpus();
+	return 0;
+}
+
+/**
  * flush_scheduled_work - ensure that any scheduled work has run to completion.
  *
  * Forces execution of the kernel-global workqueue and blocks until its
diff --git a/mm/swap.c b/mm/swap.c
index 4a1d0d2..981b1d9 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -683,7 +683,43 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
  */
 int lru_add_drain_all(void)
 {
-	return schedule_on_each_cpu(lru_add_drain_per_cpu);
+	cpumask_var_t mask;
+	int cpu, rc;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+	cpumask_clear(mask);
+
+	/*
+	 * Figure out which cpus need flushing.  It's OK if we race
+	 * with changes to the per-cpu lru pvecs, since it's no worse
+	 * than if we flushed all cpus, since a cpu could still end
+	 * up putting pages back on its pvec before we returned.
+	 * And this avoids interrupting other cpus unnecessarily.
+	 */
+	for_each_online_cpu(cpu) {
+		struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu);
+		struct pagevec *pvec = &per_cpu(lru_rotate_pvecs, cpu);
+		int count = pagevec_count(pvec);
+		int lru;
+
+		if (!count) {
+			for_each_lru(lru) {
+				pvec = &pvecs[lru - LRU_BASE];
+				count = pagevec_count(pvec);
+				if (count)
+					break;
+			}
+		}
+
+		if (count)
+			cpumask_set_cpu(cpu, mask);
+	}
+
+	rc = schedule_on_cpu_mask(lru_add_drain_per_cpu, mask);
+
+	free_cpumask_var(mask);
+	return rc;
 }
 
 /*
-- 
1.8.3.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 104+ messages in thread

* [PATCH v2] mm: make lru_add_drain_all() selective
@ 2013-08-06 20:22   ` Chris Metcalf
  0 siblings, 0 replies; 104+ messages in thread
From: Chris Metcalf @ 2013-08-06 20:22 UTC (permalink / raw)
  To: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner, Frederic Weisbecker

This change makes lru_add_drain_all() only selectively interrupt
the cpus that have per-cpu free pages that can be drained.

This is important in nohz mode where calling mlockall(), for
example, otherwise will interrupt every core unnecessarily.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
Oops! In the previous version of this change I had just blindly patched
it forward from a slightly older version of mm/swap.c.  This version is
now properly against a version of mm/swap.c that includes all the latest
changes to lru_add_drain_all().

 include/linux/workqueue.h |  3 +++
 kernel/workqueue.c        | 35 ++++++++++++++++++++++++++---------
 mm/swap.c                 | 37 ++++++++++++++++++++++++++++++++++++-
 3 files changed, 65 insertions(+), 10 deletions(-)

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index a0ed78a..71a3fe7 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -13,6 +13,8 @@
 #include <linux/atomic.h>
 #include <linux/cpumask.h>
 
+struct cpumask;
+
 struct workqueue_struct;
 
 struct work_struct;
@@ -470,6 +472,7 @@ extern void flush_workqueue(struct workqueue_struct *wq);
 extern void drain_workqueue(struct workqueue_struct *wq);
 extern void flush_scheduled_work(void);
 
+extern int schedule_on_cpu_mask(work_func_t func, const struct cpumask *mask);
 extern int schedule_on_each_cpu(work_func_t func);
 
 int execute_in_process_context(work_func_t fn, struct execute_work *);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f02c4a4..a6d1809 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2962,17 +2962,18 @@ bool cancel_delayed_work_sync(struct delayed_work *dwork)
 EXPORT_SYMBOL(cancel_delayed_work_sync);
 
 /**
- * schedule_on_each_cpu - execute a function synchronously on each online CPU
+ * schedule_on_cpu_mask - execute a function synchronously on each listed CPU
  * @func: the function to call
+ * @mask: the cpumask to invoke the function on
  *
- * schedule_on_each_cpu() executes @func on each online CPU using the
+ * schedule_on_cpu_mask() executes @func on each listed CPU using the
  * system workqueue and blocks until all CPUs have completed.
- * schedule_on_each_cpu() is very slow.
+ * schedule_on_cpu_mask() is very slow.
  *
  * RETURNS:
  * 0 on success, -errno on failure.
  */
-int schedule_on_each_cpu(work_func_t func)
+int schedule_on_cpu_mask(work_func_t func, const struct cpumask *mask)
 {
 	int cpu;
 	struct work_struct __percpu *works;
@@ -2981,24 +2982,40 @@ int schedule_on_each_cpu(work_func_t func)
 	if (!works)
 		return -ENOMEM;
 
-	get_online_cpus();
-
-	for_each_online_cpu(cpu) {
+	for_each_cpu(cpu, mask) {
 		struct work_struct *work = per_cpu_ptr(works, cpu);
 
 		INIT_WORK(work, func);
 		schedule_work_on(cpu, work);
 	}
 
-	for_each_online_cpu(cpu)
+	for_each_cpu(cpu, mask)
 		flush_work(per_cpu_ptr(works, cpu));
 
-	put_online_cpus();
 	free_percpu(works);
 	return 0;
 }
 
 /**
+ * schedule_on_each_cpu - execute a function synchronously on each online CPU
+ * @func: the function to call
+ *
+ * schedule_on_each_cpu() executes @func on each online CPU using the
+ * system workqueue and blocks until all CPUs have completed.
+ * schedule_on_each_cpu() is very slow.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int schedule_on_each_cpu(work_func_t func)
+{
+	get_online_cpus();
+	schedule_on_cpu_mask(func, cpu_online_mask);
+	put_online_cpus();
+	return 0;
+}
+
+/**
  * flush_scheduled_work - ensure that any scheduled work has run to completion.
  *
  * Forces execution of the kernel-global workqueue and blocks until its
diff --git a/mm/swap.c b/mm/swap.c
index 4a1d0d2..d4a862b 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -405,6 +405,11 @@ static void activate_page_drain(int cpu)
 		pagevec_lru_move_fn(pvec, __activate_page, NULL);
 }
 
+static bool need_activate_page_drain(int cpu)
+{
+	return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0;
+}
+
 void activate_page(struct page *page)
 {
 	if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
@@ -422,6 +427,11 @@ static inline void activate_page_drain(int cpu)
 {
 }
 
+static bool need_activate_page_drain(int cpu)
+{
+	return false;
+}
+
 void activate_page(struct page *page)
 {
 	struct zone *zone = page_zone(page);
@@ -683,7 +693,32 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
  */
 int lru_add_drain_all(void)
 {
-	return schedule_on_each_cpu(lru_add_drain_per_cpu);
+	cpumask_var_t mask;
+	int cpu, rc;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+	cpumask_clear(mask);
+
+	/*
+	 * Figure out which cpus need flushing.  It's OK if we race
+	 * with changes to the per-cpu lru pvecs, since it's no worse
+	 * than if we flushed all cpus, since a cpu could still end
+	 * up putting pages back on its pvec before we returned.
+	 * And this avoids interrupting other cpus unnecessarily.
+	 */
+	for_each_online_cpu(cpu) {
+		if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
+		    pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
+		    pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
+		    need_activate_page_drain(cpu))
+			cpumask_set_cpu(cpu, mask);
+	}
+
+	rc = schedule_on_cpu_mask(lru_add_drain_per_cpu, mask);
+
+	free_cpumask_var(mask);
+	return rc;
 }
 
 /*
-- 
1.8.3.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 104+ messages in thread

* Re: [PATCH v2] mm: make lru_add_drain_all() selective
  2013-08-06 20:22   ` Chris Metcalf
@ 2013-08-07 20:45     ` Tejun Heo
  -1 siblings, 0 replies; 104+ messages in thread
From: Tejun Heo @ 2013-08-07 20:45 UTC (permalink / raw)
  To: Chris Metcalf
  Cc: linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker

On Tue, Aug 06, 2013 at 04:22:39PM -0400, Chris Metcalf wrote:
> This change makes lru_add_drain_all() only selectively interrupt
> the cpus that have per-cpu free pages that can be drained.
> 
> This is important in nohz mode where calling mlockall(), for
> example, otherwise will interrupt every core unnecessarily.

Can you please split off workqueue part into a separate patch with
proper description?  I don't mind the change eventually going through
-mm but it's nasty to bury workqueue API update with another change
without explanation or justification.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v2] mm: make lru_add_drain_all() selective
@ 2013-08-07 20:45     ` Tejun Heo
  0 siblings, 0 replies; 104+ messages in thread
From: Tejun Heo @ 2013-08-07 20:45 UTC (permalink / raw)
  To: Chris Metcalf
  Cc: linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker

On Tue, Aug 06, 2013 at 04:22:39PM -0400, Chris Metcalf wrote:
> This change makes lru_add_drain_all() only selectively interrupt
> the cpus that have per-cpu free pages that can be drained.
> 
> This is important in nohz mode where calling mlockall(), for
> example, otherwise will interrupt every core unnecessarily.

Can you please split off workqueue part into a separate patch with
proper description?  I don't mind the change eventually going through
-mm but it's nasty to bury workqueue API update with another change
without explanation or justification.

Thanks.

-- 
tejun

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 104+ messages in thread

* [PATCH v3 1/2] workqueue: add new schedule_on_cpu_mask() API
  2013-08-07 20:45     ` Tejun Heo
@ 2013-08-07 20:49       ` Chris Metcalf
  -1 siblings, 0 replies; 104+ messages in thread
From: Chris Metcalf @ 2013-08-07 20:49 UTC (permalink / raw)
  To: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner, Frederic Weisbecker

This primitive allows scheduling work to run on a particular set of
cpus described by a "struct cpumask".  This can be useful, for example,
if you have a per-cpu variable that requires code execution only if the
per-cpu variable has a certain value (for example, is a non-empty list).

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
v3: split commit into two, one for workqueue and one for mm, though both
should probably be taken through -mm.

 include/linux/workqueue.h |  3 +++
 kernel/workqueue.c        | 35 ++++++++++++++++++++++++++---------
 2 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index a0ed78a..71a3fe7 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -13,6 +13,8 @@
 #include <linux/atomic.h>
 #include <linux/cpumask.h>
 
+struct cpumask;
+
 struct workqueue_struct;
 
 struct work_struct;
@@ -470,6 +472,7 @@ extern void flush_workqueue(struct workqueue_struct *wq);
 extern void drain_workqueue(struct workqueue_struct *wq);
 extern void flush_scheduled_work(void);
 
+extern int schedule_on_cpu_mask(work_func_t func, const struct cpumask *mask);
 extern int schedule_on_each_cpu(work_func_t func);
 
 int execute_in_process_context(work_func_t fn, struct execute_work *);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f02c4a4..a6d1809 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2962,17 +2962,18 @@ bool cancel_delayed_work_sync(struct delayed_work *dwork)
 EXPORT_SYMBOL(cancel_delayed_work_sync);
 
 /**
- * schedule_on_each_cpu - execute a function synchronously on each online CPU
+ * schedule_on_cpu_mask - execute a function synchronously on each listed CPU
  * @func: the function to call
+ * @mask: the cpumask to invoke the function on
  *
- * schedule_on_each_cpu() executes @func on each online CPU using the
+ * schedule_on_cpu_mask() executes @func on each listed CPU using the
  * system workqueue and blocks until all CPUs have completed.
- * schedule_on_each_cpu() is very slow.
+ * schedule_on_cpu_mask() is very slow.
  *
  * RETURNS:
  * 0 on success, -errno on failure.
  */
-int schedule_on_each_cpu(work_func_t func)
+int schedule_on_cpu_mask(work_func_t func, const struct cpumask *mask)
 {
 	int cpu;
 	struct work_struct __percpu *works;
@@ -2981,24 +2982,40 @@ int schedule_on_each_cpu(work_func_t func)
 	if (!works)
 		return -ENOMEM;
 
-	get_online_cpus();
-
-	for_each_online_cpu(cpu) {
+	for_each_cpu(cpu, mask) {
 		struct work_struct *work = per_cpu_ptr(works, cpu);
 
 		INIT_WORK(work, func);
 		schedule_work_on(cpu, work);
 	}
 
-	for_each_online_cpu(cpu)
+	for_each_cpu(cpu, mask)
 		flush_work(per_cpu_ptr(works, cpu));
 
-	put_online_cpus();
 	free_percpu(works);
 	return 0;
 }
 
 /**
+ * schedule_on_each_cpu - execute a function synchronously on each online CPU
+ * @func: the function to call
+ *
+ * schedule_on_each_cpu() executes @func on each online CPU using the
+ * system workqueue and blocks until all CPUs have completed.
+ * schedule_on_each_cpu() is very slow.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int schedule_on_each_cpu(work_func_t func)
+{
+	get_online_cpus();
+	schedule_on_cpu_mask(func, cpu_online_mask);
+	put_online_cpus();
+	return 0;
+}
+
+/**
  * flush_scheduled_work - ensure that any scheduled work has run to completion.
  *
  * Forces execution of the kernel-global workqueue and blocks until its
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 104+ messages in thread

* [PATCH v4 1/2] workqueue: add new schedule_on_cpu_mask() API
  2013-08-07 22:48     ` Cody P Schafer
@ 2013-08-07 20:49       ` Chris Metcalf
  -1 siblings, 0 replies; 104+ messages in thread
From: Chris Metcalf @ 2013-08-07 20:49 UTC (permalink / raw)
  To: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

This primitive allows scheduling work to run on a particular set of
cpus described by a "struct cpumask".  This can be useful, for example,
if you have a per-cpu variable that requires code execution only if the
per-cpu variable has a certain value (for example, is a non-empty list).

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
v4: don't lose possible -ENOMEM in schedule_on_each_cpu()
(Note: no change to the mm/swap.c commit)

v3: split commit into two, one for workqueue and one for mm, though both
should probably be taken through -mm.

 include/linux/workqueue.h |  3 +++
 kernel/workqueue.c        | 36 +++++++++++++++++++++++++++---------
 2 files changed, 30 insertions(+), 9 deletions(-)

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index a0ed78a..71a3fe7 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -13,6 +13,8 @@
 #include <linux/atomic.h>
 #include <linux/cpumask.h>
 
+struct cpumask;
+
 struct workqueue_struct;
 
 struct work_struct;
@@ -470,6 +472,7 @@ extern void flush_workqueue(struct workqueue_struct *wq);
 extern void drain_workqueue(struct workqueue_struct *wq);
 extern void flush_scheduled_work(void);
 
+extern int schedule_on_cpu_mask(work_func_t func, const struct cpumask *mask);
 extern int schedule_on_each_cpu(work_func_t func);
 
 int execute_in_process_context(work_func_t fn, struct execute_work *);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f02c4a4..1dacb09 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2962,17 +2962,18 @@ bool cancel_delayed_work_sync(struct delayed_work *dwork)
 EXPORT_SYMBOL(cancel_delayed_work_sync);
 
 /**
- * schedule_on_each_cpu - execute a function synchronously on each online CPU
+ * schedule_on_cpu_mask - execute a function synchronously on each listed CPU
  * @func: the function to call
+ * @mask: the cpumask to invoke the function on
  *
- * schedule_on_each_cpu() executes @func on each online CPU using the
+ * schedule_on_cpu_mask() executes @func on each listed CPU using the
  * system workqueue and blocks until all CPUs have completed.
- * schedule_on_each_cpu() is very slow.
+ * schedule_on_cpu_mask() is very slow.
  *
  * RETURNS:
  * 0 on success, -errno on failure.
  */
-int schedule_on_each_cpu(work_func_t func)
+int schedule_on_cpu_mask(work_func_t func, const struct cpumask *mask)
 {
 	int cpu;
 	struct work_struct __percpu *works;
@@ -2981,24 +2982,41 @@ int schedule_on_each_cpu(work_func_t func)
 	if (!works)
 		return -ENOMEM;
 
-	get_online_cpus();
-
-	for_each_online_cpu(cpu) {
+	for_each_cpu(cpu, mask) {
 		struct work_struct *work = per_cpu_ptr(works, cpu);
 
 		INIT_WORK(work, func);
 		schedule_work_on(cpu, work);
 	}
 
-	for_each_online_cpu(cpu)
+	for_each_cpu(cpu, mask)
 		flush_work(per_cpu_ptr(works, cpu));
 
-	put_online_cpus();
 	free_percpu(works);
 	return 0;
 }
 
 /**
+ * schedule_on_each_cpu - execute a function synchronously on each online CPU
+ * @func: the function to call
+ *
+ * schedule_on_each_cpu() executes @func on each online CPU using the
+ * system workqueue and blocks until all CPUs have completed.
+ * schedule_on_each_cpu() is very slow.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int schedule_on_each_cpu(work_func_t func)
+{
+	int ret;
+	get_online_cpus();
+	ret = schedule_on_cpu_mask(func, cpu_online_mask);
+	put_online_cpus();
+	return ret;
+}
+
+/**
  * flush_scheduled_work - ensure that any scheduled work has run to completion.
  *
  * Forces execution of the kernel-global workqueue and blocks until its
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 104+ messages in thread

* [PATCH v5 1/2] workqueue: add new schedule_on_cpu_mask() API
  2013-08-09 16:30             ` Tejun Heo
@ 2013-08-07 20:49               ` Chris Metcalf
  -1 siblings, 0 replies; 104+ messages in thread
From: Chris Metcalf @ 2013-08-07 20:49 UTC (permalink / raw)
  To: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

This primitive allows scheduling work to run on a particular set of
cpus described by a "struct cpumask".  This can be useful, for example,
if you have a per-cpu variable that requires code execution only if the
per-cpu variable has a certain value (for example, is a non-empty list).

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
v5: provide validity checking on the cpumask for schedule_on_cpu_mask.
By providing an all-or-nothing EINVAL check, we impose the requirement
that the calling code actually know clearly what it's trying to do.
(Note: no change to the mm/swap.c commit)

v4: don't lose possible -ENOMEM in schedule_on_each_cpu()
(Note: no change to the mm/swap.c commit)

v3: split commit into two, one for workqueue and one for mm, though both
should probably be taken through -mm.

 include/linux/workqueue.h |  3 +++
 kernel/workqueue.c        | 51 ++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 45 insertions(+), 9 deletions(-)

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index a0ed78a..71a3fe7 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -13,6 +13,8 @@
 #include <linux/atomic.h>
 #include <linux/cpumask.h>
 
+struct cpumask;
+
 struct workqueue_struct;
 
 struct work_struct;
@@ -470,6 +472,7 @@ extern void flush_workqueue(struct workqueue_struct *wq);
 extern void drain_workqueue(struct workqueue_struct *wq);
 extern void flush_scheduled_work(void);
 
+extern int schedule_on_cpu_mask(work_func_t func, const struct cpumask *mask);
 extern int schedule_on_each_cpu(work_func_t func);
 
 int execute_in_process_context(work_func_t fn, struct execute_work *);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f02c4a4..63d504a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -292,6 +292,9 @@ static DEFINE_SPINLOCK(wq_mayday_lock);	/* protects wq->maydays list */
 static LIST_HEAD(workqueues);		/* PL: list of all workqueues */
 static bool workqueue_freezing;		/* PL: have wqs started freezing? */
 
+/* set of cpus that are valid for per-cpu workqueue scheduling */
+static struct cpumask wq_valid_cpus;
+
 /* the per-cpu worker pools */
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
 				     cpu_worker_pools);
@@ -2962,43 +2965,66 @@ bool cancel_delayed_work_sync(struct delayed_work *dwork)
 EXPORT_SYMBOL(cancel_delayed_work_sync);
 
 /**
- * schedule_on_each_cpu - execute a function synchronously on each online CPU
+ * schedule_on_cpu_mask - execute a function synchronously on each listed CPU
  * @func: the function to call
+ * @mask: the cpumask to invoke the function on
  *
- * schedule_on_each_cpu() executes @func on each online CPU using the
+ * schedule_on_cpu_mask() executes @func on each listed CPU using the
  * system workqueue and blocks until all CPUs have completed.
- * schedule_on_each_cpu() is very slow.
+ * schedule_on_cpu_mask() is very slow.  You may only specify CPUs
+ * that are online or have previously been online; specifying an
+ * invalid CPU mask will return -EINVAL without scheduling any work.
  *
  * RETURNS:
  * 0 on success, -errno on failure.
  */
-int schedule_on_each_cpu(work_func_t func)
+int schedule_on_cpu_mask(work_func_t func, const struct cpumask *mask)
 {
 	int cpu;
 	struct work_struct __percpu *works;
 
+	if (!cpumask_subset(mask, &wq_valid_cpus))
+		return -EINVAL;
+
 	works = alloc_percpu(struct work_struct);
 	if (!works)
 		return -ENOMEM;
 
-	get_online_cpus();
-
-	for_each_online_cpu(cpu) {
+	for_each_cpu(cpu, mask) {
 		struct work_struct *work = per_cpu_ptr(works, cpu);
 
 		INIT_WORK(work, func);
 		schedule_work_on(cpu, work);
 	}
 
-	for_each_online_cpu(cpu)
+	for_each_cpu(cpu, mask)
 		flush_work(per_cpu_ptr(works, cpu));
 
-	put_online_cpus();
 	free_percpu(works);
 	return 0;
 }
 
 /**
+ * schedule_on_each_cpu - execute a function synchronously on each online CPU
+ * @func: the function to call
+ *
+ * schedule_on_each_cpu() executes @func on each online CPU using the
+ * system workqueue and blocks until all CPUs have completed.
+ * schedule_on_each_cpu() is very slow.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int schedule_on_each_cpu(work_func_t func)
+{
+	int ret;
+	get_online_cpus();
+	ret = schedule_on_cpu_mask(func, cpu_online_mask);
+	put_online_cpus();
+	return ret;
+}
+
+/**
  * flush_scheduled_work - ensure that any scheduled work has run to completion.
  *
  * Forces execution of the kernel-global workqueue and blocks until its
@@ -4687,6 +4713,9 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
 		list_for_each_entry(wq, &workqueues, list)
 			wq_update_unbound_numa(wq, cpu, true);
 
+		/* track the set of cpus that have ever been online */
+		cpumask_set_cpu(cpu, &wq_valid_cpus);
+
 		mutex_unlock(&wq_pool_mutex);
 		break;
 	}
@@ -5011,6 +5040,10 @@ static int __init init_workqueues(void)
 	       !system_unbound_wq || !system_freezable_wq ||
 	       !system_power_efficient_wq ||
 	       !system_freezable_power_efficient_wq);
+
+	/* mark startup cpu as valid */
+	cpumask_set_cpu(smp_processor_id(), &wq_valid_cpus);
+
 	return 0;
 }
 early_initcall(init_workqueues);
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 104+ messages in thread

* [PATCH v3 1/2] workqueue: add new schedule_on_cpu_mask() API
@ 2013-08-07 20:49       ` Chris Metcalf
  0 siblings, 0 replies; 104+ messages in thread
From: Chris Metcalf @ 2013-08-07 20:49 UTC (permalink / raw)
  To: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner, Frederic Weisbecker

This primitive allows scheduling work to run on a particular set of
cpus described by a "struct cpumask".  This can be useful, for example,
if you have a per-cpu variable that requires code execution only if the
per-cpu variable has a certain value (for example, is a non-empty list).

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
v3: split commit into two, one for workqueue and one for mm, though both
should probably be taken through -mm.

 include/linux/workqueue.h |  3 +++
 kernel/workqueue.c        | 35 ++++++++++++++++++++++++++---------
 2 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index a0ed78a..71a3fe7 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -13,6 +13,8 @@
 #include <linux/atomic.h>
 #include <linux/cpumask.h>
 
+struct cpumask;
+
 struct workqueue_struct;
 
 struct work_struct;
@@ -470,6 +472,7 @@ extern void flush_workqueue(struct workqueue_struct *wq);
 extern void drain_workqueue(struct workqueue_struct *wq);
 extern void flush_scheduled_work(void);
 
+extern int schedule_on_cpu_mask(work_func_t func, const struct cpumask *mask);
 extern int schedule_on_each_cpu(work_func_t func);
 
 int execute_in_process_context(work_func_t fn, struct execute_work *);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f02c4a4..a6d1809 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2962,17 +2962,18 @@ bool cancel_delayed_work_sync(struct delayed_work *dwork)
 EXPORT_SYMBOL(cancel_delayed_work_sync);
 
 /**
- * schedule_on_each_cpu - execute a function synchronously on each online CPU
+ * schedule_on_cpu_mask - execute a function synchronously on each listed CPU
  * @func: the function to call
+ * @mask: the cpumask to invoke the function on
  *
- * schedule_on_each_cpu() executes @func on each online CPU using the
+ * schedule_on_cpu_mask() executes @func on each listed CPU using the
  * system workqueue and blocks until all CPUs have completed.
- * schedule_on_each_cpu() is very slow.
+ * schedule_on_cpu_mask() is very slow.
  *
  * RETURNS:
  * 0 on success, -errno on failure.
  */
-int schedule_on_each_cpu(work_func_t func)
+int schedule_on_cpu_mask(work_func_t func, const struct cpumask *mask)
 {
 	int cpu;
 	struct work_struct __percpu *works;
@@ -2981,24 +2982,40 @@ int schedule_on_each_cpu(work_func_t func)
 	if (!works)
 		return -ENOMEM;
 
-	get_online_cpus();
-
-	for_each_online_cpu(cpu) {
+	for_each_cpu(cpu, mask) {
 		struct work_struct *work = per_cpu_ptr(works, cpu);
 
 		INIT_WORK(work, func);
 		schedule_work_on(cpu, work);
 	}
 
-	for_each_online_cpu(cpu)
+	for_each_cpu(cpu, mask)
 		flush_work(per_cpu_ptr(works, cpu));
 
-	put_online_cpus();
 	free_percpu(works);
 	return 0;
 }
 
 /**
+ * schedule_on_each_cpu - execute a function synchronously on each online CPU
+ * @func: the function to call
+ *
+ * schedule_on_each_cpu() executes @func on each online CPU using the
+ * system workqueue and blocks until all CPUs have completed.
+ * schedule_on_each_cpu() is very slow.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int schedule_on_each_cpu(work_func_t func)
+{
+	get_online_cpus();
+	schedule_on_cpu_mask(func, cpu_online_mask);
+	put_online_cpus();
+	return 0;
+}
+
+/**
  * flush_scheduled_work - ensure that any scheduled work has run to completion.
  *
  * Forces execution of the kernel-global workqueue and blocks until its
-- 
1.8.3.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 104+ messages in thread

* [PATCH v4 1/2] workqueue: add new schedule_on_cpu_mask() API
@ 2013-08-07 20:49       ` Chris Metcalf
  0 siblings, 0 replies; 104+ messages in thread
From: Chris Metcalf @ 2013-08-07 20:49 UTC (permalink / raw)
  To: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

This primitive allows scheduling work to run on a particular set of
cpus described by a "struct cpumask".  This can be useful, for example,
if you have a per-cpu variable that requires code execution only if the
per-cpu variable has a certain value (for example, is a non-empty list).

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
v4: don't lose possible -ENOMEM in schedule_on_each_cpu()
(Note: no change to the mm/swap.c commit)

v3: split commit into two, one for workqueue and one for mm, though both
should probably be taken through -mm.

 include/linux/workqueue.h |  3 +++
 kernel/workqueue.c        | 36 +++++++++++++++++++++++++++---------
 2 files changed, 30 insertions(+), 9 deletions(-)

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index a0ed78a..71a3fe7 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -13,6 +13,8 @@
 #include <linux/atomic.h>
 #include <linux/cpumask.h>
 
+struct cpumask;
+
 struct workqueue_struct;
 
 struct work_struct;
@@ -470,6 +472,7 @@ extern void flush_workqueue(struct workqueue_struct *wq);
 extern void drain_workqueue(struct workqueue_struct *wq);
 extern void flush_scheduled_work(void);
 
+extern int schedule_on_cpu_mask(work_func_t func, const struct cpumask *mask);
 extern int schedule_on_each_cpu(work_func_t func);
 
 int execute_in_process_context(work_func_t fn, struct execute_work *);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f02c4a4..1dacb09 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2962,17 +2962,18 @@ bool cancel_delayed_work_sync(struct delayed_work *dwork)
 EXPORT_SYMBOL(cancel_delayed_work_sync);
 
 /**
- * schedule_on_each_cpu - execute a function synchronously on each online CPU
+ * schedule_on_cpu_mask - execute a function synchronously on each listed CPU
  * @func: the function to call
+ * @mask: the cpumask to invoke the function on
  *
- * schedule_on_each_cpu() executes @func on each online CPU using the
+ * schedule_on_cpu_mask() executes @func on each listed CPU using the
  * system workqueue and blocks until all CPUs have completed.
- * schedule_on_each_cpu() is very slow.
+ * schedule_on_cpu_mask() is very slow.
  *
  * RETURNS:
  * 0 on success, -errno on failure.
  */
-int schedule_on_each_cpu(work_func_t func)
+int schedule_on_cpu_mask(work_func_t func, const struct cpumask *mask)
 {
 	int cpu;
 	struct work_struct __percpu *works;
@@ -2981,24 +2982,41 @@ int schedule_on_each_cpu(work_func_t func)
 	if (!works)
 		return -ENOMEM;
 
-	get_online_cpus();
-
-	for_each_online_cpu(cpu) {
+	for_each_cpu(cpu, mask) {
 		struct work_struct *work = per_cpu_ptr(works, cpu);
 
 		INIT_WORK(work, func);
 		schedule_work_on(cpu, work);
 	}
 
-	for_each_online_cpu(cpu)
+	for_each_cpu(cpu, mask)
 		flush_work(per_cpu_ptr(works, cpu));
 
-	put_online_cpus();
 	free_percpu(works);
 	return 0;
 }
 
 /**
+ * schedule_on_each_cpu - execute a function synchronously on each online CPU
+ * @func: the function to call
+ *
+ * schedule_on_each_cpu() executes @func on each online CPU using the
+ * system workqueue and blocks until all CPUs have completed.
+ * schedule_on_each_cpu() is very slow.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int schedule_on_each_cpu(work_func_t func)
+{
+	int ret;
+	get_online_cpus();
+	ret = schedule_on_cpu_mask(func, cpu_online_mask);
+	put_online_cpus();
+	return ret;
+}
+
+/**
  * flush_scheduled_work - ensure that any scheduled work has run to completion.
  *
  * Forces execution of the kernel-global workqueue and blocks until its
-- 
1.8.3.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 104+ messages in thread

* [PATCH v5 1/2] workqueue: add new schedule_on_cpu_mask() API
@ 2013-08-07 20:49               ` Chris Metcalf
  0 siblings, 0 replies; 104+ messages in thread
From: Chris Metcalf @ 2013-08-07 20:49 UTC (permalink / raw)
  To: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

This primitive allows scheduling work to run on a particular set of
cpus described by a "struct cpumask".  This can be useful, for example,
if you have a per-cpu variable that requires code execution only if the
per-cpu variable has a certain value (for example, is a non-empty list).

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
v5: provide validity checking on the cpumask for schedule_on_cpu_mask.
By providing an all-or-nothing EINVAL check, we impose the requirement
that the calling code actually know clearly what it's trying to do.
(Note: no change to the mm/swap.c commit)

v4: don't lose possible -ENOMEM in schedule_on_each_cpu()
(Note: no change to the mm/swap.c commit)

v3: split commit into two, one for workqueue and one for mm, though both
should probably be taken through -mm.

 include/linux/workqueue.h |  3 +++
 kernel/workqueue.c        | 51 ++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 45 insertions(+), 9 deletions(-)

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index a0ed78a..71a3fe7 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -13,6 +13,8 @@
 #include <linux/atomic.h>
 #include <linux/cpumask.h>
 
+struct cpumask;
+
 struct workqueue_struct;
 
 struct work_struct;
@@ -470,6 +472,7 @@ extern void flush_workqueue(struct workqueue_struct *wq);
 extern void drain_workqueue(struct workqueue_struct *wq);
 extern void flush_scheduled_work(void);
 
+extern int schedule_on_cpu_mask(work_func_t func, const struct cpumask *mask);
 extern int schedule_on_each_cpu(work_func_t func);
 
 int execute_in_process_context(work_func_t fn, struct execute_work *);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f02c4a4..63d504a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -292,6 +292,9 @@ static DEFINE_SPINLOCK(wq_mayday_lock);	/* protects wq->maydays list */
 static LIST_HEAD(workqueues);		/* PL: list of all workqueues */
 static bool workqueue_freezing;		/* PL: have wqs started freezing? */
 
+/* set of cpus that are valid for per-cpu workqueue scheduling */
+static struct cpumask wq_valid_cpus;
+
 /* the per-cpu worker pools */
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
 				     cpu_worker_pools);
@@ -2962,43 +2965,66 @@ bool cancel_delayed_work_sync(struct delayed_work *dwork)
 EXPORT_SYMBOL(cancel_delayed_work_sync);
 
 /**
- * schedule_on_each_cpu - execute a function synchronously on each online CPU
+ * schedule_on_cpu_mask - execute a function synchronously on each listed CPU
  * @func: the function to call
+ * @mask: the cpumask to invoke the function on
  *
- * schedule_on_each_cpu() executes @func on each online CPU using the
+ * schedule_on_cpu_mask() executes @func on each listed CPU using the
  * system workqueue and blocks until all CPUs have completed.
- * schedule_on_each_cpu() is very slow.
+ * schedule_on_cpu_mask() is very slow.  You may only specify CPUs
+ * that are online or have previously been online; specifying an
+ * invalid CPU mask will return -EINVAL without scheduling any work.
  *
  * RETURNS:
  * 0 on success, -errno on failure.
  */
-int schedule_on_each_cpu(work_func_t func)
+int schedule_on_cpu_mask(work_func_t func, const struct cpumask *mask)
 {
 	int cpu;
 	struct work_struct __percpu *works;
 
+	if (!cpumask_subset(mask, &wq_valid_cpus))
+		return -EINVAL;
+
 	works = alloc_percpu(struct work_struct);
 	if (!works)
 		return -ENOMEM;
 
-	get_online_cpus();
-
-	for_each_online_cpu(cpu) {
+	for_each_cpu(cpu, mask) {
 		struct work_struct *work = per_cpu_ptr(works, cpu);
 
 		INIT_WORK(work, func);
 		schedule_work_on(cpu, work);
 	}
 
-	for_each_online_cpu(cpu)
+	for_each_cpu(cpu, mask)
 		flush_work(per_cpu_ptr(works, cpu));
 
-	put_online_cpus();
 	free_percpu(works);
 	return 0;
 }
 
 /**
+ * schedule_on_each_cpu - execute a function synchronously on each online CPU
+ * @func: the function to call
+ *
+ * schedule_on_each_cpu() executes @func on each online CPU using the
+ * system workqueue and blocks until all CPUs have completed.
+ * schedule_on_each_cpu() is very slow.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int schedule_on_each_cpu(work_func_t func)
+{
+	int ret;
+	get_online_cpus();
+	ret = schedule_on_cpu_mask(func, cpu_online_mask);
+	put_online_cpus();
+	return ret;
+}
+
+/**
  * flush_scheduled_work - ensure that any scheduled work has run to completion.
  *
  * Forces execution of the kernel-global workqueue and blocks until its
@@ -4687,6 +4713,9 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
 		list_for_each_entry(wq, &workqueues, list)
 			wq_update_unbound_numa(wq, cpu, true);
 
+		/* track the set of cpus that have ever been online */
+		cpumask_set_cpu(cpu, &wq_valid_cpus);
+
 		mutex_unlock(&wq_pool_mutex);
 		break;
 	}
@@ -5011,6 +5040,10 @@ static int __init init_workqueues(void)
 	       !system_unbound_wq || !system_freezable_wq ||
 	       !system_power_efficient_wq ||
 	       !system_freezable_power_efficient_wq);
+
+	/* mark startup cpu as valid */
+	cpumask_set_cpu(smp_processor_id(), &wq_valid_cpus);
+
 	return 0;
 }
 early_initcall(init_workqueues);
-- 
1.8.3.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 104+ messages in thread

* [PATCH v3 2/2] mm: make lru_add_drain_all() selective
  2013-08-07 20:45     ` Tejun Heo
@ 2013-08-07 20:52       ` Chris Metcalf
  -1 siblings, 0 replies; 104+ messages in thread
From: Chris Metcalf @ 2013-08-07 20:52 UTC (permalink / raw)
  To: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner, Frederic Weisbecker

This change makes lru_add_drain_all() only selectively interrupt
the cpus that have per-cpu free pages that can be drained.

This is important in nohz mode where calling mlockall(), for
example, otherwise will interrupt every core unnecessarily.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
v3: split commit into two, one for workqueue and one for mm, though both
should probably be taken through -mm.

 mm/swap.c | 37 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/mm/swap.c b/mm/swap.c
index 4a1d0d2..d4a862b 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -405,6 +405,11 @@ static void activate_page_drain(int cpu)
 		pagevec_lru_move_fn(pvec, __activate_page, NULL);
 }
 
+static bool need_activate_page_drain(int cpu)
+{
+	return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0;
+}
+
 void activate_page(struct page *page)
 {
 	if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
@@ -422,6 +427,11 @@ static inline void activate_page_drain(int cpu)
 {
 }
 
+static bool need_activate_page_drain(int cpu)
+{
+	return false;
+}
+
 void activate_page(struct page *page)
 {
 	struct zone *zone = page_zone(page);
@@ -683,7 +693,32 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
  */
 int lru_add_drain_all(void)
 {
-	return schedule_on_each_cpu(lru_add_drain_per_cpu);
+	cpumask_var_t mask;
+	int cpu, rc;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+	cpumask_clear(mask);
+
+	/*
+	 * Figure out which cpus need flushing.  It's OK if we race
+	 * with changes to the per-cpu lru pvecs, since it's no worse
+	 * than if we flushed all cpus, since a cpu could still end
+	 * up putting pages back on its pvec before we returned.
+	 * And this avoids interrupting other cpus unnecessarily.
+	 */
+	for_each_online_cpu(cpu) {
+		if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
+		    pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
+		    pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
+		    need_activate_page_drain(cpu))
+			cpumask_set_cpu(cpu, mask);
+	}
+
+	rc = schedule_on_cpu_mask(lru_add_drain_per_cpu, mask);
+
+	free_cpumask_var(mask);
+	return rc;
 }
 
 /*
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 104+ messages in thread

* [PATCH v4 2/2] mm: make lru_add_drain_all() selective
  2013-08-07 22:48     ` Cody P Schafer
@ 2013-08-07 20:52       ` Chris Metcalf
  -1 siblings, 0 replies; 104+ messages in thread
From: Chris Metcalf @ 2013-08-07 20:52 UTC (permalink / raw)
  To: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

This change makes lru_add_drain_all() only selectively interrupt
the cpus that have per-cpu free pages that can be drained.

This is important in nohz mode where calling mlockall(), for
example, otherwise will interrupt every core unnecessarily.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
v4: don't lose possible -ENOMEM in schedule_on_each_cpu()
(Note: no change to the mm/swap.c commit)

v3: split commit into two, one for workqueue and one for mm, though both
should probably be taken through -mm.

 mm/swap.c | 37 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/mm/swap.c b/mm/swap.c
index 4a1d0d2..d4a862b 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -405,6 +405,11 @@ static void activate_page_drain(int cpu)
 		pagevec_lru_move_fn(pvec, __activate_page, NULL);
 }
 
+static bool need_activate_page_drain(int cpu)
+{
+	return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0;
+}
+
 void activate_page(struct page *page)
 {
 	if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
@@ -422,6 +427,11 @@ static inline void activate_page_drain(int cpu)
 {
 }
 
+static bool need_activate_page_drain(int cpu)
+{
+	return false;
+}
+
 void activate_page(struct page *page)
 {
 	struct zone *zone = page_zone(page);
@@ -683,7 +693,32 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
  */
 int lru_add_drain_all(void)
 {
-	return schedule_on_each_cpu(lru_add_drain_per_cpu);
+	cpumask_var_t mask;
+	int cpu, rc;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+	cpumask_clear(mask);
+
+	/*
+	 * Figure out which cpus need flushing.  It's OK if we race
+	 * with changes to the per-cpu lru pvecs, since it's no worse
+	 * than if we flushed all cpus, since a cpu could still end
+	 * up putting pages back on its pvec before we returned.
+	 * And this avoids interrupting other cpus unnecessarily.
+	 */
+	for_each_online_cpu(cpu) {
+		if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
+		    pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
+		    pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
+		    need_activate_page_drain(cpu))
+			cpumask_set_cpu(cpu, mask);
+	}
+
+	rc = schedule_on_cpu_mask(lru_add_drain_per_cpu, mask);
+
+	free_cpumask_var(mask);
+	return rc;
 }
 
 /*
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 104+ messages in thread

* [PATCH v5 2/2] mm: make lru_add_drain_all() selective
  2013-08-09 16:30             ` Tejun Heo
@ 2013-08-07 20:52               ` Chris Metcalf
  -1 siblings, 0 replies; 104+ messages in thread
From: Chris Metcalf @ 2013-08-07 20:52 UTC (permalink / raw)
  To: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

This change makes lru_add_drain_all() only selectively interrupt
the cpus that have per-cpu free pages that can be drained.

This is important in nohz mode where calling mlockall(), for
example, otherwise will interrupt every core unnecessarily.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
v5: provide validity checking on the cpumask for schedule_on_cpu_mask
By providing an all-or-nothing EINVAL check, we impose the requirement
that the calling code actually know clearly what it's trying to do.
(Note: no change to the mm/swap.c commit)

v4: don't lose possible -ENOMEM in schedule_on_each_cpu()
(Note: no change to the mm/swap.c commit)

v3: split commit into two, one for workqueue and one for mm, though both
should probably be taken through -mm.

 mm/swap.c | 37 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/mm/swap.c b/mm/swap.c
index 4a1d0d2..d4a862b 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -405,6 +405,11 @@ static void activate_page_drain(int cpu)
 		pagevec_lru_move_fn(pvec, __activate_page, NULL);
 }
 
+static bool need_activate_page_drain(int cpu)
+{
+	return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0;
+}
+
 void activate_page(struct page *page)
 {
 	if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
@@ -422,6 +427,11 @@ static inline void activate_page_drain(int cpu)
 {
 }
 
+static bool need_activate_page_drain(int cpu)
+{
+	return false;
+}
+
 void activate_page(struct page *page)
 {
 	struct zone *zone = page_zone(page);
@@ -683,7 +693,32 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
  */
 int lru_add_drain_all(void)
 {
-	return schedule_on_each_cpu(lru_add_drain_per_cpu);
+	cpumask_var_t mask;
+	int cpu, rc;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+	cpumask_clear(mask);
+
+	/*
+	 * Figure out which cpus need flushing.  It's OK if we race
+	 * with changes to the per-cpu lru pvecs, since it's no worse
+	 * than if we flushed all cpus, since a cpu could still end
+	 * up putting pages back on its pvec before we returned.
+	 * And this avoids interrupting other cpus unnecessarily.
+	 */
+	for_each_online_cpu(cpu) {
+		if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
+		    pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
+		    pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
+		    need_activate_page_drain(cpu))
+			cpumask_set_cpu(cpu, mask);
+	}
+
+	rc = schedule_on_cpu_mask(lru_add_drain_per_cpu, mask);
+
+	free_cpumask_var(mask);
+	return rc;
 }
 
 /*
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 104+ messages in thread

* [PATCH v3 2/2] mm: make lru_add_drain_all() selective
@ 2013-08-07 20:52       ` Chris Metcalf
  0 siblings, 0 replies; 104+ messages in thread
From: Chris Metcalf @ 2013-08-07 20:52 UTC (permalink / raw)
  To: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner, Frederic Weisbecker

This change makes lru_add_drain_all() only selectively interrupt
the cpus that have per-cpu free pages that can be drained.

This is important in nohz mode where calling mlockall(), for
example, otherwise will interrupt every core unnecessarily.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
v3: split commit into two, one for workqueue and one for mm, though both
should probably be taken through -mm.

 mm/swap.c | 37 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/mm/swap.c b/mm/swap.c
index 4a1d0d2..d4a862b 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -405,6 +405,11 @@ static void activate_page_drain(int cpu)
 		pagevec_lru_move_fn(pvec, __activate_page, NULL);
 }
 
+static bool need_activate_page_drain(int cpu)
+{
+	return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0;
+}
+
 void activate_page(struct page *page)
 {
 	if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
@@ -422,6 +427,11 @@ static inline void activate_page_drain(int cpu)
 {
 }
 
+static bool need_activate_page_drain(int cpu)
+{
+	return false;
+}
+
 void activate_page(struct page *page)
 {
 	struct zone *zone = page_zone(page);
@@ -683,7 +693,32 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
  */
 int lru_add_drain_all(void)
 {
-	return schedule_on_each_cpu(lru_add_drain_per_cpu);
+	cpumask_var_t mask;
+	int cpu, rc;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+	cpumask_clear(mask);
+
+	/*
+	 * Figure out which cpus need flushing.  It's OK if we race
+	 * with changes to the per-cpu lru pvecs, since it's no worse
+	 * than if we flushed all cpus, since a cpu could still end
+	 * up putting pages back on its pvec before we returned.
+	 * And this avoids interrupting other cpus unnecessarily.
+	 */
+	for_each_online_cpu(cpu) {
+		if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
+		    pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
+		    pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
+		    need_activate_page_drain(cpu))
+			cpumask_set_cpu(cpu, mask);
+	}
+
+	rc = schedule_on_cpu_mask(lru_add_drain_per_cpu, mask);
+
+	free_cpumask_var(mask);
+	return rc;
 }
 
 /*
-- 
1.8.3.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 104+ messages in thread

* [PATCH v4 2/2] mm: make lru_add_drain_all() selective
@ 2013-08-07 20:52       ` Chris Metcalf
  0 siblings, 0 replies; 104+ messages in thread
From: Chris Metcalf @ 2013-08-07 20:52 UTC (permalink / raw)
  To: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

This change makes lru_add_drain_all() only selectively interrupt
the cpus that have per-cpu free pages that can be drained.

This is important in nohz mode where calling mlockall(), for
example, otherwise will interrupt every core unnecessarily.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
v4: don't lose possible -ENOMEM in schedule_on_each_cpu()
(Note: no change to the mm/swap.c commit)

v3: split commit into two, one for workqueue and one for mm, though both
should probably be taken through -mm.

 mm/swap.c | 37 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/mm/swap.c b/mm/swap.c
index 4a1d0d2..d4a862b 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -405,6 +405,11 @@ static void activate_page_drain(int cpu)
 		pagevec_lru_move_fn(pvec, __activate_page, NULL);
 }
 
+static bool need_activate_page_drain(int cpu)
+{
+	return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0;
+}
+
 void activate_page(struct page *page)
 {
 	if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
@@ -422,6 +427,11 @@ static inline void activate_page_drain(int cpu)
 {
 }
 
+static bool need_activate_page_drain(int cpu)
+{
+	return false;
+}
+
 void activate_page(struct page *page)
 {
 	struct zone *zone = page_zone(page);
@@ -683,7 +693,32 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
  */
 int lru_add_drain_all(void)
 {
-	return schedule_on_each_cpu(lru_add_drain_per_cpu);
+	cpumask_var_t mask;
+	int cpu, rc;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+	cpumask_clear(mask);
+
+	/*
+	 * Figure out which cpus need flushing.  It's OK if we race
+	 * with changes to the per-cpu lru pvecs, since it's no worse
+	 * than if we flushed all cpus, since a cpu could still end
+	 * up putting pages back on its pvec before we returned.
+	 * And this avoids interrupting other cpus unnecessarily.
+	 */
+	for_each_online_cpu(cpu) {
+		if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
+		    pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
+		    pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
+		    need_activate_page_drain(cpu))
+			cpumask_set_cpu(cpu, mask);
+	}
+
+	rc = schedule_on_cpu_mask(lru_add_drain_per_cpu, mask);
+
+	free_cpumask_var(mask);
+	return rc;
 }
 
 /*
-- 
1.8.3.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 104+ messages in thread

* [PATCH v5 2/2] mm: make lru_add_drain_all() selective
@ 2013-08-07 20:52               ` Chris Metcalf
  0 siblings, 0 replies; 104+ messages in thread
From: Chris Metcalf @ 2013-08-07 20:52 UTC (permalink / raw)
  To: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

This change makes lru_add_drain_all() only selectively interrupt
the cpus that have per-cpu free pages that can be drained.

This is important in nohz mode where calling mlockall(), for
example, otherwise will interrupt every core unnecessarily.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
v5: provide validity checking on the cpumask for schedule_on_cpu_mask
By providing an all-or-nothing EINVAL check, we impose the requirement
that the calling code actually know clearly what it's trying to do.
(Note: no change to the mm/swap.c commit)

v4: don't lose possible -ENOMEM in schedule_on_each_cpu()
(Note: no change to the mm/swap.c commit)

v3: split commit into two, one for workqueue and one for mm, though both
should probably be taken through -mm.

 mm/swap.c | 37 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/mm/swap.c b/mm/swap.c
index 4a1d0d2..d4a862b 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -405,6 +405,11 @@ static void activate_page_drain(int cpu)
 		pagevec_lru_move_fn(pvec, __activate_page, NULL);
 }
 
+static bool need_activate_page_drain(int cpu)
+{
+	return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0;
+}
+
 void activate_page(struct page *page)
 {
 	if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
@@ -422,6 +427,11 @@ static inline void activate_page_drain(int cpu)
 {
 }
 
+static bool need_activate_page_drain(int cpu)
+{
+	return false;
+}
+
 void activate_page(struct page *page)
 {
 	struct zone *zone = page_zone(page);
@@ -683,7 +693,32 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
  */
 int lru_add_drain_all(void)
 {
-	return schedule_on_each_cpu(lru_add_drain_per_cpu);
+	cpumask_var_t mask;
+	int cpu, rc;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+	cpumask_clear(mask);
+
+	/*
+	 * Figure out which cpus need flushing.  It's OK if we race
+	 * with changes to the per-cpu lru pvecs, since it's no worse
+	 * than if we flushed all cpus, since a cpu could still end
+	 * up putting pages back on its pvec before we returned.
+	 * And this avoids interrupting other cpus unnecessarily.
+	 */
+	for_each_online_cpu(cpu) {
+		if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
+		    pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
+		    pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
+		    need_activate_page_drain(cpu))
+			cpumask_set_cpu(cpu, mask);
+	}
+
+	rc = schedule_on_cpu_mask(lru_add_drain_per_cpu, mask);
+
+	free_cpumask_var(mask);
+	return rc;
 }
 
 /*
-- 
1.8.3.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 104+ messages in thread

* Re: [PATCH v2] mm: make lru_add_drain_all() selective
  2013-08-06 20:22   ` Chris Metcalf
@ 2013-08-07 22:48     ` Cody P Schafer
  -1 siblings, 0 replies; 104+ messages in thread
From: Cody P Schafer @ 2013-08-07 22:48 UTC (permalink / raw)
  To: Chris Metcalf
  Cc: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner, Frederic Weisbecker

On 08/06/2013 01:22 PM, Chris Metcalf wrote:
[...]

>
>   /**
> + * schedule_on_each_cpu - execute a function synchronously on each online CPU
> + * @func: the function to call
> + *
> + * schedule_on_each_cpu() executes @func on each online CPU using the
> + * system workqueue and blocks until all CPUs have completed.
> + * schedule_on_each_cpu() is very slow.
> + *
> + * RETURNS:
> + * 0 on success, -errno on failure.
> + */
> +int schedule_on_each_cpu(work_func_t func)
> +{
> +	get_online_cpus();
> +	schedule_on_cpu_mask(func, cpu_online_mask);

Looks like the return value gets lost here.

> +	put_online_cpus();
> +	return 0;
> +}
> +
> +/**
>    * flush_scheduled_work - ensure that any scheduled work has run to completion.
>    *
>    * Forces execution of the kernel-global workqueue and blocks until its

[...]


^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v2] mm: make lru_add_drain_all() selective
@ 2013-08-07 22:48     ` Cody P Schafer
  0 siblings, 0 replies; 104+ messages in thread
From: Cody P Schafer @ 2013-08-07 22:48 UTC (permalink / raw)
  To: Chris Metcalf
  Cc: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner, Frederic Weisbecker

On 08/06/2013 01:22 PM, Chris Metcalf wrote:
[...]

>
>   /**
> + * schedule_on_each_cpu - execute a function synchronously on each online CPU
> + * @func: the function to call
> + *
> + * schedule_on_each_cpu() executes @func on each online CPU using the
> + * system workqueue and blocks until all CPUs have completed.
> + * schedule_on_each_cpu() is very slow.
> + *
> + * RETURNS:
> + * 0 on success, -errno on failure.
> + */
> +int schedule_on_each_cpu(work_func_t func)
> +{
> +	get_online_cpus();
> +	schedule_on_cpu_mask(func, cpu_online_mask);

Looks like the return value gets lost here.

> +	put_online_cpus();
> +	return 0;
> +}
> +
> +/**
>    * flush_scheduled_work - ensure that any scheduled work has run to completion.
>    *
>    * Forces execution of the kernel-global workqueue and blocks until its

[...]

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v4 1/2] workqueue: add new schedule_on_cpu_mask() API
  2013-08-07 20:49       ` Chris Metcalf
@ 2013-08-09 15:02         ` Tejun Heo
  -1 siblings, 0 replies; 104+ messages in thread
From: Tejun Heo @ 2013-08-09 15:02 UTC (permalink / raw)
  To: Chris Metcalf
  Cc: linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker,
	Cody P Schafer

Hello, Chris.

On Wed, Aug 07, 2013 at 04:49:44PM -0400, Chris Metcalf wrote:
> This primitive allows scheduling work to run on a particular set of
> cpus described by a "struct cpumask".  This can be useful, for example,
> if you have a per-cpu variable that requires code execution only if the
> per-cpu variable has a certain value (for example, is a non-empty list).

So, this allows scheduling work items on !online CPUs.  Workqueue does
allow scheduling per-cpu work items on offline CPUs if the CPU has
ever been online, but the behavior when scheduling work items on cpu
which has never been online is undefined.  I think the interface at
least needs to verify that the the target cpus have been online,
trigger warning and mask off invalid CPUs otherwise.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v4 1/2] workqueue: add new schedule_on_cpu_mask() API
@ 2013-08-09 15:02         ` Tejun Heo
  0 siblings, 0 replies; 104+ messages in thread
From: Tejun Heo @ 2013-08-09 15:02 UTC (permalink / raw)
  To: Chris Metcalf
  Cc: linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker,
	Cody P Schafer

Hello, Chris.

On Wed, Aug 07, 2013 at 04:49:44PM -0400, Chris Metcalf wrote:
> This primitive allows scheduling work to run on a particular set of
> cpus described by a "struct cpumask".  This can be useful, for example,
> if you have a per-cpu variable that requires code execution only if the
> per-cpu variable has a certain value (for example, is a non-empty list).

So, this allows scheduling work items on !online CPUs.  Workqueue does
allow scheduling per-cpu work items on offline CPUs if the CPU has
ever been online, but the behavior when scheduling work items on cpu
which has never been online is undefined.  I think the interface at
least needs to verify that the the target cpus have been online,
trigger warning and mask off invalid CPUs otherwise.

Thanks.

-- 
tejun

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v4 1/2] workqueue: add new schedule_on_cpu_mask() API
  2013-08-09 15:02         ` Tejun Heo
@ 2013-08-09 16:12           ` Chris Metcalf
  -1 siblings, 0 replies; 104+ messages in thread
From: Chris Metcalf @ 2013-08-09 16:12 UTC (permalink / raw)
  To: Tejun Heo
  Cc: linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker,
	Cody P Schafer

On 8/9/2013 11:02 AM, Tejun Heo wrote:
> Hello, Chris.
>
> On Wed, Aug 07, 2013 at 04:49:44PM -0400, Chris Metcalf wrote:
>> This primitive allows scheduling work to run on a particular set of
>> cpus described by a "struct cpumask".  This can be useful, for example,
>> if you have a per-cpu variable that requires code execution only if the
>> per-cpu variable has a certain value (for example, is a non-empty list).
> So, this allows scheduling work items on !online CPUs.  Workqueue does
> allow scheduling per-cpu work items on offline CPUs if the CPU has
> ever been online, but the behavior when scheduling work items on cpu
> which has never been online is undefined.  I think the interface at
> least needs to verify that the the target cpus have been online,
> trigger warning and mask off invalid CPUs otherwise.

I could certainly make schedule_on_cpu_mask() do sanity checking, perhaps via a WARN_ON_ONCE() if offline cpus were specified, and otherwise just have it create a local struct cpumask that it and's with cpu_online_mask, suitably wrapping with get_online_cpus()/put_online_cpus().  (I'm not sure how to test if a cpu has ever been online, vs whether it's online right now.)  I don't want to unnecessarily slow down the existing schedule_on_each_cpu(), so perhaps the implementation should have a static schedule_on_cpu_mask_internal() function that is the same as my previous schedule_on_cpu_mask(), allowing schedule_on_each_cpu() to call it directly to bypass the checking.

That said... I wonder if it might make sense to treat this API the same as other APIs that already take a cpu?  schedule_work_on(), schedule_delayed_work_on(), and queue_delayed_work_on() all take a cpu parameter without API comment or validity checking; queue_work_on() just says "the caller must ensure [the cpu] can't go away".  Does it make sense to just add a similar comment to schedule_on_cpu_mask() rather than make this API the first to actually do cpu validity checking?

Let me know; I'm happy to respin it either way.

-- 
Chris Metcalf, Tilera Corp.
http://www.tilera.com


^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v4 1/2] workqueue: add new schedule_on_cpu_mask() API
@ 2013-08-09 16:12           ` Chris Metcalf
  0 siblings, 0 replies; 104+ messages in thread
From: Chris Metcalf @ 2013-08-09 16:12 UTC (permalink / raw)
  To: Tejun Heo
  Cc: linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker,
	Cody P Schafer

On 8/9/2013 11:02 AM, Tejun Heo wrote:
> Hello, Chris.
>
> On Wed, Aug 07, 2013 at 04:49:44PM -0400, Chris Metcalf wrote:
>> This primitive allows scheduling work to run on a particular set of
>> cpus described by a "struct cpumask".  This can be useful, for example,
>> if you have a per-cpu variable that requires code execution only if the
>> per-cpu variable has a certain value (for example, is a non-empty list).
> So, this allows scheduling work items on !online CPUs.  Workqueue does
> allow scheduling per-cpu work items on offline CPUs if the CPU has
> ever been online, but the behavior when scheduling work items on cpu
> which has never been online is undefined.  I think the interface at
> least needs to verify that the the target cpus have been online,
> trigger warning and mask off invalid CPUs otherwise.

I could certainly make schedule_on_cpu_mask() do sanity checking, perhaps via a WARN_ON_ONCE() if offline cpus were specified, and otherwise just have it create a local struct cpumask that it and's with cpu_online_mask, suitably wrapping with get_online_cpus()/put_online_cpus().  (I'm not sure how to test if a cpu has ever been online, vs whether it's online right now.)  I don't want to unnecessarily slow down the existing schedule_on_each_cpu(), so perhaps the implementation should have a static schedule_on_cpu_mask_internal() function that is the same as my previous schedule_on_cpu_mask(), allowing schedule_on_each_cpu() to call it directly to bypass the checking.

That said... I wonder if it might make sense to treat this API the same as other APIs that already take a cpu?  schedule_work_on(), schedule_delayed_work_on(), and queue_delayed_work_on() all take a cpu parameter without API comment or validity checking; queue_work_on() just says "the caller must ensure [the cpu] can't go away".  Does it make sense to just add a similar comment to schedule_on_cpu_mask() rather than make this API the first to actually do cpu validity checking?

Let me know; I'm happy to respin it either way.

-- 
Chris Metcalf, Tilera Corp.
http://www.tilera.com

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v4 1/2] workqueue: add new schedule_on_cpu_mask() API
  2013-08-09 16:12           ` Chris Metcalf
@ 2013-08-09 16:30             ` Tejun Heo
  -1 siblings, 0 replies; 104+ messages in thread
From: Tejun Heo @ 2013-08-09 16:30 UTC (permalink / raw)
  To: Chris Metcalf
  Cc: linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker,
	Cody P Schafer

Hello, Chris.

On Fri, Aug 09, 2013 at 12:12:43PM -0400, Chris Metcalf wrote:
> I could certainly make schedule_on_cpu_mask() do sanity checking,
> perhaps via a WARN_ON_ONCE() if offline cpus were specified, and
> otherwise just have it create a local struct cpumask that it and's
> with cpu_online_mask, suitably wrapping with
> get_online_cpus()/put_online_cpus().  (I'm not sure how to test if a
> cpu has ever been online, vs whether it's online right now.)  I

I think you'll have to collect it from CPU_ONLINE of
workqueue_cpu_up_callback() and I think it probably wouldn't be a bad
idea to allow scheduling on CPUs which have been up but aren't
currently as that's the current rule for other interfaces anyway.

> don't want to unnecessarily slow down the existing
> schedule_on_each_cpu(), so perhaps the implementation should have a
> static schedule_on_cpu_mask_internal() function that is the same as
> my previous schedule_on_cpu_mask(), allowing schedule_on_each_cpu()
> to call it directly to bypass the checking.

Hmmm.... it's unlikely to make noticeable difference given that it's
gonna be bouncing multiple cachelines across all online CPUs.

> That said... I wonder if it might make sense to treat this API the
> same as other APIs that already take a cpu?  schedule_work_on(),
> schedule_delayed_work_on(), and queue_delayed_work_on() all take a
> cpu parameter without API comment or validity checking;
> queue_work_on() just says "the caller must ensure [the cpu] can't go
> away".  Does it make sense to just add a similar comment to
> schedule_on_cpu_mask() rather than make this API the first to
> actually do cpu validity checking?

Yeah, we've been lazy with the sanity check and I think it's a good
opportunity to add it.  Let's worry about other paths later.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v4 1/2] workqueue: add new schedule_on_cpu_mask() API
@ 2013-08-09 16:30             ` Tejun Heo
  0 siblings, 0 replies; 104+ messages in thread
From: Tejun Heo @ 2013-08-09 16:30 UTC (permalink / raw)
  To: Chris Metcalf
  Cc: linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker,
	Cody P Schafer

Hello, Chris.

On Fri, Aug 09, 2013 at 12:12:43PM -0400, Chris Metcalf wrote:
> I could certainly make schedule_on_cpu_mask() do sanity checking,
> perhaps via a WARN_ON_ONCE() if offline cpus were specified, and
> otherwise just have it create a local struct cpumask that it and's
> with cpu_online_mask, suitably wrapping with
> get_online_cpus()/put_online_cpus().  (I'm not sure how to test if a
> cpu has ever been online, vs whether it's online right now.)  I

I think you'll have to collect it from CPU_ONLINE of
workqueue_cpu_up_callback() and I think it probably wouldn't be a bad
idea to allow scheduling on CPUs which have been up but aren't
currently as that's the current rule for other interfaces anyway.

> don't want to unnecessarily slow down the existing
> schedule_on_each_cpu(), so perhaps the implementation should have a
> static schedule_on_cpu_mask_internal() function that is the same as
> my previous schedule_on_cpu_mask(), allowing schedule_on_each_cpu()
> to call it directly to bypass the checking.

Hmmm.... it's unlikely to make noticeable difference given that it's
gonna be bouncing multiple cachelines across all online CPUs.

> That said... I wonder if it might make sense to treat this API the
> same as other APIs that already take a cpu?  schedule_work_on(),
> schedule_delayed_work_on(), and queue_delayed_work_on() all take a
> cpu parameter without API comment or validity checking;
> queue_work_on() just says "the caller must ensure [the cpu] can't go
> away".  Does it make sense to just add a similar comment to
> schedule_on_cpu_mask() rather than make this API the first to
> actually do cpu validity checking?

Yeah, we've been lazy with the sanity check and I think it's a good
opportunity to add it.  Let's worry about other paths later.

Thanks.

-- 
tejun

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v5 1/2] workqueue: add new schedule_on_cpu_mask() API
  2013-08-07 20:49               ` Chris Metcalf
@ 2013-08-09 17:40                 ` Tejun Heo
  -1 siblings, 0 replies; 104+ messages in thread
From: Tejun Heo @ 2013-08-09 17:40 UTC (permalink / raw)
  To: Chris Metcalf
  Cc: linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker,
	Cody P Schafer

On Wed, Aug 07, 2013 at 04:49:44PM -0400, Chris Metcalf wrote:
> This primitive allows scheduling work to run on a particular set of
> cpus described by a "struct cpumask".  This can be useful, for example,
> if you have a per-cpu variable that requires code execution only if the
> per-cpu variable has a certain value (for example, is a non-empty list).
> 
> Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>

 Acked-by: Tejun Heo <tj@kernel.org>

Please feel free to route with the second patch.

Thanks!

-- 
tejun

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v5 1/2] workqueue: add new schedule_on_cpu_mask() API
@ 2013-08-09 17:40                 ` Tejun Heo
  0 siblings, 0 replies; 104+ messages in thread
From: Tejun Heo @ 2013-08-09 17:40 UTC (permalink / raw)
  To: Chris Metcalf
  Cc: linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker,
	Cody P Schafer

On Wed, Aug 07, 2013 at 04:49:44PM -0400, Chris Metcalf wrote:
> This primitive allows scheduling work to run on a particular set of
> cpus described by a "struct cpumask".  This can be useful, for example,
> if you have a per-cpu variable that requires code execution only if the
> per-cpu variable has a certain value (for example, is a non-empty list).
> 
> Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>

 Acked-by: Tejun Heo <tj@kernel.org>

Please feel free to route with the second patch.

Thanks!

-- 
tejun

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 104+ messages in thread

* [PATCH v6 1/2] workqueue: add new schedule_on_cpu_mask() API
  2013-08-09 17:40                 ` Tejun Heo
@ 2013-08-09 17:49                   ` Chris Metcalf
  -1 siblings, 0 replies; 104+ messages in thread
From: Chris Metcalf @ 2013-08-09 17:49 UTC (permalink / raw)
  To: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

This primitive allows scheduling work to run on a particular set of
cpus described by a "struct cpumask".  This can be useful, for example,
if you have a per-cpu variable that requires code execution only if the
per-cpu variable has a certain value (for example, is a non-empty list).

Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
v6: add Tejun's Acked-by, and add missing get/put_cpu_online to
lru_add_drain_all().

v5: provide validity checking on the cpumask for schedule_on_cpu_mask.
By providing an all-or-nothing EINVAL check, we impose the requirement
that the calling code actually know clearly what it's trying to do.
(Note: no change to the mm/swap.c commit)

v4: don't lose possible -ENOMEM in schedule_on_each_cpu()
(Note: no change to the mm/swap.c commit)

v3: split commit into two, one for workqueue and one for mm, though both
should probably be taken through -mm.

 include/linux/workqueue.h |  3 +++
 kernel/workqueue.c        | 51 ++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 45 insertions(+), 9 deletions(-)

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index a0ed78a..71a3fe7 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -13,6 +13,8 @@
 #include <linux/atomic.h>
 #include <linux/cpumask.h>
 
+struct cpumask;
+
 struct workqueue_struct;
 
 struct work_struct;
@@ -470,6 +472,7 @@ extern void flush_workqueue(struct workqueue_struct *wq);
 extern void drain_workqueue(struct workqueue_struct *wq);
 extern void flush_scheduled_work(void);
 
+extern int schedule_on_cpu_mask(work_func_t func, const struct cpumask *mask);
 extern int schedule_on_each_cpu(work_func_t func);
 
 int execute_in_process_context(work_func_t fn, struct execute_work *);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f02c4a4..63d504a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -292,6 +292,9 @@ static DEFINE_SPINLOCK(wq_mayday_lock);	/* protects wq->maydays list */
 static LIST_HEAD(workqueues);		/* PL: list of all workqueues */
 static bool workqueue_freezing;		/* PL: have wqs started freezing? */
 
+/* set of cpus that are valid for per-cpu workqueue scheduling */
+static struct cpumask wq_valid_cpus;
+
 /* the per-cpu worker pools */
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
 				     cpu_worker_pools);
@@ -2962,43 +2965,66 @@ bool cancel_delayed_work_sync(struct delayed_work *dwork)
 EXPORT_SYMBOL(cancel_delayed_work_sync);
 
 /**
- * schedule_on_each_cpu - execute a function synchronously on each online CPU
+ * schedule_on_cpu_mask - execute a function synchronously on each listed CPU
  * @func: the function to call
+ * @mask: the cpumask to invoke the function on
  *
- * schedule_on_each_cpu() executes @func on each online CPU using the
+ * schedule_on_cpu_mask() executes @func on each listed CPU using the
  * system workqueue and blocks until all CPUs have completed.
- * schedule_on_each_cpu() is very slow.
+ * schedule_on_cpu_mask() is very slow.  You may only specify CPUs
+ * that are online or have previously been online; specifying an
+ * invalid CPU mask will return -EINVAL without scheduling any work.
  *
  * RETURNS:
  * 0 on success, -errno on failure.
  */
-int schedule_on_each_cpu(work_func_t func)
+int schedule_on_cpu_mask(work_func_t func, const struct cpumask *mask)
 {
 	int cpu;
 	struct work_struct __percpu *works;
 
+	if (!cpumask_subset(mask, &wq_valid_cpus))
+		return -EINVAL;
+
 	works = alloc_percpu(struct work_struct);
 	if (!works)
 		return -ENOMEM;
 
-	get_online_cpus();
-
-	for_each_online_cpu(cpu) {
+	for_each_cpu(cpu, mask) {
 		struct work_struct *work = per_cpu_ptr(works, cpu);
 
 		INIT_WORK(work, func);
 		schedule_work_on(cpu, work);
 	}
 
-	for_each_online_cpu(cpu)
+	for_each_cpu(cpu, mask)
 		flush_work(per_cpu_ptr(works, cpu));
 
-	put_online_cpus();
 	free_percpu(works);
 	return 0;
 }
 
 /**
+ * schedule_on_each_cpu - execute a function synchronously on each online CPU
+ * @func: the function to call
+ *
+ * schedule_on_each_cpu() executes @func on each online CPU using the
+ * system workqueue and blocks until all CPUs have completed.
+ * schedule_on_each_cpu() is very slow.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int schedule_on_each_cpu(work_func_t func)
+{
+	int ret;
+	get_online_cpus();
+	ret = schedule_on_cpu_mask(func, cpu_online_mask);
+	put_online_cpus();
+	return ret;
+}
+
+/**
  * flush_scheduled_work - ensure that any scheduled work has run to completion.
  *
  * Forces execution of the kernel-global workqueue and blocks until its
@@ -4687,6 +4713,9 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
 		list_for_each_entry(wq, &workqueues, list)
 			wq_update_unbound_numa(wq, cpu, true);
 
+		/* track the set of cpus that have ever been online */
+		cpumask_set_cpu(cpu, &wq_valid_cpus);
+
 		mutex_unlock(&wq_pool_mutex);
 		break;
 	}
@@ -5011,6 +5040,10 @@ static int __init init_workqueues(void)
 	       !system_unbound_wq || !system_freezable_wq ||
 	       !system_power_efficient_wq ||
 	       !system_freezable_power_efficient_wq);
+
+	/* mark startup cpu as valid */
+	cpumask_set_cpu(smp_processor_id(), &wq_valid_cpus);
+
 	return 0;
 }
 early_initcall(init_workqueues);
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 104+ messages in thread

* [PATCH v6 1/2] workqueue: add new schedule_on_cpu_mask() API
@ 2013-08-09 17:49                   ` Chris Metcalf
  0 siblings, 0 replies; 104+ messages in thread
From: Chris Metcalf @ 2013-08-09 17:49 UTC (permalink / raw)
  To: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

This primitive allows scheduling work to run on a particular set of
cpus described by a "struct cpumask".  This can be useful, for example,
if you have a per-cpu variable that requires code execution only if the
per-cpu variable has a certain value (for example, is a non-empty list).

Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
v6: add Tejun's Acked-by, and add missing get/put_cpu_online to
lru_add_drain_all().

v5: provide validity checking on the cpumask for schedule_on_cpu_mask.
By providing an all-or-nothing EINVAL check, we impose the requirement
that the calling code actually know clearly what it's trying to do.
(Note: no change to the mm/swap.c commit)

v4: don't lose possible -ENOMEM in schedule_on_each_cpu()
(Note: no change to the mm/swap.c commit)

v3: split commit into two, one for workqueue and one for mm, though both
should probably be taken through -mm.

 include/linux/workqueue.h |  3 +++
 kernel/workqueue.c        | 51 ++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 45 insertions(+), 9 deletions(-)

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index a0ed78a..71a3fe7 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -13,6 +13,8 @@
 #include <linux/atomic.h>
 #include <linux/cpumask.h>
 
+struct cpumask;
+
 struct workqueue_struct;
 
 struct work_struct;
@@ -470,6 +472,7 @@ extern void flush_workqueue(struct workqueue_struct *wq);
 extern void drain_workqueue(struct workqueue_struct *wq);
 extern void flush_scheduled_work(void);
 
+extern int schedule_on_cpu_mask(work_func_t func, const struct cpumask *mask);
 extern int schedule_on_each_cpu(work_func_t func);
 
 int execute_in_process_context(work_func_t fn, struct execute_work *);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f02c4a4..63d504a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -292,6 +292,9 @@ static DEFINE_SPINLOCK(wq_mayday_lock);	/* protects wq->maydays list */
 static LIST_HEAD(workqueues);		/* PL: list of all workqueues */
 static bool workqueue_freezing;		/* PL: have wqs started freezing? */
 
+/* set of cpus that are valid for per-cpu workqueue scheduling */
+static struct cpumask wq_valid_cpus;
+
 /* the per-cpu worker pools */
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
 				     cpu_worker_pools);
@@ -2962,43 +2965,66 @@ bool cancel_delayed_work_sync(struct delayed_work *dwork)
 EXPORT_SYMBOL(cancel_delayed_work_sync);
 
 /**
- * schedule_on_each_cpu - execute a function synchronously on each online CPU
+ * schedule_on_cpu_mask - execute a function synchronously on each listed CPU
  * @func: the function to call
+ * @mask: the cpumask to invoke the function on
  *
- * schedule_on_each_cpu() executes @func on each online CPU using the
+ * schedule_on_cpu_mask() executes @func on each listed CPU using the
  * system workqueue and blocks until all CPUs have completed.
- * schedule_on_each_cpu() is very slow.
+ * schedule_on_cpu_mask() is very slow.  You may only specify CPUs
+ * that are online or have previously been online; specifying an
+ * invalid CPU mask will return -EINVAL without scheduling any work.
  *
  * RETURNS:
  * 0 on success, -errno on failure.
  */
-int schedule_on_each_cpu(work_func_t func)
+int schedule_on_cpu_mask(work_func_t func, const struct cpumask *mask)
 {
 	int cpu;
 	struct work_struct __percpu *works;
 
+	if (!cpumask_subset(mask, &wq_valid_cpus))
+		return -EINVAL;
+
 	works = alloc_percpu(struct work_struct);
 	if (!works)
 		return -ENOMEM;
 
-	get_online_cpus();
-
-	for_each_online_cpu(cpu) {
+	for_each_cpu(cpu, mask) {
 		struct work_struct *work = per_cpu_ptr(works, cpu);
 
 		INIT_WORK(work, func);
 		schedule_work_on(cpu, work);
 	}
 
-	for_each_online_cpu(cpu)
+	for_each_cpu(cpu, mask)
 		flush_work(per_cpu_ptr(works, cpu));
 
-	put_online_cpus();
 	free_percpu(works);
 	return 0;
 }
 
 /**
+ * schedule_on_each_cpu - execute a function synchronously on each online CPU
+ * @func: the function to call
+ *
+ * schedule_on_each_cpu() executes @func on each online CPU using the
+ * system workqueue and blocks until all CPUs have completed.
+ * schedule_on_each_cpu() is very slow.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int schedule_on_each_cpu(work_func_t func)
+{
+	int ret;
+	get_online_cpus();
+	ret = schedule_on_cpu_mask(func, cpu_online_mask);
+	put_online_cpus();
+	return ret;
+}
+
+/**
  * flush_scheduled_work - ensure that any scheduled work has run to completion.
  *
  * Forces execution of the kernel-global workqueue and blocks until its
@@ -4687,6 +4713,9 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
 		list_for_each_entry(wq, &workqueues, list)
 			wq_update_unbound_numa(wq, cpu, true);
 
+		/* track the set of cpus that have ever been online */
+		cpumask_set_cpu(cpu, &wq_valid_cpus);
+
 		mutex_unlock(&wq_pool_mutex);
 		break;
 	}
@@ -5011,6 +5040,10 @@ static int __init init_workqueues(void)
 	       !system_unbound_wq || !system_freezable_wq ||
 	       !system_power_efficient_wq ||
 	       !system_freezable_power_efficient_wq);
+
+	/* mark startup cpu as valid */
+	cpumask_set_cpu(smp_processor_id(), &wq_valid_cpus);
+
 	return 0;
 }
 early_initcall(init_workqueues);
-- 
1.8.3.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 104+ messages in thread

* [PATCH v6 2/2] mm: make lru_add_drain_all() selective
  2013-08-09 17:40                 ` Tejun Heo
@ 2013-08-09 17:52                   ` Chris Metcalf
  -1 siblings, 0 replies; 104+ messages in thread
From: Chris Metcalf @ 2013-08-09 17:52 UTC (permalink / raw)
  To: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

This change makes lru_add_drain_all() only selectively interrupt
the cpus that have per-cpu free pages that can be drained.

This is important in nohz mode where calling mlockall(), for
example, otherwise will interrupt every core unnecessarily.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
v6: add Tejun's Acked-by, and add missing get/put_cpu_online to
lru_add_drain_all().

v5: provide validity checking on the cpumask for schedule_on_cpu_mask.
By providing an all-or-nothing EINVAL check, we impose the requirement
that the calling code actually know clearly what it's trying to do.
(Note: no change to the mm/swap.c commit)

v4: don't lose possible -ENOMEM in schedule_on_each_cpu()
(Note: no change to the mm/swap.c commit)

v3: split commit into two, one for workqueue and one for mm, though both
should probably be taken through -mm.

 mm/swap.c | 39 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 38 insertions(+), 1 deletion(-)

diff --git a/mm/swap.c b/mm/swap.c
index 4a1d0d2..f42561a 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -405,6 +405,11 @@ static void activate_page_drain(int cpu)
 		pagevec_lru_move_fn(pvec, __activate_page, NULL);
 }
 
+static bool need_activate_page_drain(int cpu)
+{
+	return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0;
+}
+
 void activate_page(struct page *page)
 {
 	if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
@@ -422,6 +427,11 @@ static inline void activate_page_drain(int cpu)
 {
 }
 
+static bool need_activate_page_drain(int cpu)
+{
+	return false;
+}
+
 void activate_page(struct page *page)
 {
 	struct zone *zone = page_zone(page);
@@ -683,7 +693,34 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
  */
 int lru_add_drain_all(void)
 {
-	return schedule_on_each_cpu(lru_add_drain_per_cpu);
+	cpumask_var_t mask;
+	int cpu, rc;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+	cpumask_clear(mask);
+	get_online_cpus();
+
+	/*
+	 * Figure out which cpus need flushing.  It's OK if we race
+	 * with changes to the per-cpu lru pvecs, since it's no worse
+	 * than if we flushed all cpus, since a cpu could still end
+	 * up putting pages back on its pvec before we returned.
+	 * And this avoids interrupting other cpus unnecessarily.
+	 */
+	for_each_online_cpu(cpu) {
+		if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
+		    pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
+		    pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
+		    need_activate_page_drain(cpu))
+			cpumask_set_cpu(cpu, mask);
+	}
+
+	rc = schedule_on_cpu_mask(lru_add_drain_per_cpu, mask);
+
+	put_online_cpus();
+	free_cpumask_var(mask);
+	return rc;
 }
 
 /*
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 104+ messages in thread

* [PATCH v6 2/2] mm: make lru_add_drain_all() selective
@ 2013-08-09 17:52                   ` Chris Metcalf
  0 siblings, 0 replies; 104+ messages in thread
From: Chris Metcalf @ 2013-08-09 17:52 UTC (permalink / raw)
  To: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

This change makes lru_add_drain_all() only selectively interrupt
the cpus that have per-cpu free pages that can be drained.

This is important in nohz mode where calling mlockall(), for
example, otherwise will interrupt every core unnecessarily.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
v6: add Tejun's Acked-by, and add missing get/put_cpu_online to
lru_add_drain_all().

v5: provide validity checking on the cpumask for schedule_on_cpu_mask.
By providing an all-or-nothing EINVAL check, we impose the requirement
that the calling code actually know clearly what it's trying to do.
(Note: no change to the mm/swap.c commit)

v4: don't lose possible -ENOMEM in schedule_on_each_cpu()
(Note: no change to the mm/swap.c commit)

v3: split commit into two, one for workqueue and one for mm, though both
should probably be taken through -mm.

 mm/swap.c | 39 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 38 insertions(+), 1 deletion(-)

diff --git a/mm/swap.c b/mm/swap.c
index 4a1d0d2..f42561a 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -405,6 +405,11 @@ static void activate_page_drain(int cpu)
 		pagevec_lru_move_fn(pvec, __activate_page, NULL);
 }
 
+static bool need_activate_page_drain(int cpu)
+{
+	return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0;
+}
+
 void activate_page(struct page *page)
 {
 	if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
@@ -422,6 +427,11 @@ static inline void activate_page_drain(int cpu)
 {
 }
 
+static bool need_activate_page_drain(int cpu)
+{
+	return false;
+}
+
 void activate_page(struct page *page)
 {
 	struct zone *zone = page_zone(page);
@@ -683,7 +693,34 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
  */
 int lru_add_drain_all(void)
 {
-	return schedule_on_each_cpu(lru_add_drain_per_cpu);
+	cpumask_var_t mask;
+	int cpu, rc;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+	cpumask_clear(mask);
+	get_online_cpus();
+
+	/*
+	 * Figure out which cpus need flushing.  It's OK if we race
+	 * with changes to the per-cpu lru pvecs, since it's no worse
+	 * than if we flushed all cpus, since a cpu could still end
+	 * up putting pages back on its pvec before we returned.
+	 * And this avoids interrupting other cpus unnecessarily.
+	 */
+	for_each_online_cpu(cpu) {
+		if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
+		    pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
+		    pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
+		    need_activate_page_drain(cpu))
+			cpumask_set_cpu(cpu, mask);
+	}
+
+	rc = schedule_on_cpu_mask(lru_add_drain_per_cpu, mask);
+
+	put_online_cpus();
+	free_cpumask_var(mask);
+	return rc;
 }
 
 /*
-- 
1.8.3.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 104+ messages in thread

* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective
  2013-08-07 20:52       ` Chris Metcalf
@ 2013-08-12 21:05         ` Andrew Morton
  -1 siblings, 0 replies; 104+ messages in thread
From: Andrew Morton @ 2013-08-12 21:05 UTC (permalink / raw)
  To: Chris Metcalf
  Cc: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

On Wed, 7 Aug 2013 16:52:22 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote:

> This change makes lru_add_drain_all() only selectively interrupt
> the cpus that have per-cpu free pages that can be drained.
> 
> This is important in nohz mode where calling mlockall(), for
> example, otherwise will interrupt every core unnecessarily.
> 
> ...
>
> --- a/mm/swap.c
> +++ b/mm/swap.c
> @@ -405,6 +405,11 @@ static void activate_page_drain(int cpu)
>  		pagevec_lru_move_fn(pvec, __activate_page, NULL);
>  }
>  
> +static bool need_activate_page_drain(int cpu)
> +{
> +	return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0;
> +}
> +
>  void activate_page(struct page *page)
>  {
>  	if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
> @@ -422,6 +427,11 @@ static inline void activate_page_drain(int cpu)
>  {
>  }
>  
> +static bool need_activate_page_drain(int cpu)
> +{
> +	return false;
> +}
> +
>  void activate_page(struct page *page)
>  {
>  	struct zone *zone = page_zone(page);
> @@ -683,7 +693,32 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
>   */
>  int lru_add_drain_all(void)
>  {
> -	return schedule_on_each_cpu(lru_add_drain_per_cpu);
> +	cpumask_var_t mask;
> +	int cpu, rc;
> +
> +	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
> +		return -ENOMEM;

Newly adding a GFP_KERNEL allocation attempt into lru_add_drain_all()
is dangerous and undesirable.  I took a quick look at all the callsites
and didn't immediately see a bug, but it's hard because they're
splattered all over the place.  It would be far better if we were to
not do this.

Rather than tossing this hang-grenade in there we should at a reluctant
minimum change lru_add_drain_all() to take a gfp_t argument and then
carefully review and update the callers.

> +	cpumask_clear(mask);
> +
> +	/*
> +	 * Figure out which cpus need flushing.  It's OK if we race
> +	 * with changes to the per-cpu lru pvecs, since it's no worse
> +	 * than if we flushed all cpus, since a cpu could still end
> +	 * up putting pages back on its pvec before we returned.
> +	 * And this avoids interrupting other cpus unnecessarily.
> +	 */
> +	for_each_online_cpu(cpu) {
> +		if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
> +		    pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
> +		    pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
> +		    need_activate_page_drain(cpu))
> +			cpumask_set_cpu(cpu, mask);
> +	}
> +
> +	rc = schedule_on_cpu_mask(lru_add_drain_per_cpu, mask);

And it seems pretty easy to avoid the allocation.  Create a single
cpumask at boot (or, preferably, at compile-time) and whenever we add a
page to a drainable pagevec, do

	cpumask_set_cpu(smp_processor_id(), global_cpumask);

and to avoid needlessly dirtying a cacheline,

	if (!cpu_isset(smp_processor_id(), global_cpumask))
		cpumask_set_cpu(smp_processor_id(), global_cpumask);


This means that lru_add_drain_all() will need to clear the mask at some
point and atomicity issues arise.  It would be better to do the
clearing within schedule_on_cpu_mask() itself, using
cpumask_test_and_clear_cpu().



Also, what's up with the get_online_cpus() handling? 
schedule_on_each_cpu() does it, lru_add_drain_all() does not do it and
the schedule_on_cpu_mask() documentation forgot to mention it.



^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective
@ 2013-08-12 21:05         ` Andrew Morton
  0 siblings, 0 replies; 104+ messages in thread
From: Andrew Morton @ 2013-08-12 21:05 UTC (permalink / raw)
  To: Chris Metcalf
  Cc: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

On Wed, 7 Aug 2013 16:52:22 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote:

> This change makes lru_add_drain_all() only selectively interrupt
> the cpus that have per-cpu free pages that can be drained.
> 
> This is important in nohz mode where calling mlockall(), for
> example, otherwise will interrupt every core unnecessarily.
> 
> ...
>
> --- a/mm/swap.c
> +++ b/mm/swap.c
> @@ -405,6 +405,11 @@ static void activate_page_drain(int cpu)
>  		pagevec_lru_move_fn(pvec, __activate_page, NULL);
>  }
>  
> +static bool need_activate_page_drain(int cpu)
> +{
> +	return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0;
> +}
> +
>  void activate_page(struct page *page)
>  {
>  	if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
> @@ -422,6 +427,11 @@ static inline void activate_page_drain(int cpu)
>  {
>  }
>  
> +static bool need_activate_page_drain(int cpu)
> +{
> +	return false;
> +}
> +
>  void activate_page(struct page *page)
>  {
>  	struct zone *zone = page_zone(page);
> @@ -683,7 +693,32 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
>   */
>  int lru_add_drain_all(void)
>  {
> -	return schedule_on_each_cpu(lru_add_drain_per_cpu);
> +	cpumask_var_t mask;
> +	int cpu, rc;
> +
> +	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
> +		return -ENOMEM;

Newly adding a GFP_KERNEL allocation attempt into lru_add_drain_all()
is dangerous and undesirable.  I took a quick look at all the callsites
and didn't immediately see a bug, but it's hard because they're
splattered all over the place.  It would be far better if we were to
not do this.

Rather than tossing this hang-grenade in there we should at a reluctant
minimum change lru_add_drain_all() to take a gfp_t argument and then
carefully review and update the callers.

> +	cpumask_clear(mask);
> +
> +	/*
> +	 * Figure out which cpus need flushing.  It's OK if we race
> +	 * with changes to the per-cpu lru pvecs, since it's no worse
> +	 * than if we flushed all cpus, since a cpu could still end
> +	 * up putting pages back on its pvec before we returned.
> +	 * And this avoids interrupting other cpus unnecessarily.
> +	 */
> +	for_each_online_cpu(cpu) {
> +		if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
> +		    pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
> +		    pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
> +		    need_activate_page_drain(cpu))
> +			cpumask_set_cpu(cpu, mask);
> +	}
> +
> +	rc = schedule_on_cpu_mask(lru_add_drain_per_cpu, mask);

And it seems pretty easy to avoid the allocation.  Create a single
cpumask at boot (or, preferably, at compile-time) and whenever we add a
page to a drainable pagevec, do

	cpumask_set_cpu(smp_processor_id(), global_cpumask);

and to avoid needlessly dirtying a cacheline,

	if (!cpu_isset(smp_processor_id(), global_cpumask))
		cpumask_set_cpu(smp_processor_id(), global_cpumask);


This means that lru_add_drain_all() will need to clear the mask at some
point and atomicity issues arise.  It would be better to do the
clearing within schedule_on_cpu_mask() itself, using
cpumask_test_and_clear_cpu().



Also, what's up with the get_online_cpus() handling? 
schedule_on_each_cpu() does it, lru_add_drain_all() does not do it and
the schedule_on_cpu_mask() documentation forgot to mention it.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective
  2013-08-12 21:05         ` Andrew Morton
@ 2013-08-13  1:53           ` Chris Metcalf
  -1 siblings, 0 replies; 104+ messages in thread
From: Chris Metcalf @ 2013-08-13  1:53 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

On 8/12/2013 5:05 PM, Andrew Morton wrote:
> On Wed, 7 Aug 2013 16:52:22 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote:
>
>> This change makes lru_add_drain_all() only selectively interrupt
>> the cpus that have per-cpu free pages that can be drained.
>>
>> This is important in nohz mode where calling mlockall(), for
>> example, otherwise will interrupt every core unnecessarily.
>>
>> ...
>>
>> --- a/mm/swap.c
>> +++ b/mm/swap.c
>> @@ -405,6 +405,11 @@ static void activate_page_drain(int cpu)
>>  		pagevec_lru_move_fn(pvec, __activate_page, NULL);
>>  }
>>  
>> +static bool need_activate_page_drain(int cpu)
>> +{
>> +	return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0;
>> +}
>> +
>>  void activate_page(struct page *page)
>>  {
>>  	if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
>> @@ -422,6 +427,11 @@ static inline void activate_page_drain(int cpu)
>>  {
>>  }
>>  
>> +static bool need_activate_page_drain(int cpu)
>> +{
>> +	return false;
>> +}
>> +
>>  void activate_page(struct page *page)
>>  {
>>  	struct zone *zone = page_zone(page);
>> @@ -683,7 +693,32 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
>>   */
>>  int lru_add_drain_all(void)
>>  {
>> -	return schedule_on_each_cpu(lru_add_drain_per_cpu);
>> +	cpumask_var_t mask;
>> +	int cpu, rc;
>> +
>> +	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
>> +		return -ENOMEM;
> Newly adding a GFP_KERNEL allocation attempt into lru_add_drain_all()
> is dangerous and undesirable.  I took a quick look at all the callsites
> and didn't immediately see a bug, but it's hard because they're
> splattered all over the place.  It would be far better if we were to
> not do this.

I think it should be safe, given that we already did alloc_percpu() to do
schedule_on_each_cpu(), and that is documented as doing GFP_KERNEL allocation
(pcpu_create_chunk will call alloc_pages with GFP_KERNEL).

> Rather than tossing this hang-grenade in there we should at a reluctant
> minimum change lru_add_drain_all() to take a gfp_t argument and then
> carefully review and update the callers.
>
>> +	cpumask_clear(mask);
>> +
>> +	/*
>> +	 * Figure out which cpus need flushing.  It's OK if we race
>> +	 * with changes to the per-cpu lru pvecs, since it's no worse
>> +	 * than if we flushed all cpus, since a cpu could still end
>> +	 * up putting pages back on its pvec before we returned.
>> +	 * And this avoids interrupting other cpus unnecessarily.
>> +	 */
>> +	for_each_online_cpu(cpu) {
>> +		if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
>> +		    pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
>> +		    pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
>> +		    need_activate_page_drain(cpu))
>> +			cpumask_set_cpu(cpu, mask);
>> +	}
>> +
>> +	rc = schedule_on_cpu_mask(lru_add_drain_per_cpu, mask);
> And it seems pretty easy to avoid the allocation.  Create a single
> cpumask at boot (or, preferably, at compile-time) and whenever we add a
> page to a drainable pagevec, do
>
> 	cpumask_set_cpu(smp_processor_id(), global_cpumask);
>
> and to avoid needlessly dirtying a cacheline,
>
> 	if (!cpu_isset(smp_processor_id(), global_cpumask))
> 		cpumask_set_cpu(smp_processor_id(), global_cpumask);
>
>
> This means that lru_add_drain_all() will need to clear the mask at some
> point and atomicity issues arise.  It would be better to do the
> clearing within schedule_on_cpu_mask() itself, using
> cpumask_test_and_clear_cpu().

The atomicity issue isn't that big a deal (given that the drain is
racy anyway, you just need to make sure to do cpumask_set_cpu after
the pagevec_add), but you do need to clear the cpumask before doing
the actual drain, and that either means inflicting that semantics
on schedule_on_cpu_mask(), which seems a little unnatural, or else
doing a copy of the mask, which gets us back to where we started
with GFP_KERNEL allocations.

Alternately, you could imagine a workqueue API that just took a function
pointer that returned for each cpu whether or not to schedule work on
that cpu:

  typedef bool (*check_work_func_t)(void *data, int cpu);
  schedule_on_some_cpus(work_func_t func, check_work_func_t checker, void *data);

For the lru stuff we wouldn't need to use a "data" pointer but I'd include
it since it's cheap, pretty standard, and makes the API more general.

Or, I suppose, one other possibility is just a compile-time struct
cpumask that we guard with a lock.  It seems a bit like overkill for
the very common case of not specifying CPUMASK_OFFSTACK.

All that said, I still tend to like the simple cpumask data-driven approach,
assuming you're comfortable with the possible GFP_KERNEL allocation.

> Also, what's up with the get_online_cpus() handling? 
> schedule_on_each_cpu() does it, lru_add_drain_all() does not do it and
> the schedule_on_cpu_mask() documentation forgot to mention it.

The missing get_online_cpus() for lru_add_drain_all() is in v6 of the
patch from Aug 9 (v5 had Tejun's feedback for doing validity-checking
on the schedule_on_cpu_mask() mask argument, and v6 added his Ack
and the missing get/put_online_cpus).

schedule_on_each_cpu() obviously uses get/put_online_cpus and needs it;
I would argue that there's no need to mention it in the docs for
schedule_on_cpu_mask() since if you're going to pass the cpu_online_mask
you'd better know that you should get/put_online_cpus().

-- 
Chris Metcalf, Tilera Corp.
http://www.tilera.com


^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective
@ 2013-08-13  1:53           ` Chris Metcalf
  0 siblings, 0 replies; 104+ messages in thread
From: Chris Metcalf @ 2013-08-13  1:53 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

On 8/12/2013 5:05 PM, Andrew Morton wrote:
> On Wed, 7 Aug 2013 16:52:22 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote:
>
>> This change makes lru_add_drain_all() only selectively interrupt
>> the cpus that have per-cpu free pages that can be drained.
>>
>> This is important in nohz mode where calling mlockall(), for
>> example, otherwise will interrupt every core unnecessarily.
>>
>> ...
>>
>> --- a/mm/swap.c
>> +++ b/mm/swap.c
>> @@ -405,6 +405,11 @@ static void activate_page_drain(int cpu)
>>  		pagevec_lru_move_fn(pvec, __activate_page, NULL);
>>  }
>>  
>> +static bool need_activate_page_drain(int cpu)
>> +{
>> +	return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0;
>> +}
>> +
>>  void activate_page(struct page *page)
>>  {
>>  	if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
>> @@ -422,6 +427,11 @@ static inline void activate_page_drain(int cpu)
>>  {
>>  }
>>  
>> +static bool need_activate_page_drain(int cpu)
>> +{
>> +	return false;
>> +}
>> +
>>  void activate_page(struct page *page)
>>  {
>>  	struct zone *zone = page_zone(page);
>> @@ -683,7 +693,32 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
>>   */
>>  int lru_add_drain_all(void)
>>  {
>> -	return schedule_on_each_cpu(lru_add_drain_per_cpu);
>> +	cpumask_var_t mask;
>> +	int cpu, rc;
>> +
>> +	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
>> +		return -ENOMEM;
> Newly adding a GFP_KERNEL allocation attempt into lru_add_drain_all()
> is dangerous and undesirable.  I took a quick look at all the callsites
> and didn't immediately see a bug, but it's hard because they're
> splattered all over the place.  It would be far better if we were to
> not do this.

I think it should be safe, given that we already did alloc_percpu() to do
schedule_on_each_cpu(), and that is documented as doing GFP_KERNEL allocation
(pcpu_create_chunk will call alloc_pages with GFP_KERNEL).

> Rather than tossing this hang-grenade in there we should at a reluctant
> minimum change lru_add_drain_all() to take a gfp_t argument and then
> carefully review and update the callers.
>
>> +	cpumask_clear(mask);
>> +
>> +	/*
>> +	 * Figure out which cpus need flushing.  It's OK if we race
>> +	 * with changes to the per-cpu lru pvecs, since it's no worse
>> +	 * than if we flushed all cpus, since a cpu could still end
>> +	 * up putting pages back on its pvec before we returned.
>> +	 * And this avoids interrupting other cpus unnecessarily.
>> +	 */
>> +	for_each_online_cpu(cpu) {
>> +		if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
>> +		    pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
>> +		    pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
>> +		    need_activate_page_drain(cpu))
>> +			cpumask_set_cpu(cpu, mask);
>> +	}
>> +
>> +	rc = schedule_on_cpu_mask(lru_add_drain_per_cpu, mask);
> And it seems pretty easy to avoid the allocation.  Create a single
> cpumask at boot (or, preferably, at compile-time) and whenever we add a
> page to a drainable pagevec, do
>
> 	cpumask_set_cpu(smp_processor_id(), global_cpumask);
>
> and to avoid needlessly dirtying a cacheline,
>
> 	if (!cpu_isset(smp_processor_id(), global_cpumask))
> 		cpumask_set_cpu(smp_processor_id(), global_cpumask);
>
>
> This means that lru_add_drain_all() will need to clear the mask at some
> point and atomicity issues arise.  It would be better to do the
> clearing within schedule_on_cpu_mask() itself, using
> cpumask_test_and_clear_cpu().

The atomicity issue isn't that big a deal (given that the drain is
racy anyway, you just need to make sure to do cpumask_set_cpu after
the pagevec_add), but you do need to clear the cpumask before doing
the actual drain, and that either means inflicting that semantics
on schedule_on_cpu_mask(), which seems a little unnatural, or else
doing a copy of the mask, which gets us back to where we started
with GFP_KERNEL allocations.

Alternately, you could imagine a workqueue API that just took a function
pointer that returned for each cpu whether or not to schedule work on
that cpu:

  typedef bool (*check_work_func_t)(void *data, int cpu);
  schedule_on_some_cpus(work_func_t func, check_work_func_t checker, void *data);

For the lru stuff we wouldn't need to use a "data" pointer but I'd include
it since it's cheap, pretty standard, and makes the API more general.

Or, I suppose, one other possibility is just a compile-time struct
cpumask that we guard with a lock.  It seems a bit like overkill for
the very common case of not specifying CPUMASK_OFFSTACK.

All that said, I still tend to like the simple cpumask data-driven approach,
assuming you're comfortable with the possible GFP_KERNEL allocation.

> Also, what's up with the get_online_cpus() handling? 
> schedule_on_each_cpu() does it, lru_add_drain_all() does not do it and
> the schedule_on_cpu_mask() documentation forgot to mention it.

The missing get_online_cpus() for lru_add_drain_all() is in v6 of the
patch from Aug 9 (v5 had Tejun's feedback for doing validity-checking
on the schedule_on_cpu_mask() mask argument, and v6 added his Ack
and the missing get/put_online_cpus).

schedule_on_each_cpu() obviously uses get/put_online_cpus and needs it;
I would argue that there's no need to mention it in the docs for
schedule_on_cpu_mask() since if you're going to pass the cpu_online_mask
you'd better know that you should get/put_online_cpus().

-- 
Chris Metcalf, Tilera Corp.
http://www.tilera.com

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective
  2013-08-13  1:53           ` Chris Metcalf
@ 2013-08-13 19:35             ` Andrew Morton
  -1 siblings, 0 replies; 104+ messages in thread
From: Andrew Morton @ 2013-08-13 19:35 UTC (permalink / raw)
  To: Chris Metcalf
  Cc: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

On Mon, 12 Aug 2013 21:53:11 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote:

> On 8/12/2013 5:05 PM, Andrew Morton wrote:
> >> @@ -683,7 +693,32 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
> >>   */
> >>  int lru_add_drain_all(void)
> >>  {
> >> -	return schedule_on_each_cpu(lru_add_drain_per_cpu);
> >> +	cpumask_var_t mask;
> >> +	int cpu, rc;
> >> +
> >> +	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
> >> +		return -ENOMEM;
> > Newly adding a GFP_KERNEL allocation attempt into lru_add_drain_all()
> > is dangerous and undesirable.  I took a quick look at all the callsites
> > and didn't immediately see a bug, but it's hard because they're
> > splattered all over the place.  It would be far better if we were to
> > not do this.
> 
> I think it should be safe, given that we already did alloc_percpu() to do
> schedule_on_each_cpu(), and that is documented as doing GFP_KERNEL allocation
> (pcpu_create_chunk will call alloc_pages with GFP_KERNEL).

urgh, OK.

The allocation of the `struct work's is unavoidable, I guess.

However it's a bit sad to allocate one per CPU when only a small number
(even zero) or the CPUs actually need it.  It might be better to
kmalloc these things individually.  That will often reduce memory
allocations and because the work can be freed by the handler it opens
the possibility of an asynchronous schedule_on_cpus(), should that be
desired.

I don't know how lots-of-kmallocs compares with alloc_percpu()
performance-wise.


That being said, the `cpumask_var_t mask' which was added to
lru_add_drain_all() is unneeded - it's just a temporary storage which
can be eliminated by creating a schedule_on_each_cpu_cond() or whatever
which is passed a function pointer of type `bool (*call_needed)(int
cpu, void *data)'.

For lru_add_drain_all(), the callback function tests the
pagevec_counts.  For schedule_on_each_cpu(), `data' points at
cpu_online_mask.

> > Rather than tossing this hang-grenade in there we should at a reluctant
> > minimum change lru_add_drain_all() to take a gfp_t argument and then
> > carefully review and update the callers.
> >
> >> +	cpumask_clear(mask);
> >> +
> >> +	/*
> >> +	 * Figure out which cpus need flushing.  It's OK if we race
> >> +	 * with changes to the per-cpu lru pvecs, since it's no worse
> >> +	 * than if we flushed all cpus, since a cpu could still end
> >> +	 * up putting pages back on its pvec before we returned.
> >> +	 * And this avoids interrupting other cpus unnecessarily.
> >> +	 */
> >> +	for_each_online_cpu(cpu) {
> >> +		if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
> >> +		    pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
> >> +		    pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
> >> +		    need_activate_page_drain(cpu))
> >> +			cpumask_set_cpu(cpu, mask);
> >> +	}
> >> +
> >> +	rc = schedule_on_cpu_mask(lru_add_drain_per_cpu, mask);
> > And it seems pretty easy to avoid the allocation.  Create a single
> > cpumask at boot (or, preferably, at compile-time) and whenever we add a
> > page to a drainable pagevec, do
> >
> > 	cpumask_set_cpu(smp_processor_id(), global_cpumask);
> >
> > and to avoid needlessly dirtying a cacheline,
> >
> > 	if (!cpu_isset(smp_processor_id(), global_cpumask))
> > 		cpumask_set_cpu(smp_processor_id(), global_cpumask);
> >
> >
> > This means that lru_add_drain_all() will need to clear the mask at some
> > point and atomicity issues arise.  It would be better to do the
> > clearing within schedule_on_cpu_mask() itself, using
> > cpumask_test_and_clear_cpu().
> 
> The atomicity issue isn't that big a deal (given that the drain is
> racy anyway, you just need to make sure to do cpumask_set_cpu after
> the pagevec_add), but you do need to clear the cpumask before doing
> the actual drain, and that either means inflicting that semantics
> on schedule_on_cpu_mask(), which seems a little unnatural, or else
> doing a copy of the mask, which gets us back to where we started
> with GFP_KERNEL allocations.

The drain is kinda racy, but there are approaches we can use...

The semantics we should aim for are "all pages which were in pagevecs
when lru_add_drain_all() was called will be drained".

And "any pages which are added to pagevecs after the call to
lru_add_drain_all() should be drained by either this call or by the
next call".  ie: lru_add_drain_all() should not cause any
concurrently-added pages to just be lost to the next lru_add_drain_all()

> Alternately, you could imagine a workqueue API that just took a function
> pointer that returned for each cpu whether or not to schedule work on
> that cpu:
> 
>   typedef bool (*check_work_func_t)(void *data, int cpu);
>   schedule_on_some_cpus(work_func_t func, check_work_func_t checker, void *data);
> 
> For the lru stuff we wouldn't need to use a "data" pointer but I'd include
> it since it's cheap, pretty standard, and makes the API more general.

uh, yes, that.  I do think this implementation and interface is better
than adding the temporary cpumask.

> Or, I suppose, one other possibility is just a compile-time struct
> cpumask that we guard with a lock.  It seems a bit like overkill for
> the very common case of not specifying CPUMASK_OFFSTACK.
> 
> All that said, I still tend to like the simple cpumask data-driven approach,
> assuming you're comfortable with the possible GFP_KERNEL allocation.
> 
> > Also, what's up with the get_online_cpus() handling? 
> > schedule_on_each_cpu() does it, lru_add_drain_all() does not do it and
> > the schedule_on_cpu_mask() documentation forgot to mention it.
> 
> The missing get_online_cpus() for lru_add_drain_all() is in v6 of the
> patch from Aug 9 (v5 had Tejun's feedback for doing validity-checking
> on the schedule_on_cpu_mask() mask argument, and v6 added his Ack
> and the missing get/put_online_cpus).

Well yes, I noticed this after sending the earlier email.  He may be
old and wrinkly, but I do suggest that the guy who wrote and maintains
that code could have got a cc.  Also am unclear why Tejun went and
committed the code to -next without cc'ing me.

> schedule_on_each_cpu() obviously uses get/put_online_cpus and needs it;
> I would argue that there's no need to mention it in the docs for
> schedule_on_cpu_mask() since if you're going to pass the cpu_online_mask
> you'd better know that you should get/put_online_cpus().


^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective
@ 2013-08-13 19:35             ` Andrew Morton
  0 siblings, 0 replies; 104+ messages in thread
From: Andrew Morton @ 2013-08-13 19:35 UTC (permalink / raw)
  To: Chris Metcalf
  Cc: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

On Mon, 12 Aug 2013 21:53:11 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote:

> On 8/12/2013 5:05 PM, Andrew Morton wrote:
> >> @@ -683,7 +693,32 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
> >>   */
> >>  int lru_add_drain_all(void)
> >>  {
> >> -	return schedule_on_each_cpu(lru_add_drain_per_cpu);
> >> +	cpumask_var_t mask;
> >> +	int cpu, rc;
> >> +
> >> +	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
> >> +		return -ENOMEM;
> > Newly adding a GFP_KERNEL allocation attempt into lru_add_drain_all()
> > is dangerous and undesirable.  I took a quick look at all the callsites
> > and didn't immediately see a bug, but it's hard because they're
> > splattered all over the place.  It would be far better if we were to
> > not do this.
> 
> I think it should be safe, given that we already did alloc_percpu() to do
> schedule_on_each_cpu(), and that is documented as doing GFP_KERNEL allocation
> (pcpu_create_chunk will call alloc_pages with GFP_KERNEL).

urgh, OK.

The allocation of the `struct work's is unavoidable, I guess.

However it's a bit sad to allocate one per CPU when only a small number
(even zero) or the CPUs actually need it.  It might be better to
kmalloc these things individually.  That will often reduce memory
allocations and because the work can be freed by the handler it opens
the possibility of an asynchronous schedule_on_cpus(), should that be
desired.

I don't know how lots-of-kmallocs compares with alloc_percpu()
performance-wise.


That being said, the `cpumask_var_t mask' which was added to
lru_add_drain_all() is unneeded - it's just a temporary storage which
can be eliminated by creating a schedule_on_each_cpu_cond() or whatever
which is passed a function pointer of type `bool (*call_needed)(int
cpu, void *data)'.

For lru_add_drain_all(), the callback function tests the
pagevec_counts.  For schedule_on_each_cpu(), `data' points at
cpu_online_mask.

> > Rather than tossing this hang-grenade in there we should at a reluctant
> > minimum change lru_add_drain_all() to take a gfp_t argument and then
> > carefully review and update the callers.
> >
> >> +	cpumask_clear(mask);
> >> +
> >> +	/*
> >> +	 * Figure out which cpus need flushing.  It's OK if we race
> >> +	 * with changes to the per-cpu lru pvecs, since it's no worse
> >> +	 * than if we flushed all cpus, since a cpu could still end
> >> +	 * up putting pages back on its pvec before we returned.
> >> +	 * And this avoids interrupting other cpus unnecessarily.
> >> +	 */
> >> +	for_each_online_cpu(cpu) {
> >> +		if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
> >> +		    pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
> >> +		    pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
> >> +		    need_activate_page_drain(cpu))
> >> +			cpumask_set_cpu(cpu, mask);
> >> +	}
> >> +
> >> +	rc = schedule_on_cpu_mask(lru_add_drain_per_cpu, mask);
> > And it seems pretty easy to avoid the allocation.  Create a single
> > cpumask at boot (or, preferably, at compile-time) and whenever we add a
> > page to a drainable pagevec, do
> >
> > 	cpumask_set_cpu(smp_processor_id(), global_cpumask);
> >
> > and to avoid needlessly dirtying a cacheline,
> >
> > 	if (!cpu_isset(smp_processor_id(), global_cpumask))
> > 		cpumask_set_cpu(smp_processor_id(), global_cpumask);
> >
> >
> > This means that lru_add_drain_all() will need to clear the mask at some
> > point and atomicity issues arise.  It would be better to do the
> > clearing within schedule_on_cpu_mask() itself, using
> > cpumask_test_and_clear_cpu().
> 
> The atomicity issue isn't that big a deal (given that the drain is
> racy anyway, you just need to make sure to do cpumask_set_cpu after
> the pagevec_add), but you do need to clear the cpumask before doing
> the actual drain, and that either means inflicting that semantics
> on schedule_on_cpu_mask(), which seems a little unnatural, or else
> doing a copy of the mask, which gets us back to where we started
> with GFP_KERNEL allocations.

The drain is kinda racy, but there are approaches we can use...

The semantics we should aim for are "all pages which were in pagevecs
when lru_add_drain_all() was called will be drained".

And "any pages which are added to pagevecs after the call to
lru_add_drain_all() should be drained by either this call or by the
next call".  ie: lru_add_drain_all() should not cause any
concurrently-added pages to just be lost to the next lru_add_drain_all()

> Alternately, you could imagine a workqueue API that just took a function
> pointer that returned for each cpu whether or not to schedule work on
> that cpu:
> 
>   typedef bool (*check_work_func_t)(void *data, int cpu);
>   schedule_on_some_cpus(work_func_t func, check_work_func_t checker, void *data);
> 
> For the lru stuff we wouldn't need to use a "data" pointer but I'd include
> it since it's cheap, pretty standard, and makes the API more general.

uh, yes, that.  I do think this implementation and interface is better
than adding the temporary cpumask.

> Or, I suppose, one other possibility is just a compile-time struct
> cpumask that we guard with a lock.  It seems a bit like overkill for
> the very common case of not specifying CPUMASK_OFFSTACK.
> 
> All that said, I still tend to like the simple cpumask data-driven approach,
> assuming you're comfortable with the possible GFP_KERNEL allocation.
> 
> > Also, what's up with the get_online_cpus() handling? 
> > schedule_on_each_cpu() does it, lru_add_drain_all() does not do it and
> > the schedule_on_cpu_mask() documentation forgot to mention it.
> 
> The missing get_online_cpus() for lru_add_drain_all() is in v6 of the
> patch from Aug 9 (v5 had Tejun's feedback for doing validity-checking
> on the schedule_on_cpu_mask() mask argument, and v6 added his Ack
> and the missing get/put_online_cpus).

Well yes, I noticed this after sending the earlier email.  He may be
old and wrinkly, but I do suggest that the guy who wrote and maintains
that code could have got a cc.  Also am unclear why Tejun went and
committed the code to -next without cc'ing me.

> schedule_on_each_cpu() obviously uses get/put_online_cpus and needs it;
> I would argue that there's no need to mention it in the docs for
> schedule_on_cpu_mask() since if you're going to pass the cpu_online_mask
> you'd better know that you should get/put_online_cpus().

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective
  2013-08-13 19:35             ` Andrew Morton
@ 2013-08-13 20:19               ` Tejun Heo
  -1 siblings, 0 replies; 104+ messages in thread
From: Tejun Heo @ 2013-08-13 20:19 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

Hello,

On Tue, Aug 13, 2013 at 12:35:12PM -0700, Andrew Morton wrote:
> I don't know how lots-of-kmallocs compares with alloc_percpu()
> performance-wise.

If this is actually performance sensitive, the logical thing to do
would be pre-allocating per-cpu buffers instead of depending on
dynamic allocation.  Do the invocations need to be stackable?

> That being said, the `cpumask_var_t mask' which was added to
> lru_add_drain_all() is unneeded - it's just a temporary storage which
> can be eliminated by creating a schedule_on_each_cpu_cond() or whatever
> which is passed a function pointer of type `bool (*call_needed)(int
> cpu, void *data)'.

I'd really like to avoid that.  Decision callbacks tend to get abused
quite often and it's rather sad to do that because cpumask cannot be
prepared and passed around.  Can't it just preallocate all necessary
resources?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective
@ 2013-08-13 20:19               ` Tejun Heo
  0 siblings, 0 replies; 104+ messages in thread
From: Tejun Heo @ 2013-08-13 20:19 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

Hello,

On Tue, Aug 13, 2013 at 12:35:12PM -0700, Andrew Morton wrote:
> I don't know how lots-of-kmallocs compares with alloc_percpu()
> performance-wise.

If this is actually performance sensitive, the logical thing to do
would be pre-allocating per-cpu buffers instead of depending on
dynamic allocation.  Do the invocations need to be stackable?

> That being said, the `cpumask_var_t mask' which was added to
> lru_add_drain_all() is unneeded - it's just a temporary storage which
> can be eliminated by creating a schedule_on_each_cpu_cond() or whatever
> which is passed a function pointer of type `bool (*call_needed)(int
> cpu, void *data)'.

I'd really like to avoid that.  Decision callbacks tend to get abused
quite often and it's rather sad to do that because cpumask cannot be
prepared and passed around.  Can't it just preallocate all necessary
resources?

Thanks.

-- 
tejun

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective
  2013-08-13 20:19               ` Tejun Heo
@ 2013-08-13 20:31                 ` Andrew Morton
  -1 siblings, 0 replies; 104+ messages in thread
From: Andrew Morton @ 2013-08-13 20:31 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

On Tue, 13 Aug 2013 16:19:58 -0400 Tejun Heo <tj@kernel.org> wrote:

> Hello,
> 
> On Tue, Aug 13, 2013 at 12:35:12PM -0700, Andrew Morton wrote:
> > I don't know how lots-of-kmallocs compares with alloc_percpu()
> > performance-wise.
> 
> If this is actually performance sensitive,

I've always assumed that it isn't performance-sensitive. 
schedule_on_each_cpu() has to be slow as a dog.

Then again, why does this patchset exist?  It's a performance
optimisation so presumably someone cares.  But not enough to perform
actual measurements :(

> the logical thing to do
> would be pre-allocating per-cpu buffers instead of depending on
> dynamic allocation.  Do the invocations need to be stackable?

schedule_on_each_cpu() calls should if course happen concurrently, and
there's the question of whether we wish to permit async
schedule_on_each_cpu().  Leaving the calling CPU twiddling thumbs until
everyone has finished is pretty sad if the caller doesn't want that.

> > That being said, the `cpumask_var_t mask' which was added to
> > lru_add_drain_all() is unneeded - it's just a temporary storage which
> > can be eliminated by creating a schedule_on_each_cpu_cond() or whatever
> > which is passed a function pointer of type `bool (*call_needed)(int
> > cpu, void *data)'.
> 
> I'd really like to avoid that.  Decision callbacks tend to get abused
> quite often and it's rather sad to do that because cpumask cannot be
> prepared and passed around.  Can't it just preallocate all necessary
> resources?

I don't recall seeing such abuse.  It's a very common and powerful
tool, and not implementing it because some dummy may abuse it weakens
the API for all non-dummies.  That allocation is simply unneeded.

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective
@ 2013-08-13 20:31                 ` Andrew Morton
  0 siblings, 0 replies; 104+ messages in thread
From: Andrew Morton @ 2013-08-13 20:31 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

On Tue, 13 Aug 2013 16:19:58 -0400 Tejun Heo <tj@kernel.org> wrote:

> Hello,
> 
> On Tue, Aug 13, 2013 at 12:35:12PM -0700, Andrew Morton wrote:
> > I don't know how lots-of-kmallocs compares with alloc_percpu()
> > performance-wise.
> 
> If this is actually performance sensitive,

I've always assumed that it isn't performance-sensitive. 
schedule_on_each_cpu() has to be slow as a dog.

Then again, why does this patchset exist?  It's a performance
optimisation so presumably someone cares.  But not enough to perform
actual measurements :(

> the logical thing to do
> would be pre-allocating per-cpu buffers instead of depending on
> dynamic allocation.  Do the invocations need to be stackable?

schedule_on_each_cpu() calls should if course happen concurrently, and
there's the question of whether we wish to permit async
schedule_on_each_cpu().  Leaving the calling CPU twiddling thumbs until
everyone has finished is pretty sad if the caller doesn't want that.

> > That being said, the `cpumask_var_t mask' which was added to
> > lru_add_drain_all() is unneeded - it's just a temporary storage which
> > can be eliminated by creating a schedule_on_each_cpu_cond() or whatever
> > which is passed a function pointer of type `bool (*call_needed)(int
> > cpu, void *data)'.
> 
> I'd really like to avoid that.  Decision callbacks tend to get abused
> quite often and it's rather sad to do that because cpumask cannot be
> prepared and passed around.  Can't it just preallocate all necessary
> resources?

I don't recall seeing such abuse.  It's a very common and powerful
tool, and not implementing it because some dummy may abuse it weakens
the API for all non-dummies.  That allocation is simply unneeded.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective
  2013-08-13 20:31                 ` Andrew Morton
@ 2013-08-13 20:59                   ` Chris Metcalf
  -1 siblings, 0 replies; 104+ messages in thread
From: Chris Metcalf @ 2013-08-13 20:59 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

On 8/13/2013 3:35 PM, Andrew Morton wrote:
> He may be
> old and wrinkly, but I do suggest that the guy who wrote and maintains
> that code could have got a cc. 

Sorry about that - I just went by what MAINTAINERS shows.  There's no
specific maintainer listed for the swap code.  I probably should have
looked at the final Signed-off-by's on recent commits.

On 8/13/2013 4:31 PM, Andrew Morton wrote:
> On Tue, 13 Aug 2013 16:19:58 -0400 Tejun Heo <tj@kernel.org> wrote:
>
>> Hello,
>>
>> On Tue, Aug 13, 2013 at 12:35:12PM -0700, Andrew Morton wrote:
>>> I don't know how lots-of-kmallocs compares with alloc_percpu()
>>> performance-wise.
>> If this is actually performance sensitive,
> I've always assumed that it isn't performance-sensitive. 
> schedule_on_each_cpu() has to be slow as a dog.
>
> Then again, why does this patchset exist?  It's a performance
> optimisation so presumably someone cares.  But not enough to perform
> actual measurements :(

The patchset exists because of the difference between zero overhead on
cpus that don't have drainable lrus, and non-zero overhead.  This turns
out to be important on workloads where nohz cores are handling 10 Gb
traffic in userspace and really, really don't want to be interrupted,
or they drop packets on the floor.

>> the logical thing to do
>> would be pre-allocating per-cpu buffers instead of depending on
>> dynamic allocation.  Do the invocations need to be stackable?
> schedule_on_each_cpu() calls should if course happen concurrently, and
> there's the question of whether we wish to permit async
> schedule_on_each_cpu().  Leaving the calling CPU twiddling thumbs until
> everyone has finished is pretty sad if the caller doesn't want that.
>
>>> That being said, the `cpumask_var_t mask' which was added to
>>> lru_add_drain_all() is unneeded - it's just a temporary storage which
>>> can be eliminated by creating a schedule_on_each_cpu_cond() or whatever
>>> which is passed a function pointer of type `bool (*call_needed)(int
>>> cpu, void *data)'.
>> I'd really like to avoid that.  Decision callbacks tend to get abused
>> quite often and it's rather sad to do that because cpumask cannot be
>> prepared and passed around.  Can't it just preallocate all necessary
>> resources?
> I don't recall seeing such abuse.  It's a very common and powerful
> tool, and not implementing it because some dummy may abuse it weakens
> the API for all non-dummies.  That allocation is simply unneeded.

The problem with a callback version is that it's not clear that
it helps with Andrew's original concern about allocation.  In
schedule_on_each_cpu() we need to track which cpus we scheduled work
on so that we can flush_work() after all the work has been scheduled.
Even with a callback approach, we'd still end up wanting to record
the results of the callback in the first pass so that we could
properly flush_work() on the second pass.  Given that, having the
caller just create the cpumask in the first place makes more sense.

As Andrew suggests, we could also just have an asynchronous version
of schedule_on_each_cpu(), but I don't know if that's beneficial
enough to the swap code to make it worthwhile, or if it's tricky
enough on the workqueue side to make it not worthwhile; it does seem
like we would need to rethink the work_struct allocation, and
e.g. avoid re-issuing the flush to a cpu that hadn't finished the
previous flush, etc.  Potentially tricky, particularly if
lru_add_drain_all() doesn't care about performance in the first place.

-- 
Chris Metcalf, Tilera Corp.
http://www.tilera.com


^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective
@ 2013-08-13 20:59                   ` Chris Metcalf
  0 siblings, 0 replies; 104+ messages in thread
From: Chris Metcalf @ 2013-08-13 20:59 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

On 8/13/2013 3:35 PM, Andrew Morton wrote:
> He may be
> old and wrinkly, but I do suggest that the guy who wrote and maintains
> that code could have got a cc. 

Sorry about that - I just went by what MAINTAINERS shows.  There's no
specific maintainer listed for the swap code.  I probably should have
looked at the final Signed-off-by's on recent commits.

On 8/13/2013 4:31 PM, Andrew Morton wrote:
> On Tue, 13 Aug 2013 16:19:58 -0400 Tejun Heo <tj@kernel.org> wrote:
>
>> Hello,
>>
>> On Tue, Aug 13, 2013 at 12:35:12PM -0700, Andrew Morton wrote:
>>> I don't know how lots-of-kmallocs compares with alloc_percpu()
>>> performance-wise.
>> If this is actually performance sensitive,
> I've always assumed that it isn't performance-sensitive. 
> schedule_on_each_cpu() has to be slow as a dog.
>
> Then again, why does this patchset exist?  It's a performance
> optimisation so presumably someone cares.  But not enough to perform
> actual measurements :(

The patchset exists because of the difference between zero overhead on
cpus that don't have drainable lrus, and non-zero overhead.  This turns
out to be important on workloads where nohz cores are handling 10 Gb
traffic in userspace and really, really don't want to be interrupted,
or they drop packets on the floor.

>> the logical thing to do
>> would be pre-allocating per-cpu buffers instead of depending on
>> dynamic allocation.  Do the invocations need to be stackable?
> schedule_on_each_cpu() calls should if course happen concurrently, and
> there's the question of whether we wish to permit async
> schedule_on_each_cpu().  Leaving the calling CPU twiddling thumbs until
> everyone has finished is pretty sad if the caller doesn't want that.
>
>>> That being said, the `cpumask_var_t mask' which was added to
>>> lru_add_drain_all() is unneeded - it's just a temporary storage which
>>> can be eliminated by creating a schedule_on_each_cpu_cond() or whatever
>>> which is passed a function pointer of type `bool (*call_needed)(int
>>> cpu, void *data)'.
>> I'd really like to avoid that.  Decision callbacks tend to get abused
>> quite often and it's rather sad to do that because cpumask cannot be
>> prepared and passed around.  Can't it just preallocate all necessary
>> resources?
> I don't recall seeing such abuse.  It's a very common and powerful
> tool, and not implementing it because some dummy may abuse it weakens
> the API for all non-dummies.  That allocation is simply unneeded.

The problem with a callback version is that it's not clear that
it helps with Andrew's original concern about allocation.  In
schedule_on_each_cpu() we need to track which cpus we scheduled work
on so that we can flush_work() after all the work has been scheduled.
Even with a callback approach, we'd still end up wanting to record
the results of the callback in the first pass so that we could
properly flush_work() on the second pass.  Given that, having the
caller just create the cpumask in the first place makes more sense.

As Andrew suggests, we could also just have an asynchronous version
of schedule_on_each_cpu(), but I don't know if that's beneficial
enough to the swap code to make it worthwhile, or if it's tricky
enough on the workqueue side to make it not worthwhile; it does seem
like we would need to rethink the work_struct allocation, and
e.g. avoid re-issuing the flush to a cpu that hadn't finished the
previous flush, etc.  Potentially tricky, particularly if
lru_add_drain_all() doesn't care about performance in the first place.

-- 
Chris Metcalf, Tilera Corp.
http://www.tilera.com

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective
  2013-08-13 20:31                 ` Andrew Morton
@ 2013-08-13 21:07                   ` Tejun Heo
  -1 siblings, 0 replies; 104+ messages in thread
From: Tejun Heo @ 2013-08-13 21:07 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

Hello,

On Tue, Aug 13, 2013 at 01:31:35PM -0700, Andrew Morton wrote:
> > the logical thing to do
> > would be pre-allocating per-cpu buffers instead of depending on
> > dynamic allocation.  Do the invocations need to be stackable?
> 
> schedule_on_each_cpu() calls should if course happen concurrently, and
> there's the question of whether we wish to permit async
> schedule_on_each_cpu().  Leaving the calling CPU twiddling thumbs until
> everyone has finished is pretty sad if the caller doesn't want that.

Oh, I meant the caller-side, not schedule_on_each_cpu().  So, if this
particular caller is performance sensitive for some reason, it makes
sense to pre-allocate resources on the caller side if the caller
doesn't need to be reentrant or called concurrently.

> I don't recall seeing such abuse.  It's a very common and powerful
> tool, and not implementing it because some dummy may abuse it weakens
> the API for all non-dummies.  That allocation is simply unneeded.

More powerful and flexible doesn't always equal better and I think
being simple and less prone to abuses are important characteristics
that APIs should have.  It feels a bit silly to me to push the API
that way when doing so doesn't even solve the allocation problem.  It
doesn't really buy us much while making the interface more complex.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective
@ 2013-08-13 21:07                   ` Tejun Heo
  0 siblings, 0 replies; 104+ messages in thread
From: Tejun Heo @ 2013-08-13 21:07 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

Hello,

On Tue, Aug 13, 2013 at 01:31:35PM -0700, Andrew Morton wrote:
> > the logical thing to do
> > would be pre-allocating per-cpu buffers instead of depending on
> > dynamic allocation.  Do the invocations need to be stackable?
> 
> schedule_on_each_cpu() calls should if course happen concurrently, and
> there's the question of whether we wish to permit async
> schedule_on_each_cpu().  Leaving the calling CPU twiddling thumbs until
> everyone has finished is pretty sad if the caller doesn't want that.

Oh, I meant the caller-side, not schedule_on_each_cpu().  So, if this
particular caller is performance sensitive for some reason, it makes
sense to pre-allocate resources on the caller side if the caller
doesn't need to be reentrant or called concurrently.

> I don't recall seeing such abuse.  It's a very common and powerful
> tool, and not implementing it because some dummy may abuse it weakens
> the API for all non-dummies.  That allocation is simply unneeded.

More powerful and flexible doesn't always equal better and I think
being simple and less prone to abuses are important characteristics
that APIs should have.  It feels a bit silly to me to push the API
that way when doing so doesn't even solve the allocation problem.  It
doesn't really buy us much while making the interface more complex.

Thanks.

-- 
tejun

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective
  2013-08-13 20:59                   ` Chris Metcalf
@ 2013-08-13 21:13                     ` Andrew Morton
  -1 siblings, 0 replies; 104+ messages in thread
From: Andrew Morton @ 2013-08-13 21:13 UTC (permalink / raw)
  To: Chris Metcalf
  Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

On Tue, 13 Aug 2013 16:59:54 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote:

> >
> > Then again, why does this patchset exist?  It's a performance
> > optimisation so presumably someone cares.  But not enough to perform
> > actual measurements :(
> 
> The patchset exists because of the difference between zero overhead on
> cpus that don't have drainable lrus, and non-zero overhead.  This turns
> out to be important on workloads where nohz cores are handling 10 Gb
> traffic in userspace and really, really don't want to be interrupted,
> or they drop packets on the floor.

But what is the effect of the patchset?  Has it been tested against the
problematic workload(s)?

> >> the logical thing to do
> >> would be pre-allocating per-cpu buffers instead of depending on
> >> dynamic allocation.  Do the invocations need to be stackable?
> > schedule_on_each_cpu() calls should if course happen concurrently, and
> > there's the question of whether we wish to permit async
> > schedule_on_each_cpu().  Leaving the calling CPU twiddling thumbs until
> > everyone has finished is pretty sad if the caller doesn't want that.
> >
> >>> That being said, the `cpumask_var_t mask' which was added to
> >>> lru_add_drain_all() is unneeded - it's just a temporary storage which
> >>> can be eliminated by creating a schedule_on_each_cpu_cond() or whatever
> >>> which is passed a function pointer of type `bool (*call_needed)(int
> >>> cpu, void *data)'.
> >> I'd really like to avoid that.  Decision callbacks tend to get abused
> >> quite often and it's rather sad to do that because cpumask cannot be
> >> prepared and passed around.  Can't it just preallocate all necessary
> >> resources?
> > I don't recall seeing such abuse.  It's a very common and powerful
> > tool, and not implementing it because some dummy may abuse it weakens
> > the API for all non-dummies.  That allocation is simply unneeded.
> 
> The problem with a callback version is that it's not clear that
> it helps with Andrew's original concern about allocation.  In
> schedule_on_each_cpu() we need to track which cpus we scheduled work
> on so that we can flush_work() after all the work has been scheduled.
> Even with a callback approach, we'd still end up wanting to record
> the results of the callback in the first pass so that we could
> properly flush_work() on the second pass.  Given that, having the
> caller just create the cpumask in the first place makes more sense.

Nope.  schedule_on_each_cpu() can just continue to do

	for_each_cpu(cpu, mask)
		flush_work(per_cpu_ptr(works, cpu));

lru_add_drain_all() can do that as well.  An optimisation would be to
tag the unused works as not-needing-flush.  Set work.entry,next to
NULL, for example.

If we were to switch from alloc_per_cpu() to bunch-of-kmallocs then
they'd need to be assembled into a list which is pretty trivial. 

> As Andrew suggests, we could also just have an asynchronous version
> of schedule_on_each_cpu(), but I don't know if that's beneficial
> enough to the swap code to make it worthwhile, or if it's tricky
> enough on the workqueue side to make it not worthwhile; it does seem
> like we would need to rethink the work_struct allocation, and
> e.g. avoid re-issuing the flush to a cpu that hadn't finished the
> previous flush, etc.  Potentially tricky, particularly if
> lru_add_drain_all() doesn't care about performance in the first place.

lru_add_drain_all() wants synchronous behavior.  I don't know how much
call there would be for an async schedule_on_each_cpu_cond().


^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective
@ 2013-08-13 21:13                     ` Andrew Morton
  0 siblings, 0 replies; 104+ messages in thread
From: Andrew Morton @ 2013-08-13 21:13 UTC (permalink / raw)
  To: Chris Metcalf
  Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

On Tue, 13 Aug 2013 16:59:54 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote:

> >
> > Then again, why does this patchset exist?  It's a performance
> > optimisation so presumably someone cares.  But not enough to perform
> > actual measurements :(
> 
> The patchset exists because of the difference between zero overhead on
> cpus that don't have drainable lrus, and non-zero overhead.  This turns
> out to be important on workloads where nohz cores are handling 10 Gb
> traffic in userspace and really, really don't want to be interrupted,
> or they drop packets on the floor.

But what is the effect of the patchset?  Has it been tested against the
problematic workload(s)?

> >> the logical thing to do
> >> would be pre-allocating per-cpu buffers instead of depending on
> >> dynamic allocation.  Do the invocations need to be stackable?
> > schedule_on_each_cpu() calls should if course happen concurrently, and
> > there's the question of whether we wish to permit async
> > schedule_on_each_cpu().  Leaving the calling CPU twiddling thumbs until
> > everyone has finished is pretty sad if the caller doesn't want that.
> >
> >>> That being said, the `cpumask_var_t mask' which was added to
> >>> lru_add_drain_all() is unneeded - it's just a temporary storage which
> >>> can be eliminated by creating a schedule_on_each_cpu_cond() or whatever
> >>> which is passed a function pointer of type `bool (*call_needed)(int
> >>> cpu, void *data)'.
> >> I'd really like to avoid that.  Decision callbacks tend to get abused
> >> quite often and it's rather sad to do that because cpumask cannot be
> >> prepared and passed around.  Can't it just preallocate all necessary
> >> resources?
> > I don't recall seeing such abuse.  It's a very common and powerful
> > tool, and not implementing it because some dummy may abuse it weakens
> > the API for all non-dummies.  That allocation is simply unneeded.
> 
> The problem with a callback version is that it's not clear that
> it helps with Andrew's original concern about allocation.  In
> schedule_on_each_cpu() we need to track which cpus we scheduled work
> on so that we can flush_work() after all the work has been scheduled.
> Even with a callback approach, we'd still end up wanting to record
> the results of the callback in the first pass so that we could
> properly flush_work() on the second pass.  Given that, having the
> caller just create the cpumask in the first place makes more sense.

Nope.  schedule_on_each_cpu() can just continue to do

	for_each_cpu(cpu, mask)
		flush_work(per_cpu_ptr(works, cpu));

lru_add_drain_all() can do that as well.  An optimisation would be to
tag the unused works as not-needing-flush.  Set work.entry,next to
NULL, for example.

If we were to switch from alloc_per_cpu() to bunch-of-kmallocs then
they'd need to be assembled into a list which is pretty trivial. 

> As Andrew suggests, we could also just have an asynchronous version
> of schedule_on_each_cpu(), but I don't know if that's beneficial
> enough to the swap code to make it worthwhile, or if it's tricky
> enough on the workqueue side to make it not worthwhile; it does seem
> like we would need to rethink the work_struct allocation, and
> e.g. avoid re-issuing the flush to a cpu that hadn't finished the
> previous flush, etc.  Potentially tricky, particularly if
> lru_add_drain_all() doesn't care about performance in the first place.

lru_add_drain_all() wants synchronous behavior.  I don't know how much
call there would be for an async schedule_on_each_cpu_cond().

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective
  2013-08-13 21:07                   ` Tejun Heo
@ 2013-08-13 21:16                     ` Andrew Morton
  -1 siblings, 0 replies; 104+ messages in thread
From: Andrew Morton @ 2013-08-13 21:16 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

On Tue, 13 Aug 2013 17:07:19 -0400 Tejun Heo <tj@kernel.org> wrote:

> > I don't recall seeing such abuse.  It's a very common and powerful
> > tool, and not implementing it because some dummy may abuse it weakens
> > the API for all non-dummies.  That allocation is simply unneeded.
> 
> More powerful and flexible doesn't always equal better and I think
> being simple and less prone to abuses are important characteristics
> that APIs should have.

I've yet to see any evidence that callback APIs have been abused and
I've yet to see any reasoning which makes me believe that this one will
be abused.

>  It feels a bit silly to me to push the API
> that way when doing so doesn't even solve the allocation problem.

It removes the need to perform a cpumask allocation in
lru_add_drain_all().

>  It doesn't really buy us much while making the interface more complex.

It's a superior interface.

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective
@ 2013-08-13 21:16                     ` Andrew Morton
  0 siblings, 0 replies; 104+ messages in thread
From: Andrew Morton @ 2013-08-13 21:16 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

On Tue, 13 Aug 2013 17:07:19 -0400 Tejun Heo <tj@kernel.org> wrote:

> > I don't recall seeing such abuse.  It's a very common and powerful
> > tool, and not implementing it because some dummy may abuse it weakens
> > the API for all non-dummies.  That allocation is simply unneeded.
> 
> More powerful and flexible doesn't always equal better and I think
> being simple and less prone to abuses are important characteristics
> that APIs should have.

I've yet to see any evidence that callback APIs have been abused and
I've yet to see any reasoning which makes me believe that this one will
be abused.

>  It feels a bit silly to me to push the API
> that way when doing so doesn't even solve the allocation problem.

It removes the need to perform a cpumask allocation in
lru_add_drain_all().

>  It doesn't really buy us much while making the interface more complex.

It's a superior interface.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective
  2013-08-13 21:16                     ` Andrew Morton
@ 2013-08-13 22:07                       ` Tejun Heo
  -1 siblings, 0 replies; 104+ messages in thread
From: Tejun Heo @ 2013-08-13 22:07 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

Hello,

On Tue, Aug 13, 2013 at 02:16:21PM -0700, Andrew Morton wrote:
> I've yet to see any evidence that callback APIs have been abused and
> I've yet to see any reasoning which makes me believe that this one will
> be abused.

Well, off the top of my head.

* In general, it's clunkier.  Callbacks become artificial boundaries
  across which context has to be carried over explicitly.  It often
  involves packing data into a temporary struct.  The artificial
  barrier also generally makes the logic more difficult to follow.
  This is pretty general problem with callback based interface and why
  many programming languages / conventions prefer iterator style
  interface over callback based ones.  It makes the code a lot easier
  to organize around the looping construct.  Here, it isn't as
  pronounced because the thing naturally requires a callback anyway.

* From the API itself, it often isn't clear what restrictions the
  context the callback is called under would have.  It sure is partly
  documentation problem but is pretty easy to get wrong inadvertantly
  as the code evolves and can be difficult to spot as the context
  isn't apparent.

Moving away from callbacks started with higher level languages but the
kernel sure is on the boat too where possible.  This one is muddier as
the interface is async in nature but still it's at least partially
applicable.

> >  It feels a bit silly to me to push the API
> > that way when doing so doesn't even solve the allocation problem.
> 
> It removes the need to perform a cpumask allocation in
> lru_add_drain_all().

But that doesn't really solve anything, does it?

> >  It doesn't really buy us much while making the interface more complex.
> 
> It's a superior interface.

It is more flexible but at the same time clunkier.  I wouldn't call it
superior and the flexibility doesn't buy us much here.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective
@ 2013-08-13 22:07                       ` Tejun Heo
  0 siblings, 0 replies; 104+ messages in thread
From: Tejun Heo @ 2013-08-13 22:07 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

Hello,

On Tue, Aug 13, 2013 at 02:16:21PM -0700, Andrew Morton wrote:
> I've yet to see any evidence that callback APIs have been abused and
> I've yet to see any reasoning which makes me believe that this one will
> be abused.

Well, off the top of my head.

* In general, it's clunkier.  Callbacks become artificial boundaries
  across which context has to be carried over explicitly.  It often
  involves packing data into a temporary struct.  The artificial
  barrier also generally makes the logic more difficult to follow.
  This is pretty general problem with callback based interface and why
  many programming languages / conventions prefer iterator style
  interface over callback based ones.  It makes the code a lot easier
  to organize around the looping construct.  Here, it isn't as
  pronounced because the thing naturally requires a callback anyway.

* From the API itself, it often isn't clear what restrictions the
  context the callback is called under would have.  It sure is partly
  documentation problem but is pretty easy to get wrong inadvertantly
  as the code evolves and can be difficult to spot as the context
  isn't apparent.

Moving away from callbacks started with higher level languages but the
kernel sure is on the boat too where possible.  This one is muddier as
the interface is async in nature but still it's at least partially
applicable.

> >  It feels a bit silly to me to push the API
> > that way when doing so doesn't even solve the allocation problem.
> 
> It removes the need to perform a cpumask allocation in
> lru_add_drain_all().

But that doesn't really solve anything, does it?

> >  It doesn't really buy us much while making the interface more complex.
> 
> It's a superior interface.

It is more flexible but at the same time clunkier.  I wouldn't call it
superior and the flexibility doesn't buy us much here.

Thanks.

-- 
tejun

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective
  2013-08-13 21:13                     ` Andrew Morton
@ 2013-08-13 22:13                       ` Chris Metcalf
  -1 siblings, 0 replies; 104+ messages in thread
From: Chris Metcalf @ 2013-08-13 22:13 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

On 8/13/2013 5:13 PM, Andrew Morton wrote:
> On Tue, 13 Aug 2013 16:59:54 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote:
>
>>> Then again, why does this patchset exist?  It's a performance
>>> optimisation so presumably someone cares.  But not enough to perform
>>> actual measurements :(
>> The patchset exists because of the difference between zero overhead on
>> cpus that don't have drainable lrus, and non-zero overhead.  This turns
>> out to be important on workloads where nohz cores are handling 10 Gb
>> traffic in userspace and really, really don't want to be interrupted,
>> or they drop packets on the floor.
> But what is the effect of the patchset?  Has it been tested against the
> problematic workload(s)?

Yes.  The result is that syscalls such as mlockall(), which otherwise interrupt
every core, don't interrupt the cores that are running purely in userspace.
Since they are purely in userspace they don't have any drainable pagevecs,
so the patchset means they don't get interrupted and don't drop packets.

I implemented this against Linux 2.6.38 and our home-grown version of nohz
cpusets back in July 2012, and we have been shipping it to customers since then.

>>>> the logical thing to do
>>>> would be pre-allocating per-cpu buffers instead of depending on
>>>> dynamic allocation.  Do the invocations need to be stackable?
>>> schedule_on_each_cpu() calls should if course happen concurrently, and
>>> there's the question of whether we wish to permit async
>>> schedule_on_each_cpu().  Leaving the calling CPU twiddling thumbs until
>>> everyone has finished is pretty sad if the caller doesn't want that.
>>>
>>>>> That being said, the `cpumask_var_t mask' which was added to
>>>>> lru_add_drain_all() is unneeded - it's just a temporary storage which
>>>>> can be eliminated by creating a schedule_on_each_cpu_cond() or whatever
>>>>> which is passed a function pointer of type `bool (*call_needed)(int
>>>>> cpu, void *data)'.
>>>> I'd really like to avoid that.  Decision callbacks tend to get abused
>>>> quite often and it's rather sad to do that because cpumask cannot be
>>>> prepared and passed around.  Can't it just preallocate all necessary
>>>> resources?
>>> I don't recall seeing such abuse.  It's a very common and powerful
>>> tool, and not implementing it because some dummy may abuse it weakens
>>> the API for all non-dummies.  That allocation is simply unneeded.
>> The problem with a callback version is that it's not clear that
>> it helps with Andrew's original concern about allocation.  In
>> schedule_on_each_cpu() we need to track which cpus we scheduled work
>> on so that we can flush_work() after all the work has been scheduled.
>> Even with a callback approach, we'd still end up wanting to record
>> the results of the callback in the first pass so that we could
>> properly flush_work() on the second pass.  Given that, having the
>> caller just create the cpumask in the first place makes more sense.
> Nope.  schedule_on_each_cpu() can just continue to do
>
> 	for_each_cpu(cpu, mask)
> 		flush_work(per_cpu_ptr(works, cpu));
>
> lru_add_drain_all() can do that as well.  An optimisation would be to
> tag the unused works as not-needing-flush.  Set work.entry,next to
> NULL, for example.

Good point - we already have a per-cpu data structure sitting there
handy, so we might as well use it rather than allocating another cpumask.

>> As Andrew suggests, we could also just have an asynchronous version
>> of schedule_on_each_cpu(), but I don't know if that's beneficial
>> enough to the swap code to make it worthwhile, or if it's tricky
>> enough on the workqueue side to make it not worthwhile; it does seem
>> like we would need to rethink the work_struct allocation, and
>> e.g. avoid re-issuing the flush to a cpu that hadn't finished the
>> previous flush, etc.  Potentially tricky, particularly if
>> lru_add_drain_all() doesn't care about performance in the first place.
> lru_add_drain_all() wants synchronous behavior.  I don't know how much
> call there would be for an async schedule_on_each_cpu_cond().

Let me work up a patchset using callbacks, and we can look at that as
an example and see how it feels for both workqueue.c and swap.c. 

-- 
Chris Metcalf, Tilera Corp.
http://www.tilera.com


^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective
@ 2013-08-13 22:13                       ` Chris Metcalf
  0 siblings, 0 replies; 104+ messages in thread
From: Chris Metcalf @ 2013-08-13 22:13 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

On 8/13/2013 5:13 PM, Andrew Morton wrote:
> On Tue, 13 Aug 2013 16:59:54 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote:
>
>>> Then again, why does this patchset exist?  It's a performance
>>> optimisation so presumably someone cares.  But not enough to perform
>>> actual measurements :(
>> The patchset exists because of the difference between zero overhead on
>> cpus that don't have drainable lrus, and non-zero overhead.  This turns
>> out to be important on workloads where nohz cores are handling 10 Gb
>> traffic in userspace and really, really don't want to be interrupted,
>> or they drop packets on the floor.
> But what is the effect of the patchset?  Has it been tested against the
> problematic workload(s)?

Yes.  The result is that syscalls such as mlockall(), which otherwise interrupt
every core, don't interrupt the cores that are running purely in userspace.
Since they are purely in userspace they don't have any drainable pagevecs,
so the patchset means they don't get interrupted and don't drop packets.

I implemented this against Linux 2.6.38 and our home-grown version of nohz
cpusets back in July 2012, and we have been shipping it to customers since then.

>>>> the logical thing to do
>>>> would be pre-allocating per-cpu buffers instead of depending on
>>>> dynamic allocation.  Do the invocations need to be stackable?
>>> schedule_on_each_cpu() calls should if course happen concurrently, and
>>> there's the question of whether we wish to permit async
>>> schedule_on_each_cpu().  Leaving the calling CPU twiddling thumbs until
>>> everyone has finished is pretty sad if the caller doesn't want that.
>>>
>>>>> That being said, the `cpumask_var_t mask' which was added to
>>>>> lru_add_drain_all() is unneeded - it's just a temporary storage which
>>>>> can be eliminated by creating a schedule_on_each_cpu_cond() or whatever
>>>>> which is passed a function pointer of type `bool (*call_needed)(int
>>>>> cpu, void *data)'.
>>>> I'd really like to avoid that.  Decision callbacks tend to get abused
>>>> quite often and it's rather sad to do that because cpumask cannot be
>>>> prepared and passed around.  Can't it just preallocate all necessary
>>>> resources?
>>> I don't recall seeing such abuse.  It's a very common and powerful
>>> tool, and not implementing it because some dummy may abuse it weakens
>>> the API for all non-dummies.  That allocation is simply unneeded.
>> The problem with a callback version is that it's not clear that
>> it helps with Andrew's original concern about allocation.  In
>> schedule_on_each_cpu() we need to track which cpus we scheduled work
>> on so that we can flush_work() after all the work has been scheduled.
>> Even with a callback approach, we'd still end up wanting to record
>> the results of the callback in the first pass so that we could
>> properly flush_work() on the second pass.  Given that, having the
>> caller just create the cpumask in the first place makes more sense.
> Nope.  schedule_on_each_cpu() can just continue to do
>
> 	for_each_cpu(cpu, mask)
> 		flush_work(per_cpu_ptr(works, cpu));
>
> lru_add_drain_all() can do that as well.  An optimisation would be to
> tag the unused works as not-needing-flush.  Set work.entry,next to
> NULL, for example.

Good point - we already have a per-cpu data structure sitting there
handy, so we might as well use it rather than allocating another cpumask.

>> As Andrew suggests, we could also just have an asynchronous version
>> of schedule_on_each_cpu(), but I don't know if that's beneficial
>> enough to the swap code to make it worthwhile, or if it's tricky
>> enough on the workqueue side to make it not worthwhile; it does seem
>> like we would need to rethink the work_struct allocation, and
>> e.g. avoid re-issuing the flush to a cpu that hadn't finished the
>> previous flush, etc.  Potentially tricky, particularly if
>> lru_add_drain_all() doesn't care about performance in the first place.
> lru_add_drain_all() wants synchronous behavior.  I don't know how much
> call there would be for an async schedule_on_each_cpu_cond().

Let me work up a patchset using callbacks, and we can look at that as
an example and see how it feels for both workqueue.c and swap.c. 

-- 
Chris Metcalf, Tilera Corp.
http://www.tilera.com

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective
  2013-08-13 22:07                       ` Tejun Heo
@ 2013-08-13 22:18                         ` Andrew Morton
  -1 siblings, 0 replies; 104+ messages in thread
From: Andrew Morton @ 2013-08-13 22:18 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

On Tue, 13 Aug 2013 18:07:00 -0400 Tejun Heo <tj@kernel.org> wrote:

> Hello,
> 
> On Tue, Aug 13, 2013 at 02:16:21PM -0700, Andrew Morton wrote:
> > I've yet to see any evidence that callback APIs have been abused and
> > I've yet to see any reasoning which makes me believe that this one will
> > be abused.
> 
> Well, off the top of my head.
> 
> * In general, it's clunkier.  Callbacks become artificial boundaries
>   across which context has to be carried over explicitly.  It often
>   involves packing data into a temporary struct.  The artificial
>   barrier also generally makes the logic more difficult to follow.
>   This is pretty general problem with callback based interface and why
>   many programming languages / conventions prefer iterator style
>   interface over callback based ones.  It makes the code a lot easier
>   to organize around the looping construct.  Here, it isn't as
>   pronounced because the thing naturally requires a callback anyway.
> 
> * From the API itself, it often isn't clear what restrictions the
>   context the callback is called under would have.  It sure is partly
>   documentation problem but is pretty easy to get wrong inadvertantly
>   as the code evolves and can be difficult to spot as the context
>   isn't apparent.
> 
> Moving away from callbacks started with higher level languages but the
> kernel sure is on the boat too where possible.  This one is muddier as
> the interface is async in nature but still it's at least partially
> applicable.

I don't buy it.  The callback simply determines whether "we need to
schuedule work on this cpu".  It's utterly simple.  Nobody will have
trouble understanding or using such a thing.

> > >  It feels a bit silly to me to push the API
> > > that way when doing so doesn't even solve the allocation problem.
> > 
> > It removes the need to perform a cpumask allocation in
> > lru_add_drain_all().
> 
> But that doesn't really solve anything, does it?

It removes one memory allocation and initialisation per call.  It
removes an entire for_each_online_cpu() loop.

> > >  It doesn't really buy us much while making the interface more complex.
> > 
> > It's a superior interface.
> 
> It is more flexible but at the same time clunkier.

The callback predicate is a quite natural thing in this case.

>  I wouldn't call it
> superior and the flexibility doesn't buy us much here.

It buys quite a lot and demonstrates why a callback interface is better.


I really don't understand what's going on here.  You're advocating for
a weaker kernel interface and for inferior kernel runtime behaviour. 
Forcing callers to communicate their needs via a large,
dynamically-allocated temporary rather than directly.  And what do we
get in return for all this?  Some stuff about callbacks which frankly
has me scratching my head.


^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective
@ 2013-08-13 22:18                         ` Andrew Morton
  0 siblings, 0 replies; 104+ messages in thread
From: Andrew Morton @ 2013-08-13 22:18 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

On Tue, 13 Aug 2013 18:07:00 -0400 Tejun Heo <tj@kernel.org> wrote:

> Hello,
> 
> On Tue, Aug 13, 2013 at 02:16:21PM -0700, Andrew Morton wrote:
> > I've yet to see any evidence that callback APIs have been abused and
> > I've yet to see any reasoning which makes me believe that this one will
> > be abused.
> 
> Well, off the top of my head.
> 
> * In general, it's clunkier.  Callbacks become artificial boundaries
>   across which context has to be carried over explicitly.  It often
>   involves packing data into a temporary struct.  The artificial
>   barrier also generally makes the logic more difficult to follow.
>   This is pretty general problem with callback based interface and why
>   many programming languages / conventions prefer iterator style
>   interface over callback based ones.  It makes the code a lot easier
>   to organize around the looping construct.  Here, it isn't as
>   pronounced because the thing naturally requires a callback anyway.
> 
> * From the API itself, it often isn't clear what restrictions the
>   context the callback is called under would have.  It sure is partly
>   documentation problem but is pretty easy to get wrong inadvertantly
>   as the code evolves and can be difficult to spot as the context
>   isn't apparent.
> 
> Moving away from callbacks started with higher level languages but the
> kernel sure is on the boat too where possible.  This one is muddier as
> the interface is async in nature but still it's at least partially
> applicable.

I don't buy it.  The callback simply determines whether "we need to
schuedule work on this cpu".  It's utterly simple.  Nobody will have
trouble understanding or using such a thing.

> > >  It feels a bit silly to me to push the API
> > > that way when doing so doesn't even solve the allocation problem.
> > 
> > It removes the need to perform a cpumask allocation in
> > lru_add_drain_all().
> 
> But that doesn't really solve anything, does it?

It removes one memory allocation and initialisation per call.  It
removes an entire for_each_online_cpu() loop.

> > >  It doesn't really buy us much while making the interface more complex.
> > 
> > It's a superior interface.
> 
> It is more flexible but at the same time clunkier.

The callback predicate is a quite natural thing in this case.

>  I wouldn't call it
> superior and the flexibility doesn't buy us much here.

It buys quite a lot and demonstrates why a callback interface is better.


I really don't understand what's going on here.  You're advocating for
a weaker kernel interface and for inferior kernel runtime behaviour. 
Forcing callers to communicate their needs via a large,
dynamically-allocated temporary rather than directly.  And what do we
get in return for all this?  Some stuff about callbacks which frankly
has me scratching my head.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective
  2013-08-13 22:13                       ` Chris Metcalf
@ 2013-08-13 22:26                         ` Andrew Morton
  -1 siblings, 0 replies; 104+ messages in thread
From: Andrew Morton @ 2013-08-13 22:26 UTC (permalink / raw)
  To: Chris Metcalf
  Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer, KOSAKI Motohiro

On Tue, 13 Aug 2013 18:13:48 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote:

> On 8/13/2013 5:13 PM, Andrew Morton wrote:
> > On Tue, 13 Aug 2013 16:59:54 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote:
> >
> >>> Then again, why does this patchset exist?  It's a performance
> >>> optimisation so presumably someone cares.  But not enough to perform
> >>> actual measurements :(
> >> The patchset exists because of the difference between zero overhead on
> >> cpus that don't have drainable lrus, and non-zero overhead.  This turns
> >> out to be important on workloads where nohz cores are handling 10 Gb
> >> traffic in userspace and really, really don't want to be interrupted,
> >> or they drop packets on the floor.
> > But what is the effect of the patchset?  Has it been tested against the
> > problematic workload(s)?
> 
> Yes.  The result is that syscalls such as mlockall(), which otherwise interrupt
> every core, don't interrupt the cores that are running purely in userspace.
> Since they are purely in userspace they don't have any drainable pagevecs,
> so the patchset means they don't get interrupted and don't drop packets.
> 
> I implemented this against Linux 2.6.38 and our home-grown version of nohz
> cpusets back in July 2012, and we have been shipping it to customers since then.

argh.

Those per-cpu LRU pagevecs were a nasty but very effective locking
amortization hack back in, umm, 2002.  They have caused quite a lot of
weird corner-case behaviour, resulting in all the lru_add_drain_all()
calls sprinkled around the place.  I'd like to nuke the whole thing,
but that would require a fundamental rethnik/rework of all the LRU list
locking.

According to the 8891d6da17db0f changelog, the lru_add_drain_all() in
sys_mlock() isn't really required: "it isn't must.  but it reduce the
failure of moving to unevictable list.  its failure can rescue in
vmscan later.  but reducing is better."

I suspect we could just kill it.

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective
@ 2013-08-13 22:26                         ` Andrew Morton
  0 siblings, 0 replies; 104+ messages in thread
From: Andrew Morton @ 2013-08-13 22:26 UTC (permalink / raw)
  To: Chris Metcalf
  Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer, KOSAKI Motohiro

On Tue, 13 Aug 2013 18:13:48 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote:

> On 8/13/2013 5:13 PM, Andrew Morton wrote:
> > On Tue, 13 Aug 2013 16:59:54 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote:
> >
> >>> Then again, why does this patchset exist?  It's a performance
> >>> optimisation so presumably someone cares.  But not enough to perform
> >>> actual measurements :(
> >> The patchset exists because of the difference between zero overhead on
> >> cpus that don't have drainable lrus, and non-zero overhead.  This turns
> >> out to be important on workloads where nohz cores are handling 10 Gb
> >> traffic in userspace and really, really don't want to be interrupted,
> >> or they drop packets on the floor.
> > But what is the effect of the patchset?  Has it been tested against the
> > problematic workload(s)?
> 
> Yes.  The result is that syscalls such as mlockall(), which otherwise interrupt
> every core, don't interrupt the cores that are running purely in userspace.
> Since they are purely in userspace they don't have any drainable pagevecs,
> so the patchset means they don't get interrupted and don't drop packets.
> 
> I implemented this against Linux 2.6.38 and our home-grown version of nohz
> cpusets back in July 2012, and we have been shipping it to customers since then.

argh.

Those per-cpu LRU pagevecs were a nasty but very effective locking
amortization hack back in, umm, 2002.  They have caused quite a lot of
weird corner-case behaviour, resulting in all the lru_add_drain_all()
calls sprinkled around the place.  I'd like to nuke the whole thing,
but that would require a fundamental rethnik/rework of all the LRU list
locking.

According to the 8891d6da17db0f changelog, the lru_add_drain_all() in
sys_mlock() isn't really required: "it isn't must.  but it reduce the
failure of moving to unevictable list.  its failure can rescue in
vmscan later.  but reducing is better."

I suspect we could just kill it.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective
  2013-08-13 22:18                         ` Andrew Morton
@ 2013-08-13 22:33                           ` Tejun Heo
  -1 siblings, 0 replies; 104+ messages in thread
From: Tejun Heo @ 2013-08-13 22:33 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

Hello, Andrew.

On Tue, Aug 13, 2013 at 03:18:05PM -0700, Andrew Morton wrote:
> I don't buy it.  The callback simply determines whether "we need to
> schuedule work on this cpu".  It's utterly simple.  Nobody will have
> trouble understanding or using such a thing.

Well, I don't buy that either.  Callback based interface has its
issues.  The difference we're talking about here is pretty minute but
then again the improvement brought on by the callback is pretty minute
too.

> It removes one memory allocation and initialisation per call.  It
> removes an entire for_each_online_cpu() loop.

But that doesn't solve the original problem at all and while it
removes the loop, it also adds a separate function.

> I really don't understand what's going on here.  You're advocating for
> a weaker kernel interface and for inferior kernel runtime behaviour. 
> Forcing callers to communicate their needs via a large,
> dynamically-allocated temporary rather than directly.  And what do we
> get in return for all this?  Some stuff about callbacks which frankly
> has me scratching my head.

Well, it is a fairly heavy path and you're pushing for an optimization
which won't make any noticeable difference at all.  And, yes, I do
think we need to stick to simpler APIs whereever possible.  Sure the
difference is minute here but the addition of test callback doesn't
buy us anything either, so what's the point?  The allocation doesn't
even exist for vast majority of configurations.  If kmalloc from that
site is problematic, the right thing to do is pre-allocating resources
on the caller side, isn't it?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective
@ 2013-08-13 22:33                           ` Tejun Heo
  0 siblings, 0 replies; 104+ messages in thread
From: Tejun Heo @ 2013-08-13 22:33 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

Hello, Andrew.

On Tue, Aug 13, 2013 at 03:18:05PM -0700, Andrew Morton wrote:
> I don't buy it.  The callback simply determines whether "we need to
> schuedule work on this cpu".  It's utterly simple.  Nobody will have
> trouble understanding or using such a thing.

Well, I don't buy that either.  Callback based interface has its
issues.  The difference we're talking about here is pretty minute but
then again the improvement brought on by the callback is pretty minute
too.

> It removes one memory allocation and initialisation per call.  It
> removes an entire for_each_online_cpu() loop.

But that doesn't solve the original problem at all and while it
removes the loop, it also adds a separate function.

> I really don't understand what's going on here.  You're advocating for
> a weaker kernel interface and for inferior kernel runtime behaviour. 
> Forcing callers to communicate their needs via a large,
> dynamically-allocated temporary rather than directly.  And what do we
> get in return for all this?  Some stuff about callbacks which frankly
> has me scratching my head.

Well, it is a fairly heavy path and you're pushing for an optimization
which won't make any noticeable difference at all.  And, yes, I do
think we need to stick to simpler APIs whereever possible.  Sure the
difference is minute here but the addition of test callback doesn't
buy us anything either, so what's the point?  The allocation doesn't
even exist for vast majority of configurations.  If kmalloc from that
site is problematic, the right thing to do is pre-allocating resources
on the caller side, isn't it?

Thanks.

-- 
tejun

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective
  2013-08-13 22:33                           ` Tejun Heo
@ 2013-08-13 22:47                             ` Andrew Morton
  -1 siblings, 0 replies; 104+ messages in thread
From: Andrew Morton @ 2013-08-13 22:47 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

On Tue, 13 Aug 2013 18:33:04 -0400 Tejun Heo <tj@kernel.org> wrote:

> Hello, Andrew.
> 
> On Tue, Aug 13, 2013 at 03:18:05PM -0700, Andrew Morton wrote:
> > I don't buy it.  The callback simply determines whether "we need to
> > schuedule work on this cpu".  It's utterly simple.  Nobody will have
> > trouble understanding or using such a thing.
> 
> Well, I don't buy that either.  Callback based interface has its
> issues.

No it hasn't.  It's a common and simple technique which we all understand.

>  The difference we're talking about here is pretty minute but
> then again the improvement brought on by the callback is pretty minute
> too.

It's a relatively small improvement in the lru_add_drain_all() case. 
Other callsites can gain improvements as well.

> > It removes one memory allocation and initialisation per call.  It
> > removes an entire for_each_online_cpu() loop.
> 
> But that doesn't solve the original problem at all and while it
> removes the loop, it also adds a separate function.

It results in superior runtime code.  At this and potentially other
callsites.

> > I really don't understand what's going on here.  You're advocating for
> > a weaker kernel interface and for inferior kernel runtime behaviour. 
> > Forcing callers to communicate their needs via a large,
> > dynamically-allocated temporary rather than directly.  And what do we
> > get in return for all this?  Some stuff about callbacks which frankly
> > has me scratching my head.
> 
> Well, it is a fairly heavy path and you're pushing for an optimization
> which won't make any noticeable difference at all.  And, yes, I do
> think we need to stick to simpler APIs whereever possible.  Sure the
> difference is minute here but the addition of test callback doesn't
> buy us anything either, so what's the point?

It does buy us things, as I've repeatedly described.  You keep on
saying things which demonstrably aren't so.  I think I'll give up now.



^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective
@ 2013-08-13 22:47                             ` Andrew Morton
  0 siblings, 0 replies; 104+ messages in thread
From: Andrew Morton @ 2013-08-13 22:47 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

On Tue, 13 Aug 2013 18:33:04 -0400 Tejun Heo <tj@kernel.org> wrote:

> Hello, Andrew.
> 
> On Tue, Aug 13, 2013 at 03:18:05PM -0700, Andrew Morton wrote:
> > I don't buy it.  The callback simply determines whether "we need to
> > schuedule work on this cpu".  It's utterly simple.  Nobody will have
> > trouble understanding or using such a thing.
> 
> Well, I don't buy that either.  Callback based interface has its
> issues.

No it hasn't.  It's a common and simple technique which we all understand.

>  The difference we're talking about here is pretty minute but
> then again the improvement brought on by the callback is pretty minute
> too.

It's a relatively small improvement in the lru_add_drain_all() case. 
Other callsites can gain improvements as well.

> > It removes one memory allocation and initialisation per call.  It
> > removes an entire for_each_online_cpu() loop.
> 
> But that doesn't solve the original problem at all and while it
> removes the loop, it also adds a separate function.

It results in superior runtime code.  At this and potentially other
callsites.

> > I really don't understand what's going on here.  You're advocating for
> > a weaker kernel interface and for inferior kernel runtime behaviour. 
> > Forcing callers to communicate their needs via a large,
> > dynamically-allocated temporary rather than directly.  And what do we
> > get in return for all this?  Some stuff about callbacks which frankly
> > has me scratching my head.
> 
> Well, it is a fairly heavy path and you're pushing for an optimization
> which won't make any noticeable difference at all.  And, yes, I do
> think we need to stick to simpler APIs whereever possible.  Sure the
> difference is minute here but the addition of test callback doesn't
> buy us anything either, so what's the point?

It does buy us things, as I've repeatedly described.  You keep on
saying things which demonstrably aren't so.  I think I'll give up now.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 104+ messages in thread

* [PATCH v7 1/2] workqueue: add schedule_on_each_cpu_cond
  2013-08-13 22:13                       ` Chris Metcalf
@ 2013-08-13 22:51                         ` Chris Metcalf
  -1 siblings, 0 replies; 104+ messages in thread
From: Chris Metcalf @ 2013-08-13 22:51 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

This API supports running work on a subset of the online
cpus determined by a callback function.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
v7: try a version with callbacks instead of cpu masks.
Either this or v6 seem like reasonable solutions.

v6: add Tejun's Acked-by, and add missing get/put_cpu_online to
lru_add_drain_all().

v5: provide validity checking on the cpumask for schedule_on_cpu_mask.
By providing an all-or-nothing EINVAL check, we impose the requirement
that the calling code actually know clearly what it's trying to do.
(Note: no change to the mm/swap.c commit)

v4: don't lose possible -ENOMEM in schedule_on_each_cpu()
(Note: no change to the mm/swap.c commit)

v3: split commit into two, one for workqueue and one for mm, though both
should probably be taken through -mm.

 include/linux/workqueue.h |  3 +++
 kernel/workqueue.c        | 54 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+)

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index a0ed78a..c5ee29f 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -17,6 +17,7 @@ struct workqueue_struct;
 
 struct work_struct;
 typedef void (*work_func_t)(struct work_struct *work);
+typedef bool (*work_cond_func_t)(void *data, int cpu);
 void delayed_work_timer_fn(unsigned long __data);
 
 /*
@@ -471,6 +472,8 @@ extern void drain_workqueue(struct workqueue_struct *wq);
 extern void flush_scheduled_work(void);
 
 extern int schedule_on_each_cpu(work_func_t func);
+extern int schedule_on_each_cpu_cond(work_func_t func, work_cond_func_t cond,
+				     void *data);
 
 int execute_in_process_context(work_func_t fn, struct execute_work *);
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f02c4a4..5c5b534 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2999,6 +2999,60 @@ int schedule_on_each_cpu(work_func_t func)
 }
 
 /**
+ * schedule_on_each_cpu_cond - execute a function synchronously on each
+ *   online CPU if requested by a condition callback.
+ * @func: the function to call
+ * @cond: the callback function to determine whether to schedule the work
+ * @data: opaque data passed to the callback function
+ *
+ * schedule_on_each_cpu_cond() calls @cond for each online cpu (in the
+ * context of the current cpu), and for each cpu for which @cond returns
+ * true, it executes @func using the system workqueue.  The function
+ * blocks until all CPUs on which work was scheduled have completed.
+ * schedule_on_each_cpu_cond() is very slow.
+ *
+ * The @cond callback is called in the same context as the original
+ * call to schedule_on_each_cpu_cond().
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int schedule_on_each_cpu_cond(work_func_t func,
+			      work_cond_func_t cond, void *data)
+{
+	int cpu;
+	struct work_struct __percpu *works;
+
+	works = alloc_percpu(struct work_struct);
+	if (!works)
+		return -ENOMEM;
+
+	get_online_cpus();
+
+	for_each_online_cpu(cpu) {
+		struct work_struct *work = per_cpu_ptr(works, cpu);
+
+		if (cond(data, cpu)) {
+			INIT_WORK(work, func);
+			schedule_work_on(cpu, work);
+		} else {
+			work->entry.next = NULL;
+		}
+	}
+
+	for_each_online_cpu(cpu) {
+		struct work_struct *work = per_cpu_ptr(works, cpu);
+
+		if (work->entry.next)
+			flush_work(work);
+	}
+
+	put_online_cpus();
+	free_percpu(works);
+	return 0;
+}
+
+/**
  * flush_scheduled_work - ensure that any scheduled work has run to completion.
  *
  * Forces execution of the kernel-global workqueue and blocks until its
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 104+ messages in thread

* [PATCH v7 1/2] workqueue: add schedule_on_each_cpu_cond
@ 2013-08-13 22:51                         ` Chris Metcalf
  0 siblings, 0 replies; 104+ messages in thread
From: Chris Metcalf @ 2013-08-13 22:51 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

This API supports running work on a subset of the online
cpus determined by a callback function.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
v7: try a version with callbacks instead of cpu masks.
Either this or v6 seem like reasonable solutions.

v6: add Tejun's Acked-by, and add missing get/put_cpu_online to
lru_add_drain_all().

v5: provide validity checking on the cpumask for schedule_on_cpu_mask.
By providing an all-or-nothing EINVAL check, we impose the requirement
that the calling code actually know clearly what it's trying to do.
(Note: no change to the mm/swap.c commit)

v4: don't lose possible -ENOMEM in schedule_on_each_cpu()
(Note: no change to the mm/swap.c commit)

v3: split commit into two, one for workqueue and one for mm, though both
should probably be taken through -mm.

 include/linux/workqueue.h |  3 +++
 kernel/workqueue.c        | 54 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+)

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index a0ed78a..c5ee29f 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -17,6 +17,7 @@ struct workqueue_struct;
 
 struct work_struct;
 typedef void (*work_func_t)(struct work_struct *work);
+typedef bool (*work_cond_func_t)(void *data, int cpu);
 void delayed_work_timer_fn(unsigned long __data);
 
 /*
@@ -471,6 +472,8 @@ extern void drain_workqueue(struct workqueue_struct *wq);
 extern void flush_scheduled_work(void);
 
 extern int schedule_on_each_cpu(work_func_t func);
+extern int schedule_on_each_cpu_cond(work_func_t func, work_cond_func_t cond,
+				     void *data);
 
 int execute_in_process_context(work_func_t fn, struct execute_work *);
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f02c4a4..5c5b534 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2999,6 +2999,60 @@ int schedule_on_each_cpu(work_func_t func)
 }
 
 /**
+ * schedule_on_each_cpu_cond - execute a function synchronously on each
+ *   online CPU if requested by a condition callback.
+ * @func: the function to call
+ * @cond: the callback function to determine whether to schedule the work
+ * @data: opaque data passed to the callback function
+ *
+ * schedule_on_each_cpu_cond() calls @cond for each online cpu (in the
+ * context of the current cpu), and for each cpu for which @cond returns
+ * true, it executes @func using the system workqueue.  The function
+ * blocks until all CPUs on which work was scheduled have completed.
+ * schedule_on_each_cpu_cond() is very slow.
+ *
+ * The @cond callback is called in the same context as the original
+ * call to schedule_on_each_cpu_cond().
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int schedule_on_each_cpu_cond(work_func_t func,
+			      work_cond_func_t cond, void *data)
+{
+	int cpu;
+	struct work_struct __percpu *works;
+
+	works = alloc_percpu(struct work_struct);
+	if (!works)
+		return -ENOMEM;
+
+	get_online_cpus();
+
+	for_each_online_cpu(cpu) {
+		struct work_struct *work = per_cpu_ptr(works, cpu);
+
+		if (cond(data, cpu)) {
+			INIT_WORK(work, func);
+			schedule_work_on(cpu, work);
+		} else {
+			work->entry.next = NULL;
+		}
+	}
+
+	for_each_online_cpu(cpu) {
+		struct work_struct *work = per_cpu_ptr(works, cpu);
+
+		if (work->entry.next)
+			flush_work(work);
+	}
+
+	put_online_cpus();
+	free_percpu(works);
+	return 0;
+}
+
+/**
  * flush_scheduled_work - ensure that any scheduled work has run to completion.
  *
  * Forces execution of the kernel-global workqueue and blocks until its
-- 
1.8.3.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 104+ messages in thread

* [PATCH v7 2/2] mm: make lru_add_drain_all() selective
  2013-08-13 22:13                       ` Chris Metcalf
@ 2013-08-13 22:53                         ` Chris Metcalf
  -1 siblings, 0 replies; 104+ messages in thread
From: Chris Metcalf @ 2013-08-13 22:53 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

This change makes lru_add_drain_all() only selectively interrupt
the cpus that have per-cpu free pages that can be drained.

This is important in nohz mode where calling mlockall(), for
example, otherwise will interrupt every core unnecessarily.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
v7: try a version with callbacks instead of cpu masks.
Either this or v6 seem like reasonable solutions.

v6: add Tejun's Acked-by, and add missing get/put_cpu_online to
lru_add_drain_all().

v5: provide validity checking on the cpumask for schedule_on_cpu_mask.
By providing an all-or-nothing EINVAL check, we impose the requirement
that the calling code actually know clearly what it's trying to do.
(Note: no change to the mm/swap.c commit)

v4: don't lose possible -ENOMEM in schedule_on_each_cpu()
(Note: no change to the mm/swap.c commit)

v3: split commit into two, one for workqueue and one for mm, though both
should probably be taken through -mm.

 mm/swap.c | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/mm/swap.c b/mm/swap.c
index 4a1d0d2..fe3a488 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -405,6 +405,11 @@ static void activate_page_drain(int cpu)
 		pagevec_lru_move_fn(pvec, __activate_page, NULL);
 }
 
+static bool need_activate_page_drain(int cpu)
+{
+	return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0;
+}
+
 void activate_page(struct page *page)
 {
 	if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
@@ -422,6 +427,11 @@ static inline void activate_page_drain(int cpu)
 {
 }
 
+static bool need_activate_page_drain(int cpu)
+{
+	return false;
+}
+
 void activate_page(struct page *page)
 {
 	struct zone *zone = page_zone(page);
@@ -673,6 +683,14 @@ void lru_add_drain(void)
 	put_cpu();
 }
 
+static bool lru_add_drain_cond(void *data, int cpu)
+{
+	return pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
+		pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
+		pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
+		need_activate_page_drain(cpu);
+}
+
 static void lru_add_drain_per_cpu(struct work_struct *dummy)
 {
 	lru_add_drain();
@@ -683,7 +701,8 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
  */
 int lru_add_drain_all(void)
 {
-	return schedule_on_each_cpu(lru_add_drain_per_cpu);
+	return schedule_on_each_cpu_cond(lru_add_drain_per_cpu,
+					 lru_add_drain_cond, NULL);
 }
 
 /*
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 104+ messages in thread

* [PATCH v7 2/2] mm: make lru_add_drain_all() selective
@ 2013-08-13 22:53                         ` Chris Metcalf
  0 siblings, 0 replies; 104+ messages in thread
From: Chris Metcalf @ 2013-08-13 22:53 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

This change makes lru_add_drain_all() only selectively interrupt
the cpus that have per-cpu free pages that can be drained.

This is important in nohz mode where calling mlockall(), for
example, otherwise will interrupt every core unnecessarily.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
v7: try a version with callbacks instead of cpu masks.
Either this or v6 seem like reasonable solutions.

v6: add Tejun's Acked-by, and add missing get/put_cpu_online to
lru_add_drain_all().

v5: provide validity checking on the cpumask for schedule_on_cpu_mask.
By providing an all-or-nothing EINVAL check, we impose the requirement
that the calling code actually know clearly what it's trying to do.
(Note: no change to the mm/swap.c commit)

v4: don't lose possible -ENOMEM in schedule_on_each_cpu()
(Note: no change to the mm/swap.c commit)

v3: split commit into two, one for workqueue and one for mm, though both
should probably be taken through -mm.

 mm/swap.c | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/mm/swap.c b/mm/swap.c
index 4a1d0d2..fe3a488 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -405,6 +405,11 @@ static void activate_page_drain(int cpu)
 		pagevec_lru_move_fn(pvec, __activate_page, NULL);
 }
 
+static bool need_activate_page_drain(int cpu)
+{
+	return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0;
+}
+
 void activate_page(struct page *page)
 {
 	if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
@@ -422,6 +427,11 @@ static inline void activate_page_drain(int cpu)
 {
 }
 
+static bool need_activate_page_drain(int cpu)
+{
+	return false;
+}
+
 void activate_page(struct page *page)
 {
 	struct zone *zone = page_zone(page);
@@ -673,6 +683,14 @@ void lru_add_drain(void)
 	put_cpu();
 }
 
+static bool lru_add_drain_cond(void *data, int cpu)
+{
+	return pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
+		pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
+		pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
+		need_activate_page_drain(cpu);
+}
+
 static void lru_add_drain_per_cpu(struct work_struct *dummy)
 {
 	lru_add_drain();
@@ -683,7 +701,8 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
  */
 int lru_add_drain_all(void)
 {
-	return schedule_on_each_cpu(lru_add_drain_per_cpu);
+	return schedule_on_each_cpu_cond(lru_add_drain_per_cpu,
+					 lru_add_drain_cond, NULL);
 }
 
 /*
-- 
1.8.3.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 104+ messages in thread

* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective
  2013-08-13 22:47                             ` Andrew Morton
@ 2013-08-13 23:03                               ` Tejun Heo
  -1 siblings, 0 replies; 104+ messages in thread
From: Tejun Heo @ 2013-08-13 23:03 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

Hello, Andrew.

On Tue, Aug 13, 2013 at 03:47:40PM -0700, Andrew Morton wrote:
> > Well, I don't buy that either.  Callback based interface has its
> > issues.
> 
> No it hasn't.  It's a common and simple technique which we all understand.

It sure has its uses but it has receded some of its former use cases
to better constructs which are easier to use and maintain.  I'm not
saying it's black and white here as the thing is callback based anyway
but was trying to point out general disadvantages of callback based
interface.  If you're saying callback based interface isn't clunkier
compared to constructs which can be embedded in the caller side, this
discussion probably won't be very fruitful.

> It's a relatively small improvement in the lru_add_drain_all() case. 
> Other callsites can gain improvements as well.

Well, if we're talking about minute performance differences, for
non-huge configurations, it'll actually be a small performance
degradation as there will be more instructions and the control will be
jumping back and forth.

> It results in superior runtime code.  At this and potentially other
> callsites.

It's actually inferior in majority of cases.

> It does buy us things, as I've repeatedly described.  You keep on
> saying things which demonstrably aren't so.  I think I'll give up now.

I just don't think it's something clear cut and it doesn't even matter
for the problem at hand.  Let's please talk about how to solve the
actual problem.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective
@ 2013-08-13 23:03                               ` Tejun Heo
  0 siblings, 0 replies; 104+ messages in thread
From: Tejun Heo @ 2013-08-13 23:03 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

Hello, Andrew.

On Tue, Aug 13, 2013 at 03:47:40PM -0700, Andrew Morton wrote:
> > Well, I don't buy that either.  Callback based interface has its
> > issues.
> 
> No it hasn't.  It's a common and simple technique which we all understand.

It sure has its uses but it has receded some of its former use cases
to better constructs which are easier to use and maintain.  I'm not
saying it's black and white here as the thing is callback based anyway
but was trying to point out general disadvantages of callback based
interface.  If you're saying callback based interface isn't clunkier
compared to constructs which can be embedded in the caller side, this
discussion probably won't be very fruitful.

> It's a relatively small improvement in the lru_add_drain_all() case. 
> Other callsites can gain improvements as well.

Well, if we're talking about minute performance differences, for
non-huge configurations, it'll actually be a small performance
degradation as there will be more instructions and the control will be
jumping back and forth.

> It results in superior runtime code.  At this and potentially other
> callsites.

It's actually inferior in majority of cases.

> It does buy us things, as I've repeatedly described.  You keep on
> saying things which demonstrably aren't so.  I think I'll give up now.

I just don't think it's something clear cut and it doesn't even matter
for the problem at hand.  Let's please talk about how to solve the
actual problem.

Thanks.

-- 
tejun

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective
  2013-08-13 22:26                         ` Andrew Morton
@ 2013-08-13 23:04                           ` Chris Metcalf
  -1 siblings, 0 replies; 104+ messages in thread
From: Chris Metcalf @ 2013-08-13 23:04 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer, KOSAKI Motohiro

On 8/13/2013 6:26 PM, Andrew Morton wrote:
> On Tue, 13 Aug 2013 18:13:48 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote:
>
>> On 8/13/2013 5:13 PM, Andrew Morton wrote:
>>> On Tue, 13 Aug 2013 16:59:54 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote:
>>>
>>>>> Then again, why does this patchset exist?  It's a performance
>>>>> optimisation so presumably someone cares.  But not enough to perform
>>>>> actual measurements :(
>>>> The patchset exists because of the difference between zero overhead on
>>>> cpus that don't have drainable lrus, and non-zero overhead.  This turns
>>>> out to be important on workloads where nohz cores are handling 10 Gb
>>>> traffic in userspace and really, really don't want to be interrupted,
>>>> or they drop packets on the floor.
>>> But what is the effect of the patchset?  Has it been tested against the
>>> problematic workload(s)?
>> Yes.  The result is that syscalls such as mlockall(), which otherwise interrupt
>> every core, don't interrupt the cores that are running purely in userspace.
>> Since they are purely in userspace they don't have any drainable pagevecs,
>> so the patchset means they don't get interrupted and don't drop packets.
>>
>> I implemented this against Linux 2.6.38 and our home-grown version of nohz
>> cpusets back in July 2012, and we have been shipping it to customers since then.
> argh.
>
> Those per-cpu LRU pagevecs were a nasty but very effective locking
> amortization hack back in, umm, 2002.  They have caused quite a lot of
> weird corner-case behaviour, resulting in all the lru_add_drain_all()
> calls sprinkled around the place.  I'd like to nuke the whole thing,
> but that would require a fundamental rethnik/rework of all the LRU list
> locking.
>
> According to the 8891d6da17db0f changelog, the lru_add_drain_all() in
> sys_mlock() isn't really required: "it isn't must.  but it reduce the
> failure of moving to unevictable list.  its failure can rescue in
> vmscan later.  but reducing is better."
>
> I suspect we could just kill it.

That's probably true, but I suspect this change is still worthwhile for
nohz environments.  There are other calls of lru_add_drain_all(), and
you just don't want anything in the kernel that interrupts every core
when only a subset could be interrupted.  If the kernel can avoid
generating unnecessary interrupts to uninvolved cores, you can make
guarantees about jitter on cores that are running dedicated userspace code.

-- 
Chris Metcalf, Tilera Corp.
http://www.tilera.com


^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective
@ 2013-08-13 23:04                           ` Chris Metcalf
  0 siblings, 0 replies; 104+ messages in thread
From: Chris Metcalf @ 2013-08-13 23:04 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer, KOSAKI Motohiro

On 8/13/2013 6:26 PM, Andrew Morton wrote:
> On Tue, 13 Aug 2013 18:13:48 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote:
>
>> On 8/13/2013 5:13 PM, Andrew Morton wrote:
>>> On Tue, 13 Aug 2013 16:59:54 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote:
>>>
>>>>> Then again, why does this patchset exist?  It's a performance
>>>>> optimisation so presumably someone cares.  But not enough to perform
>>>>> actual measurements :(
>>>> The patchset exists because of the difference between zero overhead on
>>>> cpus that don't have drainable lrus, and non-zero overhead.  This turns
>>>> out to be important on workloads where nohz cores are handling 10 Gb
>>>> traffic in userspace and really, really don't want to be interrupted,
>>>> or they drop packets on the floor.
>>> But what is the effect of the patchset?  Has it been tested against the
>>> problematic workload(s)?
>> Yes.  The result is that syscalls such as mlockall(), which otherwise interrupt
>> every core, don't interrupt the cores that are running purely in userspace.
>> Since they are purely in userspace they don't have any drainable pagevecs,
>> so the patchset means they don't get interrupted and don't drop packets.
>>
>> I implemented this against Linux 2.6.38 and our home-grown version of nohz
>> cpusets back in July 2012, and we have been shipping it to customers since then.
> argh.
>
> Those per-cpu LRU pagevecs were a nasty but very effective locking
> amortization hack back in, umm, 2002.  They have caused quite a lot of
> weird corner-case behaviour, resulting in all the lru_add_drain_all()
> calls sprinkled around the place.  I'd like to nuke the whole thing,
> but that would require a fundamental rethnik/rework of all the LRU list
> locking.
>
> According to the 8891d6da17db0f changelog, the lru_add_drain_all() in
> sys_mlock() isn't really required: "it isn't must.  but it reduce the
> failure of moving to unevictable list.  its failure can rescue in
> vmscan later.  but reducing is better."
>
> I suspect we could just kill it.

That's probably true, but I suspect this change is still worthwhile for
nohz environments.  There are other calls of lru_add_drain_all(), and
you just don't want anything in the kernel that interrupts every core
when only a subset could be interrupted.  If the kernel can avoid
generating unnecessary interrupts to uninvolved cores, you can make
guarantees about jitter on cores that are running dedicated userspace code.

-- 
Chris Metcalf, Tilera Corp.
http://www.tilera.com

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v7 2/2] mm: make lru_add_drain_all() selective
  2013-08-13 22:53                         ` Chris Metcalf
@ 2013-08-13 23:29                           ` Tejun Heo
  -1 siblings, 0 replies; 104+ messages in thread
From: Tejun Heo @ 2013-08-13 23:29 UTC (permalink / raw)
  To: Chris Metcalf
  Cc: Andrew Morton, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

Hello,

On Tue, Aug 13, 2013 at 06:53:32PM -0400, Chris Metcalf wrote:
>  int lru_add_drain_all(void)
>  {
> -	return schedule_on_each_cpu(lru_add_drain_per_cpu);
> +	return schedule_on_each_cpu_cond(lru_add_drain_per_cpu,
> +					 lru_add_drain_cond, NULL);

It won't nest and doing it simultaneously won't buy anything, right?
Wouldn't it be better to protect it with a mutex and define all
necessary resources statically (yeah, cpumask is pain in the ass and I
think we should un-deprecate cpumask_t for static use cases)?  Then,
there'd be no allocation to worry about on the path.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v7 2/2] mm: make lru_add_drain_all() selective
@ 2013-08-13 23:29                           ` Tejun Heo
  0 siblings, 0 replies; 104+ messages in thread
From: Tejun Heo @ 2013-08-13 23:29 UTC (permalink / raw)
  To: Chris Metcalf
  Cc: Andrew Morton, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

Hello,

On Tue, Aug 13, 2013 at 06:53:32PM -0400, Chris Metcalf wrote:
>  int lru_add_drain_all(void)
>  {
> -	return schedule_on_each_cpu(lru_add_drain_per_cpu);
> +	return schedule_on_each_cpu_cond(lru_add_drain_per_cpu,
> +					 lru_add_drain_cond, NULL);

It won't nest and doing it simultaneously won't buy anything, right?
Wouldn't it be better to protect it with a mutex and define all
necessary resources statically (yeah, cpumask is pain in the ass and I
think we should un-deprecate cpumask_t for static use cases)?  Then,
there'd be no allocation to worry about on the path.

Thanks.

-- 
tejun

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v7 2/2] mm: make lru_add_drain_all() selective
  2013-08-13 23:29                           ` Tejun Heo
@ 2013-08-13 23:32                             ` Chris Metcalf
  -1 siblings, 0 replies; 104+ messages in thread
From: Chris Metcalf @ 2013-08-13 23:32 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Andrew Morton, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

On 8/13/2013 7:29 PM, Tejun Heo wrote:
> Hello,
>
> On Tue, Aug 13, 2013 at 06:53:32PM -0400, Chris Metcalf wrote:
>>  int lru_add_drain_all(void)
>>  {
>> -	return schedule_on_each_cpu(lru_add_drain_per_cpu);
>> +	return schedule_on_each_cpu_cond(lru_add_drain_per_cpu,
>> +					 lru_add_drain_cond, NULL);
> It won't nest and doing it simultaneously won't buy anything, right?

Correct on both counts, I think.

> Wouldn't it be better to protect it with a mutex and define all
> necessary resources statically (yeah, cpumask is pain in the ass and I
> think we should un-deprecate cpumask_t for static use cases)?  Then,
> there'd be no allocation to worry about on the path.

If allocation is a real problem on this path, I think this is probably
OK, though I don't want to speak for Andrew.  You could just guard it
with a trylock and any caller that tried to start it while it was
locked could just return happy that it was going on.

I'll put out a version that does that and see how that looks
for comparison's sake.

-- 
Chris Metcalf, Tilera Corp.
http://www.tilera.com


^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v7 2/2] mm: make lru_add_drain_all() selective
@ 2013-08-13 23:32                             ` Chris Metcalf
  0 siblings, 0 replies; 104+ messages in thread
From: Chris Metcalf @ 2013-08-13 23:32 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Andrew Morton, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

On 8/13/2013 7:29 PM, Tejun Heo wrote:
> Hello,
>
> On Tue, Aug 13, 2013 at 06:53:32PM -0400, Chris Metcalf wrote:
>>  int lru_add_drain_all(void)
>>  {
>> -	return schedule_on_each_cpu(lru_add_drain_per_cpu);
>> +	return schedule_on_each_cpu_cond(lru_add_drain_per_cpu,
>> +					 lru_add_drain_cond, NULL);
> It won't nest and doing it simultaneously won't buy anything, right?

Correct on both counts, I think.

> Wouldn't it be better to protect it with a mutex and define all
> necessary resources statically (yeah, cpumask is pain in the ass and I
> think we should un-deprecate cpumask_t for static use cases)?  Then,
> there'd be no allocation to worry about on the path.

If allocation is a real problem on this path, I think this is probably
OK, though I don't want to speak for Andrew.  You could just guard it
with a trylock and any caller that tried to start it while it was
locked could just return happy that it was going on.

I'll put out a version that does that and see how that looks
for comparison's sake.

-- 
Chris Metcalf, Tilera Corp.
http://www.tilera.com

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v7 2/2] mm: make lru_add_drain_all() selective
  2013-08-13 23:29                           ` Tejun Heo
@ 2013-08-13 23:44                             ` Chris Metcalf
  -1 siblings, 0 replies; 104+ messages in thread
From: Chris Metcalf @ 2013-08-13 23:44 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Andrew Morton, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

On 8/13/2013 7:29 PM, Tejun Heo wrote:
> It won't nest and doing it simultaneously won't buy anything, right?
> Wouldn't it be better to protect it with a mutex and define all
> necessary resources statically (yeah, cpumask is pain in the ass and I
> think we should un-deprecate cpumask_t for static use cases)?  Then,
> there'd be no allocation to worry about on the path.

Here's what lru_add_drain_all() looks like with a guarding mutex.
Pretty much the same code complexity as when we have to allocate the
cpumask, and there really aren't any issues from locking, since we can assume
all is well and return immediately if we fail to get the lock.

int lru_add_drain_all(void)
{
        static struct cpumask mask;
        static DEFINE_MUTEX(lock);
        int cpu, rc;

        if (!mutex_trylock(&lock))
                return 0;  /* already ongoing elsewhere */

        cpumask_clear(&mask);
        get_online_cpus();

        /*
         * Figure out which cpus need flushing.  It's OK if we race
         * with changes to the per-cpu lru pvecs, since it's no worse
         * than if we flushed all cpus, since a cpu could still end
         * up putting pages back on its pvec before we returned.
         * And this avoids interrupting other cpus unnecessarily.
         */
        for_each_online_cpu(cpu) {
                if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
                    pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
                    pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
                    need_activate_page_drain(cpu))
                        cpumask_set_cpu(cpu, &mask);
        }

        rc = schedule_on_cpu_mask(lru_add_drain_per_cpu, &mask);

        put_online_cpus();
        mutex_unlock(&lock);
        return rc;
}

-- 
Chris Metcalf, Tilera Corp.
http://www.tilera.com


^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v7 2/2] mm: make lru_add_drain_all() selective
@ 2013-08-13 23:44                             ` Chris Metcalf
  0 siblings, 0 replies; 104+ messages in thread
From: Chris Metcalf @ 2013-08-13 23:44 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Andrew Morton, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

On 8/13/2013 7:29 PM, Tejun Heo wrote:
> It won't nest and doing it simultaneously won't buy anything, right?
> Wouldn't it be better to protect it with a mutex and define all
> necessary resources statically (yeah, cpumask is pain in the ass and I
> think we should un-deprecate cpumask_t for static use cases)?  Then,
> there'd be no allocation to worry about on the path.

Here's what lru_add_drain_all() looks like with a guarding mutex.
Pretty much the same code complexity as when we have to allocate the
cpumask, and there really aren't any issues from locking, since we can assume
all is well and return immediately if we fail to get the lock.

int lru_add_drain_all(void)
{
        static struct cpumask mask;
        static DEFINE_MUTEX(lock);
        int cpu, rc;

        if (!mutex_trylock(&lock))
                return 0;  /* already ongoing elsewhere */

        cpumask_clear(&mask);
        get_online_cpus();

        /*
         * Figure out which cpus need flushing.  It's OK if we race
         * with changes to the per-cpu lru pvecs, since it's no worse
         * than if we flushed all cpus, since a cpu could still end
         * up putting pages back on its pvec before we returned.
         * And this avoids interrupting other cpus unnecessarily.
         */
        for_each_online_cpu(cpu) {
                if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
                    pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
                    pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
                    need_activate_page_drain(cpu))
                        cpumask_set_cpu(cpu, &mask);
        }

        rc = schedule_on_cpu_mask(lru_add_drain_per_cpu, &mask);

        put_online_cpus();
        mutex_unlock(&lock);
        return rc;
}

-- 
Chris Metcalf, Tilera Corp.
http://www.tilera.com

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v7 2/2] mm: make lru_add_drain_all() selective
  2013-08-13 23:44                             ` Chris Metcalf
@ 2013-08-13 23:51                               ` Tejun Heo
  -1 siblings, 0 replies; 104+ messages in thread
From: Tejun Heo @ 2013-08-13 23:51 UTC (permalink / raw)
  To: Chris Metcalf
  Cc: Andrew Morton, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

On Tue, Aug 13, 2013 at 07:44:55PM -0400, Chris Metcalf wrote:
> int lru_add_drain_all(void)
> {
>         static struct cpumask mask;

Instead of cpumask,

>         static DEFINE_MUTEX(lock);

you can DEFINE_PER_CPU(struct work_struct, ...).

>         for_each_online_cpu(cpu) {
>                 if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
>                     pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
>                     pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
>                     need_activate_page_drain(cpu))
>                         cpumask_set_cpu(cpu, &mask);

and schedule the work items directly.

>         }
> 
>         rc = schedule_on_cpu_mask(lru_add_drain_per_cpu, &mask);

Open coding flushing can be a bit bothersome but you can create a
per-cpu workqueue and schedule work items on it and then flush the
workqueue instead too.

No matter how flushing is implemented, the path wouldn't have any
memory allocation, which I thought was the topic of the thread, no?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v7 2/2] mm: make lru_add_drain_all() selective
@ 2013-08-13 23:51                               ` Tejun Heo
  0 siblings, 0 replies; 104+ messages in thread
From: Tejun Heo @ 2013-08-13 23:51 UTC (permalink / raw)
  To: Chris Metcalf
  Cc: Andrew Morton, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

On Tue, Aug 13, 2013 at 07:44:55PM -0400, Chris Metcalf wrote:
> int lru_add_drain_all(void)
> {
>         static struct cpumask mask;

Instead of cpumask,

>         static DEFINE_MUTEX(lock);

you can DEFINE_PER_CPU(struct work_struct, ...).

>         for_each_online_cpu(cpu) {
>                 if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
>                     pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
>                     pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
>                     need_activate_page_drain(cpu))
>                         cpumask_set_cpu(cpu, &mask);

and schedule the work items directly.

>         }
> 
>         rc = schedule_on_cpu_mask(lru_add_drain_per_cpu, &mask);

Open coding flushing can be a bit bothersome but you can create a
per-cpu workqueue and schedule work items on it and then flush the
workqueue instead too.

No matter how flushing is implemented, the path wouldn't have any
memory allocation, which I thought was the topic of the thread, no?

Thanks.

-- 
tejun

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v7 2/2] mm: make lru_add_drain_all() selective
  2013-08-13 23:32                             ` Chris Metcalf
@ 2013-08-14  6:46                               ` Andrew Morton
  -1 siblings, 0 replies; 104+ messages in thread
From: Andrew Morton @ 2013-08-14  6:46 UTC (permalink / raw)
  To: Chris Metcalf
  Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

On Tue, 13 Aug 2013 19:32:37 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote:

> On 8/13/2013 7:29 PM, Tejun Heo wrote:
> > Hello,
> >
> > On Tue, Aug 13, 2013 at 06:53:32PM -0400, Chris Metcalf wrote:
> >>  int lru_add_drain_all(void)
> >>  {
> >> -	return schedule_on_each_cpu(lru_add_drain_per_cpu);
> >> +	return schedule_on_each_cpu_cond(lru_add_drain_per_cpu,
> >> +					 lru_add_drain_cond, NULL);

This version looks nice to me.  It's missing the conversion of
schedule_on_each_cpu(), but I suppose that will be pretty simple.

> > It won't nest and doing it simultaneously won't buy anything, right?
> 
> Correct on both counts, I think.

I'm glad you understood the question :(

What does "nest" mean?  lru_add_drain_all() calls itself recursively,
presumably via some ghastly alloc_percpu()->alloc_pages(GFP_KERNEL)
route?  If that ever happens then we'd certainly want to know about it.
Hopefully PF_MEMALLOC would prevent infinite recursion.

If "nest" means something else then please enlighten me!

As for "doing it simultaneously", I assume we're referring to
concurrent execution from separate threads.  If so, why would that "buy
us anything"?  Confused.  As long as each thread sees "all pages which
were in pagevecs at the time I called lru_add_drain_all() get spilled
onto the LRU" then we're good.  afaict the implementation will do this.

> > Wouldn't it be better to protect it with a mutex and define all
> > necessary resources statically (yeah, cpumask is pain in the ass and I
> > think we should un-deprecate cpumask_t for static use cases)?  Then,
> > there'd be no allocation to worry about on the path.
> 
> If allocation is a real problem on this path, I think this is probably

Well as you pointed out, alloc_percpu() can already do a GFP_KERNEL
allocation, so adding another GFP_KERNEL allocation won't cause great
problems.  But the patchset demonstrates that the additional allocation
isn't needed.

> OK, though I don't want to speak for Andrew.  You could just guard it
> with a trylock and any caller that tried to start it while it was
> locked could just return happy that it was going on.
> 
> I'll put out a version that does that and see how that looks
> for comparison's sake.

That one's no good.  If thread A is holding the mutex, thread B will
bale out and we broke lru_add_drain_all()'s contract, "all pages which
...", above.


^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v7 2/2] mm: make lru_add_drain_all() selective
@ 2013-08-14  6:46                               ` Andrew Morton
  0 siblings, 0 replies; 104+ messages in thread
From: Andrew Morton @ 2013-08-14  6:46 UTC (permalink / raw)
  To: Chris Metcalf
  Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

On Tue, 13 Aug 2013 19:32:37 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote:

> On 8/13/2013 7:29 PM, Tejun Heo wrote:
> > Hello,
> >
> > On Tue, Aug 13, 2013 at 06:53:32PM -0400, Chris Metcalf wrote:
> >>  int lru_add_drain_all(void)
> >>  {
> >> -	return schedule_on_each_cpu(lru_add_drain_per_cpu);
> >> +	return schedule_on_each_cpu_cond(lru_add_drain_per_cpu,
> >> +					 lru_add_drain_cond, NULL);

This version looks nice to me.  It's missing the conversion of
schedule_on_each_cpu(), but I suppose that will be pretty simple.

> > It won't nest and doing it simultaneously won't buy anything, right?
> 
> Correct on both counts, I think.

I'm glad you understood the question :(

What does "nest" mean?  lru_add_drain_all() calls itself recursively,
presumably via some ghastly alloc_percpu()->alloc_pages(GFP_KERNEL)
route?  If that ever happens then we'd certainly want to know about it.
Hopefully PF_MEMALLOC would prevent infinite recursion.

If "nest" means something else then please enlighten me!

As for "doing it simultaneously", I assume we're referring to
concurrent execution from separate threads.  If so, why would that "buy
us anything"?  Confused.  As long as each thread sees "all pages which
were in pagevecs at the time I called lru_add_drain_all() get spilled
onto the LRU" then we're good.  afaict the implementation will do this.

> > Wouldn't it be better to protect it with a mutex and define all
> > necessary resources statically (yeah, cpumask is pain in the ass and I
> > think we should un-deprecate cpumask_t for static use cases)?  Then,
> > there'd be no allocation to worry about on the path.
> 
> If allocation is a real problem on this path, I think this is probably

Well as you pointed out, alloc_percpu() can already do a GFP_KERNEL
allocation, so adding another GFP_KERNEL allocation won't cause great
problems.  But the patchset demonstrates that the additional allocation
isn't needed.

> OK, though I don't want to speak for Andrew.  You could just guard it
> with a trylock and any caller that tried to start it while it was
> locked could just return happy that it was going on.
> 
> I'll put out a version that does that and see how that looks
> for comparison's sake.

That one's no good.  If thread A is holding the mutex, thread B will
bale out and we broke lru_add_drain_all()'s contract, "all pages which
...", above.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v7 2/2] mm: make lru_add_drain_all() selective
  2013-08-14  6:46                               ` Andrew Morton
@ 2013-08-14 13:05                                 ` Tejun Heo
  -1 siblings, 0 replies; 104+ messages in thread
From: Tejun Heo @ 2013-08-14 13:05 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

Hello,

On Tue, Aug 13, 2013 at 11:46:29PM -0700, Andrew Morton wrote:
> What does "nest" mean?  lru_add_drain_all() calls itself recursively,
> presumably via some ghastly alloc_percpu()->alloc_pages(GFP_KERNEL)
> route?  If that ever happens then we'd certainly want to know about it.
> Hopefully PF_MEMALLOC would prevent infinite recursion.
> 
> If "nest" means something else then please enlighten me!
> 
> As for "doing it simultaneously", I assume we're referring to
> concurrent execution from separate threads.  If so, why would that "buy
> us anything"?  Confused.  As long as each thread sees "all pages which
> were in pagevecs at the time I called lru_add_drain_all() get spilled
> onto the LRU" then we're good.  afaict the implementation will do this.

I was wondering whether we can avoid all allocations by just
pre-allocating all resources.  If it can't call itself if we get rid
of all allocations && running multiple instances of them doesn't buy
us anything, the best solution would be allocating work items
statically and synchronize their use using a mutex.  That way the
whole thing wouldn't need any allocation.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v7 2/2] mm: make lru_add_drain_all() selective
@ 2013-08-14 13:05                                 ` Tejun Heo
  0 siblings, 0 replies; 104+ messages in thread
From: Tejun Heo @ 2013-08-14 13:05 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

Hello,

On Tue, Aug 13, 2013 at 11:46:29PM -0700, Andrew Morton wrote:
> What does "nest" mean?  lru_add_drain_all() calls itself recursively,
> presumably via some ghastly alloc_percpu()->alloc_pages(GFP_KERNEL)
> route?  If that ever happens then we'd certainly want to know about it.
> Hopefully PF_MEMALLOC would prevent infinite recursion.
> 
> If "nest" means something else then please enlighten me!
> 
> As for "doing it simultaneously", I assume we're referring to
> concurrent execution from separate threads.  If so, why would that "buy
> us anything"?  Confused.  As long as each thread sees "all pages which
> were in pagevecs at the time I called lru_add_drain_all() get spilled
> onto the LRU" then we're good.  afaict the implementation will do this.

I was wondering whether we can avoid all allocations by just
pre-allocating all resources.  If it can't call itself if we get rid
of all allocations && running multiple instances of them doesn't buy
us anything, the best solution would be allocating work items
statically and synchronize their use using a mutex.  That way the
whole thing wouldn't need any allocation.

Thanks.

-- 
tejun

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v7 2/2] mm: make lru_add_drain_all() selective
  2013-08-14  6:46                               ` Andrew Morton
@ 2013-08-14 16:03                                 ` Chris Metcalf
  -1 siblings, 0 replies; 104+ messages in thread
From: Chris Metcalf @ 2013-08-14 16:03 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

On 8/14/2013 2:46 AM, Andrew Morton wrote:
> On Tue, 13 Aug 2013 19:32:37 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote:
>
>> On 8/13/2013 7:29 PM, Tejun Heo wrote:
>>> Hello,
>>>
>>> On Tue, Aug 13, 2013 at 06:53:32PM -0400, Chris Metcalf wrote:
>>>>  int lru_add_drain_all(void)
>>>>  {
>>>> -	return schedule_on_each_cpu(lru_add_drain_per_cpu);
>>>> +	return schedule_on_each_cpu_cond(lru_add_drain_per_cpu,
>>>> +					 lru_add_drain_cond, NULL);
> This version looks nice to me.  It's missing the conversion of
> schedule_on_each_cpu(), but I suppose that will be pretty simple.

I assume you were thinking of something like a NULL "cond" function pointer
meaning "no condition" in the implementation, and then schedule_on_each_cpu
just calls schedule_on_each_cpu_cond(work, NULL, NULL)?

>>> It won't nest and doing it simultaneously won't buy anything, right?
>> Correct on both counts, I think.
> I'm glad you understood the question :(
>
> What does "nest" mean?  lru_add_drain_all() calls itself recursively,
> presumably via some ghastly alloc_percpu()->alloc_pages(GFP_KERNEL)
> route?  If that ever happens then we'd certainly want to know about it.
> Hopefully PF_MEMALLOC would prevent infinite recursion.

Well, I think "it won't nest" is true in exactly the way you say - or
maybe we want to say "it shouldn't nest".  Catching the condition would
obviously be a good thing.

> If "nest" means something else then please enlighten me!
>
> As for "doing it simultaneously", I assume we're referring to
> concurrent execution from separate threads.  If so, why would that "buy
> us anything"?  Confused.  As long as each thread sees "all pages which
> were in pagevecs at the time I called lru_add_drain_all() get spilled
> onto the LRU" then we're good.  afaict the implementation will do this.

If we can avoid doing it simultaneously, we avoid having to do any
allocation at all, which seems like a win.  Let's consider if we
simply do a mutex_lock() around the routine.  If we also then test to
see which cpus need to be drained before issuing the drain, the result
for two cpus that both try to drain simultaneously would be that one
succeeds and drains everything; the second blocks on the mutex until
the first is done, then quickly enters the code, finds that no cpus
require draining, and returns, faster than would have been the case in
the current code when it would also have fired drain requests at every
cpu, concurrently with the earlier cpus firing its drain requests.
Frankly, this seems like a pretty reasonable solution.  We can
implement it purely in swap.c using existing workqueue APIs.

Note that lru_add_drain_all() always returns success this way, so I've
changed the API to make it a void function.  Considering none of the
callers currently check the error return value anyway, that seemed like
a straightforward thing to do.  :-)

Unfortunately DEFINE_PER_CPU doesn't work at function scope, so
I have to make it file-static.

Assuming the existence of my "need_activate_page_drain()" helper from
the earlier versions of the patch set, lru_add_drain_all() looks
like the following:


static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);

void lru_add_drain_all(void)
{
	static DEFINE_MUTEX(lock);
	int cpu;

	mutex_lock(&lock);
	get_online_cpus();

	for_each_online_cpu(cpu) {
		struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);

		if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
		    pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
		    pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
		    need_activate_page_drain(cpu)) {
			INIT_WORK(work, lru_add_drain_per_cpu);
			schedule_work_on(cpu, work);
		} else {
			work->entry.next = NULL;
		}
	}

	for_each_online_cpu(cpu) {
		struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);

		if (work->entry.next)
			flush_work(work);
	}

	put_online_cpus();
	mutex_unlock(&lock);
}

Tejun, I don't know if you have a better idea for how to mark a
work_struct as being "not used" so we can set and test it here.
Is setting entry.next to NULL good?  Should we offer it as an API
in the workqueue header?

We could wrap the whole thing in a new workqueue API too, of course
(schedule_on_each_cpu_cond_sequential??) but it seems better at this
point to wait until we find another caller with similar needs, and only
then factor the code into a new workqueue API.

-- 
Chris Metcalf, Tilera Corp.
http://www.tilera.com


^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v7 2/2] mm: make lru_add_drain_all() selective
@ 2013-08-14 16:03                                 ` Chris Metcalf
  0 siblings, 0 replies; 104+ messages in thread
From: Chris Metcalf @ 2013-08-14 16:03 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

On 8/14/2013 2:46 AM, Andrew Morton wrote:
> On Tue, 13 Aug 2013 19:32:37 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote:
>
>> On 8/13/2013 7:29 PM, Tejun Heo wrote:
>>> Hello,
>>>
>>> On Tue, Aug 13, 2013 at 06:53:32PM -0400, Chris Metcalf wrote:
>>>>  int lru_add_drain_all(void)
>>>>  {
>>>> -	return schedule_on_each_cpu(lru_add_drain_per_cpu);
>>>> +	return schedule_on_each_cpu_cond(lru_add_drain_per_cpu,
>>>> +					 lru_add_drain_cond, NULL);
> This version looks nice to me.  It's missing the conversion of
> schedule_on_each_cpu(), but I suppose that will be pretty simple.

I assume you were thinking of something like a NULL "cond" function pointer
meaning "no condition" in the implementation, and then schedule_on_each_cpu
just calls schedule_on_each_cpu_cond(work, NULL, NULL)?

>>> It won't nest and doing it simultaneously won't buy anything, right?
>> Correct on both counts, I think.
> I'm glad you understood the question :(
>
> What does "nest" mean?  lru_add_drain_all() calls itself recursively,
> presumably via some ghastly alloc_percpu()->alloc_pages(GFP_KERNEL)
> route?  If that ever happens then we'd certainly want to know about it.
> Hopefully PF_MEMALLOC would prevent infinite recursion.

Well, I think "it won't nest" is true in exactly the way you say - or
maybe we want to say "it shouldn't nest".  Catching the condition would
obviously be a good thing.

> If "nest" means something else then please enlighten me!
>
> As for "doing it simultaneously", I assume we're referring to
> concurrent execution from separate threads.  If so, why would that "buy
> us anything"?  Confused.  As long as each thread sees "all pages which
> were in pagevecs at the time I called lru_add_drain_all() get spilled
> onto the LRU" then we're good.  afaict the implementation will do this.

If we can avoid doing it simultaneously, we avoid having to do any
allocation at all, which seems like a win.  Let's consider if we
simply do a mutex_lock() around the routine.  If we also then test to
see which cpus need to be drained before issuing the drain, the result
for two cpus that both try to drain simultaneously would be that one
succeeds and drains everything; the second blocks on the mutex until
the first is done, then quickly enters the code, finds that no cpus
require draining, and returns, faster than would have been the case in
the current code when it would also have fired drain requests at every
cpu, concurrently with the earlier cpus firing its drain requests.
Frankly, this seems like a pretty reasonable solution.  We can
implement it purely in swap.c using existing workqueue APIs.

Note that lru_add_drain_all() always returns success this way, so I've
changed the API to make it a void function.  Considering none of the
callers currently check the error return value anyway, that seemed like
a straightforward thing to do.  :-)

Unfortunately DEFINE_PER_CPU doesn't work at function scope, so
I have to make it file-static.

Assuming the existence of my "need_activate_page_drain()" helper from
the earlier versions of the patch set, lru_add_drain_all() looks
like the following:


static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);

void lru_add_drain_all(void)
{
	static DEFINE_MUTEX(lock);
	int cpu;

	mutex_lock(&lock);
	get_online_cpus();

	for_each_online_cpu(cpu) {
		struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);

		if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
		    pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
		    pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
		    need_activate_page_drain(cpu)) {
			INIT_WORK(work, lru_add_drain_per_cpu);
			schedule_work_on(cpu, work);
		} else {
			work->entry.next = NULL;
		}
	}

	for_each_online_cpu(cpu) {
		struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);

		if (work->entry.next)
			flush_work(work);
	}

	put_online_cpus();
	mutex_unlock(&lock);
}

Tejun, I don't know if you have a better idea for how to mark a
work_struct as being "not used" so we can set and test it here.
Is setting entry.next to NULL good?  Should we offer it as an API
in the workqueue header?

We could wrap the whole thing in a new workqueue API too, of course
(schedule_on_each_cpu_cond_sequential??) but it seems better at this
point to wait until we find another caller with similar needs, and only
then factor the code into a new workqueue API.

-- 
Chris Metcalf, Tilera Corp.
http://www.tilera.com

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v7 2/2] mm: make lru_add_drain_all() selective
  2013-08-14 16:03                                 ` Chris Metcalf
@ 2013-08-14 16:57                                   ` Tejun Heo
  -1 siblings, 0 replies; 104+ messages in thread
From: Tejun Heo @ 2013-08-14 16:57 UTC (permalink / raw)
  To: Chris Metcalf
  Cc: Andrew Morton, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

Hello, Chris.

On Wed, Aug 14, 2013 at 12:03:39PM -0400, Chris Metcalf wrote:
> Tejun, I don't know if you have a better idea for how to mark a
> work_struct as being "not used" so we can set and test it here.
> Is setting entry.next to NULL good?  Should we offer it as an API
> in the workqueue header?

Maybe simply defining a static cpumask would be cleaner?

> We could wrap the whole thing in a new workqueue API too, of course
> (schedule_on_each_cpu_cond_sequential??) but it seems better at this
> point to wait until we find another caller with similar needs, and only
> then factor the code into a new workqueue API.

We can have e.g. __schedule_on_cpu(fn, pcpu_works) but yeah it seems a
bit excessive at this point.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v7 2/2] mm: make lru_add_drain_all() selective
@ 2013-08-14 16:57                                   ` Tejun Heo
  0 siblings, 0 replies; 104+ messages in thread
From: Tejun Heo @ 2013-08-14 16:57 UTC (permalink / raw)
  To: Chris Metcalf
  Cc: Andrew Morton, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

Hello, Chris.

On Wed, Aug 14, 2013 at 12:03:39PM -0400, Chris Metcalf wrote:
> Tejun, I don't know if you have a better idea for how to mark a
> work_struct as being "not used" so we can set and test it here.
> Is setting entry.next to NULL good?  Should we offer it as an API
> in the workqueue header?

Maybe simply defining a static cpumask would be cleaner?

> We could wrap the whole thing in a new workqueue API too, of course
> (schedule_on_each_cpu_cond_sequential??) but it seems better at this
> point to wait until we find another caller with similar needs, and only
> then factor the code into a new workqueue API.

We can have e.g. __schedule_on_cpu(fn, pcpu_works) but yeah it seems a
bit excessive at this point.

Thanks.

-- 
tejun

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v7 2/2] mm: make lru_add_drain_all() selective
  2013-08-14 16:57                                   ` Tejun Heo
@ 2013-08-14 17:18                                     ` Chris Metcalf
  -1 siblings, 0 replies; 104+ messages in thread
From: Chris Metcalf @ 2013-08-14 17:18 UTC (permalink / raw)
  To: Tejun Heo, Andrew Morton
  Cc: linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker,
	Cody P Schafer

On 8/14/2013 12:57 PM, Tejun Heo wrote:
> Hello, Chris.
>
> On Wed, Aug 14, 2013 at 12:03:39PM -0400, Chris Metcalf wrote:
>> Tejun, I don't know if you have a better idea for how to mark a
>> work_struct as being "not used" so we can set and test it here.
>> Is setting entry.next to NULL good?  Should we offer it as an API
>> in the workqueue header?
> Maybe simply defining a static cpumask would be cleaner?

I think you're right, actually.  Andrew, Tejun, how does this look?


static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);

void lru_add_drain_all(void)
{
        static DEFINE_MUTEX(lock);
        static struct cpumask has_work;
        int cpu;

        mutex_lock(&lock);
        get_online_cpus();
        cpumask_clear(&has_work);

        for_each_online_cpu(cpu) {
                struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);

                if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
                    pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
                    pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
                    need_activate_page_drain(cpu)) {
                        INIT_WORK(work, lru_add_drain_per_cpu);
                        schedule_work_on(cpu, work);
                        cpumask_set_cpu(cpu, &has_work);
                }
        }

        for_each_cpu(cpu, &has_work)
                flush_work(&per_cpu(lru_add_drain_work, cpu));

        put_online_cpus();
        mutex_unlock(&lock);
}

-- 
Chris Metcalf, Tilera Corp.
http://www.tilera.com


^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v7 2/2] mm: make lru_add_drain_all() selective
@ 2013-08-14 17:18                                     ` Chris Metcalf
  0 siblings, 0 replies; 104+ messages in thread
From: Chris Metcalf @ 2013-08-14 17:18 UTC (permalink / raw)
  To: Tejun Heo, Andrew Morton
  Cc: linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker,
	Cody P Schafer

On 8/14/2013 12:57 PM, Tejun Heo wrote:
> Hello, Chris.
>
> On Wed, Aug 14, 2013 at 12:03:39PM -0400, Chris Metcalf wrote:
>> Tejun, I don't know if you have a better idea for how to mark a
>> work_struct as being "not used" so we can set and test it here.
>> Is setting entry.next to NULL good?  Should we offer it as an API
>> in the workqueue header?
> Maybe simply defining a static cpumask would be cleaner?

I think you're right, actually.  Andrew, Tejun, how does this look?


static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);

void lru_add_drain_all(void)
{
        static DEFINE_MUTEX(lock);
        static struct cpumask has_work;
        int cpu;

        mutex_lock(&lock);
        get_online_cpus();
        cpumask_clear(&has_work);

        for_each_online_cpu(cpu) {
                struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);

                if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
                    pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
                    pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
                    need_activate_page_drain(cpu)) {
                        INIT_WORK(work, lru_add_drain_per_cpu);
                        schedule_work_on(cpu, work);
                        cpumask_set_cpu(cpu, &has_work);
                }
        }

        for_each_cpu(cpu, &has_work)
                flush_work(&per_cpu(lru_add_drain_work, cpu));

        put_online_cpus();
        mutex_unlock(&lock);
}

-- 
Chris Metcalf, Tilera Corp.
http://www.tilera.com

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v7 2/2] mm: make lru_add_drain_all() selective
  2013-08-14 17:18                                     ` Chris Metcalf
@ 2013-08-14 20:07                                       ` Tejun Heo
  -1 siblings, 0 replies; 104+ messages in thread
From: Tejun Heo @ 2013-08-14 20:07 UTC (permalink / raw)
  To: Chris Metcalf
  Cc: Andrew Morton, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

Hello, Chris.

On Wed, Aug 14, 2013 at 01:18:31PM -0400, Chris Metcalf wrote:
> On 8/14/2013 12:57 PM, Tejun Heo wrote:
> > Hello, Chris.
> >
> > On Wed, Aug 14, 2013 at 12:03:39PM -0400, Chris Metcalf wrote:
> >> Tejun, I don't know if you have a better idea for how to mark a
> >> work_struct as being "not used" so we can set and test it here.
> >> Is setting entry.next to NULL good?  Should we offer it as an API
> >> in the workqueue header?
> > Maybe simply defining a static cpumask would be cleaner?
> 
> I think you're right, actually.  Andrew, Tejun, how does this look?

Looks good to me.  Please feel free to add

 Reviewed-by: Tejun Heo <tj@kernel.org>

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v7 2/2] mm: make lru_add_drain_all() selective
@ 2013-08-14 20:07                                       ` Tejun Heo
  0 siblings, 0 replies; 104+ messages in thread
From: Tejun Heo @ 2013-08-14 20:07 UTC (permalink / raw)
  To: Chris Metcalf
  Cc: Andrew Morton, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

Hello, Chris.

On Wed, Aug 14, 2013 at 01:18:31PM -0400, Chris Metcalf wrote:
> On 8/14/2013 12:57 PM, Tejun Heo wrote:
> > Hello, Chris.
> >
> > On Wed, Aug 14, 2013 at 12:03:39PM -0400, Chris Metcalf wrote:
> >> Tejun, I don't know if you have a better idea for how to mark a
> >> work_struct as being "not used" so we can set and test it here.
> >> Is setting entry.next to NULL good?  Should we offer it as an API
> >> in the workqueue header?
> > Maybe simply defining a static cpumask would be cleaner?
> 
> I think you're right, actually.  Andrew, Tejun, how does this look?

Looks good to me.  Please feel free to add

 Reviewed-by: Tejun Heo <tj@kernel.org>

Thanks.

-- 
tejun

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 104+ messages in thread

* [PATCH v8] mm: make lru_add_drain_all() selective
  2013-08-14 20:07                                       ` Tejun Heo
@ 2013-08-14 20:22                                         ` Chris Metcalf
  -1 siblings, 0 replies; 104+ messages in thread
From: Chris Metcalf @ 2013-08-14 20:22 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

This change makes lru_add_drain_all() only selectively interrupt
the cpus that have per-cpu free pages that can be drained.

This is important in nohz mode where calling mlockall(), for
example, otherwise will interrupt every core unnecessarily.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
Reviewed-by: Tejun Heo <tj@kernel.org>
---
v8: Here's an actual git patch after Tejun's review.
Pull everything inline into lru_add_drain_all(), and use
a mutex plus build-time allocated work_structs to avoid allocation.

v7: try a version with callbacks instead of cpu masks.
Either this or v6 seem like reasonable solutions.

v6: add Tejun's Acked-by, and add missing get/put_cpu_online to
lru_add_drain_all().

v5: provide validity checking on the cpumask for schedule_on_cpu_mask.
By providing an all-or-nothing EINVAL check, we impose the requirement
that the calling code actually know clearly what it's trying to do.
(Note: no change to the mm/swap.c commit)

v4: don't lose possible -ENOMEM in schedule_on_each_cpu()
(Note: no change to the mm/swap.c commit)

v3: split commit into two, one for workqueue and one for mm, though both
should probably be taken through -mm.

 include/linux/swap.h |  2 +-
 mm/swap.c            | 44 +++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index d95cde5..acea8e0 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -247,7 +247,7 @@ extern void activate_page(struct page *);
 extern void mark_page_accessed(struct page *);
 extern void lru_add_drain(void);
 extern void lru_add_drain_cpu(int cpu);
-extern int lru_add_drain_all(void);
+extern void lru_add_drain_all(void);
 extern void rotate_reclaimable_page(struct page *page);
 extern void deactivate_page(struct page *page);
 extern void swap_setup(void);
diff --git a/mm/swap.c b/mm/swap.c
index 4a1d0d2..8d19543 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -405,6 +405,11 @@ static void activate_page_drain(int cpu)
 		pagevec_lru_move_fn(pvec, __activate_page, NULL);
 }
 
+static bool need_activate_page_drain(int cpu)
+{
+	return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0;
+}
+
 void activate_page(struct page *page)
 {
 	if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
@@ -422,6 +427,11 @@ static inline void activate_page_drain(int cpu)
 {
 }
 
+static bool need_activate_page_drain(int cpu)
+{
+	return false;
+}
+
 void activate_page(struct page *page)
 {
 	struct zone *zone = page_zone(page);
@@ -678,12 +688,36 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
 	lru_add_drain();
 }
 
-/*
- * Returns 0 for success
- */
-int lru_add_drain_all(void)
+static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
+
+void lru_add_drain_all(void)
 {
-	return schedule_on_each_cpu(lru_add_drain_per_cpu);
+	static DEFINE_MUTEX(lock);
+	static struct cpumask has_work;
+	int cpu;
+
+	mutex_lock(&lock);
+	get_online_cpus();
+	cpumask_clear(&has_work);
+
+	for_each_online_cpu(cpu) {
+		struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
+
+		if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
+		    pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
+		    pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
+		    need_activate_page_drain(cpu)) {
+			INIT_WORK(work, lru_add_drain_per_cpu);
+			schedule_work_on(cpu, work);
+			cpumask_set_cpu(cpu, &has_work);
+		}
+	}
+
+	for_each_cpu(cpu, &has_work)
+		flush_work(&per_cpu(lru_add_drain_work, cpu));
+
+	put_online_cpus();
+	mutex_unlock(&lock);
 }
 
 /*
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 104+ messages in thread

* [PATCH v8] mm: make lru_add_drain_all() selective
@ 2013-08-14 20:22                                         ` Chris Metcalf
  0 siblings, 0 replies; 104+ messages in thread
From: Chris Metcalf @ 2013-08-14 20:22 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

This change makes lru_add_drain_all() only selectively interrupt
the cpus that have per-cpu free pages that can be drained.

This is important in nohz mode where calling mlockall(), for
example, otherwise will interrupt every core unnecessarily.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
Reviewed-by: Tejun Heo <tj@kernel.org>
---
v8: Here's an actual git patch after Tejun's review.
Pull everything inline into lru_add_drain_all(), and use
a mutex plus build-time allocated work_structs to avoid allocation.

v7: try a version with callbacks instead of cpu masks.
Either this or v6 seem like reasonable solutions.

v6: add Tejun's Acked-by, and add missing get/put_cpu_online to
lru_add_drain_all().

v5: provide validity checking on the cpumask for schedule_on_cpu_mask.
By providing an all-or-nothing EINVAL check, we impose the requirement
that the calling code actually know clearly what it's trying to do.
(Note: no change to the mm/swap.c commit)

v4: don't lose possible -ENOMEM in schedule_on_each_cpu()
(Note: no change to the mm/swap.c commit)

v3: split commit into two, one for workqueue and one for mm, though both
should probably be taken through -mm.

 include/linux/swap.h |  2 +-
 mm/swap.c            | 44 +++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index d95cde5..acea8e0 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -247,7 +247,7 @@ extern void activate_page(struct page *);
 extern void mark_page_accessed(struct page *);
 extern void lru_add_drain(void);
 extern void lru_add_drain_cpu(int cpu);
-extern int lru_add_drain_all(void);
+extern void lru_add_drain_all(void);
 extern void rotate_reclaimable_page(struct page *page);
 extern void deactivate_page(struct page *page);
 extern void swap_setup(void);
diff --git a/mm/swap.c b/mm/swap.c
index 4a1d0d2..8d19543 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -405,6 +405,11 @@ static void activate_page_drain(int cpu)
 		pagevec_lru_move_fn(pvec, __activate_page, NULL);
 }
 
+static bool need_activate_page_drain(int cpu)
+{
+	return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0;
+}
+
 void activate_page(struct page *page)
 {
 	if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
@@ -422,6 +427,11 @@ static inline void activate_page_drain(int cpu)
 {
 }
 
+static bool need_activate_page_drain(int cpu)
+{
+	return false;
+}
+
 void activate_page(struct page *page)
 {
 	struct zone *zone = page_zone(page);
@@ -678,12 +688,36 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
 	lru_add_drain();
 }
 
-/*
- * Returns 0 for success
- */
-int lru_add_drain_all(void)
+static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
+
+void lru_add_drain_all(void)
 {
-	return schedule_on_each_cpu(lru_add_drain_per_cpu);
+	static DEFINE_MUTEX(lock);
+	static struct cpumask has_work;
+	int cpu;
+
+	mutex_lock(&lock);
+	get_online_cpus();
+	cpumask_clear(&has_work);
+
+	for_each_online_cpu(cpu) {
+		struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
+
+		if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
+		    pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
+		    pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
+		    need_activate_page_drain(cpu)) {
+			INIT_WORK(work, lru_add_drain_per_cpu);
+			schedule_work_on(cpu, work);
+			cpumask_set_cpu(cpu, &has_work);
+		}
+	}
+
+	for_each_cpu(cpu, &has_work)
+		flush_work(&per_cpu(lru_add_drain_work, cpu));
+
+	put_online_cpus();
+	mutex_unlock(&lock);
 }
 
 /*
-- 
1.8.3.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 104+ messages in thread

* Re: [PATCH v8] mm: make lru_add_drain_all() selective
  2013-08-14 20:22                                         ` Chris Metcalf
@ 2013-08-14 20:44                                           ` Andrew Morton
  -1 siblings, 0 replies; 104+ messages in thread
From: Andrew Morton @ 2013-08-14 20:44 UTC (permalink / raw)
  To: Chris Metcalf
  Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

On Wed, 14 Aug 2013 16:22:18 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote:

> This change makes lru_add_drain_all() only selectively interrupt
> the cpus that have per-cpu free pages that can be drained.
> 
> This is important in nohz mode where calling mlockall(), for
> example, otherwise will interrupt every core unnecessarily.
> 

I think the patch will work, but it's a bit sad to no longer gain the
general ability to do schedule_on_some_cpus().  Oh well.

> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -247,7 +247,7 @@ extern void activate_page(struct page *);
>  extern void mark_page_accessed(struct page *);
>  extern void lru_add_drain(void);
>  extern void lru_add_drain_cpu(int cpu);
> -extern int lru_add_drain_all(void);
> +extern void lru_add_drain_all(void);
>  extern void rotate_reclaimable_page(struct page *page);
>  extern void deactivate_page(struct page *page);
>  extern void swap_setup(void);
> diff --git a/mm/swap.c b/mm/swap.c
> index 4a1d0d2..8d19543 100644
> --- a/mm/swap.c
> +++ b/mm/swap.c
> @@ -405,6 +405,11 @@ static void activate_page_drain(int cpu)
>  		pagevec_lru_move_fn(pvec, __activate_page, NULL);
>  }
>  
> +static bool need_activate_page_drain(int cpu)
> +{
> +	return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0;
> +}

static int need_activate_page_drain(int cpu)
{
	return pagevec_count(&per_cpu(activate_page_pvecs, cpu));
}

would be shorter and faster.  bool rather sucks that way.  It's a
performance-vs-niceness thing.  I guess one has to look at the call
frequency when deciding.

>  void activate_page(struct page *page)
>  {
>  	if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
> @@ -422,6 +427,11 @@ static inline void activate_page_drain(int cpu)
>  {
>  }
>  
> +static bool need_activate_page_drain(int cpu)
> +{
> +	return false;
> +}
> +
>  void activate_page(struct page *page)
>  {
>  	struct zone *zone = page_zone(page);
> @@ -678,12 +688,36 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
>  	lru_add_drain();
>  }
>  
> -/*
> - * Returns 0 for success
> - */
> -int lru_add_drain_all(void)
> +static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
> +
> +void lru_add_drain_all(void)
>  {
> -	return schedule_on_each_cpu(lru_add_drain_per_cpu);
> +	static DEFINE_MUTEX(lock);
> +	static struct cpumask has_work;
> +	int cpu;
> +
> +	mutex_lock(&lock);

This is a bit scary but I expect it will be OK - later threads will
just twiddle thumbs while some other thread does all or most of their
work for them.

> +	get_online_cpus();
> +	cpumask_clear(&has_work);
> +
> +	for_each_online_cpu(cpu) {
> +		struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
> +
> +		if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
> +		    pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
> +		    pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
> +		    need_activate_page_drain(cpu)) {
> +			INIT_WORK(work, lru_add_drain_per_cpu);

This initialization is only needed once per boot but I don't see a
convenient way of doing this.

> +			schedule_work_on(cpu, work);
> +			cpumask_set_cpu(cpu, &has_work);
> +		}
> +	}
> +
> +	for_each_cpu(cpu, &has_work)

for_each_online_cpu()?

> +		flush_work(&per_cpu(lru_add_drain_work, cpu));
> +
> +	put_online_cpus();
> +	mutex_unlock(&lock);
>  }


^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v8] mm: make lru_add_drain_all() selective
@ 2013-08-14 20:44                                           ` Andrew Morton
  0 siblings, 0 replies; 104+ messages in thread
From: Andrew Morton @ 2013-08-14 20:44 UTC (permalink / raw)
  To: Chris Metcalf
  Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

On Wed, 14 Aug 2013 16:22:18 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote:

> This change makes lru_add_drain_all() only selectively interrupt
> the cpus that have per-cpu free pages that can be drained.
> 
> This is important in nohz mode where calling mlockall(), for
> example, otherwise will interrupt every core unnecessarily.
> 

I think the patch will work, but it's a bit sad to no longer gain the
general ability to do schedule_on_some_cpus().  Oh well.

> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -247,7 +247,7 @@ extern void activate_page(struct page *);
>  extern void mark_page_accessed(struct page *);
>  extern void lru_add_drain(void);
>  extern void lru_add_drain_cpu(int cpu);
> -extern int lru_add_drain_all(void);
> +extern void lru_add_drain_all(void);
>  extern void rotate_reclaimable_page(struct page *page);
>  extern void deactivate_page(struct page *page);
>  extern void swap_setup(void);
> diff --git a/mm/swap.c b/mm/swap.c
> index 4a1d0d2..8d19543 100644
> --- a/mm/swap.c
> +++ b/mm/swap.c
> @@ -405,6 +405,11 @@ static void activate_page_drain(int cpu)
>  		pagevec_lru_move_fn(pvec, __activate_page, NULL);
>  }
>  
> +static bool need_activate_page_drain(int cpu)
> +{
> +	return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0;
> +}

static int need_activate_page_drain(int cpu)
{
	return pagevec_count(&per_cpu(activate_page_pvecs, cpu));
}

would be shorter and faster.  bool rather sucks that way.  It's a
performance-vs-niceness thing.  I guess one has to look at the call
frequency when deciding.

>  void activate_page(struct page *page)
>  {
>  	if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
> @@ -422,6 +427,11 @@ static inline void activate_page_drain(int cpu)
>  {
>  }
>  
> +static bool need_activate_page_drain(int cpu)
> +{
> +	return false;
> +}
> +
>  void activate_page(struct page *page)
>  {
>  	struct zone *zone = page_zone(page);
> @@ -678,12 +688,36 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
>  	lru_add_drain();
>  }
>  
> -/*
> - * Returns 0 for success
> - */
> -int lru_add_drain_all(void)
> +static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
> +
> +void lru_add_drain_all(void)
>  {
> -	return schedule_on_each_cpu(lru_add_drain_per_cpu);
> +	static DEFINE_MUTEX(lock);
> +	static struct cpumask has_work;
> +	int cpu;
> +
> +	mutex_lock(&lock);

This is a bit scary but I expect it will be OK - later threads will
just twiddle thumbs while some other thread does all or most of their
work for them.

> +	get_online_cpus();
> +	cpumask_clear(&has_work);
> +
> +	for_each_online_cpu(cpu) {
> +		struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
> +
> +		if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
> +		    pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
> +		    pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
> +		    need_activate_page_drain(cpu)) {
> +			INIT_WORK(work, lru_add_drain_per_cpu);

This initialization is only needed once per boot but I don't see a
convenient way of doing this.

> +			schedule_work_on(cpu, work);
> +			cpumask_set_cpu(cpu, &has_work);
> +		}
> +	}
> +
> +	for_each_cpu(cpu, &has_work)

for_each_online_cpu()?

> +		flush_work(&per_cpu(lru_add_drain_work, cpu));
> +
> +	put_online_cpus();
> +	mutex_unlock(&lock);
>  }

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v8] mm: make lru_add_drain_all() selective
  2013-08-14 20:44                                           ` Andrew Morton
@ 2013-08-14 20:50                                             ` Tejun Heo
  -1 siblings, 0 replies; 104+ messages in thread
From: Tejun Heo @ 2013-08-14 20:50 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

Hello,

On Wed, Aug 14, 2013 at 01:44:30PM -0700, Andrew Morton wrote:
> > +static bool need_activate_page_drain(int cpu)
> > +{
> > +	return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0;
> > +}
> 
> static int need_activate_page_drain(int cpu)
> {
> 	return pagevec_count(&per_cpu(activate_page_pvecs, cpu));
> }
> 
> would be shorter and faster.  bool rather sucks that way.  It's a
> performance-vs-niceness thing.  I guess one has to look at the call
> frequency when deciding.

"!= 0" can be dropped but I'm fairly sure the compiler would be able
to figure out that the type conversion can be skipped.  It's a trivial
optimization.

> > +			schedule_work_on(cpu, work);
> > +			cpumask_set_cpu(cpu, &has_work);
> > +		}
> > +	}
> > +
> > +	for_each_cpu(cpu, &has_work)
> 
> for_each_online_cpu()?

That would lead to flushing work items which aren't used and may not
have been initialized yet, no?

> > +		flush_work(&per_cpu(lru_add_drain_work, cpu));
> > +
> > +	put_online_cpus();
> > +	mutex_unlock(&lock);
> >  }
> 

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v8] mm: make lru_add_drain_all() selective
@ 2013-08-14 20:50                                             ` Tejun Heo
  0 siblings, 0 replies; 104+ messages in thread
From: Tejun Heo @ 2013-08-14 20:50 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

Hello,

On Wed, Aug 14, 2013 at 01:44:30PM -0700, Andrew Morton wrote:
> > +static bool need_activate_page_drain(int cpu)
> > +{
> > +	return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0;
> > +}
> 
> static int need_activate_page_drain(int cpu)
> {
> 	return pagevec_count(&per_cpu(activate_page_pvecs, cpu));
> }
> 
> would be shorter and faster.  bool rather sucks that way.  It's a
> performance-vs-niceness thing.  I guess one has to look at the call
> frequency when deciding.

"!= 0" can be dropped but I'm fairly sure the compiler would be able
to figure out that the type conversion can be skipped.  It's a trivial
optimization.

> > +			schedule_work_on(cpu, work);
> > +			cpumask_set_cpu(cpu, &has_work);
> > +		}
> > +	}
> > +
> > +	for_each_cpu(cpu, &has_work)
> 
> for_each_online_cpu()?

That would lead to flushing work items which aren't used and may not
have been initialized yet, no?

> > +		flush_work(&per_cpu(lru_add_drain_work, cpu));
> > +
> > +	put_online_cpus();
> > +	mutex_unlock(&lock);
> >  }
> 

Thanks.

-- 
tejun

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v8] mm: make lru_add_drain_all() selective
  2013-08-14 20:50                                             ` Tejun Heo
@ 2013-08-14 21:03                                               ` Andrew Morton
  -1 siblings, 0 replies; 104+ messages in thread
From: Andrew Morton @ 2013-08-14 21:03 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

On Wed, 14 Aug 2013 16:50:29 -0400 Tejun Heo <tj@kernel.org> wrote:

> > > +	for_each_cpu(cpu, &has_work)
> > 
> > for_each_online_cpu()?
> 
> That would lead to flushing work items which aren't used and may not
> have been initialized yet, no?

doh, I confused for_each_cpu with for_each_possible_cpu.  Dumb name.

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v8] mm: make lru_add_drain_all() selective
@ 2013-08-14 21:03                                               ` Andrew Morton
  0 siblings, 0 replies; 104+ messages in thread
From: Andrew Morton @ 2013-08-14 21:03 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

On Wed, 14 Aug 2013 16:50:29 -0400 Tejun Heo <tj@kernel.org> wrote:

> > > +	for_each_cpu(cpu, &has_work)
> > 
> > for_each_online_cpu()?
> 
> That would lead to flushing work items which aren't used and may not
> have been initialized yet, no?

doh, I confused for_each_cpu with for_each_possible_cpu.  Dumb name.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v8] mm: make lru_add_drain_all() selective
  2013-08-14 20:50                                             ` Tejun Heo
@ 2013-08-14 21:07                                               ` Andrew Morton
  -1 siblings, 0 replies; 104+ messages in thread
From: Andrew Morton @ 2013-08-14 21:07 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

On Wed, 14 Aug 2013 16:50:29 -0400 Tejun Heo <tj@kernel.org> wrote:

> On Wed, Aug 14, 2013 at 01:44:30PM -0700, Andrew Morton wrote:
> > > +static bool need_activate_page_drain(int cpu)
> > > +{
> > > +	return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0;
> > > +}
> > 
> > static int need_activate_page_drain(int cpu)
> > {
> > 	return pagevec_count(&per_cpu(activate_page_pvecs, cpu));
> > }
> > 
> > would be shorter and faster.  bool rather sucks that way.  It's a
> > performance-vs-niceness thing.  I guess one has to look at the call
> > frequency when deciding.
> 
> "!= 0" can be dropped but I'm fairly sure the compiler would be able
> to figure out that the type conversion can be skipped.  It's a trivial
> optimization.

The != 0 can surely be removed and that shouldn't make any difference
to generated code.

The compiler will always need to do the int-to-bool conversion and
that's overhead which is added by using bool.

It's possible that the compiler will optmise away the int-to-bool
conversion when inlining this function into a callsite.  I don't know
whether the compiler _does_ do this and it will be version dependent.



^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v8] mm: make lru_add_drain_all() selective
@ 2013-08-14 21:07                                               ` Andrew Morton
  0 siblings, 0 replies; 104+ messages in thread
From: Andrew Morton @ 2013-08-14 21:07 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

On Wed, 14 Aug 2013 16:50:29 -0400 Tejun Heo <tj@kernel.org> wrote:

> On Wed, Aug 14, 2013 at 01:44:30PM -0700, Andrew Morton wrote:
> > > +static bool need_activate_page_drain(int cpu)
> > > +{
> > > +	return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0;
> > > +}
> > 
> > static int need_activate_page_drain(int cpu)
> > {
> > 	return pagevec_count(&per_cpu(activate_page_pvecs, cpu));
> > }
> > 
> > would be shorter and faster.  bool rather sucks that way.  It's a
> > performance-vs-niceness thing.  I guess one has to look at the call
> > frequency when deciding.
> 
> "!= 0" can be dropped but I'm fairly sure the compiler would be able
> to figure out that the type conversion can be skipped.  It's a trivial
> optimization.

The != 0 can surely be removed and that shouldn't make any difference
to generated code.

The compiler will always need to do the int-to-bool conversion and
that's overhead which is added by using bool.

It's possible that the compiler will optmise away the int-to-bool
conversion when inlining this function into a callsite.  I don't know
whether the compiler _does_ do this and it will be version dependent.


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v8] mm: make lru_add_drain_all() selective
  2013-08-14 20:22                                         ` Chris Metcalf
@ 2013-08-14 21:12                                           ` Andrew Morton
  -1 siblings, 0 replies; 104+ messages in thread
From: Andrew Morton @ 2013-08-14 21:12 UTC (permalink / raw)
  To: Chris Metcalf
  Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

On Wed, 14 Aug 2013 16:22:18 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote:

> This change makes lru_add_drain_all() only selectively interrupt
> the cpus that have per-cpu free pages that can be drained.
> 
> This is important in nohz mode where calling mlockall(), for
> example, otherwise will interrupt every core unnecessarily.

Changelog isn't very informative.  I added this:

: This is important on workloads where nohz cores are handling 10 Gb traffic
: in userspace.  Those CPUs do not enter the kernel and place pages into LRU
: pagevecs and they really, really don't want to be interrupted, or they
: drop packets on the floor.

to attempt to describe the rationale for the patch.

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v8] mm: make lru_add_drain_all() selective
@ 2013-08-14 21:12                                           ` Andrew Morton
  0 siblings, 0 replies; 104+ messages in thread
From: Andrew Morton @ 2013-08-14 21:12 UTC (permalink / raw)
  To: Chris Metcalf
  Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

On Wed, 14 Aug 2013 16:22:18 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote:

> This change makes lru_add_drain_all() only selectively interrupt
> the cpus that have per-cpu free pages that can be drained.
> 
> This is important in nohz mode where calling mlockall(), for
> example, otherwise will interrupt every core unnecessarily.

Changelog isn't very informative.  I added this:

: This is important on workloads where nohz cores are handling 10 Gb traffic
: in userspace.  Those CPUs do not enter the kernel and place pages into LRU
: pagevecs and they really, really don't want to be interrupted, or they
: drop packets on the floor.

to attempt to describe the rationale for the patch.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v8] mm: make lru_add_drain_all() selective
  2013-08-14 21:12                                           ` Andrew Morton
@ 2013-08-14 21:23                                             ` Chris Metcalf
  -1 siblings, 0 replies; 104+ messages in thread
From: Chris Metcalf @ 2013-08-14 21:23 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

On 8/14/2013 5:12 PM, Andrew Morton wrote:
> On Wed, 14 Aug 2013 16:22:18 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote:
>
>> This change makes lru_add_drain_all() only selectively interrupt
>> the cpus that have per-cpu free pages that can be drained.
>>
>> This is important in nohz mode where calling mlockall(), for
>> example, otherwise will interrupt every core unnecessarily.
> Changelog isn't very informative.  I added this:
>
> : This is important on workloads where nohz cores are handling 10 Gb traffic
> : in userspace.  Those CPUs do not enter the kernel and place pages into LRU
> : pagevecs and they really, really don't want to be interrupted, or they
> : drop packets on the floor.
>
> to attempt to describe the rationale for the patch.

Thanks.  More motivational text is always a good thing.

-- 
Chris Metcalf, Tilera Corp.
http://www.tilera.com


^ permalink raw reply	[flat|nested] 104+ messages in thread

* Re: [PATCH v8] mm: make lru_add_drain_all() selective
@ 2013-08-14 21:23                                             ` Chris Metcalf
  0 siblings, 0 replies; 104+ messages in thread
From: Chris Metcalf @ 2013-08-14 21:23 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner,
	Frederic Weisbecker, Cody P Schafer

On 8/14/2013 5:12 PM, Andrew Morton wrote:
> On Wed, 14 Aug 2013 16:22:18 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote:
>
>> This change makes lru_add_drain_all() only selectively interrupt
>> the cpus that have per-cpu free pages that can be drained.
>>
>> This is important in nohz mode where calling mlockall(), for
>> example, otherwise will interrupt every core unnecessarily.
> Changelog isn't very informative.  I added this:
>
> : This is important on workloads where nohz cores are handling 10 Gb traffic
> : in userspace.  Those CPUs do not enter the kernel and place pages into LRU
> : pagevecs and they really, really don't want to be interrupted, or they
> : drop packets on the floor.
>
> to attempt to describe the rationale for the patch.

Thanks.  More motivational text is always a good thing.

-- 
Chris Metcalf, Tilera Corp.
http://www.tilera.com

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 104+ messages in thread

end of thread, other threads:[~2013-08-14 21:23 UTC | newest]

Thread overview: 104+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-08-06 20:22 [PATCH] mm: make lru_add_drain_all() selective Chris Metcalf
2013-08-06 20:22 ` Chris Metcalf
2013-08-06 20:22 ` [PATCH v2] " Chris Metcalf
2013-08-06 20:22   ` Chris Metcalf
2013-08-07 20:45   ` Tejun Heo
2013-08-07 20:45     ` Tejun Heo
2013-08-07 20:49     ` [PATCH v3 1/2] workqueue: add new schedule_on_cpu_mask() API Chris Metcalf
2013-08-07 20:49       ` Chris Metcalf
2013-08-07 20:52     ` [PATCH v3 2/2] mm: make lru_add_drain_all() selective Chris Metcalf
2013-08-07 20:52       ` Chris Metcalf
2013-08-07 22:48   ` [PATCH v2] " Cody P Schafer
2013-08-07 22:48     ` Cody P Schafer
2013-08-07 20:49     ` [PATCH v4 1/2] workqueue: add new schedule_on_cpu_mask() API Chris Metcalf
2013-08-07 20:49       ` Chris Metcalf
2013-08-09 15:02       ` Tejun Heo
2013-08-09 15:02         ` Tejun Heo
2013-08-09 16:12         ` Chris Metcalf
2013-08-09 16:12           ` Chris Metcalf
2013-08-09 16:30           ` Tejun Heo
2013-08-09 16:30             ` Tejun Heo
2013-08-07 20:49             ` [PATCH v5 " Chris Metcalf
2013-08-07 20:49               ` Chris Metcalf
2013-08-09 17:40               ` Tejun Heo
2013-08-09 17:40                 ` Tejun Heo
2013-08-09 17:49                 ` [PATCH v6 " Chris Metcalf
2013-08-09 17:49                   ` Chris Metcalf
2013-08-09 17:52                 ` [PATCH v6 2/2] mm: make lru_add_drain_all() selective Chris Metcalf
2013-08-09 17:52                   ` Chris Metcalf
2013-08-07 20:52             ` [PATCH v5 " Chris Metcalf
2013-08-07 20:52               ` Chris Metcalf
2013-08-07 20:52     ` [PATCH v4 " Chris Metcalf
2013-08-07 20:52       ` Chris Metcalf
2013-08-12 21:05       ` Andrew Morton
2013-08-12 21:05         ` Andrew Morton
2013-08-13  1:53         ` Chris Metcalf
2013-08-13  1:53           ` Chris Metcalf
2013-08-13 19:35           ` Andrew Morton
2013-08-13 19:35             ` Andrew Morton
2013-08-13 20:19             ` Tejun Heo
2013-08-13 20:19               ` Tejun Heo
2013-08-13 20:31               ` Andrew Morton
2013-08-13 20:31                 ` Andrew Morton
2013-08-13 20:59                 ` Chris Metcalf
2013-08-13 20:59                   ` Chris Metcalf
2013-08-13 21:13                   ` Andrew Morton
2013-08-13 21:13                     ` Andrew Morton
2013-08-13 22:13                     ` Chris Metcalf
2013-08-13 22:13                       ` Chris Metcalf
2013-08-13 22:26                       ` Andrew Morton
2013-08-13 22:26                         ` Andrew Morton
2013-08-13 23:04                         ` Chris Metcalf
2013-08-13 23:04                           ` Chris Metcalf
2013-08-13 22:51                       ` [PATCH v7 1/2] workqueue: add schedule_on_each_cpu_cond Chris Metcalf
2013-08-13 22:51                         ` Chris Metcalf
2013-08-13 22:53                       ` [PATCH v7 2/2] mm: make lru_add_drain_all() selective Chris Metcalf
2013-08-13 22:53                         ` Chris Metcalf
2013-08-13 23:29                         ` Tejun Heo
2013-08-13 23:29                           ` Tejun Heo
2013-08-13 23:32                           ` Chris Metcalf
2013-08-13 23:32                             ` Chris Metcalf
2013-08-14  6:46                             ` Andrew Morton
2013-08-14  6:46                               ` Andrew Morton
2013-08-14 13:05                               ` Tejun Heo
2013-08-14 13:05                                 ` Tejun Heo
2013-08-14 16:03                               ` Chris Metcalf
2013-08-14 16:03                                 ` Chris Metcalf
2013-08-14 16:57                                 ` Tejun Heo
2013-08-14 16:57                                   ` Tejun Heo
2013-08-14 17:18                                   ` Chris Metcalf
2013-08-14 17:18                                     ` Chris Metcalf
2013-08-14 20:07                                     ` Tejun Heo
2013-08-14 20:07                                       ` Tejun Heo
2013-08-14 20:22                                       ` [PATCH v8] " Chris Metcalf
2013-08-14 20:22                                         ` Chris Metcalf
2013-08-14 20:44                                         ` Andrew Morton
2013-08-14 20:44                                           ` Andrew Morton
2013-08-14 20:50                                           ` Tejun Heo
2013-08-14 20:50                                             ` Tejun Heo
2013-08-14 21:03                                             ` Andrew Morton
2013-08-14 21:03                                               ` Andrew Morton
2013-08-14 21:07                                             ` Andrew Morton
2013-08-14 21:07                                               ` Andrew Morton
2013-08-14 21:12                                         ` Andrew Morton
2013-08-14 21:12                                           ` Andrew Morton
2013-08-14 21:23                                           ` Chris Metcalf
2013-08-14 21:23                                             ` Chris Metcalf
2013-08-13 23:44                           ` [PATCH v7 2/2] " Chris Metcalf
2013-08-13 23:44                             ` Chris Metcalf
2013-08-13 23:51                             ` Tejun Heo
2013-08-13 23:51                               ` Tejun Heo
2013-08-13 21:07                 ` [PATCH v4 " Tejun Heo
2013-08-13 21:07                   ` Tejun Heo
2013-08-13 21:16                   ` Andrew Morton
2013-08-13 21:16                     ` Andrew Morton
2013-08-13 22:07                     ` Tejun Heo
2013-08-13 22:07                       ` Tejun Heo
2013-08-13 22:18                       ` Andrew Morton
2013-08-13 22:18                         ` Andrew Morton
2013-08-13 22:33                         ` Tejun Heo
2013-08-13 22:33                           ` Tejun Heo
2013-08-13 22:47                           ` Andrew Morton
2013-08-13 22:47                             ` Andrew Morton
2013-08-13 23:03                             ` Tejun Heo
2013-08-13 23:03                               ` Tejun Heo

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.