* [PATCH] mm: make lru_add_drain_all() selective @ 2013-08-06 20:22 ` Chris Metcalf 0 siblings, 0 replies; 104+ messages in thread From: Chris Metcalf @ 2013-08-06 20:22 UTC (permalink / raw) To: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner, Frederic Weisbecker This change makes lru_add_drain_all() only selectively interrupt the cpus that have per-cpu free pages that can be drained. This is important in nohz mode where calling mlockall(), for example, otherwise will interrupt every core unnecessarily. Signed-off-by: Chris Metcalf <cmetcalf@tilera.com> --- include/linux/workqueue.h | 3 +++ kernel/workqueue.c | 35 ++++++++++++++++++++++++++--------- mm/swap.c | 38 +++++++++++++++++++++++++++++++++++++- 3 files changed, 66 insertions(+), 10 deletions(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index a0ed78a..71a3fe7 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -13,6 +13,8 @@ #include <linux/atomic.h> #include <linux/cpumask.h> +struct cpumask; + struct workqueue_struct; struct work_struct; @@ -470,6 +472,7 @@ extern void flush_workqueue(struct workqueue_struct *wq); extern void drain_workqueue(struct workqueue_struct *wq); extern void flush_scheduled_work(void); +extern int schedule_on_cpu_mask(work_func_t func, const struct cpumask *mask); extern int schedule_on_each_cpu(work_func_t func); int execute_in_process_context(work_func_t fn, struct execute_work *); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f02c4a4..a6d1809 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2962,17 +2962,18 @@ bool cancel_delayed_work_sync(struct delayed_work *dwork) EXPORT_SYMBOL(cancel_delayed_work_sync); /** - * schedule_on_each_cpu - execute a function synchronously on each online CPU + * schedule_on_cpu_mask - execute a function synchronously on each listed CPU * @func: the function to call + * @mask: the cpumask to invoke the function on * - * schedule_on_each_cpu() executes @func on each online CPU using the + * schedule_on_cpu_mask() executes @func on each listed CPU using the * system workqueue and blocks until all CPUs have completed. - * schedule_on_each_cpu() is very slow. + * schedule_on_cpu_mask() is very slow. * * RETURNS: * 0 on success, -errno on failure. */ -int schedule_on_each_cpu(work_func_t func) +int schedule_on_cpu_mask(work_func_t func, const struct cpumask *mask) { int cpu; struct work_struct __percpu *works; @@ -2981,24 +2982,40 @@ int schedule_on_each_cpu(work_func_t func) if (!works) return -ENOMEM; - get_online_cpus(); - - for_each_online_cpu(cpu) { + for_each_cpu(cpu, mask) { struct work_struct *work = per_cpu_ptr(works, cpu); INIT_WORK(work, func); schedule_work_on(cpu, work); } - for_each_online_cpu(cpu) + for_each_cpu(cpu, mask) flush_work(per_cpu_ptr(works, cpu)); - put_online_cpus(); free_percpu(works); return 0; } /** + * schedule_on_each_cpu - execute a function synchronously on each online CPU + * @func: the function to call + * + * schedule_on_each_cpu() executes @func on each online CPU using the + * system workqueue and blocks until all CPUs have completed. + * schedule_on_each_cpu() is very slow. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int schedule_on_each_cpu(work_func_t func) +{ + get_online_cpus(); + schedule_on_cpu_mask(func, cpu_online_mask); + put_online_cpus(); + return 0; +} + +/** * flush_scheduled_work - ensure that any scheduled work has run to completion. * * Forces execution of the kernel-global workqueue and blocks until its diff --git a/mm/swap.c b/mm/swap.c index 4a1d0d2..981b1d9 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -683,7 +683,43 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) */ int lru_add_drain_all(void) { - return schedule_on_each_cpu(lru_add_drain_per_cpu); + cpumask_var_t mask; + int cpu, rc; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + cpumask_clear(mask); + + /* + * Figure out which cpus need flushing. It's OK if we race + * with changes to the per-cpu lru pvecs, since it's no worse + * than if we flushed all cpus, since a cpu could still end + * up putting pages back on its pvec before we returned. + * And this avoids interrupting other cpus unnecessarily. + */ + for_each_online_cpu(cpu) { + struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu); + struct pagevec *pvec = &per_cpu(lru_rotate_pvecs, cpu); + int count = pagevec_count(pvec); + int lru; + + if (!count) { + for_each_lru(lru) { + pvec = &pvecs[lru - LRU_BASE]; + count = pagevec_count(pvec); + if (count) + break; + } + } + + if (count) + cpumask_set_cpu(cpu, mask); + } + + rc = schedule_on_cpu_mask(lru_add_drain_per_cpu, mask); + + free_cpumask_var(mask); + return rc; } /* -- 1.8.3.1 ^ permalink raw reply related [flat|nested] 104+ messages in thread
* [PATCH] mm: make lru_add_drain_all() selective @ 2013-08-06 20:22 ` Chris Metcalf 0 siblings, 0 replies; 104+ messages in thread From: Chris Metcalf @ 2013-08-06 20:22 UTC (permalink / raw) To: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner, Frederic Weisbecker This change makes lru_add_drain_all() only selectively interrupt the cpus that have per-cpu free pages that can be drained. This is important in nohz mode where calling mlockall(), for example, otherwise will interrupt every core unnecessarily. Signed-off-by: Chris Metcalf <cmetcalf@tilera.com> --- include/linux/workqueue.h | 3 +++ kernel/workqueue.c | 35 ++++++++++++++++++++++++++--------- mm/swap.c | 38 +++++++++++++++++++++++++++++++++++++- 3 files changed, 66 insertions(+), 10 deletions(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index a0ed78a..71a3fe7 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -13,6 +13,8 @@ #include <linux/atomic.h> #include <linux/cpumask.h> +struct cpumask; + struct workqueue_struct; struct work_struct; @@ -470,6 +472,7 @@ extern void flush_workqueue(struct workqueue_struct *wq); extern void drain_workqueue(struct workqueue_struct *wq); extern void flush_scheduled_work(void); +extern int schedule_on_cpu_mask(work_func_t func, const struct cpumask *mask); extern int schedule_on_each_cpu(work_func_t func); int execute_in_process_context(work_func_t fn, struct execute_work *); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f02c4a4..a6d1809 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2962,17 +2962,18 @@ bool cancel_delayed_work_sync(struct delayed_work *dwork) EXPORT_SYMBOL(cancel_delayed_work_sync); /** - * schedule_on_each_cpu - execute a function synchronously on each online CPU + * schedule_on_cpu_mask - execute a function synchronously on each listed CPU * @func: the function to call + * @mask: the cpumask to invoke the function on * - * schedule_on_each_cpu() executes @func on each online CPU using the + * schedule_on_cpu_mask() executes @func on each listed CPU using the * system workqueue and blocks until all CPUs have completed. - * schedule_on_each_cpu() is very slow. + * schedule_on_cpu_mask() is very slow. * * RETURNS: * 0 on success, -errno on failure. */ -int schedule_on_each_cpu(work_func_t func) +int schedule_on_cpu_mask(work_func_t func, const struct cpumask *mask) { int cpu; struct work_struct __percpu *works; @@ -2981,24 +2982,40 @@ int schedule_on_each_cpu(work_func_t func) if (!works) return -ENOMEM; - get_online_cpus(); - - for_each_online_cpu(cpu) { + for_each_cpu(cpu, mask) { struct work_struct *work = per_cpu_ptr(works, cpu); INIT_WORK(work, func); schedule_work_on(cpu, work); } - for_each_online_cpu(cpu) + for_each_cpu(cpu, mask) flush_work(per_cpu_ptr(works, cpu)); - put_online_cpus(); free_percpu(works); return 0; } /** + * schedule_on_each_cpu - execute a function synchronously on each online CPU + * @func: the function to call + * + * schedule_on_each_cpu() executes @func on each online CPU using the + * system workqueue and blocks until all CPUs have completed. + * schedule_on_each_cpu() is very slow. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int schedule_on_each_cpu(work_func_t func) +{ + get_online_cpus(); + schedule_on_cpu_mask(func, cpu_online_mask); + put_online_cpus(); + return 0; +} + +/** * flush_scheduled_work - ensure that any scheduled work has run to completion. * * Forces execution of the kernel-global workqueue and blocks until its diff --git a/mm/swap.c b/mm/swap.c index 4a1d0d2..981b1d9 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -683,7 +683,43 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) */ int lru_add_drain_all(void) { - return schedule_on_each_cpu(lru_add_drain_per_cpu); + cpumask_var_t mask; + int cpu, rc; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + cpumask_clear(mask); + + /* + * Figure out which cpus need flushing. It's OK if we race + * with changes to the per-cpu lru pvecs, since it's no worse + * than if we flushed all cpus, since a cpu could still end + * up putting pages back on its pvec before we returned. + * And this avoids interrupting other cpus unnecessarily. + */ + for_each_online_cpu(cpu) { + struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu); + struct pagevec *pvec = &per_cpu(lru_rotate_pvecs, cpu); + int count = pagevec_count(pvec); + int lru; + + if (!count) { + for_each_lru(lru) { + pvec = &pvecs[lru - LRU_BASE]; + count = pagevec_count(pvec); + if (count) + break; + } + } + + if (count) + cpumask_set_cpu(cpu, mask); + } + + rc = schedule_on_cpu_mask(lru_add_drain_per_cpu, mask); + + free_cpumask_var(mask); + return rc; } /* -- 1.8.3.1 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply related [flat|nested] 104+ messages in thread
* [PATCH v2] mm: make lru_add_drain_all() selective 2013-08-06 20:22 ` Chris Metcalf @ 2013-08-06 20:22 ` Chris Metcalf -1 siblings, 0 replies; 104+ messages in thread From: Chris Metcalf @ 2013-08-06 20:22 UTC (permalink / raw) To: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner, Frederic Weisbecker This change makes lru_add_drain_all() only selectively interrupt the cpus that have per-cpu free pages that can be drained. This is important in nohz mode where calling mlockall(), for example, otherwise will interrupt every core unnecessarily. Signed-off-by: Chris Metcalf <cmetcalf@tilera.com> --- Oops! In the previous version of this change I had just blindly patched it forward from a slightly older version of mm/swap.c. This version is now properly against a version of mm/swap.c that includes all the latest changes to lru_add_drain_all(). include/linux/workqueue.h | 3 +++ kernel/workqueue.c | 35 ++++++++++++++++++++++++++--------- mm/swap.c | 37 ++++++++++++++++++++++++++++++++++++- 3 files changed, 65 insertions(+), 10 deletions(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index a0ed78a..71a3fe7 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -13,6 +13,8 @@ #include <linux/atomic.h> #include <linux/cpumask.h> +struct cpumask; + struct workqueue_struct; struct work_struct; @@ -470,6 +472,7 @@ extern void flush_workqueue(struct workqueue_struct *wq); extern void drain_workqueue(struct workqueue_struct *wq); extern void flush_scheduled_work(void); +extern int schedule_on_cpu_mask(work_func_t func, const struct cpumask *mask); extern int schedule_on_each_cpu(work_func_t func); int execute_in_process_context(work_func_t fn, struct execute_work *); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f02c4a4..a6d1809 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2962,17 +2962,18 @@ bool cancel_delayed_work_sync(struct delayed_work *dwork) EXPORT_SYMBOL(cancel_delayed_work_sync); /** - * schedule_on_each_cpu - execute a function synchronously on each online CPU + * schedule_on_cpu_mask - execute a function synchronously on each listed CPU * @func: the function to call + * @mask: the cpumask to invoke the function on * - * schedule_on_each_cpu() executes @func on each online CPU using the + * schedule_on_cpu_mask() executes @func on each listed CPU using the * system workqueue and blocks until all CPUs have completed. - * schedule_on_each_cpu() is very slow. + * schedule_on_cpu_mask() is very slow. * * RETURNS: * 0 on success, -errno on failure. */ -int schedule_on_each_cpu(work_func_t func) +int schedule_on_cpu_mask(work_func_t func, const struct cpumask *mask) { int cpu; struct work_struct __percpu *works; @@ -2981,24 +2982,40 @@ int schedule_on_each_cpu(work_func_t func) if (!works) return -ENOMEM; - get_online_cpus(); - - for_each_online_cpu(cpu) { + for_each_cpu(cpu, mask) { struct work_struct *work = per_cpu_ptr(works, cpu); INIT_WORK(work, func); schedule_work_on(cpu, work); } - for_each_online_cpu(cpu) + for_each_cpu(cpu, mask) flush_work(per_cpu_ptr(works, cpu)); - put_online_cpus(); free_percpu(works); return 0; } /** + * schedule_on_each_cpu - execute a function synchronously on each online CPU + * @func: the function to call + * + * schedule_on_each_cpu() executes @func on each online CPU using the + * system workqueue and blocks until all CPUs have completed. + * schedule_on_each_cpu() is very slow. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int schedule_on_each_cpu(work_func_t func) +{ + get_online_cpus(); + schedule_on_cpu_mask(func, cpu_online_mask); + put_online_cpus(); + return 0; +} + +/** * flush_scheduled_work - ensure that any scheduled work has run to completion. * * Forces execution of the kernel-global workqueue and blocks until its diff --git a/mm/swap.c b/mm/swap.c index 4a1d0d2..d4a862b 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -405,6 +405,11 @@ static void activate_page_drain(int cpu) pagevec_lru_move_fn(pvec, __activate_page, NULL); } +static bool need_activate_page_drain(int cpu) +{ + return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0; +} + void activate_page(struct page *page) { if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { @@ -422,6 +427,11 @@ static inline void activate_page_drain(int cpu) { } +static bool need_activate_page_drain(int cpu) +{ + return false; +} + void activate_page(struct page *page) { struct zone *zone = page_zone(page); @@ -683,7 +693,32 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) */ int lru_add_drain_all(void) { - return schedule_on_each_cpu(lru_add_drain_per_cpu); + cpumask_var_t mask; + int cpu, rc; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + cpumask_clear(mask); + + /* + * Figure out which cpus need flushing. It's OK if we race + * with changes to the per-cpu lru pvecs, since it's no worse + * than if we flushed all cpus, since a cpu could still end + * up putting pages back on its pvec before we returned. + * And this avoids interrupting other cpus unnecessarily. + */ + for_each_online_cpu(cpu) { + if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || + pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || + pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || + need_activate_page_drain(cpu)) + cpumask_set_cpu(cpu, mask); + } + + rc = schedule_on_cpu_mask(lru_add_drain_per_cpu, mask); + + free_cpumask_var(mask); + return rc; } /* -- 1.8.3.1 ^ permalink raw reply related [flat|nested] 104+ messages in thread
* [PATCH v2] mm: make lru_add_drain_all() selective @ 2013-08-06 20:22 ` Chris Metcalf 0 siblings, 0 replies; 104+ messages in thread From: Chris Metcalf @ 2013-08-06 20:22 UTC (permalink / raw) To: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner, Frederic Weisbecker This change makes lru_add_drain_all() only selectively interrupt the cpus that have per-cpu free pages that can be drained. This is important in nohz mode where calling mlockall(), for example, otherwise will interrupt every core unnecessarily. Signed-off-by: Chris Metcalf <cmetcalf@tilera.com> --- Oops! In the previous version of this change I had just blindly patched it forward from a slightly older version of mm/swap.c. This version is now properly against a version of mm/swap.c that includes all the latest changes to lru_add_drain_all(). include/linux/workqueue.h | 3 +++ kernel/workqueue.c | 35 ++++++++++++++++++++++++++--------- mm/swap.c | 37 ++++++++++++++++++++++++++++++++++++- 3 files changed, 65 insertions(+), 10 deletions(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index a0ed78a..71a3fe7 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -13,6 +13,8 @@ #include <linux/atomic.h> #include <linux/cpumask.h> +struct cpumask; + struct workqueue_struct; struct work_struct; @@ -470,6 +472,7 @@ extern void flush_workqueue(struct workqueue_struct *wq); extern void drain_workqueue(struct workqueue_struct *wq); extern void flush_scheduled_work(void); +extern int schedule_on_cpu_mask(work_func_t func, const struct cpumask *mask); extern int schedule_on_each_cpu(work_func_t func); int execute_in_process_context(work_func_t fn, struct execute_work *); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f02c4a4..a6d1809 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2962,17 +2962,18 @@ bool cancel_delayed_work_sync(struct delayed_work *dwork) EXPORT_SYMBOL(cancel_delayed_work_sync); /** - * schedule_on_each_cpu - execute a function synchronously on each online CPU + * schedule_on_cpu_mask - execute a function synchronously on each listed CPU * @func: the function to call + * @mask: the cpumask to invoke the function on * - * schedule_on_each_cpu() executes @func on each online CPU using the + * schedule_on_cpu_mask() executes @func on each listed CPU using the * system workqueue and blocks until all CPUs have completed. - * schedule_on_each_cpu() is very slow. + * schedule_on_cpu_mask() is very slow. * * RETURNS: * 0 on success, -errno on failure. */ -int schedule_on_each_cpu(work_func_t func) +int schedule_on_cpu_mask(work_func_t func, const struct cpumask *mask) { int cpu; struct work_struct __percpu *works; @@ -2981,24 +2982,40 @@ int schedule_on_each_cpu(work_func_t func) if (!works) return -ENOMEM; - get_online_cpus(); - - for_each_online_cpu(cpu) { + for_each_cpu(cpu, mask) { struct work_struct *work = per_cpu_ptr(works, cpu); INIT_WORK(work, func); schedule_work_on(cpu, work); } - for_each_online_cpu(cpu) + for_each_cpu(cpu, mask) flush_work(per_cpu_ptr(works, cpu)); - put_online_cpus(); free_percpu(works); return 0; } /** + * schedule_on_each_cpu - execute a function synchronously on each online CPU + * @func: the function to call + * + * schedule_on_each_cpu() executes @func on each online CPU using the + * system workqueue and blocks until all CPUs have completed. + * schedule_on_each_cpu() is very slow. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int schedule_on_each_cpu(work_func_t func) +{ + get_online_cpus(); + schedule_on_cpu_mask(func, cpu_online_mask); + put_online_cpus(); + return 0; +} + +/** * flush_scheduled_work - ensure that any scheduled work has run to completion. * * Forces execution of the kernel-global workqueue and blocks until its diff --git a/mm/swap.c b/mm/swap.c index 4a1d0d2..d4a862b 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -405,6 +405,11 @@ static void activate_page_drain(int cpu) pagevec_lru_move_fn(pvec, __activate_page, NULL); } +static bool need_activate_page_drain(int cpu) +{ + return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0; +} + void activate_page(struct page *page) { if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { @@ -422,6 +427,11 @@ static inline void activate_page_drain(int cpu) { } +static bool need_activate_page_drain(int cpu) +{ + return false; +} + void activate_page(struct page *page) { struct zone *zone = page_zone(page); @@ -683,7 +693,32 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) */ int lru_add_drain_all(void) { - return schedule_on_each_cpu(lru_add_drain_per_cpu); + cpumask_var_t mask; + int cpu, rc; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + cpumask_clear(mask); + + /* + * Figure out which cpus need flushing. It's OK if we race + * with changes to the per-cpu lru pvecs, since it's no worse + * than if we flushed all cpus, since a cpu could still end + * up putting pages back on its pvec before we returned. + * And this avoids interrupting other cpus unnecessarily. + */ + for_each_online_cpu(cpu) { + if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || + pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || + pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || + need_activate_page_drain(cpu)) + cpumask_set_cpu(cpu, mask); + } + + rc = schedule_on_cpu_mask(lru_add_drain_per_cpu, mask); + + free_cpumask_var(mask); + return rc; } /* -- 1.8.3.1 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply related [flat|nested] 104+ messages in thread
* Re: [PATCH v2] mm: make lru_add_drain_all() selective 2013-08-06 20:22 ` Chris Metcalf @ 2013-08-07 20:45 ` Tejun Heo -1 siblings, 0 replies; 104+ messages in thread From: Tejun Heo @ 2013-08-07 20:45 UTC (permalink / raw) To: Chris Metcalf Cc: linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker On Tue, Aug 06, 2013 at 04:22:39PM -0400, Chris Metcalf wrote: > This change makes lru_add_drain_all() only selectively interrupt > the cpus that have per-cpu free pages that can be drained. > > This is important in nohz mode where calling mlockall(), for > example, otherwise will interrupt every core unnecessarily. Can you please split off workqueue part into a separate patch with proper description? I don't mind the change eventually going through -mm but it's nasty to bury workqueue API update with another change without explanation or justification. Thanks. -- tejun ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v2] mm: make lru_add_drain_all() selective @ 2013-08-07 20:45 ` Tejun Heo 0 siblings, 0 replies; 104+ messages in thread From: Tejun Heo @ 2013-08-07 20:45 UTC (permalink / raw) To: Chris Metcalf Cc: linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker On Tue, Aug 06, 2013 at 04:22:39PM -0400, Chris Metcalf wrote: > This change makes lru_add_drain_all() only selectively interrupt > the cpus that have per-cpu free pages that can be drained. > > This is important in nohz mode where calling mlockall(), for > example, otherwise will interrupt every core unnecessarily. Can you please split off workqueue part into a separate patch with proper description? I don't mind the change eventually going through -mm but it's nasty to bury workqueue API update with another change without explanation or justification. Thanks. -- tejun -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 104+ messages in thread
* [PATCH v3 1/2] workqueue: add new schedule_on_cpu_mask() API 2013-08-07 20:45 ` Tejun Heo @ 2013-08-07 20:49 ` Chris Metcalf -1 siblings, 0 replies; 104+ messages in thread From: Chris Metcalf @ 2013-08-07 20:49 UTC (permalink / raw) To: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner, Frederic Weisbecker This primitive allows scheduling work to run on a particular set of cpus described by a "struct cpumask". This can be useful, for example, if you have a per-cpu variable that requires code execution only if the per-cpu variable has a certain value (for example, is a non-empty list). Signed-off-by: Chris Metcalf <cmetcalf@tilera.com> --- v3: split commit into two, one for workqueue and one for mm, though both should probably be taken through -mm. include/linux/workqueue.h | 3 +++ kernel/workqueue.c | 35 ++++++++++++++++++++++++++--------- 2 files changed, 29 insertions(+), 9 deletions(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index a0ed78a..71a3fe7 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -13,6 +13,8 @@ #include <linux/atomic.h> #include <linux/cpumask.h> +struct cpumask; + struct workqueue_struct; struct work_struct; @@ -470,6 +472,7 @@ extern void flush_workqueue(struct workqueue_struct *wq); extern void drain_workqueue(struct workqueue_struct *wq); extern void flush_scheduled_work(void); +extern int schedule_on_cpu_mask(work_func_t func, const struct cpumask *mask); extern int schedule_on_each_cpu(work_func_t func); int execute_in_process_context(work_func_t fn, struct execute_work *); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f02c4a4..a6d1809 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2962,17 +2962,18 @@ bool cancel_delayed_work_sync(struct delayed_work *dwork) EXPORT_SYMBOL(cancel_delayed_work_sync); /** - * schedule_on_each_cpu - execute a function synchronously on each online CPU + * schedule_on_cpu_mask - execute a function synchronously on each listed CPU * @func: the function to call + * @mask: the cpumask to invoke the function on * - * schedule_on_each_cpu() executes @func on each online CPU using the + * schedule_on_cpu_mask() executes @func on each listed CPU using the * system workqueue and blocks until all CPUs have completed. - * schedule_on_each_cpu() is very slow. + * schedule_on_cpu_mask() is very slow. * * RETURNS: * 0 on success, -errno on failure. */ -int schedule_on_each_cpu(work_func_t func) +int schedule_on_cpu_mask(work_func_t func, const struct cpumask *mask) { int cpu; struct work_struct __percpu *works; @@ -2981,24 +2982,40 @@ int schedule_on_each_cpu(work_func_t func) if (!works) return -ENOMEM; - get_online_cpus(); - - for_each_online_cpu(cpu) { + for_each_cpu(cpu, mask) { struct work_struct *work = per_cpu_ptr(works, cpu); INIT_WORK(work, func); schedule_work_on(cpu, work); } - for_each_online_cpu(cpu) + for_each_cpu(cpu, mask) flush_work(per_cpu_ptr(works, cpu)); - put_online_cpus(); free_percpu(works); return 0; } /** + * schedule_on_each_cpu - execute a function synchronously on each online CPU + * @func: the function to call + * + * schedule_on_each_cpu() executes @func on each online CPU using the + * system workqueue and blocks until all CPUs have completed. + * schedule_on_each_cpu() is very slow. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int schedule_on_each_cpu(work_func_t func) +{ + get_online_cpus(); + schedule_on_cpu_mask(func, cpu_online_mask); + put_online_cpus(); + return 0; +} + +/** * flush_scheduled_work - ensure that any scheduled work has run to completion. * * Forces execution of the kernel-global workqueue and blocks until its -- 1.8.3.1 ^ permalink raw reply related [flat|nested] 104+ messages in thread
* [PATCH v3 1/2] workqueue: add new schedule_on_cpu_mask() API @ 2013-08-07 20:49 ` Chris Metcalf 0 siblings, 0 replies; 104+ messages in thread From: Chris Metcalf @ 2013-08-07 20:49 UTC (permalink / raw) To: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner, Frederic Weisbecker This primitive allows scheduling work to run on a particular set of cpus described by a "struct cpumask". This can be useful, for example, if you have a per-cpu variable that requires code execution only if the per-cpu variable has a certain value (for example, is a non-empty list). Signed-off-by: Chris Metcalf <cmetcalf@tilera.com> --- v3: split commit into two, one for workqueue and one for mm, though both should probably be taken through -mm. include/linux/workqueue.h | 3 +++ kernel/workqueue.c | 35 ++++++++++++++++++++++++++--------- 2 files changed, 29 insertions(+), 9 deletions(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index a0ed78a..71a3fe7 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -13,6 +13,8 @@ #include <linux/atomic.h> #include <linux/cpumask.h> +struct cpumask; + struct workqueue_struct; struct work_struct; @@ -470,6 +472,7 @@ extern void flush_workqueue(struct workqueue_struct *wq); extern void drain_workqueue(struct workqueue_struct *wq); extern void flush_scheduled_work(void); +extern int schedule_on_cpu_mask(work_func_t func, const struct cpumask *mask); extern int schedule_on_each_cpu(work_func_t func); int execute_in_process_context(work_func_t fn, struct execute_work *); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f02c4a4..a6d1809 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2962,17 +2962,18 @@ bool cancel_delayed_work_sync(struct delayed_work *dwork) EXPORT_SYMBOL(cancel_delayed_work_sync); /** - * schedule_on_each_cpu - execute a function synchronously on each online CPU + * schedule_on_cpu_mask - execute a function synchronously on each listed CPU * @func: the function to call + * @mask: the cpumask to invoke the function on * - * schedule_on_each_cpu() executes @func on each online CPU using the + * schedule_on_cpu_mask() executes @func on each listed CPU using the * system workqueue and blocks until all CPUs have completed. - * schedule_on_each_cpu() is very slow. + * schedule_on_cpu_mask() is very slow. * * RETURNS: * 0 on success, -errno on failure. */ -int schedule_on_each_cpu(work_func_t func) +int schedule_on_cpu_mask(work_func_t func, const struct cpumask *mask) { int cpu; struct work_struct __percpu *works; @@ -2981,24 +2982,40 @@ int schedule_on_each_cpu(work_func_t func) if (!works) return -ENOMEM; - get_online_cpus(); - - for_each_online_cpu(cpu) { + for_each_cpu(cpu, mask) { struct work_struct *work = per_cpu_ptr(works, cpu); INIT_WORK(work, func); schedule_work_on(cpu, work); } - for_each_online_cpu(cpu) + for_each_cpu(cpu, mask) flush_work(per_cpu_ptr(works, cpu)); - put_online_cpus(); free_percpu(works); return 0; } /** + * schedule_on_each_cpu - execute a function synchronously on each online CPU + * @func: the function to call + * + * schedule_on_each_cpu() executes @func on each online CPU using the + * system workqueue and blocks until all CPUs have completed. + * schedule_on_each_cpu() is very slow. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int schedule_on_each_cpu(work_func_t func) +{ + get_online_cpus(); + schedule_on_cpu_mask(func, cpu_online_mask); + put_online_cpus(); + return 0; +} + +/** * flush_scheduled_work - ensure that any scheduled work has run to completion. * * Forces execution of the kernel-global workqueue and blocks until its -- 1.8.3.1 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply related [flat|nested] 104+ messages in thread
* [PATCH v3 2/2] mm: make lru_add_drain_all() selective 2013-08-07 20:45 ` Tejun Heo @ 2013-08-07 20:52 ` Chris Metcalf -1 siblings, 0 replies; 104+ messages in thread From: Chris Metcalf @ 2013-08-07 20:52 UTC (permalink / raw) To: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner, Frederic Weisbecker This change makes lru_add_drain_all() only selectively interrupt the cpus that have per-cpu free pages that can be drained. This is important in nohz mode where calling mlockall(), for example, otherwise will interrupt every core unnecessarily. Signed-off-by: Chris Metcalf <cmetcalf@tilera.com> --- v3: split commit into two, one for workqueue and one for mm, though both should probably be taken through -mm. mm/swap.c | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/mm/swap.c b/mm/swap.c index 4a1d0d2..d4a862b 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -405,6 +405,11 @@ static void activate_page_drain(int cpu) pagevec_lru_move_fn(pvec, __activate_page, NULL); } +static bool need_activate_page_drain(int cpu) +{ + return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0; +} + void activate_page(struct page *page) { if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { @@ -422,6 +427,11 @@ static inline void activate_page_drain(int cpu) { } +static bool need_activate_page_drain(int cpu) +{ + return false; +} + void activate_page(struct page *page) { struct zone *zone = page_zone(page); @@ -683,7 +693,32 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) */ int lru_add_drain_all(void) { - return schedule_on_each_cpu(lru_add_drain_per_cpu); + cpumask_var_t mask; + int cpu, rc; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + cpumask_clear(mask); + + /* + * Figure out which cpus need flushing. It's OK if we race + * with changes to the per-cpu lru pvecs, since it's no worse + * than if we flushed all cpus, since a cpu could still end + * up putting pages back on its pvec before we returned. + * And this avoids interrupting other cpus unnecessarily. + */ + for_each_online_cpu(cpu) { + if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || + pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || + pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || + need_activate_page_drain(cpu)) + cpumask_set_cpu(cpu, mask); + } + + rc = schedule_on_cpu_mask(lru_add_drain_per_cpu, mask); + + free_cpumask_var(mask); + return rc; } /* -- 1.8.3.1 ^ permalink raw reply related [flat|nested] 104+ messages in thread
* [PATCH v3 2/2] mm: make lru_add_drain_all() selective @ 2013-08-07 20:52 ` Chris Metcalf 0 siblings, 0 replies; 104+ messages in thread From: Chris Metcalf @ 2013-08-07 20:52 UTC (permalink / raw) To: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner, Frederic Weisbecker This change makes lru_add_drain_all() only selectively interrupt the cpus that have per-cpu free pages that can be drained. This is important in nohz mode where calling mlockall(), for example, otherwise will interrupt every core unnecessarily. Signed-off-by: Chris Metcalf <cmetcalf@tilera.com> --- v3: split commit into two, one for workqueue and one for mm, though both should probably be taken through -mm. mm/swap.c | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/mm/swap.c b/mm/swap.c index 4a1d0d2..d4a862b 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -405,6 +405,11 @@ static void activate_page_drain(int cpu) pagevec_lru_move_fn(pvec, __activate_page, NULL); } +static bool need_activate_page_drain(int cpu) +{ + return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0; +} + void activate_page(struct page *page) { if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { @@ -422,6 +427,11 @@ static inline void activate_page_drain(int cpu) { } +static bool need_activate_page_drain(int cpu) +{ + return false; +} + void activate_page(struct page *page) { struct zone *zone = page_zone(page); @@ -683,7 +693,32 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) */ int lru_add_drain_all(void) { - return schedule_on_each_cpu(lru_add_drain_per_cpu); + cpumask_var_t mask; + int cpu, rc; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + cpumask_clear(mask); + + /* + * Figure out which cpus need flushing. It's OK if we race + * with changes to the per-cpu lru pvecs, since it's no worse + * than if we flushed all cpus, since a cpu could still end + * up putting pages back on its pvec before we returned. + * And this avoids interrupting other cpus unnecessarily. + */ + for_each_online_cpu(cpu) { + if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || + pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || + pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || + need_activate_page_drain(cpu)) + cpumask_set_cpu(cpu, mask); + } + + rc = schedule_on_cpu_mask(lru_add_drain_per_cpu, mask); + + free_cpumask_var(mask); + return rc; } /* -- 1.8.3.1 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply related [flat|nested] 104+ messages in thread
* Re: [PATCH v2] mm: make lru_add_drain_all() selective 2013-08-06 20:22 ` Chris Metcalf @ 2013-08-07 22:48 ` Cody P Schafer -1 siblings, 0 replies; 104+ messages in thread From: Cody P Schafer @ 2013-08-07 22:48 UTC (permalink / raw) To: Chris Metcalf Cc: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner, Frederic Weisbecker On 08/06/2013 01:22 PM, Chris Metcalf wrote: [...] > > /** > + * schedule_on_each_cpu - execute a function synchronously on each online CPU > + * @func: the function to call > + * > + * schedule_on_each_cpu() executes @func on each online CPU using the > + * system workqueue and blocks until all CPUs have completed. > + * schedule_on_each_cpu() is very slow. > + * > + * RETURNS: > + * 0 on success, -errno on failure. > + */ > +int schedule_on_each_cpu(work_func_t func) > +{ > + get_online_cpus(); > + schedule_on_cpu_mask(func, cpu_online_mask); Looks like the return value gets lost here. > + put_online_cpus(); > + return 0; > +} > + > +/** > * flush_scheduled_work - ensure that any scheduled work has run to completion. > * > * Forces execution of the kernel-global workqueue and blocks until its [...] ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v2] mm: make lru_add_drain_all() selective @ 2013-08-07 22:48 ` Cody P Schafer 0 siblings, 0 replies; 104+ messages in thread From: Cody P Schafer @ 2013-08-07 22:48 UTC (permalink / raw) To: Chris Metcalf Cc: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner, Frederic Weisbecker On 08/06/2013 01:22 PM, Chris Metcalf wrote: [...] > > /** > + * schedule_on_each_cpu - execute a function synchronously on each online CPU > + * @func: the function to call > + * > + * schedule_on_each_cpu() executes @func on each online CPU using the > + * system workqueue and blocks until all CPUs have completed. > + * schedule_on_each_cpu() is very slow. > + * > + * RETURNS: > + * 0 on success, -errno on failure. > + */ > +int schedule_on_each_cpu(work_func_t func) > +{ > + get_online_cpus(); > + schedule_on_cpu_mask(func, cpu_online_mask); Looks like the return value gets lost here. > + put_online_cpus(); > + return 0; > +} > + > +/** > * flush_scheduled_work - ensure that any scheduled work has run to completion. > * > * Forces execution of the kernel-global workqueue and blocks until its [...] -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 104+ messages in thread
* [PATCH v4 1/2] workqueue: add new schedule_on_cpu_mask() API 2013-08-07 22:48 ` Cody P Schafer @ 2013-08-07 20:49 ` Chris Metcalf -1 siblings, 0 replies; 104+ messages in thread From: Chris Metcalf @ 2013-08-07 20:49 UTC (permalink / raw) To: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer This primitive allows scheduling work to run on a particular set of cpus described by a "struct cpumask". This can be useful, for example, if you have a per-cpu variable that requires code execution only if the per-cpu variable has a certain value (for example, is a non-empty list). Signed-off-by: Chris Metcalf <cmetcalf@tilera.com> --- v4: don't lose possible -ENOMEM in schedule_on_each_cpu() (Note: no change to the mm/swap.c commit) v3: split commit into two, one for workqueue and one for mm, though both should probably be taken through -mm. include/linux/workqueue.h | 3 +++ kernel/workqueue.c | 36 +++++++++++++++++++++++++++--------- 2 files changed, 30 insertions(+), 9 deletions(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index a0ed78a..71a3fe7 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -13,6 +13,8 @@ #include <linux/atomic.h> #include <linux/cpumask.h> +struct cpumask; + struct workqueue_struct; struct work_struct; @@ -470,6 +472,7 @@ extern void flush_workqueue(struct workqueue_struct *wq); extern void drain_workqueue(struct workqueue_struct *wq); extern void flush_scheduled_work(void); +extern int schedule_on_cpu_mask(work_func_t func, const struct cpumask *mask); extern int schedule_on_each_cpu(work_func_t func); int execute_in_process_context(work_func_t fn, struct execute_work *); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f02c4a4..1dacb09 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2962,17 +2962,18 @@ bool cancel_delayed_work_sync(struct delayed_work *dwork) EXPORT_SYMBOL(cancel_delayed_work_sync); /** - * schedule_on_each_cpu - execute a function synchronously on each online CPU + * schedule_on_cpu_mask - execute a function synchronously on each listed CPU * @func: the function to call + * @mask: the cpumask to invoke the function on * - * schedule_on_each_cpu() executes @func on each online CPU using the + * schedule_on_cpu_mask() executes @func on each listed CPU using the * system workqueue and blocks until all CPUs have completed. - * schedule_on_each_cpu() is very slow. + * schedule_on_cpu_mask() is very slow. * * RETURNS: * 0 on success, -errno on failure. */ -int schedule_on_each_cpu(work_func_t func) +int schedule_on_cpu_mask(work_func_t func, const struct cpumask *mask) { int cpu; struct work_struct __percpu *works; @@ -2981,24 +2982,41 @@ int schedule_on_each_cpu(work_func_t func) if (!works) return -ENOMEM; - get_online_cpus(); - - for_each_online_cpu(cpu) { + for_each_cpu(cpu, mask) { struct work_struct *work = per_cpu_ptr(works, cpu); INIT_WORK(work, func); schedule_work_on(cpu, work); } - for_each_online_cpu(cpu) + for_each_cpu(cpu, mask) flush_work(per_cpu_ptr(works, cpu)); - put_online_cpus(); free_percpu(works); return 0; } /** + * schedule_on_each_cpu - execute a function synchronously on each online CPU + * @func: the function to call + * + * schedule_on_each_cpu() executes @func on each online CPU using the + * system workqueue and blocks until all CPUs have completed. + * schedule_on_each_cpu() is very slow. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int schedule_on_each_cpu(work_func_t func) +{ + int ret; + get_online_cpus(); + ret = schedule_on_cpu_mask(func, cpu_online_mask); + put_online_cpus(); + return ret; +} + +/** * flush_scheduled_work - ensure that any scheduled work has run to completion. * * Forces execution of the kernel-global workqueue and blocks until its -- 1.8.3.1 ^ permalink raw reply related [flat|nested] 104+ messages in thread
* [PATCH v4 1/2] workqueue: add new schedule_on_cpu_mask() API @ 2013-08-07 20:49 ` Chris Metcalf 0 siblings, 0 replies; 104+ messages in thread From: Chris Metcalf @ 2013-08-07 20:49 UTC (permalink / raw) To: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer This primitive allows scheduling work to run on a particular set of cpus described by a "struct cpumask". This can be useful, for example, if you have a per-cpu variable that requires code execution only if the per-cpu variable has a certain value (for example, is a non-empty list). Signed-off-by: Chris Metcalf <cmetcalf@tilera.com> --- v4: don't lose possible -ENOMEM in schedule_on_each_cpu() (Note: no change to the mm/swap.c commit) v3: split commit into two, one for workqueue and one for mm, though both should probably be taken through -mm. include/linux/workqueue.h | 3 +++ kernel/workqueue.c | 36 +++++++++++++++++++++++++++--------- 2 files changed, 30 insertions(+), 9 deletions(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index a0ed78a..71a3fe7 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -13,6 +13,8 @@ #include <linux/atomic.h> #include <linux/cpumask.h> +struct cpumask; + struct workqueue_struct; struct work_struct; @@ -470,6 +472,7 @@ extern void flush_workqueue(struct workqueue_struct *wq); extern void drain_workqueue(struct workqueue_struct *wq); extern void flush_scheduled_work(void); +extern int schedule_on_cpu_mask(work_func_t func, const struct cpumask *mask); extern int schedule_on_each_cpu(work_func_t func); int execute_in_process_context(work_func_t fn, struct execute_work *); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f02c4a4..1dacb09 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2962,17 +2962,18 @@ bool cancel_delayed_work_sync(struct delayed_work *dwork) EXPORT_SYMBOL(cancel_delayed_work_sync); /** - * schedule_on_each_cpu - execute a function synchronously on each online CPU + * schedule_on_cpu_mask - execute a function synchronously on each listed CPU * @func: the function to call + * @mask: the cpumask to invoke the function on * - * schedule_on_each_cpu() executes @func on each online CPU using the + * schedule_on_cpu_mask() executes @func on each listed CPU using the * system workqueue and blocks until all CPUs have completed. - * schedule_on_each_cpu() is very slow. + * schedule_on_cpu_mask() is very slow. * * RETURNS: * 0 on success, -errno on failure. */ -int schedule_on_each_cpu(work_func_t func) +int schedule_on_cpu_mask(work_func_t func, const struct cpumask *mask) { int cpu; struct work_struct __percpu *works; @@ -2981,24 +2982,41 @@ int schedule_on_each_cpu(work_func_t func) if (!works) return -ENOMEM; - get_online_cpus(); - - for_each_online_cpu(cpu) { + for_each_cpu(cpu, mask) { struct work_struct *work = per_cpu_ptr(works, cpu); INIT_WORK(work, func); schedule_work_on(cpu, work); } - for_each_online_cpu(cpu) + for_each_cpu(cpu, mask) flush_work(per_cpu_ptr(works, cpu)); - put_online_cpus(); free_percpu(works); return 0; } /** + * schedule_on_each_cpu - execute a function synchronously on each online CPU + * @func: the function to call + * + * schedule_on_each_cpu() executes @func on each online CPU using the + * system workqueue and blocks until all CPUs have completed. + * schedule_on_each_cpu() is very slow. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int schedule_on_each_cpu(work_func_t func) +{ + int ret; + get_online_cpus(); + ret = schedule_on_cpu_mask(func, cpu_online_mask); + put_online_cpus(); + return ret; +} + +/** * flush_scheduled_work - ensure that any scheduled work has run to completion. * * Forces execution of the kernel-global workqueue and blocks until its -- 1.8.3.1 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply related [flat|nested] 104+ messages in thread
* Re: [PATCH v4 1/2] workqueue: add new schedule_on_cpu_mask() API 2013-08-07 20:49 ` Chris Metcalf @ 2013-08-09 15:02 ` Tejun Heo -1 siblings, 0 replies; 104+ messages in thread From: Tejun Heo @ 2013-08-09 15:02 UTC (permalink / raw) To: Chris Metcalf Cc: linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer Hello, Chris. On Wed, Aug 07, 2013 at 04:49:44PM -0400, Chris Metcalf wrote: > This primitive allows scheduling work to run on a particular set of > cpus described by a "struct cpumask". This can be useful, for example, > if you have a per-cpu variable that requires code execution only if the > per-cpu variable has a certain value (for example, is a non-empty list). So, this allows scheduling work items on !online CPUs. Workqueue does allow scheduling per-cpu work items on offline CPUs if the CPU has ever been online, but the behavior when scheduling work items on cpu which has never been online is undefined. I think the interface at least needs to verify that the the target cpus have been online, trigger warning and mask off invalid CPUs otherwise. Thanks. -- tejun ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v4 1/2] workqueue: add new schedule_on_cpu_mask() API @ 2013-08-09 15:02 ` Tejun Heo 0 siblings, 0 replies; 104+ messages in thread From: Tejun Heo @ 2013-08-09 15:02 UTC (permalink / raw) To: Chris Metcalf Cc: linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer Hello, Chris. On Wed, Aug 07, 2013 at 04:49:44PM -0400, Chris Metcalf wrote: > This primitive allows scheduling work to run on a particular set of > cpus described by a "struct cpumask". This can be useful, for example, > if you have a per-cpu variable that requires code execution only if the > per-cpu variable has a certain value (for example, is a non-empty list). So, this allows scheduling work items on !online CPUs. Workqueue does allow scheduling per-cpu work items on offline CPUs if the CPU has ever been online, but the behavior when scheduling work items on cpu which has never been online is undefined. I think the interface at least needs to verify that the the target cpus have been online, trigger warning and mask off invalid CPUs otherwise. Thanks. -- tejun -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v4 1/2] workqueue: add new schedule_on_cpu_mask() API 2013-08-09 15:02 ` Tejun Heo @ 2013-08-09 16:12 ` Chris Metcalf -1 siblings, 0 replies; 104+ messages in thread From: Chris Metcalf @ 2013-08-09 16:12 UTC (permalink / raw) To: Tejun Heo Cc: linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer On 8/9/2013 11:02 AM, Tejun Heo wrote: > Hello, Chris. > > On Wed, Aug 07, 2013 at 04:49:44PM -0400, Chris Metcalf wrote: >> This primitive allows scheduling work to run on a particular set of >> cpus described by a "struct cpumask". This can be useful, for example, >> if you have a per-cpu variable that requires code execution only if the >> per-cpu variable has a certain value (for example, is a non-empty list). > So, this allows scheduling work items on !online CPUs. Workqueue does > allow scheduling per-cpu work items on offline CPUs if the CPU has > ever been online, but the behavior when scheduling work items on cpu > which has never been online is undefined. I think the interface at > least needs to verify that the the target cpus have been online, > trigger warning and mask off invalid CPUs otherwise. I could certainly make schedule_on_cpu_mask() do sanity checking, perhaps via a WARN_ON_ONCE() if offline cpus were specified, and otherwise just have it create a local struct cpumask that it and's with cpu_online_mask, suitably wrapping with get_online_cpus()/put_online_cpus(). (I'm not sure how to test if a cpu has ever been online, vs whether it's online right now.) I don't want to unnecessarily slow down the existing schedule_on_each_cpu(), so perhaps the implementation should have a static schedule_on_cpu_mask_internal() function that is the same as my previous schedule_on_cpu_mask(), allowing schedule_on_each_cpu() to call it directly to bypass the checking. That said... I wonder if it might make sense to treat this API the same as other APIs that already take a cpu? schedule_work_on(), schedule_delayed_work_on(), and queue_delayed_work_on() all take a cpu parameter without API comment or validity checking; queue_work_on() just says "the caller must ensure [the cpu] can't go away". Does it make sense to just add a similar comment to schedule_on_cpu_mask() rather than make this API the first to actually do cpu validity checking? Let me know; I'm happy to respin it either way. -- Chris Metcalf, Tilera Corp. http://www.tilera.com ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v4 1/2] workqueue: add new schedule_on_cpu_mask() API @ 2013-08-09 16:12 ` Chris Metcalf 0 siblings, 0 replies; 104+ messages in thread From: Chris Metcalf @ 2013-08-09 16:12 UTC (permalink / raw) To: Tejun Heo Cc: linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer On 8/9/2013 11:02 AM, Tejun Heo wrote: > Hello, Chris. > > On Wed, Aug 07, 2013 at 04:49:44PM -0400, Chris Metcalf wrote: >> This primitive allows scheduling work to run on a particular set of >> cpus described by a "struct cpumask". This can be useful, for example, >> if you have a per-cpu variable that requires code execution only if the >> per-cpu variable has a certain value (for example, is a non-empty list). > So, this allows scheduling work items on !online CPUs. Workqueue does > allow scheduling per-cpu work items on offline CPUs if the CPU has > ever been online, but the behavior when scheduling work items on cpu > which has never been online is undefined. I think the interface at > least needs to verify that the the target cpus have been online, > trigger warning and mask off invalid CPUs otherwise. I could certainly make schedule_on_cpu_mask() do sanity checking, perhaps via a WARN_ON_ONCE() if offline cpus were specified, and otherwise just have it create a local struct cpumask that it and's with cpu_online_mask, suitably wrapping with get_online_cpus()/put_online_cpus(). (I'm not sure how to test if a cpu has ever been online, vs whether it's online right now.) I don't want to unnecessarily slow down the existing schedule_on_each_cpu(), so perhaps the implementation should have a static schedule_on_cpu_mask_internal() function that is the same as my previous schedule_on_cpu_mask(), allowing schedule_on_each_cpu() to call it directly to bypass the checking. That said... I wonder if it might make sense to treat this API the same as other APIs that already take a cpu? schedule_work_on(), schedule_delayed_work_on(), and queue_delayed_work_on() all take a cpu parameter without API comment or validity checking; queue_work_on() just says "the caller must ensure [the cpu] can't go away". Does it make sense to just add a similar comment to schedule_on_cpu_mask() rather than make this API the first to actually do cpu validity checking? Let me know; I'm happy to respin it either way. -- Chris Metcalf, Tilera Corp. http://www.tilera.com -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v4 1/2] workqueue: add new schedule_on_cpu_mask() API 2013-08-09 16:12 ` Chris Metcalf @ 2013-08-09 16:30 ` Tejun Heo -1 siblings, 0 replies; 104+ messages in thread From: Tejun Heo @ 2013-08-09 16:30 UTC (permalink / raw) To: Chris Metcalf Cc: linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer Hello, Chris. On Fri, Aug 09, 2013 at 12:12:43PM -0400, Chris Metcalf wrote: > I could certainly make schedule_on_cpu_mask() do sanity checking, > perhaps via a WARN_ON_ONCE() if offline cpus were specified, and > otherwise just have it create a local struct cpumask that it and's > with cpu_online_mask, suitably wrapping with > get_online_cpus()/put_online_cpus(). (I'm not sure how to test if a > cpu has ever been online, vs whether it's online right now.) I I think you'll have to collect it from CPU_ONLINE of workqueue_cpu_up_callback() and I think it probably wouldn't be a bad idea to allow scheduling on CPUs which have been up but aren't currently as that's the current rule for other interfaces anyway. > don't want to unnecessarily slow down the existing > schedule_on_each_cpu(), so perhaps the implementation should have a > static schedule_on_cpu_mask_internal() function that is the same as > my previous schedule_on_cpu_mask(), allowing schedule_on_each_cpu() > to call it directly to bypass the checking. Hmmm.... it's unlikely to make noticeable difference given that it's gonna be bouncing multiple cachelines across all online CPUs. > That said... I wonder if it might make sense to treat this API the > same as other APIs that already take a cpu? schedule_work_on(), > schedule_delayed_work_on(), and queue_delayed_work_on() all take a > cpu parameter without API comment or validity checking; > queue_work_on() just says "the caller must ensure [the cpu] can't go > away". Does it make sense to just add a similar comment to > schedule_on_cpu_mask() rather than make this API the first to > actually do cpu validity checking? Yeah, we've been lazy with the sanity check and I think it's a good opportunity to add it. Let's worry about other paths later. Thanks. -- tejun ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v4 1/2] workqueue: add new schedule_on_cpu_mask() API @ 2013-08-09 16:30 ` Tejun Heo 0 siblings, 0 replies; 104+ messages in thread From: Tejun Heo @ 2013-08-09 16:30 UTC (permalink / raw) To: Chris Metcalf Cc: linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer Hello, Chris. On Fri, Aug 09, 2013 at 12:12:43PM -0400, Chris Metcalf wrote: > I could certainly make schedule_on_cpu_mask() do sanity checking, > perhaps via a WARN_ON_ONCE() if offline cpus were specified, and > otherwise just have it create a local struct cpumask that it and's > with cpu_online_mask, suitably wrapping with > get_online_cpus()/put_online_cpus(). (I'm not sure how to test if a > cpu has ever been online, vs whether it's online right now.) I I think you'll have to collect it from CPU_ONLINE of workqueue_cpu_up_callback() and I think it probably wouldn't be a bad idea to allow scheduling on CPUs which have been up but aren't currently as that's the current rule for other interfaces anyway. > don't want to unnecessarily slow down the existing > schedule_on_each_cpu(), so perhaps the implementation should have a > static schedule_on_cpu_mask_internal() function that is the same as > my previous schedule_on_cpu_mask(), allowing schedule_on_each_cpu() > to call it directly to bypass the checking. Hmmm.... it's unlikely to make noticeable difference given that it's gonna be bouncing multiple cachelines across all online CPUs. > That said... I wonder if it might make sense to treat this API the > same as other APIs that already take a cpu? schedule_work_on(), > schedule_delayed_work_on(), and queue_delayed_work_on() all take a > cpu parameter without API comment or validity checking; > queue_work_on() just says "the caller must ensure [the cpu] can't go > away". Does it make sense to just add a similar comment to > schedule_on_cpu_mask() rather than make this API the first to > actually do cpu validity checking? Yeah, we've been lazy with the sanity check and I think it's a good opportunity to add it. Let's worry about other paths later. Thanks. -- tejun -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 104+ messages in thread
* [PATCH v5 1/2] workqueue: add new schedule_on_cpu_mask() API 2013-08-09 16:30 ` Tejun Heo @ 2013-08-07 20:49 ` Chris Metcalf -1 siblings, 0 replies; 104+ messages in thread From: Chris Metcalf @ 2013-08-07 20:49 UTC (permalink / raw) To: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer This primitive allows scheduling work to run on a particular set of cpus described by a "struct cpumask". This can be useful, for example, if you have a per-cpu variable that requires code execution only if the per-cpu variable has a certain value (for example, is a non-empty list). Signed-off-by: Chris Metcalf <cmetcalf@tilera.com> --- v5: provide validity checking on the cpumask for schedule_on_cpu_mask. By providing an all-or-nothing EINVAL check, we impose the requirement that the calling code actually know clearly what it's trying to do. (Note: no change to the mm/swap.c commit) v4: don't lose possible -ENOMEM in schedule_on_each_cpu() (Note: no change to the mm/swap.c commit) v3: split commit into two, one for workqueue and one for mm, though both should probably be taken through -mm. include/linux/workqueue.h | 3 +++ kernel/workqueue.c | 51 ++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 45 insertions(+), 9 deletions(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index a0ed78a..71a3fe7 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -13,6 +13,8 @@ #include <linux/atomic.h> #include <linux/cpumask.h> +struct cpumask; + struct workqueue_struct; struct work_struct; @@ -470,6 +472,7 @@ extern void flush_workqueue(struct workqueue_struct *wq); extern void drain_workqueue(struct workqueue_struct *wq); extern void flush_scheduled_work(void); +extern int schedule_on_cpu_mask(work_func_t func, const struct cpumask *mask); extern int schedule_on_each_cpu(work_func_t func); int execute_in_process_context(work_func_t fn, struct execute_work *); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f02c4a4..63d504a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -292,6 +292,9 @@ static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ static LIST_HEAD(workqueues); /* PL: list of all workqueues */ static bool workqueue_freezing; /* PL: have wqs started freezing? */ +/* set of cpus that are valid for per-cpu workqueue scheduling */ +static struct cpumask wq_valid_cpus; + /* the per-cpu worker pools */ static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools); @@ -2962,43 +2965,66 @@ bool cancel_delayed_work_sync(struct delayed_work *dwork) EXPORT_SYMBOL(cancel_delayed_work_sync); /** - * schedule_on_each_cpu - execute a function synchronously on each online CPU + * schedule_on_cpu_mask - execute a function synchronously on each listed CPU * @func: the function to call + * @mask: the cpumask to invoke the function on * - * schedule_on_each_cpu() executes @func on each online CPU using the + * schedule_on_cpu_mask() executes @func on each listed CPU using the * system workqueue and blocks until all CPUs have completed. - * schedule_on_each_cpu() is very slow. + * schedule_on_cpu_mask() is very slow. You may only specify CPUs + * that are online or have previously been online; specifying an + * invalid CPU mask will return -EINVAL without scheduling any work. * * RETURNS: * 0 on success, -errno on failure. */ -int schedule_on_each_cpu(work_func_t func) +int schedule_on_cpu_mask(work_func_t func, const struct cpumask *mask) { int cpu; struct work_struct __percpu *works; + if (!cpumask_subset(mask, &wq_valid_cpus)) + return -EINVAL; + works = alloc_percpu(struct work_struct); if (!works) return -ENOMEM; - get_online_cpus(); - - for_each_online_cpu(cpu) { + for_each_cpu(cpu, mask) { struct work_struct *work = per_cpu_ptr(works, cpu); INIT_WORK(work, func); schedule_work_on(cpu, work); } - for_each_online_cpu(cpu) + for_each_cpu(cpu, mask) flush_work(per_cpu_ptr(works, cpu)); - put_online_cpus(); free_percpu(works); return 0; } /** + * schedule_on_each_cpu - execute a function synchronously on each online CPU + * @func: the function to call + * + * schedule_on_each_cpu() executes @func on each online CPU using the + * system workqueue and blocks until all CPUs have completed. + * schedule_on_each_cpu() is very slow. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int schedule_on_each_cpu(work_func_t func) +{ + int ret; + get_online_cpus(); + ret = schedule_on_cpu_mask(func, cpu_online_mask); + put_online_cpus(); + return ret; +} + +/** * flush_scheduled_work - ensure that any scheduled work has run to completion. * * Forces execution of the kernel-global workqueue and blocks until its @@ -4687,6 +4713,9 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, list_for_each_entry(wq, &workqueues, list) wq_update_unbound_numa(wq, cpu, true); + /* track the set of cpus that have ever been online */ + cpumask_set_cpu(cpu, &wq_valid_cpus); + mutex_unlock(&wq_pool_mutex); break; } @@ -5011,6 +5040,10 @@ static int __init init_workqueues(void) !system_unbound_wq || !system_freezable_wq || !system_power_efficient_wq || !system_freezable_power_efficient_wq); + + /* mark startup cpu as valid */ + cpumask_set_cpu(smp_processor_id(), &wq_valid_cpus); + return 0; } early_initcall(init_workqueues); -- 1.8.3.1 ^ permalink raw reply related [flat|nested] 104+ messages in thread
* [PATCH v5 1/2] workqueue: add new schedule_on_cpu_mask() API @ 2013-08-07 20:49 ` Chris Metcalf 0 siblings, 0 replies; 104+ messages in thread From: Chris Metcalf @ 2013-08-07 20:49 UTC (permalink / raw) To: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer This primitive allows scheduling work to run on a particular set of cpus described by a "struct cpumask". This can be useful, for example, if you have a per-cpu variable that requires code execution only if the per-cpu variable has a certain value (for example, is a non-empty list). Signed-off-by: Chris Metcalf <cmetcalf@tilera.com> --- v5: provide validity checking on the cpumask for schedule_on_cpu_mask. By providing an all-or-nothing EINVAL check, we impose the requirement that the calling code actually know clearly what it's trying to do. (Note: no change to the mm/swap.c commit) v4: don't lose possible -ENOMEM in schedule_on_each_cpu() (Note: no change to the mm/swap.c commit) v3: split commit into two, one for workqueue and one for mm, though both should probably be taken through -mm. include/linux/workqueue.h | 3 +++ kernel/workqueue.c | 51 ++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 45 insertions(+), 9 deletions(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index a0ed78a..71a3fe7 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -13,6 +13,8 @@ #include <linux/atomic.h> #include <linux/cpumask.h> +struct cpumask; + struct workqueue_struct; struct work_struct; @@ -470,6 +472,7 @@ extern void flush_workqueue(struct workqueue_struct *wq); extern void drain_workqueue(struct workqueue_struct *wq); extern void flush_scheduled_work(void); +extern int schedule_on_cpu_mask(work_func_t func, const struct cpumask *mask); extern int schedule_on_each_cpu(work_func_t func); int execute_in_process_context(work_func_t fn, struct execute_work *); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f02c4a4..63d504a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -292,6 +292,9 @@ static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ static LIST_HEAD(workqueues); /* PL: list of all workqueues */ static bool workqueue_freezing; /* PL: have wqs started freezing? */ +/* set of cpus that are valid for per-cpu workqueue scheduling */ +static struct cpumask wq_valid_cpus; + /* the per-cpu worker pools */ static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools); @@ -2962,43 +2965,66 @@ bool cancel_delayed_work_sync(struct delayed_work *dwork) EXPORT_SYMBOL(cancel_delayed_work_sync); /** - * schedule_on_each_cpu - execute a function synchronously on each online CPU + * schedule_on_cpu_mask - execute a function synchronously on each listed CPU * @func: the function to call + * @mask: the cpumask to invoke the function on * - * schedule_on_each_cpu() executes @func on each online CPU using the + * schedule_on_cpu_mask() executes @func on each listed CPU using the * system workqueue and blocks until all CPUs have completed. - * schedule_on_each_cpu() is very slow. + * schedule_on_cpu_mask() is very slow. You may only specify CPUs + * that are online or have previously been online; specifying an + * invalid CPU mask will return -EINVAL without scheduling any work. * * RETURNS: * 0 on success, -errno on failure. */ -int schedule_on_each_cpu(work_func_t func) +int schedule_on_cpu_mask(work_func_t func, const struct cpumask *mask) { int cpu; struct work_struct __percpu *works; + if (!cpumask_subset(mask, &wq_valid_cpus)) + return -EINVAL; + works = alloc_percpu(struct work_struct); if (!works) return -ENOMEM; - get_online_cpus(); - - for_each_online_cpu(cpu) { + for_each_cpu(cpu, mask) { struct work_struct *work = per_cpu_ptr(works, cpu); INIT_WORK(work, func); schedule_work_on(cpu, work); } - for_each_online_cpu(cpu) + for_each_cpu(cpu, mask) flush_work(per_cpu_ptr(works, cpu)); - put_online_cpus(); free_percpu(works); return 0; } /** + * schedule_on_each_cpu - execute a function synchronously on each online CPU + * @func: the function to call + * + * schedule_on_each_cpu() executes @func on each online CPU using the + * system workqueue and blocks until all CPUs have completed. + * schedule_on_each_cpu() is very slow. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int schedule_on_each_cpu(work_func_t func) +{ + int ret; + get_online_cpus(); + ret = schedule_on_cpu_mask(func, cpu_online_mask); + put_online_cpus(); + return ret; +} + +/** * flush_scheduled_work - ensure that any scheduled work has run to completion. * * Forces execution of the kernel-global workqueue and blocks until its @@ -4687,6 +4713,9 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, list_for_each_entry(wq, &workqueues, list) wq_update_unbound_numa(wq, cpu, true); + /* track the set of cpus that have ever been online */ + cpumask_set_cpu(cpu, &wq_valid_cpus); + mutex_unlock(&wq_pool_mutex); break; } @@ -5011,6 +5040,10 @@ static int __init init_workqueues(void) !system_unbound_wq || !system_freezable_wq || !system_power_efficient_wq || !system_freezable_power_efficient_wq); + + /* mark startup cpu as valid */ + cpumask_set_cpu(smp_processor_id(), &wq_valid_cpus); + return 0; } early_initcall(init_workqueues); -- 1.8.3.1 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply related [flat|nested] 104+ messages in thread
* Re: [PATCH v5 1/2] workqueue: add new schedule_on_cpu_mask() API 2013-08-07 20:49 ` Chris Metcalf @ 2013-08-09 17:40 ` Tejun Heo -1 siblings, 0 replies; 104+ messages in thread From: Tejun Heo @ 2013-08-09 17:40 UTC (permalink / raw) To: Chris Metcalf Cc: linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer On Wed, Aug 07, 2013 at 04:49:44PM -0400, Chris Metcalf wrote: > This primitive allows scheduling work to run on a particular set of > cpus described by a "struct cpumask". This can be useful, for example, > if you have a per-cpu variable that requires code execution only if the > per-cpu variable has a certain value (for example, is a non-empty list). > > Signed-off-by: Chris Metcalf <cmetcalf@tilera.com> Acked-by: Tejun Heo <tj@kernel.org> Please feel free to route with the second patch. Thanks! -- tejun ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v5 1/2] workqueue: add new schedule_on_cpu_mask() API @ 2013-08-09 17:40 ` Tejun Heo 0 siblings, 0 replies; 104+ messages in thread From: Tejun Heo @ 2013-08-09 17:40 UTC (permalink / raw) To: Chris Metcalf Cc: linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer On Wed, Aug 07, 2013 at 04:49:44PM -0400, Chris Metcalf wrote: > This primitive allows scheduling work to run on a particular set of > cpus described by a "struct cpumask". This can be useful, for example, > if you have a per-cpu variable that requires code execution only if the > per-cpu variable has a certain value (for example, is a non-empty list). > > Signed-off-by: Chris Metcalf <cmetcalf@tilera.com> Acked-by: Tejun Heo <tj@kernel.org> Please feel free to route with the second patch. Thanks! -- tejun -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 104+ messages in thread
* [PATCH v6 1/2] workqueue: add new schedule_on_cpu_mask() API 2013-08-09 17:40 ` Tejun Heo @ 2013-08-09 17:49 ` Chris Metcalf -1 siblings, 0 replies; 104+ messages in thread From: Chris Metcalf @ 2013-08-09 17:49 UTC (permalink / raw) To: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer This primitive allows scheduling work to run on a particular set of cpus described by a "struct cpumask". This can be useful, for example, if you have a per-cpu variable that requires code execution only if the per-cpu variable has a certain value (for example, is a non-empty list). Acked-by: Tejun Heo <tj@kernel.org> Signed-off-by: Chris Metcalf <cmetcalf@tilera.com> --- v6: add Tejun's Acked-by, and add missing get/put_cpu_online to lru_add_drain_all(). v5: provide validity checking on the cpumask for schedule_on_cpu_mask. By providing an all-or-nothing EINVAL check, we impose the requirement that the calling code actually know clearly what it's trying to do. (Note: no change to the mm/swap.c commit) v4: don't lose possible -ENOMEM in schedule_on_each_cpu() (Note: no change to the mm/swap.c commit) v3: split commit into two, one for workqueue and one for mm, though both should probably be taken through -mm. include/linux/workqueue.h | 3 +++ kernel/workqueue.c | 51 ++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 45 insertions(+), 9 deletions(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index a0ed78a..71a3fe7 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -13,6 +13,8 @@ #include <linux/atomic.h> #include <linux/cpumask.h> +struct cpumask; + struct workqueue_struct; struct work_struct; @@ -470,6 +472,7 @@ extern void flush_workqueue(struct workqueue_struct *wq); extern void drain_workqueue(struct workqueue_struct *wq); extern void flush_scheduled_work(void); +extern int schedule_on_cpu_mask(work_func_t func, const struct cpumask *mask); extern int schedule_on_each_cpu(work_func_t func); int execute_in_process_context(work_func_t fn, struct execute_work *); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f02c4a4..63d504a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -292,6 +292,9 @@ static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ static LIST_HEAD(workqueues); /* PL: list of all workqueues */ static bool workqueue_freezing; /* PL: have wqs started freezing? */ +/* set of cpus that are valid for per-cpu workqueue scheduling */ +static struct cpumask wq_valid_cpus; + /* the per-cpu worker pools */ static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools); @@ -2962,43 +2965,66 @@ bool cancel_delayed_work_sync(struct delayed_work *dwork) EXPORT_SYMBOL(cancel_delayed_work_sync); /** - * schedule_on_each_cpu - execute a function synchronously on each online CPU + * schedule_on_cpu_mask - execute a function synchronously on each listed CPU * @func: the function to call + * @mask: the cpumask to invoke the function on * - * schedule_on_each_cpu() executes @func on each online CPU using the + * schedule_on_cpu_mask() executes @func on each listed CPU using the * system workqueue and blocks until all CPUs have completed. - * schedule_on_each_cpu() is very slow. + * schedule_on_cpu_mask() is very slow. You may only specify CPUs + * that are online or have previously been online; specifying an + * invalid CPU mask will return -EINVAL without scheduling any work. * * RETURNS: * 0 on success, -errno on failure. */ -int schedule_on_each_cpu(work_func_t func) +int schedule_on_cpu_mask(work_func_t func, const struct cpumask *mask) { int cpu; struct work_struct __percpu *works; + if (!cpumask_subset(mask, &wq_valid_cpus)) + return -EINVAL; + works = alloc_percpu(struct work_struct); if (!works) return -ENOMEM; - get_online_cpus(); - - for_each_online_cpu(cpu) { + for_each_cpu(cpu, mask) { struct work_struct *work = per_cpu_ptr(works, cpu); INIT_WORK(work, func); schedule_work_on(cpu, work); } - for_each_online_cpu(cpu) + for_each_cpu(cpu, mask) flush_work(per_cpu_ptr(works, cpu)); - put_online_cpus(); free_percpu(works); return 0; } /** + * schedule_on_each_cpu - execute a function synchronously on each online CPU + * @func: the function to call + * + * schedule_on_each_cpu() executes @func on each online CPU using the + * system workqueue and blocks until all CPUs have completed. + * schedule_on_each_cpu() is very slow. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int schedule_on_each_cpu(work_func_t func) +{ + int ret; + get_online_cpus(); + ret = schedule_on_cpu_mask(func, cpu_online_mask); + put_online_cpus(); + return ret; +} + +/** * flush_scheduled_work - ensure that any scheduled work has run to completion. * * Forces execution of the kernel-global workqueue and blocks until its @@ -4687,6 +4713,9 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, list_for_each_entry(wq, &workqueues, list) wq_update_unbound_numa(wq, cpu, true); + /* track the set of cpus that have ever been online */ + cpumask_set_cpu(cpu, &wq_valid_cpus); + mutex_unlock(&wq_pool_mutex); break; } @@ -5011,6 +5040,10 @@ static int __init init_workqueues(void) !system_unbound_wq || !system_freezable_wq || !system_power_efficient_wq || !system_freezable_power_efficient_wq); + + /* mark startup cpu as valid */ + cpumask_set_cpu(smp_processor_id(), &wq_valid_cpus); + return 0; } early_initcall(init_workqueues); -- 1.8.3.1 ^ permalink raw reply related [flat|nested] 104+ messages in thread
* [PATCH v6 1/2] workqueue: add new schedule_on_cpu_mask() API @ 2013-08-09 17:49 ` Chris Metcalf 0 siblings, 0 replies; 104+ messages in thread From: Chris Metcalf @ 2013-08-09 17:49 UTC (permalink / raw) To: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer This primitive allows scheduling work to run on a particular set of cpus described by a "struct cpumask". This can be useful, for example, if you have a per-cpu variable that requires code execution only if the per-cpu variable has a certain value (for example, is a non-empty list). Acked-by: Tejun Heo <tj@kernel.org> Signed-off-by: Chris Metcalf <cmetcalf@tilera.com> --- v6: add Tejun's Acked-by, and add missing get/put_cpu_online to lru_add_drain_all(). v5: provide validity checking on the cpumask for schedule_on_cpu_mask. By providing an all-or-nothing EINVAL check, we impose the requirement that the calling code actually know clearly what it's trying to do. (Note: no change to the mm/swap.c commit) v4: don't lose possible -ENOMEM in schedule_on_each_cpu() (Note: no change to the mm/swap.c commit) v3: split commit into two, one for workqueue and one for mm, though both should probably be taken through -mm. include/linux/workqueue.h | 3 +++ kernel/workqueue.c | 51 ++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 45 insertions(+), 9 deletions(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index a0ed78a..71a3fe7 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -13,6 +13,8 @@ #include <linux/atomic.h> #include <linux/cpumask.h> +struct cpumask; + struct workqueue_struct; struct work_struct; @@ -470,6 +472,7 @@ extern void flush_workqueue(struct workqueue_struct *wq); extern void drain_workqueue(struct workqueue_struct *wq); extern void flush_scheduled_work(void); +extern int schedule_on_cpu_mask(work_func_t func, const struct cpumask *mask); extern int schedule_on_each_cpu(work_func_t func); int execute_in_process_context(work_func_t fn, struct execute_work *); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f02c4a4..63d504a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -292,6 +292,9 @@ static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ static LIST_HEAD(workqueues); /* PL: list of all workqueues */ static bool workqueue_freezing; /* PL: have wqs started freezing? */ +/* set of cpus that are valid for per-cpu workqueue scheduling */ +static struct cpumask wq_valid_cpus; + /* the per-cpu worker pools */ static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools); @@ -2962,43 +2965,66 @@ bool cancel_delayed_work_sync(struct delayed_work *dwork) EXPORT_SYMBOL(cancel_delayed_work_sync); /** - * schedule_on_each_cpu - execute a function synchronously on each online CPU + * schedule_on_cpu_mask - execute a function synchronously on each listed CPU * @func: the function to call + * @mask: the cpumask to invoke the function on * - * schedule_on_each_cpu() executes @func on each online CPU using the + * schedule_on_cpu_mask() executes @func on each listed CPU using the * system workqueue and blocks until all CPUs have completed. - * schedule_on_each_cpu() is very slow. + * schedule_on_cpu_mask() is very slow. You may only specify CPUs + * that are online or have previously been online; specifying an + * invalid CPU mask will return -EINVAL without scheduling any work. * * RETURNS: * 0 on success, -errno on failure. */ -int schedule_on_each_cpu(work_func_t func) +int schedule_on_cpu_mask(work_func_t func, const struct cpumask *mask) { int cpu; struct work_struct __percpu *works; + if (!cpumask_subset(mask, &wq_valid_cpus)) + return -EINVAL; + works = alloc_percpu(struct work_struct); if (!works) return -ENOMEM; - get_online_cpus(); - - for_each_online_cpu(cpu) { + for_each_cpu(cpu, mask) { struct work_struct *work = per_cpu_ptr(works, cpu); INIT_WORK(work, func); schedule_work_on(cpu, work); } - for_each_online_cpu(cpu) + for_each_cpu(cpu, mask) flush_work(per_cpu_ptr(works, cpu)); - put_online_cpus(); free_percpu(works); return 0; } /** + * schedule_on_each_cpu - execute a function synchronously on each online CPU + * @func: the function to call + * + * schedule_on_each_cpu() executes @func on each online CPU using the + * system workqueue and blocks until all CPUs have completed. + * schedule_on_each_cpu() is very slow. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int schedule_on_each_cpu(work_func_t func) +{ + int ret; + get_online_cpus(); + ret = schedule_on_cpu_mask(func, cpu_online_mask); + put_online_cpus(); + return ret; +} + +/** * flush_scheduled_work - ensure that any scheduled work has run to completion. * * Forces execution of the kernel-global workqueue and blocks until its @@ -4687,6 +4713,9 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, list_for_each_entry(wq, &workqueues, list) wq_update_unbound_numa(wq, cpu, true); + /* track the set of cpus that have ever been online */ + cpumask_set_cpu(cpu, &wq_valid_cpus); + mutex_unlock(&wq_pool_mutex); break; } @@ -5011,6 +5040,10 @@ static int __init init_workqueues(void) !system_unbound_wq || !system_freezable_wq || !system_power_efficient_wq || !system_freezable_power_efficient_wq); + + /* mark startup cpu as valid */ + cpumask_set_cpu(smp_processor_id(), &wq_valid_cpus); + return 0; } early_initcall(init_workqueues); -- 1.8.3.1 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply related [flat|nested] 104+ messages in thread
* [PATCH v6 2/2] mm: make lru_add_drain_all() selective 2013-08-09 17:40 ` Tejun Heo @ 2013-08-09 17:52 ` Chris Metcalf -1 siblings, 0 replies; 104+ messages in thread From: Chris Metcalf @ 2013-08-09 17:52 UTC (permalink / raw) To: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer This change makes lru_add_drain_all() only selectively interrupt the cpus that have per-cpu free pages that can be drained. This is important in nohz mode where calling mlockall(), for example, otherwise will interrupt every core unnecessarily. Signed-off-by: Chris Metcalf <cmetcalf@tilera.com> --- v6: add Tejun's Acked-by, and add missing get/put_cpu_online to lru_add_drain_all(). v5: provide validity checking on the cpumask for schedule_on_cpu_mask. By providing an all-or-nothing EINVAL check, we impose the requirement that the calling code actually know clearly what it's trying to do. (Note: no change to the mm/swap.c commit) v4: don't lose possible -ENOMEM in schedule_on_each_cpu() (Note: no change to the mm/swap.c commit) v3: split commit into two, one for workqueue and one for mm, though both should probably be taken through -mm. mm/swap.c | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/mm/swap.c b/mm/swap.c index 4a1d0d2..f42561a 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -405,6 +405,11 @@ static void activate_page_drain(int cpu) pagevec_lru_move_fn(pvec, __activate_page, NULL); } +static bool need_activate_page_drain(int cpu) +{ + return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0; +} + void activate_page(struct page *page) { if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { @@ -422,6 +427,11 @@ static inline void activate_page_drain(int cpu) { } +static bool need_activate_page_drain(int cpu) +{ + return false; +} + void activate_page(struct page *page) { struct zone *zone = page_zone(page); @@ -683,7 +693,34 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) */ int lru_add_drain_all(void) { - return schedule_on_each_cpu(lru_add_drain_per_cpu); + cpumask_var_t mask; + int cpu, rc; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + cpumask_clear(mask); + get_online_cpus(); + + /* + * Figure out which cpus need flushing. It's OK if we race + * with changes to the per-cpu lru pvecs, since it's no worse + * than if we flushed all cpus, since a cpu could still end + * up putting pages back on its pvec before we returned. + * And this avoids interrupting other cpus unnecessarily. + */ + for_each_online_cpu(cpu) { + if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || + pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || + pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || + need_activate_page_drain(cpu)) + cpumask_set_cpu(cpu, mask); + } + + rc = schedule_on_cpu_mask(lru_add_drain_per_cpu, mask); + + put_online_cpus(); + free_cpumask_var(mask); + return rc; } /* -- 1.8.3.1 ^ permalink raw reply related [flat|nested] 104+ messages in thread
* [PATCH v6 2/2] mm: make lru_add_drain_all() selective @ 2013-08-09 17:52 ` Chris Metcalf 0 siblings, 0 replies; 104+ messages in thread From: Chris Metcalf @ 2013-08-09 17:52 UTC (permalink / raw) To: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer This change makes lru_add_drain_all() only selectively interrupt the cpus that have per-cpu free pages that can be drained. This is important in nohz mode where calling mlockall(), for example, otherwise will interrupt every core unnecessarily. Signed-off-by: Chris Metcalf <cmetcalf@tilera.com> --- v6: add Tejun's Acked-by, and add missing get/put_cpu_online to lru_add_drain_all(). v5: provide validity checking on the cpumask for schedule_on_cpu_mask. By providing an all-or-nothing EINVAL check, we impose the requirement that the calling code actually know clearly what it's trying to do. (Note: no change to the mm/swap.c commit) v4: don't lose possible -ENOMEM in schedule_on_each_cpu() (Note: no change to the mm/swap.c commit) v3: split commit into two, one for workqueue and one for mm, though both should probably be taken through -mm. mm/swap.c | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/mm/swap.c b/mm/swap.c index 4a1d0d2..f42561a 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -405,6 +405,11 @@ static void activate_page_drain(int cpu) pagevec_lru_move_fn(pvec, __activate_page, NULL); } +static bool need_activate_page_drain(int cpu) +{ + return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0; +} + void activate_page(struct page *page) { if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { @@ -422,6 +427,11 @@ static inline void activate_page_drain(int cpu) { } +static bool need_activate_page_drain(int cpu) +{ + return false; +} + void activate_page(struct page *page) { struct zone *zone = page_zone(page); @@ -683,7 +693,34 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) */ int lru_add_drain_all(void) { - return schedule_on_each_cpu(lru_add_drain_per_cpu); + cpumask_var_t mask; + int cpu, rc; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + cpumask_clear(mask); + get_online_cpus(); + + /* + * Figure out which cpus need flushing. It's OK if we race + * with changes to the per-cpu lru pvecs, since it's no worse + * than if we flushed all cpus, since a cpu could still end + * up putting pages back on its pvec before we returned. + * And this avoids interrupting other cpus unnecessarily. + */ + for_each_online_cpu(cpu) { + if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || + pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || + pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || + need_activate_page_drain(cpu)) + cpumask_set_cpu(cpu, mask); + } + + rc = schedule_on_cpu_mask(lru_add_drain_per_cpu, mask); + + put_online_cpus(); + free_cpumask_var(mask); + return rc; } /* -- 1.8.3.1 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply related [flat|nested] 104+ messages in thread
* [PATCH v5 2/2] mm: make lru_add_drain_all() selective 2013-08-09 16:30 ` Tejun Heo @ 2013-08-07 20:52 ` Chris Metcalf -1 siblings, 0 replies; 104+ messages in thread From: Chris Metcalf @ 2013-08-07 20:52 UTC (permalink / raw) To: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer This change makes lru_add_drain_all() only selectively interrupt the cpus that have per-cpu free pages that can be drained. This is important in nohz mode where calling mlockall(), for example, otherwise will interrupt every core unnecessarily. Signed-off-by: Chris Metcalf <cmetcalf@tilera.com> --- v5: provide validity checking on the cpumask for schedule_on_cpu_mask By providing an all-or-nothing EINVAL check, we impose the requirement that the calling code actually know clearly what it's trying to do. (Note: no change to the mm/swap.c commit) v4: don't lose possible -ENOMEM in schedule_on_each_cpu() (Note: no change to the mm/swap.c commit) v3: split commit into two, one for workqueue and one for mm, though both should probably be taken through -mm. mm/swap.c | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/mm/swap.c b/mm/swap.c index 4a1d0d2..d4a862b 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -405,6 +405,11 @@ static void activate_page_drain(int cpu) pagevec_lru_move_fn(pvec, __activate_page, NULL); } +static bool need_activate_page_drain(int cpu) +{ + return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0; +} + void activate_page(struct page *page) { if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { @@ -422,6 +427,11 @@ static inline void activate_page_drain(int cpu) { } +static bool need_activate_page_drain(int cpu) +{ + return false; +} + void activate_page(struct page *page) { struct zone *zone = page_zone(page); @@ -683,7 +693,32 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) */ int lru_add_drain_all(void) { - return schedule_on_each_cpu(lru_add_drain_per_cpu); + cpumask_var_t mask; + int cpu, rc; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + cpumask_clear(mask); + + /* + * Figure out which cpus need flushing. It's OK if we race + * with changes to the per-cpu lru pvecs, since it's no worse + * than if we flushed all cpus, since a cpu could still end + * up putting pages back on its pvec before we returned. + * And this avoids interrupting other cpus unnecessarily. + */ + for_each_online_cpu(cpu) { + if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || + pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || + pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || + need_activate_page_drain(cpu)) + cpumask_set_cpu(cpu, mask); + } + + rc = schedule_on_cpu_mask(lru_add_drain_per_cpu, mask); + + free_cpumask_var(mask); + return rc; } /* -- 1.8.3.1 ^ permalink raw reply related [flat|nested] 104+ messages in thread
* [PATCH v5 2/2] mm: make lru_add_drain_all() selective @ 2013-08-07 20:52 ` Chris Metcalf 0 siblings, 0 replies; 104+ messages in thread From: Chris Metcalf @ 2013-08-07 20:52 UTC (permalink / raw) To: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer This change makes lru_add_drain_all() only selectively interrupt the cpus that have per-cpu free pages that can be drained. This is important in nohz mode where calling mlockall(), for example, otherwise will interrupt every core unnecessarily. Signed-off-by: Chris Metcalf <cmetcalf@tilera.com> --- v5: provide validity checking on the cpumask for schedule_on_cpu_mask By providing an all-or-nothing EINVAL check, we impose the requirement that the calling code actually know clearly what it's trying to do. (Note: no change to the mm/swap.c commit) v4: don't lose possible -ENOMEM in schedule_on_each_cpu() (Note: no change to the mm/swap.c commit) v3: split commit into two, one for workqueue and one for mm, though both should probably be taken through -mm. mm/swap.c | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/mm/swap.c b/mm/swap.c index 4a1d0d2..d4a862b 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -405,6 +405,11 @@ static void activate_page_drain(int cpu) pagevec_lru_move_fn(pvec, __activate_page, NULL); } +static bool need_activate_page_drain(int cpu) +{ + return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0; +} + void activate_page(struct page *page) { if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { @@ -422,6 +427,11 @@ static inline void activate_page_drain(int cpu) { } +static bool need_activate_page_drain(int cpu) +{ + return false; +} + void activate_page(struct page *page) { struct zone *zone = page_zone(page); @@ -683,7 +693,32 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) */ int lru_add_drain_all(void) { - return schedule_on_each_cpu(lru_add_drain_per_cpu); + cpumask_var_t mask; + int cpu, rc; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + cpumask_clear(mask); + + /* + * Figure out which cpus need flushing. It's OK if we race + * with changes to the per-cpu lru pvecs, since it's no worse + * than if we flushed all cpus, since a cpu could still end + * up putting pages back on its pvec before we returned. + * And this avoids interrupting other cpus unnecessarily. + */ + for_each_online_cpu(cpu) { + if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || + pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || + pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || + need_activate_page_drain(cpu)) + cpumask_set_cpu(cpu, mask); + } + + rc = schedule_on_cpu_mask(lru_add_drain_per_cpu, mask); + + free_cpumask_var(mask); + return rc; } /* -- 1.8.3.1 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply related [flat|nested] 104+ messages in thread
* [PATCH v4 2/2] mm: make lru_add_drain_all() selective 2013-08-07 22:48 ` Cody P Schafer @ 2013-08-07 20:52 ` Chris Metcalf -1 siblings, 0 replies; 104+ messages in thread From: Chris Metcalf @ 2013-08-07 20:52 UTC (permalink / raw) To: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer This change makes lru_add_drain_all() only selectively interrupt the cpus that have per-cpu free pages that can be drained. This is important in nohz mode where calling mlockall(), for example, otherwise will interrupt every core unnecessarily. Signed-off-by: Chris Metcalf <cmetcalf@tilera.com> --- v4: don't lose possible -ENOMEM in schedule_on_each_cpu() (Note: no change to the mm/swap.c commit) v3: split commit into two, one for workqueue and one for mm, though both should probably be taken through -mm. mm/swap.c | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/mm/swap.c b/mm/swap.c index 4a1d0d2..d4a862b 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -405,6 +405,11 @@ static void activate_page_drain(int cpu) pagevec_lru_move_fn(pvec, __activate_page, NULL); } +static bool need_activate_page_drain(int cpu) +{ + return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0; +} + void activate_page(struct page *page) { if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { @@ -422,6 +427,11 @@ static inline void activate_page_drain(int cpu) { } +static bool need_activate_page_drain(int cpu) +{ + return false; +} + void activate_page(struct page *page) { struct zone *zone = page_zone(page); @@ -683,7 +693,32 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) */ int lru_add_drain_all(void) { - return schedule_on_each_cpu(lru_add_drain_per_cpu); + cpumask_var_t mask; + int cpu, rc; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + cpumask_clear(mask); + + /* + * Figure out which cpus need flushing. It's OK if we race + * with changes to the per-cpu lru pvecs, since it's no worse + * than if we flushed all cpus, since a cpu could still end + * up putting pages back on its pvec before we returned. + * And this avoids interrupting other cpus unnecessarily. + */ + for_each_online_cpu(cpu) { + if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || + pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || + pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || + need_activate_page_drain(cpu)) + cpumask_set_cpu(cpu, mask); + } + + rc = schedule_on_cpu_mask(lru_add_drain_per_cpu, mask); + + free_cpumask_var(mask); + return rc; } /* -- 1.8.3.1 ^ permalink raw reply related [flat|nested] 104+ messages in thread
* [PATCH v4 2/2] mm: make lru_add_drain_all() selective @ 2013-08-07 20:52 ` Chris Metcalf 0 siblings, 0 replies; 104+ messages in thread From: Chris Metcalf @ 2013-08-07 20:52 UTC (permalink / raw) To: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer This change makes lru_add_drain_all() only selectively interrupt the cpus that have per-cpu free pages that can be drained. This is important in nohz mode where calling mlockall(), for example, otherwise will interrupt every core unnecessarily. Signed-off-by: Chris Metcalf <cmetcalf@tilera.com> --- v4: don't lose possible -ENOMEM in schedule_on_each_cpu() (Note: no change to the mm/swap.c commit) v3: split commit into two, one for workqueue and one for mm, though both should probably be taken through -mm. mm/swap.c | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/mm/swap.c b/mm/swap.c index 4a1d0d2..d4a862b 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -405,6 +405,11 @@ static void activate_page_drain(int cpu) pagevec_lru_move_fn(pvec, __activate_page, NULL); } +static bool need_activate_page_drain(int cpu) +{ + return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0; +} + void activate_page(struct page *page) { if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { @@ -422,6 +427,11 @@ static inline void activate_page_drain(int cpu) { } +static bool need_activate_page_drain(int cpu) +{ + return false; +} + void activate_page(struct page *page) { struct zone *zone = page_zone(page); @@ -683,7 +693,32 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) */ int lru_add_drain_all(void) { - return schedule_on_each_cpu(lru_add_drain_per_cpu); + cpumask_var_t mask; + int cpu, rc; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + cpumask_clear(mask); + + /* + * Figure out which cpus need flushing. It's OK if we race + * with changes to the per-cpu lru pvecs, since it's no worse + * than if we flushed all cpus, since a cpu could still end + * up putting pages back on its pvec before we returned. + * And this avoids interrupting other cpus unnecessarily. + */ + for_each_online_cpu(cpu) { + if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || + pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || + pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || + need_activate_page_drain(cpu)) + cpumask_set_cpu(cpu, mask); + } + + rc = schedule_on_cpu_mask(lru_add_drain_per_cpu, mask); + + free_cpumask_var(mask); + return rc; } /* -- 1.8.3.1 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply related [flat|nested] 104+ messages in thread
* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective 2013-08-07 20:52 ` Chris Metcalf @ 2013-08-12 21:05 ` Andrew Morton -1 siblings, 0 replies; 104+ messages in thread From: Andrew Morton @ 2013-08-12 21:05 UTC (permalink / raw) To: Chris Metcalf Cc: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer On Wed, 7 Aug 2013 16:52:22 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote: > This change makes lru_add_drain_all() only selectively interrupt > the cpus that have per-cpu free pages that can be drained. > > This is important in nohz mode where calling mlockall(), for > example, otherwise will interrupt every core unnecessarily. > > ... > > --- a/mm/swap.c > +++ b/mm/swap.c > @@ -405,6 +405,11 @@ static void activate_page_drain(int cpu) > pagevec_lru_move_fn(pvec, __activate_page, NULL); > } > > +static bool need_activate_page_drain(int cpu) > +{ > + return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0; > +} > + > void activate_page(struct page *page) > { > if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { > @@ -422,6 +427,11 @@ static inline void activate_page_drain(int cpu) > { > } > > +static bool need_activate_page_drain(int cpu) > +{ > + return false; > +} > + > void activate_page(struct page *page) > { > struct zone *zone = page_zone(page); > @@ -683,7 +693,32 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) > */ > int lru_add_drain_all(void) > { > - return schedule_on_each_cpu(lru_add_drain_per_cpu); > + cpumask_var_t mask; > + int cpu, rc; > + > + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) > + return -ENOMEM; Newly adding a GFP_KERNEL allocation attempt into lru_add_drain_all() is dangerous and undesirable. I took a quick look at all the callsites and didn't immediately see a bug, but it's hard because they're splattered all over the place. It would be far better if we were to not do this. Rather than tossing this hang-grenade in there we should at a reluctant minimum change lru_add_drain_all() to take a gfp_t argument and then carefully review and update the callers. > + cpumask_clear(mask); > + > + /* > + * Figure out which cpus need flushing. It's OK if we race > + * with changes to the per-cpu lru pvecs, since it's no worse > + * than if we flushed all cpus, since a cpu could still end > + * up putting pages back on its pvec before we returned. > + * And this avoids interrupting other cpus unnecessarily. > + */ > + for_each_online_cpu(cpu) { > + if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || > + pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || > + pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || > + need_activate_page_drain(cpu)) > + cpumask_set_cpu(cpu, mask); > + } > + > + rc = schedule_on_cpu_mask(lru_add_drain_per_cpu, mask); And it seems pretty easy to avoid the allocation. Create a single cpumask at boot (or, preferably, at compile-time) and whenever we add a page to a drainable pagevec, do cpumask_set_cpu(smp_processor_id(), global_cpumask); and to avoid needlessly dirtying a cacheline, if (!cpu_isset(smp_processor_id(), global_cpumask)) cpumask_set_cpu(smp_processor_id(), global_cpumask); This means that lru_add_drain_all() will need to clear the mask at some point and atomicity issues arise. It would be better to do the clearing within schedule_on_cpu_mask() itself, using cpumask_test_and_clear_cpu(). Also, what's up with the get_online_cpus() handling? schedule_on_each_cpu() does it, lru_add_drain_all() does not do it and the schedule_on_cpu_mask() documentation forgot to mention it. ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective @ 2013-08-12 21:05 ` Andrew Morton 0 siblings, 0 replies; 104+ messages in thread From: Andrew Morton @ 2013-08-12 21:05 UTC (permalink / raw) To: Chris Metcalf Cc: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer On Wed, 7 Aug 2013 16:52:22 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote: > This change makes lru_add_drain_all() only selectively interrupt > the cpus that have per-cpu free pages that can be drained. > > This is important in nohz mode where calling mlockall(), for > example, otherwise will interrupt every core unnecessarily. > > ... > > --- a/mm/swap.c > +++ b/mm/swap.c > @@ -405,6 +405,11 @@ static void activate_page_drain(int cpu) > pagevec_lru_move_fn(pvec, __activate_page, NULL); > } > > +static bool need_activate_page_drain(int cpu) > +{ > + return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0; > +} > + > void activate_page(struct page *page) > { > if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { > @@ -422,6 +427,11 @@ static inline void activate_page_drain(int cpu) > { > } > > +static bool need_activate_page_drain(int cpu) > +{ > + return false; > +} > + > void activate_page(struct page *page) > { > struct zone *zone = page_zone(page); > @@ -683,7 +693,32 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) > */ > int lru_add_drain_all(void) > { > - return schedule_on_each_cpu(lru_add_drain_per_cpu); > + cpumask_var_t mask; > + int cpu, rc; > + > + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) > + return -ENOMEM; Newly adding a GFP_KERNEL allocation attempt into lru_add_drain_all() is dangerous and undesirable. I took a quick look at all the callsites and didn't immediately see a bug, but it's hard because they're splattered all over the place. It would be far better if we were to not do this. Rather than tossing this hang-grenade in there we should at a reluctant minimum change lru_add_drain_all() to take a gfp_t argument and then carefully review and update the callers. > + cpumask_clear(mask); > + > + /* > + * Figure out which cpus need flushing. It's OK if we race > + * with changes to the per-cpu lru pvecs, since it's no worse > + * than if we flushed all cpus, since a cpu could still end > + * up putting pages back on its pvec before we returned. > + * And this avoids interrupting other cpus unnecessarily. > + */ > + for_each_online_cpu(cpu) { > + if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || > + pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || > + pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || > + need_activate_page_drain(cpu)) > + cpumask_set_cpu(cpu, mask); > + } > + > + rc = schedule_on_cpu_mask(lru_add_drain_per_cpu, mask); And it seems pretty easy to avoid the allocation. Create a single cpumask at boot (or, preferably, at compile-time) and whenever we add a page to a drainable pagevec, do cpumask_set_cpu(smp_processor_id(), global_cpumask); and to avoid needlessly dirtying a cacheline, if (!cpu_isset(smp_processor_id(), global_cpumask)) cpumask_set_cpu(smp_processor_id(), global_cpumask); This means that lru_add_drain_all() will need to clear the mask at some point and atomicity issues arise. It would be better to do the clearing within schedule_on_cpu_mask() itself, using cpumask_test_and_clear_cpu(). Also, what's up with the get_online_cpus() handling? schedule_on_each_cpu() does it, lru_add_drain_all() does not do it and the schedule_on_cpu_mask() documentation forgot to mention it. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective 2013-08-12 21:05 ` Andrew Morton @ 2013-08-13 1:53 ` Chris Metcalf -1 siblings, 0 replies; 104+ messages in thread From: Chris Metcalf @ 2013-08-13 1:53 UTC (permalink / raw) To: Andrew Morton Cc: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer On 8/12/2013 5:05 PM, Andrew Morton wrote: > On Wed, 7 Aug 2013 16:52:22 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote: > >> This change makes lru_add_drain_all() only selectively interrupt >> the cpus that have per-cpu free pages that can be drained. >> >> This is important in nohz mode where calling mlockall(), for >> example, otherwise will interrupt every core unnecessarily. >> >> ... >> >> --- a/mm/swap.c >> +++ b/mm/swap.c >> @@ -405,6 +405,11 @@ static void activate_page_drain(int cpu) >> pagevec_lru_move_fn(pvec, __activate_page, NULL); >> } >> >> +static bool need_activate_page_drain(int cpu) >> +{ >> + return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0; >> +} >> + >> void activate_page(struct page *page) >> { >> if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { >> @@ -422,6 +427,11 @@ static inline void activate_page_drain(int cpu) >> { >> } >> >> +static bool need_activate_page_drain(int cpu) >> +{ >> + return false; >> +} >> + >> void activate_page(struct page *page) >> { >> struct zone *zone = page_zone(page); >> @@ -683,7 +693,32 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) >> */ >> int lru_add_drain_all(void) >> { >> - return schedule_on_each_cpu(lru_add_drain_per_cpu); >> + cpumask_var_t mask; >> + int cpu, rc; >> + >> + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) >> + return -ENOMEM; > Newly adding a GFP_KERNEL allocation attempt into lru_add_drain_all() > is dangerous and undesirable. I took a quick look at all the callsites > and didn't immediately see a bug, but it's hard because they're > splattered all over the place. It would be far better if we were to > not do this. I think it should be safe, given that we already did alloc_percpu() to do schedule_on_each_cpu(), and that is documented as doing GFP_KERNEL allocation (pcpu_create_chunk will call alloc_pages with GFP_KERNEL). > Rather than tossing this hang-grenade in there we should at a reluctant > minimum change lru_add_drain_all() to take a gfp_t argument and then > carefully review and update the callers. > >> + cpumask_clear(mask); >> + >> + /* >> + * Figure out which cpus need flushing. It's OK if we race >> + * with changes to the per-cpu lru pvecs, since it's no worse >> + * than if we flushed all cpus, since a cpu could still end >> + * up putting pages back on its pvec before we returned. >> + * And this avoids interrupting other cpus unnecessarily. >> + */ >> + for_each_online_cpu(cpu) { >> + if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || >> + pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || >> + pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || >> + need_activate_page_drain(cpu)) >> + cpumask_set_cpu(cpu, mask); >> + } >> + >> + rc = schedule_on_cpu_mask(lru_add_drain_per_cpu, mask); > And it seems pretty easy to avoid the allocation. Create a single > cpumask at boot (or, preferably, at compile-time) and whenever we add a > page to a drainable pagevec, do > > cpumask_set_cpu(smp_processor_id(), global_cpumask); > > and to avoid needlessly dirtying a cacheline, > > if (!cpu_isset(smp_processor_id(), global_cpumask)) > cpumask_set_cpu(smp_processor_id(), global_cpumask); > > > This means that lru_add_drain_all() will need to clear the mask at some > point and atomicity issues arise. It would be better to do the > clearing within schedule_on_cpu_mask() itself, using > cpumask_test_and_clear_cpu(). The atomicity issue isn't that big a deal (given that the drain is racy anyway, you just need to make sure to do cpumask_set_cpu after the pagevec_add), but you do need to clear the cpumask before doing the actual drain, and that either means inflicting that semantics on schedule_on_cpu_mask(), which seems a little unnatural, or else doing a copy of the mask, which gets us back to where we started with GFP_KERNEL allocations. Alternately, you could imagine a workqueue API that just took a function pointer that returned for each cpu whether or not to schedule work on that cpu: typedef bool (*check_work_func_t)(void *data, int cpu); schedule_on_some_cpus(work_func_t func, check_work_func_t checker, void *data); For the lru stuff we wouldn't need to use a "data" pointer but I'd include it since it's cheap, pretty standard, and makes the API more general. Or, I suppose, one other possibility is just a compile-time struct cpumask that we guard with a lock. It seems a bit like overkill for the very common case of not specifying CPUMASK_OFFSTACK. All that said, I still tend to like the simple cpumask data-driven approach, assuming you're comfortable with the possible GFP_KERNEL allocation. > Also, what's up with the get_online_cpus() handling? > schedule_on_each_cpu() does it, lru_add_drain_all() does not do it and > the schedule_on_cpu_mask() documentation forgot to mention it. The missing get_online_cpus() for lru_add_drain_all() is in v6 of the patch from Aug 9 (v5 had Tejun's feedback for doing validity-checking on the schedule_on_cpu_mask() mask argument, and v6 added his Ack and the missing get/put_online_cpus). schedule_on_each_cpu() obviously uses get/put_online_cpus and needs it; I would argue that there's no need to mention it in the docs for schedule_on_cpu_mask() since if you're going to pass the cpu_online_mask you'd better know that you should get/put_online_cpus(). -- Chris Metcalf, Tilera Corp. http://www.tilera.com ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective @ 2013-08-13 1:53 ` Chris Metcalf 0 siblings, 0 replies; 104+ messages in thread From: Chris Metcalf @ 2013-08-13 1:53 UTC (permalink / raw) To: Andrew Morton Cc: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer On 8/12/2013 5:05 PM, Andrew Morton wrote: > On Wed, 7 Aug 2013 16:52:22 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote: > >> This change makes lru_add_drain_all() only selectively interrupt >> the cpus that have per-cpu free pages that can be drained. >> >> This is important in nohz mode where calling mlockall(), for >> example, otherwise will interrupt every core unnecessarily. >> >> ... >> >> --- a/mm/swap.c >> +++ b/mm/swap.c >> @@ -405,6 +405,11 @@ static void activate_page_drain(int cpu) >> pagevec_lru_move_fn(pvec, __activate_page, NULL); >> } >> >> +static bool need_activate_page_drain(int cpu) >> +{ >> + return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0; >> +} >> + >> void activate_page(struct page *page) >> { >> if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { >> @@ -422,6 +427,11 @@ static inline void activate_page_drain(int cpu) >> { >> } >> >> +static bool need_activate_page_drain(int cpu) >> +{ >> + return false; >> +} >> + >> void activate_page(struct page *page) >> { >> struct zone *zone = page_zone(page); >> @@ -683,7 +693,32 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) >> */ >> int lru_add_drain_all(void) >> { >> - return schedule_on_each_cpu(lru_add_drain_per_cpu); >> + cpumask_var_t mask; >> + int cpu, rc; >> + >> + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) >> + return -ENOMEM; > Newly adding a GFP_KERNEL allocation attempt into lru_add_drain_all() > is dangerous and undesirable. I took a quick look at all the callsites > and didn't immediately see a bug, but it's hard because they're > splattered all over the place. It would be far better if we were to > not do this. I think it should be safe, given that we already did alloc_percpu() to do schedule_on_each_cpu(), and that is documented as doing GFP_KERNEL allocation (pcpu_create_chunk will call alloc_pages with GFP_KERNEL). > Rather than tossing this hang-grenade in there we should at a reluctant > minimum change lru_add_drain_all() to take a gfp_t argument and then > carefully review and update the callers. > >> + cpumask_clear(mask); >> + >> + /* >> + * Figure out which cpus need flushing. It's OK if we race >> + * with changes to the per-cpu lru pvecs, since it's no worse >> + * than if we flushed all cpus, since a cpu could still end >> + * up putting pages back on its pvec before we returned. >> + * And this avoids interrupting other cpus unnecessarily. >> + */ >> + for_each_online_cpu(cpu) { >> + if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || >> + pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || >> + pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || >> + need_activate_page_drain(cpu)) >> + cpumask_set_cpu(cpu, mask); >> + } >> + >> + rc = schedule_on_cpu_mask(lru_add_drain_per_cpu, mask); > And it seems pretty easy to avoid the allocation. Create a single > cpumask at boot (or, preferably, at compile-time) and whenever we add a > page to a drainable pagevec, do > > cpumask_set_cpu(smp_processor_id(), global_cpumask); > > and to avoid needlessly dirtying a cacheline, > > if (!cpu_isset(smp_processor_id(), global_cpumask)) > cpumask_set_cpu(smp_processor_id(), global_cpumask); > > > This means that lru_add_drain_all() will need to clear the mask at some > point and atomicity issues arise. It would be better to do the > clearing within schedule_on_cpu_mask() itself, using > cpumask_test_and_clear_cpu(). The atomicity issue isn't that big a deal (given that the drain is racy anyway, you just need to make sure to do cpumask_set_cpu after the pagevec_add), but you do need to clear the cpumask before doing the actual drain, and that either means inflicting that semantics on schedule_on_cpu_mask(), which seems a little unnatural, or else doing a copy of the mask, which gets us back to where we started with GFP_KERNEL allocations. Alternately, you could imagine a workqueue API that just took a function pointer that returned for each cpu whether or not to schedule work on that cpu: typedef bool (*check_work_func_t)(void *data, int cpu); schedule_on_some_cpus(work_func_t func, check_work_func_t checker, void *data); For the lru stuff we wouldn't need to use a "data" pointer but I'd include it since it's cheap, pretty standard, and makes the API more general. Or, I suppose, one other possibility is just a compile-time struct cpumask that we guard with a lock. It seems a bit like overkill for the very common case of not specifying CPUMASK_OFFSTACK. All that said, I still tend to like the simple cpumask data-driven approach, assuming you're comfortable with the possible GFP_KERNEL allocation. > Also, what's up with the get_online_cpus() handling? > schedule_on_each_cpu() does it, lru_add_drain_all() does not do it and > the schedule_on_cpu_mask() documentation forgot to mention it. The missing get_online_cpus() for lru_add_drain_all() is in v6 of the patch from Aug 9 (v5 had Tejun's feedback for doing validity-checking on the schedule_on_cpu_mask() mask argument, and v6 added his Ack and the missing get/put_online_cpus). schedule_on_each_cpu() obviously uses get/put_online_cpus and needs it; I would argue that there's no need to mention it in the docs for schedule_on_cpu_mask() since if you're going to pass the cpu_online_mask you'd better know that you should get/put_online_cpus(). -- Chris Metcalf, Tilera Corp. http://www.tilera.com -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective 2013-08-13 1:53 ` Chris Metcalf @ 2013-08-13 19:35 ` Andrew Morton -1 siblings, 0 replies; 104+ messages in thread From: Andrew Morton @ 2013-08-13 19:35 UTC (permalink / raw) To: Chris Metcalf Cc: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer On Mon, 12 Aug 2013 21:53:11 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote: > On 8/12/2013 5:05 PM, Andrew Morton wrote: > >> @@ -683,7 +693,32 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) > >> */ > >> int lru_add_drain_all(void) > >> { > >> - return schedule_on_each_cpu(lru_add_drain_per_cpu); > >> + cpumask_var_t mask; > >> + int cpu, rc; > >> + > >> + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) > >> + return -ENOMEM; > > Newly adding a GFP_KERNEL allocation attempt into lru_add_drain_all() > > is dangerous and undesirable. I took a quick look at all the callsites > > and didn't immediately see a bug, but it's hard because they're > > splattered all over the place. It would be far better if we were to > > not do this. > > I think it should be safe, given that we already did alloc_percpu() to do > schedule_on_each_cpu(), and that is documented as doing GFP_KERNEL allocation > (pcpu_create_chunk will call alloc_pages with GFP_KERNEL). urgh, OK. The allocation of the `struct work's is unavoidable, I guess. However it's a bit sad to allocate one per CPU when only a small number (even zero) or the CPUs actually need it. It might be better to kmalloc these things individually. That will often reduce memory allocations and because the work can be freed by the handler it opens the possibility of an asynchronous schedule_on_cpus(), should that be desired. I don't know how lots-of-kmallocs compares with alloc_percpu() performance-wise. That being said, the `cpumask_var_t mask' which was added to lru_add_drain_all() is unneeded - it's just a temporary storage which can be eliminated by creating a schedule_on_each_cpu_cond() or whatever which is passed a function pointer of type `bool (*call_needed)(int cpu, void *data)'. For lru_add_drain_all(), the callback function tests the pagevec_counts. For schedule_on_each_cpu(), `data' points at cpu_online_mask. > > Rather than tossing this hang-grenade in there we should at a reluctant > > minimum change lru_add_drain_all() to take a gfp_t argument and then > > carefully review and update the callers. > > > >> + cpumask_clear(mask); > >> + > >> + /* > >> + * Figure out which cpus need flushing. It's OK if we race > >> + * with changes to the per-cpu lru pvecs, since it's no worse > >> + * than if we flushed all cpus, since a cpu could still end > >> + * up putting pages back on its pvec before we returned. > >> + * And this avoids interrupting other cpus unnecessarily. > >> + */ > >> + for_each_online_cpu(cpu) { > >> + if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || > >> + pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || > >> + pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || > >> + need_activate_page_drain(cpu)) > >> + cpumask_set_cpu(cpu, mask); > >> + } > >> + > >> + rc = schedule_on_cpu_mask(lru_add_drain_per_cpu, mask); > > And it seems pretty easy to avoid the allocation. Create a single > > cpumask at boot (or, preferably, at compile-time) and whenever we add a > > page to a drainable pagevec, do > > > > cpumask_set_cpu(smp_processor_id(), global_cpumask); > > > > and to avoid needlessly dirtying a cacheline, > > > > if (!cpu_isset(smp_processor_id(), global_cpumask)) > > cpumask_set_cpu(smp_processor_id(), global_cpumask); > > > > > > This means that lru_add_drain_all() will need to clear the mask at some > > point and atomicity issues arise. It would be better to do the > > clearing within schedule_on_cpu_mask() itself, using > > cpumask_test_and_clear_cpu(). > > The atomicity issue isn't that big a deal (given that the drain is > racy anyway, you just need to make sure to do cpumask_set_cpu after > the pagevec_add), but you do need to clear the cpumask before doing > the actual drain, and that either means inflicting that semantics > on schedule_on_cpu_mask(), which seems a little unnatural, or else > doing a copy of the mask, which gets us back to where we started > with GFP_KERNEL allocations. The drain is kinda racy, but there are approaches we can use... The semantics we should aim for are "all pages which were in pagevecs when lru_add_drain_all() was called will be drained". And "any pages which are added to pagevecs after the call to lru_add_drain_all() should be drained by either this call or by the next call". ie: lru_add_drain_all() should not cause any concurrently-added pages to just be lost to the next lru_add_drain_all() > Alternately, you could imagine a workqueue API that just took a function > pointer that returned for each cpu whether or not to schedule work on > that cpu: > > typedef bool (*check_work_func_t)(void *data, int cpu); > schedule_on_some_cpus(work_func_t func, check_work_func_t checker, void *data); > > For the lru stuff we wouldn't need to use a "data" pointer but I'd include > it since it's cheap, pretty standard, and makes the API more general. uh, yes, that. I do think this implementation and interface is better than adding the temporary cpumask. > Or, I suppose, one other possibility is just a compile-time struct > cpumask that we guard with a lock. It seems a bit like overkill for > the very common case of not specifying CPUMASK_OFFSTACK. > > All that said, I still tend to like the simple cpumask data-driven approach, > assuming you're comfortable with the possible GFP_KERNEL allocation. > > > Also, what's up with the get_online_cpus() handling? > > schedule_on_each_cpu() does it, lru_add_drain_all() does not do it and > > the schedule_on_cpu_mask() documentation forgot to mention it. > > The missing get_online_cpus() for lru_add_drain_all() is in v6 of the > patch from Aug 9 (v5 had Tejun's feedback for doing validity-checking > on the schedule_on_cpu_mask() mask argument, and v6 added his Ack > and the missing get/put_online_cpus). Well yes, I noticed this after sending the earlier email. He may be old and wrinkly, but I do suggest that the guy who wrote and maintains that code could have got a cc. Also am unclear why Tejun went and committed the code to -next without cc'ing me. > schedule_on_each_cpu() obviously uses get/put_online_cpus and needs it; > I would argue that there's no need to mention it in the docs for > schedule_on_cpu_mask() since if you're going to pass the cpu_online_mask > you'd better know that you should get/put_online_cpus(). ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective @ 2013-08-13 19:35 ` Andrew Morton 0 siblings, 0 replies; 104+ messages in thread From: Andrew Morton @ 2013-08-13 19:35 UTC (permalink / raw) To: Chris Metcalf Cc: linux-kernel, linux-mm, Tejun Heo, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer On Mon, 12 Aug 2013 21:53:11 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote: > On 8/12/2013 5:05 PM, Andrew Morton wrote: > >> @@ -683,7 +693,32 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) > >> */ > >> int lru_add_drain_all(void) > >> { > >> - return schedule_on_each_cpu(lru_add_drain_per_cpu); > >> + cpumask_var_t mask; > >> + int cpu, rc; > >> + > >> + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) > >> + return -ENOMEM; > > Newly adding a GFP_KERNEL allocation attempt into lru_add_drain_all() > > is dangerous and undesirable. I took a quick look at all the callsites > > and didn't immediately see a bug, but it's hard because they're > > splattered all over the place. It would be far better if we were to > > not do this. > > I think it should be safe, given that we already did alloc_percpu() to do > schedule_on_each_cpu(), and that is documented as doing GFP_KERNEL allocation > (pcpu_create_chunk will call alloc_pages with GFP_KERNEL). urgh, OK. The allocation of the `struct work's is unavoidable, I guess. However it's a bit sad to allocate one per CPU when only a small number (even zero) or the CPUs actually need it. It might be better to kmalloc these things individually. That will often reduce memory allocations and because the work can be freed by the handler it opens the possibility of an asynchronous schedule_on_cpus(), should that be desired. I don't know how lots-of-kmallocs compares with alloc_percpu() performance-wise. That being said, the `cpumask_var_t mask' which was added to lru_add_drain_all() is unneeded - it's just a temporary storage which can be eliminated by creating a schedule_on_each_cpu_cond() or whatever which is passed a function pointer of type `bool (*call_needed)(int cpu, void *data)'. For lru_add_drain_all(), the callback function tests the pagevec_counts. For schedule_on_each_cpu(), `data' points at cpu_online_mask. > > Rather than tossing this hang-grenade in there we should at a reluctant > > minimum change lru_add_drain_all() to take a gfp_t argument and then > > carefully review and update the callers. > > > >> + cpumask_clear(mask); > >> + > >> + /* > >> + * Figure out which cpus need flushing. It's OK if we race > >> + * with changes to the per-cpu lru pvecs, since it's no worse > >> + * than if we flushed all cpus, since a cpu could still end > >> + * up putting pages back on its pvec before we returned. > >> + * And this avoids interrupting other cpus unnecessarily. > >> + */ > >> + for_each_online_cpu(cpu) { > >> + if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || > >> + pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || > >> + pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || > >> + need_activate_page_drain(cpu)) > >> + cpumask_set_cpu(cpu, mask); > >> + } > >> + > >> + rc = schedule_on_cpu_mask(lru_add_drain_per_cpu, mask); > > And it seems pretty easy to avoid the allocation. Create a single > > cpumask at boot (or, preferably, at compile-time) and whenever we add a > > page to a drainable pagevec, do > > > > cpumask_set_cpu(smp_processor_id(), global_cpumask); > > > > and to avoid needlessly dirtying a cacheline, > > > > if (!cpu_isset(smp_processor_id(), global_cpumask)) > > cpumask_set_cpu(smp_processor_id(), global_cpumask); > > > > > > This means that lru_add_drain_all() will need to clear the mask at some > > point and atomicity issues arise. It would be better to do the > > clearing within schedule_on_cpu_mask() itself, using > > cpumask_test_and_clear_cpu(). > > The atomicity issue isn't that big a deal (given that the drain is > racy anyway, you just need to make sure to do cpumask_set_cpu after > the pagevec_add), but you do need to clear the cpumask before doing > the actual drain, and that either means inflicting that semantics > on schedule_on_cpu_mask(), which seems a little unnatural, or else > doing a copy of the mask, which gets us back to where we started > with GFP_KERNEL allocations. The drain is kinda racy, but there are approaches we can use... The semantics we should aim for are "all pages which were in pagevecs when lru_add_drain_all() was called will be drained". And "any pages which are added to pagevecs after the call to lru_add_drain_all() should be drained by either this call or by the next call". ie: lru_add_drain_all() should not cause any concurrently-added pages to just be lost to the next lru_add_drain_all() > Alternately, you could imagine a workqueue API that just took a function > pointer that returned for each cpu whether or not to schedule work on > that cpu: > > typedef bool (*check_work_func_t)(void *data, int cpu); > schedule_on_some_cpus(work_func_t func, check_work_func_t checker, void *data); > > For the lru stuff we wouldn't need to use a "data" pointer but I'd include > it since it's cheap, pretty standard, and makes the API more general. uh, yes, that. I do think this implementation and interface is better than adding the temporary cpumask. > Or, I suppose, one other possibility is just a compile-time struct > cpumask that we guard with a lock. It seems a bit like overkill for > the very common case of not specifying CPUMASK_OFFSTACK. > > All that said, I still tend to like the simple cpumask data-driven approach, > assuming you're comfortable with the possible GFP_KERNEL allocation. > > > Also, what's up with the get_online_cpus() handling? > > schedule_on_each_cpu() does it, lru_add_drain_all() does not do it and > > the schedule_on_cpu_mask() documentation forgot to mention it. > > The missing get_online_cpus() for lru_add_drain_all() is in v6 of the > patch from Aug 9 (v5 had Tejun's feedback for doing validity-checking > on the schedule_on_cpu_mask() mask argument, and v6 added his Ack > and the missing get/put_online_cpus). Well yes, I noticed this after sending the earlier email. He may be old and wrinkly, but I do suggest that the guy who wrote and maintains that code could have got a cc. Also am unclear why Tejun went and committed the code to -next without cc'ing me. > schedule_on_each_cpu() obviously uses get/put_online_cpus and needs it; > I would argue that there's no need to mention it in the docs for > schedule_on_cpu_mask() since if you're going to pass the cpu_online_mask > you'd better know that you should get/put_online_cpus(). -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective 2013-08-13 19:35 ` Andrew Morton @ 2013-08-13 20:19 ` Tejun Heo -1 siblings, 0 replies; 104+ messages in thread From: Tejun Heo @ 2013-08-13 20:19 UTC (permalink / raw) To: Andrew Morton Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer Hello, On Tue, Aug 13, 2013 at 12:35:12PM -0700, Andrew Morton wrote: > I don't know how lots-of-kmallocs compares with alloc_percpu() > performance-wise. If this is actually performance sensitive, the logical thing to do would be pre-allocating per-cpu buffers instead of depending on dynamic allocation. Do the invocations need to be stackable? > That being said, the `cpumask_var_t mask' which was added to > lru_add_drain_all() is unneeded - it's just a temporary storage which > can be eliminated by creating a schedule_on_each_cpu_cond() or whatever > which is passed a function pointer of type `bool (*call_needed)(int > cpu, void *data)'. I'd really like to avoid that. Decision callbacks tend to get abused quite often and it's rather sad to do that because cpumask cannot be prepared and passed around. Can't it just preallocate all necessary resources? Thanks. -- tejun ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective @ 2013-08-13 20:19 ` Tejun Heo 0 siblings, 0 replies; 104+ messages in thread From: Tejun Heo @ 2013-08-13 20:19 UTC (permalink / raw) To: Andrew Morton Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer Hello, On Tue, Aug 13, 2013 at 12:35:12PM -0700, Andrew Morton wrote: > I don't know how lots-of-kmallocs compares with alloc_percpu() > performance-wise. If this is actually performance sensitive, the logical thing to do would be pre-allocating per-cpu buffers instead of depending on dynamic allocation. Do the invocations need to be stackable? > That being said, the `cpumask_var_t mask' which was added to > lru_add_drain_all() is unneeded - it's just a temporary storage which > can be eliminated by creating a schedule_on_each_cpu_cond() or whatever > which is passed a function pointer of type `bool (*call_needed)(int > cpu, void *data)'. I'd really like to avoid that. Decision callbacks tend to get abused quite often and it's rather sad to do that because cpumask cannot be prepared and passed around. Can't it just preallocate all necessary resources? Thanks. -- tejun -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective 2013-08-13 20:19 ` Tejun Heo @ 2013-08-13 20:31 ` Andrew Morton -1 siblings, 0 replies; 104+ messages in thread From: Andrew Morton @ 2013-08-13 20:31 UTC (permalink / raw) To: Tejun Heo Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer On Tue, 13 Aug 2013 16:19:58 -0400 Tejun Heo <tj@kernel.org> wrote: > Hello, > > On Tue, Aug 13, 2013 at 12:35:12PM -0700, Andrew Morton wrote: > > I don't know how lots-of-kmallocs compares with alloc_percpu() > > performance-wise. > > If this is actually performance sensitive, I've always assumed that it isn't performance-sensitive. schedule_on_each_cpu() has to be slow as a dog. Then again, why does this patchset exist? It's a performance optimisation so presumably someone cares. But not enough to perform actual measurements :( > the logical thing to do > would be pre-allocating per-cpu buffers instead of depending on > dynamic allocation. Do the invocations need to be stackable? schedule_on_each_cpu() calls should if course happen concurrently, and there's the question of whether we wish to permit async schedule_on_each_cpu(). Leaving the calling CPU twiddling thumbs until everyone has finished is pretty sad if the caller doesn't want that. > > That being said, the `cpumask_var_t mask' which was added to > > lru_add_drain_all() is unneeded - it's just a temporary storage which > > can be eliminated by creating a schedule_on_each_cpu_cond() or whatever > > which is passed a function pointer of type `bool (*call_needed)(int > > cpu, void *data)'. > > I'd really like to avoid that. Decision callbacks tend to get abused > quite often and it's rather sad to do that because cpumask cannot be > prepared and passed around. Can't it just preallocate all necessary > resources? I don't recall seeing such abuse. It's a very common and powerful tool, and not implementing it because some dummy may abuse it weakens the API for all non-dummies. That allocation is simply unneeded. ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective @ 2013-08-13 20:31 ` Andrew Morton 0 siblings, 0 replies; 104+ messages in thread From: Andrew Morton @ 2013-08-13 20:31 UTC (permalink / raw) To: Tejun Heo Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer On Tue, 13 Aug 2013 16:19:58 -0400 Tejun Heo <tj@kernel.org> wrote: > Hello, > > On Tue, Aug 13, 2013 at 12:35:12PM -0700, Andrew Morton wrote: > > I don't know how lots-of-kmallocs compares with alloc_percpu() > > performance-wise. > > If this is actually performance sensitive, I've always assumed that it isn't performance-sensitive. schedule_on_each_cpu() has to be slow as a dog. Then again, why does this patchset exist? It's a performance optimisation so presumably someone cares. But not enough to perform actual measurements :( > the logical thing to do > would be pre-allocating per-cpu buffers instead of depending on > dynamic allocation. Do the invocations need to be stackable? schedule_on_each_cpu() calls should if course happen concurrently, and there's the question of whether we wish to permit async schedule_on_each_cpu(). Leaving the calling CPU twiddling thumbs until everyone has finished is pretty sad if the caller doesn't want that. > > That being said, the `cpumask_var_t mask' which was added to > > lru_add_drain_all() is unneeded - it's just a temporary storage which > > can be eliminated by creating a schedule_on_each_cpu_cond() or whatever > > which is passed a function pointer of type `bool (*call_needed)(int > > cpu, void *data)'. > > I'd really like to avoid that. Decision callbacks tend to get abused > quite often and it's rather sad to do that because cpumask cannot be > prepared and passed around. Can't it just preallocate all necessary > resources? I don't recall seeing such abuse. It's a very common and powerful tool, and not implementing it because some dummy may abuse it weakens the API for all non-dummies. That allocation is simply unneeded. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective 2013-08-13 20:31 ` Andrew Morton @ 2013-08-13 20:59 ` Chris Metcalf -1 siblings, 0 replies; 104+ messages in thread From: Chris Metcalf @ 2013-08-13 20:59 UTC (permalink / raw) To: Andrew Morton Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer On 8/13/2013 3:35 PM, Andrew Morton wrote: > He may be > old and wrinkly, but I do suggest that the guy who wrote and maintains > that code could have got a cc. Sorry about that - I just went by what MAINTAINERS shows. There's no specific maintainer listed for the swap code. I probably should have looked at the final Signed-off-by's on recent commits. On 8/13/2013 4:31 PM, Andrew Morton wrote: > On Tue, 13 Aug 2013 16:19:58 -0400 Tejun Heo <tj@kernel.org> wrote: > >> Hello, >> >> On Tue, Aug 13, 2013 at 12:35:12PM -0700, Andrew Morton wrote: >>> I don't know how lots-of-kmallocs compares with alloc_percpu() >>> performance-wise. >> If this is actually performance sensitive, > I've always assumed that it isn't performance-sensitive. > schedule_on_each_cpu() has to be slow as a dog. > > Then again, why does this patchset exist? It's a performance > optimisation so presumably someone cares. But not enough to perform > actual measurements :( The patchset exists because of the difference between zero overhead on cpus that don't have drainable lrus, and non-zero overhead. This turns out to be important on workloads where nohz cores are handling 10 Gb traffic in userspace and really, really don't want to be interrupted, or they drop packets on the floor. >> the logical thing to do >> would be pre-allocating per-cpu buffers instead of depending on >> dynamic allocation. Do the invocations need to be stackable? > schedule_on_each_cpu() calls should if course happen concurrently, and > there's the question of whether we wish to permit async > schedule_on_each_cpu(). Leaving the calling CPU twiddling thumbs until > everyone has finished is pretty sad if the caller doesn't want that. > >>> That being said, the `cpumask_var_t mask' which was added to >>> lru_add_drain_all() is unneeded - it's just a temporary storage which >>> can be eliminated by creating a schedule_on_each_cpu_cond() or whatever >>> which is passed a function pointer of type `bool (*call_needed)(int >>> cpu, void *data)'. >> I'd really like to avoid that. Decision callbacks tend to get abused >> quite often and it's rather sad to do that because cpumask cannot be >> prepared and passed around. Can't it just preallocate all necessary >> resources? > I don't recall seeing such abuse. It's a very common and powerful > tool, and not implementing it because some dummy may abuse it weakens > the API for all non-dummies. That allocation is simply unneeded. The problem with a callback version is that it's not clear that it helps with Andrew's original concern about allocation. In schedule_on_each_cpu() we need to track which cpus we scheduled work on so that we can flush_work() after all the work has been scheduled. Even with a callback approach, we'd still end up wanting to record the results of the callback in the first pass so that we could properly flush_work() on the second pass. Given that, having the caller just create the cpumask in the first place makes more sense. As Andrew suggests, we could also just have an asynchronous version of schedule_on_each_cpu(), but I don't know if that's beneficial enough to the swap code to make it worthwhile, or if it's tricky enough on the workqueue side to make it not worthwhile; it does seem like we would need to rethink the work_struct allocation, and e.g. avoid re-issuing the flush to a cpu that hadn't finished the previous flush, etc. Potentially tricky, particularly if lru_add_drain_all() doesn't care about performance in the first place. -- Chris Metcalf, Tilera Corp. http://www.tilera.com ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective @ 2013-08-13 20:59 ` Chris Metcalf 0 siblings, 0 replies; 104+ messages in thread From: Chris Metcalf @ 2013-08-13 20:59 UTC (permalink / raw) To: Andrew Morton Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer On 8/13/2013 3:35 PM, Andrew Morton wrote: > He may be > old and wrinkly, but I do suggest that the guy who wrote and maintains > that code could have got a cc. Sorry about that - I just went by what MAINTAINERS shows. There's no specific maintainer listed for the swap code. I probably should have looked at the final Signed-off-by's on recent commits. On 8/13/2013 4:31 PM, Andrew Morton wrote: > On Tue, 13 Aug 2013 16:19:58 -0400 Tejun Heo <tj@kernel.org> wrote: > >> Hello, >> >> On Tue, Aug 13, 2013 at 12:35:12PM -0700, Andrew Morton wrote: >>> I don't know how lots-of-kmallocs compares with alloc_percpu() >>> performance-wise. >> If this is actually performance sensitive, > I've always assumed that it isn't performance-sensitive. > schedule_on_each_cpu() has to be slow as a dog. > > Then again, why does this patchset exist? It's a performance > optimisation so presumably someone cares. But not enough to perform > actual measurements :( The patchset exists because of the difference between zero overhead on cpus that don't have drainable lrus, and non-zero overhead. This turns out to be important on workloads where nohz cores are handling 10 Gb traffic in userspace and really, really don't want to be interrupted, or they drop packets on the floor. >> the logical thing to do >> would be pre-allocating per-cpu buffers instead of depending on >> dynamic allocation. Do the invocations need to be stackable? > schedule_on_each_cpu() calls should if course happen concurrently, and > there's the question of whether we wish to permit async > schedule_on_each_cpu(). Leaving the calling CPU twiddling thumbs until > everyone has finished is pretty sad if the caller doesn't want that. > >>> That being said, the `cpumask_var_t mask' which was added to >>> lru_add_drain_all() is unneeded - it's just a temporary storage which >>> can be eliminated by creating a schedule_on_each_cpu_cond() or whatever >>> which is passed a function pointer of type `bool (*call_needed)(int >>> cpu, void *data)'. >> I'd really like to avoid that. Decision callbacks tend to get abused >> quite often and it's rather sad to do that because cpumask cannot be >> prepared and passed around. Can't it just preallocate all necessary >> resources? > I don't recall seeing such abuse. It's a very common and powerful > tool, and not implementing it because some dummy may abuse it weakens > the API for all non-dummies. That allocation is simply unneeded. The problem with a callback version is that it's not clear that it helps with Andrew's original concern about allocation. In schedule_on_each_cpu() we need to track which cpus we scheduled work on so that we can flush_work() after all the work has been scheduled. Even with a callback approach, we'd still end up wanting to record the results of the callback in the first pass so that we could properly flush_work() on the second pass. Given that, having the caller just create the cpumask in the first place makes more sense. As Andrew suggests, we could also just have an asynchronous version of schedule_on_each_cpu(), but I don't know if that's beneficial enough to the swap code to make it worthwhile, or if it's tricky enough on the workqueue side to make it not worthwhile; it does seem like we would need to rethink the work_struct allocation, and e.g. avoid re-issuing the flush to a cpu that hadn't finished the previous flush, etc. Potentially tricky, particularly if lru_add_drain_all() doesn't care about performance in the first place. -- Chris Metcalf, Tilera Corp. http://www.tilera.com -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective 2013-08-13 20:59 ` Chris Metcalf @ 2013-08-13 21:13 ` Andrew Morton -1 siblings, 0 replies; 104+ messages in thread From: Andrew Morton @ 2013-08-13 21:13 UTC (permalink / raw) To: Chris Metcalf Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer On Tue, 13 Aug 2013 16:59:54 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote: > > > > Then again, why does this patchset exist? It's a performance > > optimisation so presumably someone cares. But not enough to perform > > actual measurements :( > > The patchset exists because of the difference between zero overhead on > cpus that don't have drainable lrus, and non-zero overhead. This turns > out to be important on workloads where nohz cores are handling 10 Gb > traffic in userspace and really, really don't want to be interrupted, > or they drop packets on the floor. But what is the effect of the patchset? Has it been tested against the problematic workload(s)? > >> the logical thing to do > >> would be pre-allocating per-cpu buffers instead of depending on > >> dynamic allocation. Do the invocations need to be stackable? > > schedule_on_each_cpu() calls should if course happen concurrently, and > > there's the question of whether we wish to permit async > > schedule_on_each_cpu(). Leaving the calling CPU twiddling thumbs until > > everyone has finished is pretty sad if the caller doesn't want that. > > > >>> That being said, the `cpumask_var_t mask' which was added to > >>> lru_add_drain_all() is unneeded - it's just a temporary storage which > >>> can be eliminated by creating a schedule_on_each_cpu_cond() or whatever > >>> which is passed a function pointer of type `bool (*call_needed)(int > >>> cpu, void *data)'. > >> I'd really like to avoid that. Decision callbacks tend to get abused > >> quite often and it's rather sad to do that because cpumask cannot be > >> prepared and passed around. Can't it just preallocate all necessary > >> resources? > > I don't recall seeing such abuse. It's a very common and powerful > > tool, and not implementing it because some dummy may abuse it weakens > > the API for all non-dummies. That allocation is simply unneeded. > > The problem with a callback version is that it's not clear that > it helps with Andrew's original concern about allocation. In > schedule_on_each_cpu() we need to track which cpus we scheduled work > on so that we can flush_work() after all the work has been scheduled. > Even with a callback approach, we'd still end up wanting to record > the results of the callback in the first pass so that we could > properly flush_work() on the second pass. Given that, having the > caller just create the cpumask in the first place makes more sense. Nope. schedule_on_each_cpu() can just continue to do for_each_cpu(cpu, mask) flush_work(per_cpu_ptr(works, cpu)); lru_add_drain_all() can do that as well. An optimisation would be to tag the unused works as not-needing-flush. Set work.entry,next to NULL, for example. If we were to switch from alloc_per_cpu() to bunch-of-kmallocs then they'd need to be assembled into a list which is pretty trivial. > As Andrew suggests, we could also just have an asynchronous version > of schedule_on_each_cpu(), but I don't know if that's beneficial > enough to the swap code to make it worthwhile, or if it's tricky > enough on the workqueue side to make it not worthwhile; it does seem > like we would need to rethink the work_struct allocation, and > e.g. avoid re-issuing the flush to a cpu that hadn't finished the > previous flush, etc. Potentially tricky, particularly if > lru_add_drain_all() doesn't care about performance in the first place. lru_add_drain_all() wants synchronous behavior. I don't know how much call there would be for an async schedule_on_each_cpu_cond(). ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective @ 2013-08-13 21:13 ` Andrew Morton 0 siblings, 0 replies; 104+ messages in thread From: Andrew Morton @ 2013-08-13 21:13 UTC (permalink / raw) To: Chris Metcalf Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer On Tue, 13 Aug 2013 16:59:54 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote: > > > > Then again, why does this patchset exist? It's a performance > > optimisation so presumably someone cares. But not enough to perform > > actual measurements :( > > The patchset exists because of the difference between zero overhead on > cpus that don't have drainable lrus, and non-zero overhead. This turns > out to be important on workloads where nohz cores are handling 10 Gb > traffic in userspace and really, really don't want to be interrupted, > or they drop packets on the floor. But what is the effect of the patchset? Has it been tested against the problematic workload(s)? > >> the logical thing to do > >> would be pre-allocating per-cpu buffers instead of depending on > >> dynamic allocation. Do the invocations need to be stackable? > > schedule_on_each_cpu() calls should if course happen concurrently, and > > there's the question of whether we wish to permit async > > schedule_on_each_cpu(). Leaving the calling CPU twiddling thumbs until > > everyone has finished is pretty sad if the caller doesn't want that. > > > >>> That being said, the `cpumask_var_t mask' which was added to > >>> lru_add_drain_all() is unneeded - it's just a temporary storage which > >>> can be eliminated by creating a schedule_on_each_cpu_cond() or whatever > >>> which is passed a function pointer of type `bool (*call_needed)(int > >>> cpu, void *data)'. > >> I'd really like to avoid that. Decision callbacks tend to get abused > >> quite often and it's rather sad to do that because cpumask cannot be > >> prepared and passed around. Can't it just preallocate all necessary > >> resources? > > I don't recall seeing such abuse. It's a very common and powerful > > tool, and not implementing it because some dummy may abuse it weakens > > the API for all non-dummies. That allocation is simply unneeded. > > The problem with a callback version is that it's not clear that > it helps with Andrew's original concern about allocation. In > schedule_on_each_cpu() we need to track which cpus we scheduled work > on so that we can flush_work() after all the work has been scheduled. > Even with a callback approach, we'd still end up wanting to record > the results of the callback in the first pass so that we could > properly flush_work() on the second pass. Given that, having the > caller just create the cpumask in the first place makes more sense. Nope. schedule_on_each_cpu() can just continue to do for_each_cpu(cpu, mask) flush_work(per_cpu_ptr(works, cpu)); lru_add_drain_all() can do that as well. An optimisation would be to tag the unused works as not-needing-flush. Set work.entry,next to NULL, for example. If we were to switch from alloc_per_cpu() to bunch-of-kmallocs then they'd need to be assembled into a list which is pretty trivial. > As Andrew suggests, we could also just have an asynchronous version > of schedule_on_each_cpu(), but I don't know if that's beneficial > enough to the swap code to make it worthwhile, or if it's tricky > enough on the workqueue side to make it not worthwhile; it does seem > like we would need to rethink the work_struct allocation, and > e.g. avoid re-issuing the flush to a cpu that hadn't finished the > previous flush, etc. Potentially tricky, particularly if > lru_add_drain_all() doesn't care about performance in the first place. lru_add_drain_all() wants synchronous behavior. I don't know how much call there would be for an async schedule_on_each_cpu_cond(). -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective 2013-08-13 21:13 ` Andrew Morton @ 2013-08-13 22:13 ` Chris Metcalf -1 siblings, 0 replies; 104+ messages in thread From: Chris Metcalf @ 2013-08-13 22:13 UTC (permalink / raw) To: Andrew Morton Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer On 8/13/2013 5:13 PM, Andrew Morton wrote: > On Tue, 13 Aug 2013 16:59:54 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote: > >>> Then again, why does this patchset exist? It's a performance >>> optimisation so presumably someone cares. But not enough to perform >>> actual measurements :( >> The patchset exists because of the difference between zero overhead on >> cpus that don't have drainable lrus, and non-zero overhead. This turns >> out to be important on workloads where nohz cores are handling 10 Gb >> traffic in userspace and really, really don't want to be interrupted, >> or they drop packets on the floor. > But what is the effect of the patchset? Has it been tested against the > problematic workload(s)? Yes. The result is that syscalls such as mlockall(), which otherwise interrupt every core, don't interrupt the cores that are running purely in userspace. Since they are purely in userspace they don't have any drainable pagevecs, so the patchset means they don't get interrupted and don't drop packets. I implemented this against Linux 2.6.38 and our home-grown version of nohz cpusets back in July 2012, and we have been shipping it to customers since then. >>>> the logical thing to do >>>> would be pre-allocating per-cpu buffers instead of depending on >>>> dynamic allocation. Do the invocations need to be stackable? >>> schedule_on_each_cpu() calls should if course happen concurrently, and >>> there's the question of whether we wish to permit async >>> schedule_on_each_cpu(). Leaving the calling CPU twiddling thumbs until >>> everyone has finished is pretty sad if the caller doesn't want that. >>> >>>>> That being said, the `cpumask_var_t mask' which was added to >>>>> lru_add_drain_all() is unneeded - it's just a temporary storage which >>>>> can be eliminated by creating a schedule_on_each_cpu_cond() or whatever >>>>> which is passed a function pointer of type `bool (*call_needed)(int >>>>> cpu, void *data)'. >>>> I'd really like to avoid that. Decision callbacks tend to get abused >>>> quite often and it's rather sad to do that because cpumask cannot be >>>> prepared and passed around. Can't it just preallocate all necessary >>>> resources? >>> I don't recall seeing such abuse. It's a very common and powerful >>> tool, and not implementing it because some dummy may abuse it weakens >>> the API for all non-dummies. That allocation is simply unneeded. >> The problem with a callback version is that it's not clear that >> it helps with Andrew's original concern about allocation. In >> schedule_on_each_cpu() we need to track which cpus we scheduled work >> on so that we can flush_work() after all the work has been scheduled. >> Even with a callback approach, we'd still end up wanting to record >> the results of the callback in the first pass so that we could >> properly flush_work() on the second pass. Given that, having the >> caller just create the cpumask in the first place makes more sense. > Nope. schedule_on_each_cpu() can just continue to do > > for_each_cpu(cpu, mask) > flush_work(per_cpu_ptr(works, cpu)); > > lru_add_drain_all() can do that as well. An optimisation would be to > tag the unused works as not-needing-flush. Set work.entry,next to > NULL, for example. Good point - we already have a per-cpu data structure sitting there handy, so we might as well use it rather than allocating another cpumask. >> As Andrew suggests, we could also just have an asynchronous version >> of schedule_on_each_cpu(), but I don't know if that's beneficial >> enough to the swap code to make it worthwhile, or if it's tricky >> enough on the workqueue side to make it not worthwhile; it does seem >> like we would need to rethink the work_struct allocation, and >> e.g. avoid re-issuing the flush to a cpu that hadn't finished the >> previous flush, etc. Potentially tricky, particularly if >> lru_add_drain_all() doesn't care about performance in the first place. > lru_add_drain_all() wants synchronous behavior. I don't know how much > call there would be for an async schedule_on_each_cpu_cond(). Let me work up a patchset using callbacks, and we can look at that as an example and see how it feels for both workqueue.c and swap.c. -- Chris Metcalf, Tilera Corp. http://www.tilera.com ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective @ 2013-08-13 22:13 ` Chris Metcalf 0 siblings, 0 replies; 104+ messages in thread From: Chris Metcalf @ 2013-08-13 22:13 UTC (permalink / raw) To: Andrew Morton Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer On 8/13/2013 5:13 PM, Andrew Morton wrote: > On Tue, 13 Aug 2013 16:59:54 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote: > >>> Then again, why does this patchset exist? It's a performance >>> optimisation so presumably someone cares. But not enough to perform >>> actual measurements :( >> The patchset exists because of the difference between zero overhead on >> cpus that don't have drainable lrus, and non-zero overhead. This turns >> out to be important on workloads where nohz cores are handling 10 Gb >> traffic in userspace and really, really don't want to be interrupted, >> or they drop packets on the floor. > But what is the effect of the patchset? Has it been tested against the > problematic workload(s)? Yes. The result is that syscalls such as mlockall(), which otherwise interrupt every core, don't interrupt the cores that are running purely in userspace. Since they are purely in userspace they don't have any drainable pagevecs, so the patchset means they don't get interrupted and don't drop packets. I implemented this against Linux 2.6.38 and our home-grown version of nohz cpusets back in July 2012, and we have been shipping it to customers since then. >>>> the logical thing to do >>>> would be pre-allocating per-cpu buffers instead of depending on >>>> dynamic allocation. Do the invocations need to be stackable? >>> schedule_on_each_cpu() calls should if course happen concurrently, and >>> there's the question of whether we wish to permit async >>> schedule_on_each_cpu(). Leaving the calling CPU twiddling thumbs until >>> everyone has finished is pretty sad if the caller doesn't want that. >>> >>>>> That being said, the `cpumask_var_t mask' which was added to >>>>> lru_add_drain_all() is unneeded - it's just a temporary storage which >>>>> can be eliminated by creating a schedule_on_each_cpu_cond() or whatever >>>>> which is passed a function pointer of type `bool (*call_needed)(int >>>>> cpu, void *data)'. >>>> I'd really like to avoid that. Decision callbacks tend to get abused >>>> quite often and it's rather sad to do that because cpumask cannot be >>>> prepared and passed around. Can't it just preallocate all necessary >>>> resources? >>> I don't recall seeing such abuse. It's a very common and powerful >>> tool, and not implementing it because some dummy may abuse it weakens >>> the API for all non-dummies. That allocation is simply unneeded. >> The problem with a callback version is that it's not clear that >> it helps with Andrew's original concern about allocation. In >> schedule_on_each_cpu() we need to track which cpus we scheduled work >> on so that we can flush_work() after all the work has been scheduled. >> Even with a callback approach, we'd still end up wanting to record >> the results of the callback in the first pass so that we could >> properly flush_work() on the second pass. Given that, having the >> caller just create the cpumask in the first place makes more sense. > Nope. schedule_on_each_cpu() can just continue to do > > for_each_cpu(cpu, mask) > flush_work(per_cpu_ptr(works, cpu)); > > lru_add_drain_all() can do that as well. An optimisation would be to > tag the unused works as not-needing-flush. Set work.entry,next to > NULL, for example. Good point - we already have a per-cpu data structure sitting there handy, so we might as well use it rather than allocating another cpumask. >> As Andrew suggests, we could also just have an asynchronous version >> of schedule_on_each_cpu(), but I don't know if that's beneficial >> enough to the swap code to make it worthwhile, or if it's tricky >> enough on the workqueue side to make it not worthwhile; it does seem >> like we would need to rethink the work_struct allocation, and >> e.g. avoid re-issuing the flush to a cpu that hadn't finished the >> previous flush, etc. Potentially tricky, particularly if >> lru_add_drain_all() doesn't care about performance in the first place. > lru_add_drain_all() wants synchronous behavior. I don't know how much > call there would be for an async schedule_on_each_cpu_cond(). Let me work up a patchset using callbacks, and we can look at that as an example and see how it feels for both workqueue.c and swap.c. -- Chris Metcalf, Tilera Corp. http://www.tilera.com -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective 2013-08-13 22:13 ` Chris Metcalf @ 2013-08-13 22:26 ` Andrew Morton -1 siblings, 0 replies; 104+ messages in thread From: Andrew Morton @ 2013-08-13 22:26 UTC (permalink / raw) To: Chris Metcalf Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer, KOSAKI Motohiro On Tue, 13 Aug 2013 18:13:48 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote: > On 8/13/2013 5:13 PM, Andrew Morton wrote: > > On Tue, 13 Aug 2013 16:59:54 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote: > > > >>> Then again, why does this patchset exist? It's a performance > >>> optimisation so presumably someone cares. But not enough to perform > >>> actual measurements :( > >> The patchset exists because of the difference between zero overhead on > >> cpus that don't have drainable lrus, and non-zero overhead. This turns > >> out to be important on workloads where nohz cores are handling 10 Gb > >> traffic in userspace and really, really don't want to be interrupted, > >> or they drop packets on the floor. > > But what is the effect of the patchset? Has it been tested against the > > problematic workload(s)? > > Yes. The result is that syscalls such as mlockall(), which otherwise interrupt > every core, don't interrupt the cores that are running purely in userspace. > Since they are purely in userspace they don't have any drainable pagevecs, > so the patchset means they don't get interrupted and don't drop packets. > > I implemented this against Linux 2.6.38 and our home-grown version of nohz > cpusets back in July 2012, and we have been shipping it to customers since then. argh. Those per-cpu LRU pagevecs were a nasty but very effective locking amortization hack back in, umm, 2002. They have caused quite a lot of weird corner-case behaviour, resulting in all the lru_add_drain_all() calls sprinkled around the place. I'd like to nuke the whole thing, but that would require a fundamental rethnik/rework of all the LRU list locking. According to the 8891d6da17db0f changelog, the lru_add_drain_all() in sys_mlock() isn't really required: "it isn't must. but it reduce the failure of moving to unevictable list. its failure can rescue in vmscan later. but reducing is better." I suspect we could just kill it. ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective @ 2013-08-13 22:26 ` Andrew Morton 0 siblings, 0 replies; 104+ messages in thread From: Andrew Morton @ 2013-08-13 22:26 UTC (permalink / raw) To: Chris Metcalf Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer, KOSAKI Motohiro On Tue, 13 Aug 2013 18:13:48 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote: > On 8/13/2013 5:13 PM, Andrew Morton wrote: > > On Tue, 13 Aug 2013 16:59:54 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote: > > > >>> Then again, why does this patchset exist? It's a performance > >>> optimisation so presumably someone cares. But not enough to perform > >>> actual measurements :( > >> The patchset exists because of the difference between zero overhead on > >> cpus that don't have drainable lrus, and non-zero overhead. This turns > >> out to be important on workloads where nohz cores are handling 10 Gb > >> traffic in userspace and really, really don't want to be interrupted, > >> or they drop packets on the floor. > > But what is the effect of the patchset? Has it been tested against the > > problematic workload(s)? > > Yes. The result is that syscalls such as mlockall(), which otherwise interrupt > every core, don't interrupt the cores that are running purely in userspace. > Since they are purely in userspace they don't have any drainable pagevecs, > so the patchset means they don't get interrupted and don't drop packets. > > I implemented this against Linux 2.6.38 and our home-grown version of nohz > cpusets back in July 2012, and we have been shipping it to customers since then. argh. Those per-cpu LRU pagevecs were a nasty but very effective locking amortization hack back in, umm, 2002. They have caused quite a lot of weird corner-case behaviour, resulting in all the lru_add_drain_all() calls sprinkled around the place. I'd like to nuke the whole thing, but that would require a fundamental rethnik/rework of all the LRU list locking. According to the 8891d6da17db0f changelog, the lru_add_drain_all() in sys_mlock() isn't really required: "it isn't must. but it reduce the failure of moving to unevictable list. its failure can rescue in vmscan later. but reducing is better." I suspect we could just kill it. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective 2013-08-13 22:26 ` Andrew Morton @ 2013-08-13 23:04 ` Chris Metcalf -1 siblings, 0 replies; 104+ messages in thread From: Chris Metcalf @ 2013-08-13 23:04 UTC (permalink / raw) To: Andrew Morton Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer, KOSAKI Motohiro On 8/13/2013 6:26 PM, Andrew Morton wrote: > On Tue, 13 Aug 2013 18:13:48 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote: > >> On 8/13/2013 5:13 PM, Andrew Morton wrote: >>> On Tue, 13 Aug 2013 16:59:54 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote: >>> >>>>> Then again, why does this patchset exist? It's a performance >>>>> optimisation so presumably someone cares. But not enough to perform >>>>> actual measurements :( >>>> The patchset exists because of the difference between zero overhead on >>>> cpus that don't have drainable lrus, and non-zero overhead. This turns >>>> out to be important on workloads where nohz cores are handling 10 Gb >>>> traffic in userspace and really, really don't want to be interrupted, >>>> or they drop packets on the floor. >>> But what is the effect of the patchset? Has it been tested against the >>> problematic workload(s)? >> Yes. The result is that syscalls such as mlockall(), which otherwise interrupt >> every core, don't interrupt the cores that are running purely in userspace. >> Since they are purely in userspace they don't have any drainable pagevecs, >> so the patchset means they don't get interrupted and don't drop packets. >> >> I implemented this against Linux 2.6.38 and our home-grown version of nohz >> cpusets back in July 2012, and we have been shipping it to customers since then. > argh. > > Those per-cpu LRU pagevecs were a nasty but very effective locking > amortization hack back in, umm, 2002. They have caused quite a lot of > weird corner-case behaviour, resulting in all the lru_add_drain_all() > calls sprinkled around the place. I'd like to nuke the whole thing, > but that would require a fundamental rethnik/rework of all the LRU list > locking. > > According to the 8891d6da17db0f changelog, the lru_add_drain_all() in > sys_mlock() isn't really required: "it isn't must. but it reduce the > failure of moving to unevictable list. its failure can rescue in > vmscan later. but reducing is better." > > I suspect we could just kill it. That's probably true, but I suspect this change is still worthwhile for nohz environments. There are other calls of lru_add_drain_all(), and you just don't want anything in the kernel that interrupts every core when only a subset could be interrupted. If the kernel can avoid generating unnecessary interrupts to uninvolved cores, you can make guarantees about jitter on cores that are running dedicated userspace code. -- Chris Metcalf, Tilera Corp. http://www.tilera.com ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective @ 2013-08-13 23:04 ` Chris Metcalf 0 siblings, 0 replies; 104+ messages in thread From: Chris Metcalf @ 2013-08-13 23:04 UTC (permalink / raw) To: Andrew Morton Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer, KOSAKI Motohiro On 8/13/2013 6:26 PM, Andrew Morton wrote: > On Tue, 13 Aug 2013 18:13:48 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote: > >> On 8/13/2013 5:13 PM, Andrew Morton wrote: >>> On Tue, 13 Aug 2013 16:59:54 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote: >>> >>>>> Then again, why does this patchset exist? It's a performance >>>>> optimisation so presumably someone cares. But not enough to perform >>>>> actual measurements :( >>>> The patchset exists because of the difference between zero overhead on >>>> cpus that don't have drainable lrus, and non-zero overhead. This turns >>>> out to be important on workloads where nohz cores are handling 10 Gb >>>> traffic in userspace and really, really don't want to be interrupted, >>>> or they drop packets on the floor. >>> But what is the effect of the patchset? Has it been tested against the >>> problematic workload(s)? >> Yes. The result is that syscalls such as mlockall(), which otherwise interrupt >> every core, don't interrupt the cores that are running purely in userspace. >> Since they are purely in userspace they don't have any drainable pagevecs, >> so the patchset means they don't get interrupted and don't drop packets. >> >> I implemented this against Linux 2.6.38 and our home-grown version of nohz >> cpusets back in July 2012, and we have been shipping it to customers since then. > argh. > > Those per-cpu LRU pagevecs were a nasty but very effective locking > amortization hack back in, umm, 2002. They have caused quite a lot of > weird corner-case behaviour, resulting in all the lru_add_drain_all() > calls sprinkled around the place. I'd like to nuke the whole thing, > but that would require a fundamental rethnik/rework of all the LRU list > locking. > > According to the 8891d6da17db0f changelog, the lru_add_drain_all() in > sys_mlock() isn't really required: "it isn't must. but it reduce the > failure of moving to unevictable list. its failure can rescue in > vmscan later. but reducing is better." > > I suspect we could just kill it. That's probably true, but I suspect this change is still worthwhile for nohz environments. There are other calls of lru_add_drain_all(), and you just don't want anything in the kernel that interrupts every core when only a subset could be interrupted. If the kernel can avoid generating unnecessary interrupts to uninvolved cores, you can make guarantees about jitter on cores that are running dedicated userspace code. -- Chris Metcalf, Tilera Corp. http://www.tilera.com -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 104+ messages in thread
* [PATCH v7 1/2] workqueue: add schedule_on_each_cpu_cond 2013-08-13 22:13 ` Chris Metcalf @ 2013-08-13 22:51 ` Chris Metcalf -1 siblings, 0 replies; 104+ messages in thread From: Chris Metcalf @ 2013-08-13 22:51 UTC (permalink / raw) To: Andrew Morton Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer This API supports running work on a subset of the online cpus determined by a callback function. Signed-off-by: Chris Metcalf <cmetcalf@tilera.com> --- v7: try a version with callbacks instead of cpu masks. Either this or v6 seem like reasonable solutions. v6: add Tejun's Acked-by, and add missing get/put_cpu_online to lru_add_drain_all(). v5: provide validity checking on the cpumask for schedule_on_cpu_mask. By providing an all-or-nothing EINVAL check, we impose the requirement that the calling code actually know clearly what it's trying to do. (Note: no change to the mm/swap.c commit) v4: don't lose possible -ENOMEM in schedule_on_each_cpu() (Note: no change to the mm/swap.c commit) v3: split commit into two, one for workqueue and one for mm, though both should probably be taken through -mm. include/linux/workqueue.h | 3 +++ kernel/workqueue.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index a0ed78a..c5ee29f 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -17,6 +17,7 @@ struct workqueue_struct; struct work_struct; typedef void (*work_func_t)(struct work_struct *work); +typedef bool (*work_cond_func_t)(void *data, int cpu); void delayed_work_timer_fn(unsigned long __data); /* @@ -471,6 +472,8 @@ extern void drain_workqueue(struct workqueue_struct *wq); extern void flush_scheduled_work(void); extern int schedule_on_each_cpu(work_func_t func); +extern int schedule_on_each_cpu_cond(work_func_t func, work_cond_func_t cond, + void *data); int execute_in_process_context(work_func_t fn, struct execute_work *); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f02c4a4..5c5b534 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2999,6 +2999,60 @@ int schedule_on_each_cpu(work_func_t func) } /** + * schedule_on_each_cpu_cond - execute a function synchronously on each + * online CPU if requested by a condition callback. + * @func: the function to call + * @cond: the callback function to determine whether to schedule the work + * @data: opaque data passed to the callback function + * + * schedule_on_each_cpu_cond() calls @cond for each online cpu (in the + * context of the current cpu), and for each cpu for which @cond returns + * true, it executes @func using the system workqueue. The function + * blocks until all CPUs on which work was scheduled have completed. + * schedule_on_each_cpu_cond() is very slow. + * + * The @cond callback is called in the same context as the original + * call to schedule_on_each_cpu_cond(). + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int schedule_on_each_cpu_cond(work_func_t func, + work_cond_func_t cond, void *data) +{ + int cpu; + struct work_struct __percpu *works; + + works = alloc_percpu(struct work_struct); + if (!works) + return -ENOMEM; + + get_online_cpus(); + + for_each_online_cpu(cpu) { + struct work_struct *work = per_cpu_ptr(works, cpu); + + if (cond(data, cpu)) { + INIT_WORK(work, func); + schedule_work_on(cpu, work); + } else { + work->entry.next = NULL; + } + } + + for_each_online_cpu(cpu) { + struct work_struct *work = per_cpu_ptr(works, cpu); + + if (work->entry.next) + flush_work(work); + } + + put_online_cpus(); + free_percpu(works); + return 0; +} + +/** * flush_scheduled_work - ensure that any scheduled work has run to completion. * * Forces execution of the kernel-global workqueue and blocks until its -- 1.8.3.1 ^ permalink raw reply related [flat|nested] 104+ messages in thread
* [PATCH v7 1/2] workqueue: add schedule_on_each_cpu_cond @ 2013-08-13 22:51 ` Chris Metcalf 0 siblings, 0 replies; 104+ messages in thread From: Chris Metcalf @ 2013-08-13 22:51 UTC (permalink / raw) To: Andrew Morton Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer This API supports running work on a subset of the online cpus determined by a callback function. Signed-off-by: Chris Metcalf <cmetcalf@tilera.com> --- v7: try a version with callbacks instead of cpu masks. Either this or v6 seem like reasonable solutions. v6: add Tejun's Acked-by, and add missing get/put_cpu_online to lru_add_drain_all(). v5: provide validity checking on the cpumask for schedule_on_cpu_mask. By providing an all-or-nothing EINVAL check, we impose the requirement that the calling code actually know clearly what it's trying to do. (Note: no change to the mm/swap.c commit) v4: don't lose possible -ENOMEM in schedule_on_each_cpu() (Note: no change to the mm/swap.c commit) v3: split commit into two, one for workqueue and one for mm, though both should probably be taken through -mm. include/linux/workqueue.h | 3 +++ kernel/workqueue.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index a0ed78a..c5ee29f 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -17,6 +17,7 @@ struct workqueue_struct; struct work_struct; typedef void (*work_func_t)(struct work_struct *work); +typedef bool (*work_cond_func_t)(void *data, int cpu); void delayed_work_timer_fn(unsigned long __data); /* @@ -471,6 +472,8 @@ extern void drain_workqueue(struct workqueue_struct *wq); extern void flush_scheduled_work(void); extern int schedule_on_each_cpu(work_func_t func); +extern int schedule_on_each_cpu_cond(work_func_t func, work_cond_func_t cond, + void *data); int execute_in_process_context(work_func_t fn, struct execute_work *); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f02c4a4..5c5b534 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2999,6 +2999,60 @@ int schedule_on_each_cpu(work_func_t func) } /** + * schedule_on_each_cpu_cond - execute a function synchronously on each + * online CPU if requested by a condition callback. + * @func: the function to call + * @cond: the callback function to determine whether to schedule the work + * @data: opaque data passed to the callback function + * + * schedule_on_each_cpu_cond() calls @cond for each online cpu (in the + * context of the current cpu), and for each cpu for which @cond returns + * true, it executes @func using the system workqueue. The function + * blocks until all CPUs on which work was scheduled have completed. + * schedule_on_each_cpu_cond() is very slow. + * + * The @cond callback is called in the same context as the original + * call to schedule_on_each_cpu_cond(). + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int schedule_on_each_cpu_cond(work_func_t func, + work_cond_func_t cond, void *data) +{ + int cpu; + struct work_struct __percpu *works; + + works = alloc_percpu(struct work_struct); + if (!works) + return -ENOMEM; + + get_online_cpus(); + + for_each_online_cpu(cpu) { + struct work_struct *work = per_cpu_ptr(works, cpu); + + if (cond(data, cpu)) { + INIT_WORK(work, func); + schedule_work_on(cpu, work); + } else { + work->entry.next = NULL; + } + } + + for_each_online_cpu(cpu) { + struct work_struct *work = per_cpu_ptr(works, cpu); + + if (work->entry.next) + flush_work(work); + } + + put_online_cpus(); + free_percpu(works); + return 0; +} + +/** * flush_scheduled_work - ensure that any scheduled work has run to completion. * * Forces execution of the kernel-global workqueue and blocks until its -- 1.8.3.1 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply related [flat|nested] 104+ messages in thread
* [PATCH v7 2/2] mm: make lru_add_drain_all() selective 2013-08-13 22:13 ` Chris Metcalf @ 2013-08-13 22:53 ` Chris Metcalf -1 siblings, 0 replies; 104+ messages in thread From: Chris Metcalf @ 2013-08-13 22:53 UTC (permalink / raw) To: Andrew Morton Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer This change makes lru_add_drain_all() only selectively interrupt the cpus that have per-cpu free pages that can be drained. This is important in nohz mode where calling mlockall(), for example, otherwise will interrupt every core unnecessarily. Signed-off-by: Chris Metcalf <cmetcalf@tilera.com> --- v7: try a version with callbacks instead of cpu masks. Either this or v6 seem like reasonable solutions. v6: add Tejun's Acked-by, and add missing get/put_cpu_online to lru_add_drain_all(). v5: provide validity checking on the cpumask for schedule_on_cpu_mask. By providing an all-or-nothing EINVAL check, we impose the requirement that the calling code actually know clearly what it's trying to do. (Note: no change to the mm/swap.c commit) v4: don't lose possible -ENOMEM in schedule_on_each_cpu() (Note: no change to the mm/swap.c commit) v3: split commit into two, one for workqueue and one for mm, though both should probably be taken through -mm. mm/swap.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/mm/swap.c b/mm/swap.c index 4a1d0d2..fe3a488 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -405,6 +405,11 @@ static void activate_page_drain(int cpu) pagevec_lru_move_fn(pvec, __activate_page, NULL); } +static bool need_activate_page_drain(int cpu) +{ + return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0; +} + void activate_page(struct page *page) { if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { @@ -422,6 +427,11 @@ static inline void activate_page_drain(int cpu) { } +static bool need_activate_page_drain(int cpu) +{ + return false; +} + void activate_page(struct page *page) { struct zone *zone = page_zone(page); @@ -673,6 +683,14 @@ void lru_add_drain(void) put_cpu(); } +static bool lru_add_drain_cond(void *data, int cpu) +{ + return pagevec_count(&per_cpu(lru_add_pvec, cpu)) || + pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || + pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || + need_activate_page_drain(cpu); +} + static void lru_add_drain_per_cpu(struct work_struct *dummy) { lru_add_drain(); @@ -683,7 +701,8 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) */ int lru_add_drain_all(void) { - return schedule_on_each_cpu(lru_add_drain_per_cpu); + return schedule_on_each_cpu_cond(lru_add_drain_per_cpu, + lru_add_drain_cond, NULL); } /* -- 1.8.3.1 ^ permalink raw reply related [flat|nested] 104+ messages in thread
* [PATCH v7 2/2] mm: make lru_add_drain_all() selective @ 2013-08-13 22:53 ` Chris Metcalf 0 siblings, 0 replies; 104+ messages in thread From: Chris Metcalf @ 2013-08-13 22:53 UTC (permalink / raw) To: Andrew Morton Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer This change makes lru_add_drain_all() only selectively interrupt the cpus that have per-cpu free pages that can be drained. This is important in nohz mode where calling mlockall(), for example, otherwise will interrupt every core unnecessarily. Signed-off-by: Chris Metcalf <cmetcalf@tilera.com> --- v7: try a version with callbacks instead of cpu masks. Either this or v6 seem like reasonable solutions. v6: add Tejun's Acked-by, and add missing get/put_cpu_online to lru_add_drain_all(). v5: provide validity checking on the cpumask for schedule_on_cpu_mask. By providing an all-or-nothing EINVAL check, we impose the requirement that the calling code actually know clearly what it's trying to do. (Note: no change to the mm/swap.c commit) v4: don't lose possible -ENOMEM in schedule_on_each_cpu() (Note: no change to the mm/swap.c commit) v3: split commit into two, one for workqueue and one for mm, though both should probably be taken through -mm. mm/swap.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/mm/swap.c b/mm/swap.c index 4a1d0d2..fe3a488 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -405,6 +405,11 @@ static void activate_page_drain(int cpu) pagevec_lru_move_fn(pvec, __activate_page, NULL); } +static bool need_activate_page_drain(int cpu) +{ + return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0; +} + void activate_page(struct page *page) { if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { @@ -422,6 +427,11 @@ static inline void activate_page_drain(int cpu) { } +static bool need_activate_page_drain(int cpu) +{ + return false; +} + void activate_page(struct page *page) { struct zone *zone = page_zone(page); @@ -673,6 +683,14 @@ void lru_add_drain(void) put_cpu(); } +static bool lru_add_drain_cond(void *data, int cpu) +{ + return pagevec_count(&per_cpu(lru_add_pvec, cpu)) || + pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || + pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || + need_activate_page_drain(cpu); +} + static void lru_add_drain_per_cpu(struct work_struct *dummy) { lru_add_drain(); @@ -683,7 +701,8 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) */ int lru_add_drain_all(void) { - return schedule_on_each_cpu(lru_add_drain_per_cpu); + return schedule_on_each_cpu_cond(lru_add_drain_per_cpu, + lru_add_drain_cond, NULL); } /* -- 1.8.3.1 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply related [flat|nested] 104+ messages in thread
* Re: [PATCH v7 2/2] mm: make lru_add_drain_all() selective 2013-08-13 22:53 ` Chris Metcalf @ 2013-08-13 23:29 ` Tejun Heo -1 siblings, 0 replies; 104+ messages in thread From: Tejun Heo @ 2013-08-13 23:29 UTC (permalink / raw) To: Chris Metcalf Cc: Andrew Morton, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer Hello, On Tue, Aug 13, 2013 at 06:53:32PM -0400, Chris Metcalf wrote: > int lru_add_drain_all(void) > { > - return schedule_on_each_cpu(lru_add_drain_per_cpu); > + return schedule_on_each_cpu_cond(lru_add_drain_per_cpu, > + lru_add_drain_cond, NULL); It won't nest and doing it simultaneously won't buy anything, right? Wouldn't it be better to protect it with a mutex and define all necessary resources statically (yeah, cpumask is pain in the ass and I think we should un-deprecate cpumask_t for static use cases)? Then, there'd be no allocation to worry about on the path. Thanks. -- tejun ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v7 2/2] mm: make lru_add_drain_all() selective @ 2013-08-13 23:29 ` Tejun Heo 0 siblings, 0 replies; 104+ messages in thread From: Tejun Heo @ 2013-08-13 23:29 UTC (permalink / raw) To: Chris Metcalf Cc: Andrew Morton, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer Hello, On Tue, Aug 13, 2013 at 06:53:32PM -0400, Chris Metcalf wrote: > int lru_add_drain_all(void) > { > - return schedule_on_each_cpu(lru_add_drain_per_cpu); > + return schedule_on_each_cpu_cond(lru_add_drain_per_cpu, > + lru_add_drain_cond, NULL); It won't nest and doing it simultaneously won't buy anything, right? Wouldn't it be better to protect it with a mutex and define all necessary resources statically (yeah, cpumask is pain in the ass and I think we should un-deprecate cpumask_t for static use cases)? Then, there'd be no allocation to worry about on the path. Thanks. -- tejun -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v7 2/2] mm: make lru_add_drain_all() selective 2013-08-13 23:29 ` Tejun Heo @ 2013-08-13 23:32 ` Chris Metcalf -1 siblings, 0 replies; 104+ messages in thread From: Chris Metcalf @ 2013-08-13 23:32 UTC (permalink / raw) To: Tejun Heo Cc: Andrew Morton, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer On 8/13/2013 7:29 PM, Tejun Heo wrote: > Hello, > > On Tue, Aug 13, 2013 at 06:53:32PM -0400, Chris Metcalf wrote: >> int lru_add_drain_all(void) >> { >> - return schedule_on_each_cpu(lru_add_drain_per_cpu); >> + return schedule_on_each_cpu_cond(lru_add_drain_per_cpu, >> + lru_add_drain_cond, NULL); > It won't nest and doing it simultaneously won't buy anything, right? Correct on both counts, I think. > Wouldn't it be better to protect it with a mutex and define all > necessary resources statically (yeah, cpumask is pain in the ass and I > think we should un-deprecate cpumask_t for static use cases)? Then, > there'd be no allocation to worry about on the path. If allocation is a real problem on this path, I think this is probably OK, though I don't want to speak for Andrew. You could just guard it with a trylock and any caller that tried to start it while it was locked could just return happy that it was going on. I'll put out a version that does that and see how that looks for comparison's sake. -- Chris Metcalf, Tilera Corp. http://www.tilera.com ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v7 2/2] mm: make lru_add_drain_all() selective @ 2013-08-13 23:32 ` Chris Metcalf 0 siblings, 0 replies; 104+ messages in thread From: Chris Metcalf @ 2013-08-13 23:32 UTC (permalink / raw) To: Tejun Heo Cc: Andrew Morton, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer On 8/13/2013 7:29 PM, Tejun Heo wrote: > Hello, > > On Tue, Aug 13, 2013 at 06:53:32PM -0400, Chris Metcalf wrote: >> int lru_add_drain_all(void) >> { >> - return schedule_on_each_cpu(lru_add_drain_per_cpu); >> + return schedule_on_each_cpu_cond(lru_add_drain_per_cpu, >> + lru_add_drain_cond, NULL); > It won't nest and doing it simultaneously won't buy anything, right? Correct on both counts, I think. > Wouldn't it be better to protect it with a mutex and define all > necessary resources statically (yeah, cpumask is pain in the ass and I > think we should un-deprecate cpumask_t for static use cases)? Then, > there'd be no allocation to worry about on the path. If allocation is a real problem on this path, I think this is probably OK, though I don't want to speak for Andrew. You could just guard it with a trylock and any caller that tried to start it while it was locked could just return happy that it was going on. I'll put out a version that does that and see how that looks for comparison's sake. -- Chris Metcalf, Tilera Corp. http://www.tilera.com -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v7 2/2] mm: make lru_add_drain_all() selective 2013-08-13 23:32 ` Chris Metcalf @ 2013-08-14 6:46 ` Andrew Morton -1 siblings, 0 replies; 104+ messages in thread From: Andrew Morton @ 2013-08-14 6:46 UTC (permalink / raw) To: Chris Metcalf Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer On Tue, 13 Aug 2013 19:32:37 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote: > On 8/13/2013 7:29 PM, Tejun Heo wrote: > > Hello, > > > > On Tue, Aug 13, 2013 at 06:53:32PM -0400, Chris Metcalf wrote: > >> int lru_add_drain_all(void) > >> { > >> - return schedule_on_each_cpu(lru_add_drain_per_cpu); > >> + return schedule_on_each_cpu_cond(lru_add_drain_per_cpu, > >> + lru_add_drain_cond, NULL); This version looks nice to me. It's missing the conversion of schedule_on_each_cpu(), but I suppose that will be pretty simple. > > It won't nest and doing it simultaneously won't buy anything, right? > > Correct on both counts, I think. I'm glad you understood the question :( What does "nest" mean? lru_add_drain_all() calls itself recursively, presumably via some ghastly alloc_percpu()->alloc_pages(GFP_KERNEL) route? If that ever happens then we'd certainly want to know about it. Hopefully PF_MEMALLOC would prevent infinite recursion. If "nest" means something else then please enlighten me! As for "doing it simultaneously", I assume we're referring to concurrent execution from separate threads. If so, why would that "buy us anything"? Confused. As long as each thread sees "all pages which were in pagevecs at the time I called lru_add_drain_all() get spilled onto the LRU" then we're good. afaict the implementation will do this. > > Wouldn't it be better to protect it with a mutex and define all > > necessary resources statically (yeah, cpumask is pain in the ass and I > > think we should un-deprecate cpumask_t for static use cases)? Then, > > there'd be no allocation to worry about on the path. > > If allocation is a real problem on this path, I think this is probably Well as you pointed out, alloc_percpu() can already do a GFP_KERNEL allocation, so adding another GFP_KERNEL allocation won't cause great problems. But the patchset demonstrates that the additional allocation isn't needed. > OK, though I don't want to speak for Andrew. You could just guard it > with a trylock and any caller that tried to start it while it was > locked could just return happy that it was going on. > > I'll put out a version that does that and see how that looks > for comparison's sake. That one's no good. If thread A is holding the mutex, thread B will bale out and we broke lru_add_drain_all()'s contract, "all pages which ...", above. ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v7 2/2] mm: make lru_add_drain_all() selective @ 2013-08-14 6:46 ` Andrew Morton 0 siblings, 0 replies; 104+ messages in thread From: Andrew Morton @ 2013-08-14 6:46 UTC (permalink / raw) To: Chris Metcalf Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer On Tue, 13 Aug 2013 19:32:37 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote: > On 8/13/2013 7:29 PM, Tejun Heo wrote: > > Hello, > > > > On Tue, Aug 13, 2013 at 06:53:32PM -0400, Chris Metcalf wrote: > >> int lru_add_drain_all(void) > >> { > >> - return schedule_on_each_cpu(lru_add_drain_per_cpu); > >> + return schedule_on_each_cpu_cond(lru_add_drain_per_cpu, > >> + lru_add_drain_cond, NULL); This version looks nice to me. It's missing the conversion of schedule_on_each_cpu(), but I suppose that will be pretty simple. > > It won't nest and doing it simultaneously won't buy anything, right? > > Correct on both counts, I think. I'm glad you understood the question :( What does "nest" mean? lru_add_drain_all() calls itself recursively, presumably via some ghastly alloc_percpu()->alloc_pages(GFP_KERNEL) route? If that ever happens then we'd certainly want to know about it. Hopefully PF_MEMALLOC would prevent infinite recursion. If "nest" means something else then please enlighten me! As for "doing it simultaneously", I assume we're referring to concurrent execution from separate threads. If so, why would that "buy us anything"? Confused. As long as each thread sees "all pages which were in pagevecs at the time I called lru_add_drain_all() get spilled onto the LRU" then we're good. afaict the implementation will do this. > > Wouldn't it be better to protect it with a mutex and define all > > necessary resources statically (yeah, cpumask is pain in the ass and I > > think we should un-deprecate cpumask_t for static use cases)? Then, > > there'd be no allocation to worry about on the path. > > If allocation is a real problem on this path, I think this is probably Well as you pointed out, alloc_percpu() can already do a GFP_KERNEL allocation, so adding another GFP_KERNEL allocation won't cause great problems. But the patchset demonstrates that the additional allocation isn't needed. > OK, though I don't want to speak for Andrew. You could just guard it > with a trylock and any caller that tried to start it while it was > locked could just return happy that it was going on. > > I'll put out a version that does that and see how that looks > for comparison's sake. That one's no good. If thread A is holding the mutex, thread B will bale out and we broke lru_add_drain_all()'s contract, "all pages which ...", above. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v7 2/2] mm: make lru_add_drain_all() selective 2013-08-14 6:46 ` Andrew Morton @ 2013-08-14 13:05 ` Tejun Heo -1 siblings, 0 replies; 104+ messages in thread From: Tejun Heo @ 2013-08-14 13:05 UTC (permalink / raw) To: Andrew Morton Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer Hello, On Tue, Aug 13, 2013 at 11:46:29PM -0700, Andrew Morton wrote: > What does "nest" mean? lru_add_drain_all() calls itself recursively, > presumably via some ghastly alloc_percpu()->alloc_pages(GFP_KERNEL) > route? If that ever happens then we'd certainly want to know about it. > Hopefully PF_MEMALLOC would prevent infinite recursion. > > If "nest" means something else then please enlighten me! > > As for "doing it simultaneously", I assume we're referring to > concurrent execution from separate threads. If so, why would that "buy > us anything"? Confused. As long as each thread sees "all pages which > were in pagevecs at the time I called lru_add_drain_all() get spilled > onto the LRU" then we're good. afaict the implementation will do this. I was wondering whether we can avoid all allocations by just pre-allocating all resources. If it can't call itself if we get rid of all allocations && running multiple instances of them doesn't buy us anything, the best solution would be allocating work items statically and synchronize their use using a mutex. That way the whole thing wouldn't need any allocation. Thanks. -- tejun ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v7 2/2] mm: make lru_add_drain_all() selective @ 2013-08-14 13:05 ` Tejun Heo 0 siblings, 0 replies; 104+ messages in thread From: Tejun Heo @ 2013-08-14 13:05 UTC (permalink / raw) To: Andrew Morton Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer Hello, On Tue, Aug 13, 2013 at 11:46:29PM -0700, Andrew Morton wrote: > What does "nest" mean? lru_add_drain_all() calls itself recursively, > presumably via some ghastly alloc_percpu()->alloc_pages(GFP_KERNEL) > route? If that ever happens then we'd certainly want to know about it. > Hopefully PF_MEMALLOC would prevent infinite recursion. > > If "nest" means something else then please enlighten me! > > As for "doing it simultaneously", I assume we're referring to > concurrent execution from separate threads. If so, why would that "buy > us anything"? Confused. As long as each thread sees "all pages which > were in pagevecs at the time I called lru_add_drain_all() get spilled > onto the LRU" then we're good. afaict the implementation will do this. I was wondering whether we can avoid all allocations by just pre-allocating all resources. If it can't call itself if we get rid of all allocations && running multiple instances of them doesn't buy us anything, the best solution would be allocating work items statically and synchronize their use using a mutex. That way the whole thing wouldn't need any allocation. Thanks. -- tejun -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v7 2/2] mm: make lru_add_drain_all() selective 2013-08-14 6:46 ` Andrew Morton @ 2013-08-14 16:03 ` Chris Metcalf -1 siblings, 0 replies; 104+ messages in thread From: Chris Metcalf @ 2013-08-14 16:03 UTC (permalink / raw) To: Andrew Morton Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer On 8/14/2013 2:46 AM, Andrew Morton wrote: > On Tue, 13 Aug 2013 19:32:37 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote: > >> On 8/13/2013 7:29 PM, Tejun Heo wrote: >>> Hello, >>> >>> On Tue, Aug 13, 2013 at 06:53:32PM -0400, Chris Metcalf wrote: >>>> int lru_add_drain_all(void) >>>> { >>>> - return schedule_on_each_cpu(lru_add_drain_per_cpu); >>>> + return schedule_on_each_cpu_cond(lru_add_drain_per_cpu, >>>> + lru_add_drain_cond, NULL); > This version looks nice to me. It's missing the conversion of > schedule_on_each_cpu(), but I suppose that will be pretty simple. I assume you were thinking of something like a NULL "cond" function pointer meaning "no condition" in the implementation, and then schedule_on_each_cpu just calls schedule_on_each_cpu_cond(work, NULL, NULL)? >>> It won't nest and doing it simultaneously won't buy anything, right? >> Correct on both counts, I think. > I'm glad you understood the question :( > > What does "nest" mean? lru_add_drain_all() calls itself recursively, > presumably via some ghastly alloc_percpu()->alloc_pages(GFP_KERNEL) > route? If that ever happens then we'd certainly want to know about it. > Hopefully PF_MEMALLOC would prevent infinite recursion. Well, I think "it won't nest" is true in exactly the way you say - or maybe we want to say "it shouldn't nest". Catching the condition would obviously be a good thing. > If "nest" means something else then please enlighten me! > > As for "doing it simultaneously", I assume we're referring to > concurrent execution from separate threads. If so, why would that "buy > us anything"? Confused. As long as each thread sees "all pages which > were in pagevecs at the time I called lru_add_drain_all() get spilled > onto the LRU" then we're good. afaict the implementation will do this. If we can avoid doing it simultaneously, we avoid having to do any allocation at all, which seems like a win. Let's consider if we simply do a mutex_lock() around the routine. If we also then test to see which cpus need to be drained before issuing the drain, the result for two cpus that both try to drain simultaneously would be that one succeeds and drains everything; the second blocks on the mutex until the first is done, then quickly enters the code, finds that no cpus require draining, and returns, faster than would have been the case in the current code when it would also have fired drain requests at every cpu, concurrently with the earlier cpus firing its drain requests. Frankly, this seems like a pretty reasonable solution. We can implement it purely in swap.c using existing workqueue APIs. Note that lru_add_drain_all() always returns success this way, so I've changed the API to make it a void function. Considering none of the callers currently check the error return value anyway, that seemed like a straightforward thing to do. :-) Unfortunately DEFINE_PER_CPU doesn't work at function scope, so I have to make it file-static. Assuming the existence of my "need_activate_page_drain()" helper from the earlier versions of the patch set, lru_add_drain_all() looks like the following: static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); void lru_add_drain_all(void) { static DEFINE_MUTEX(lock); int cpu; mutex_lock(&lock); get_online_cpus(); for_each_online_cpu(cpu) { struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || need_activate_page_drain(cpu)) { INIT_WORK(work, lru_add_drain_per_cpu); schedule_work_on(cpu, work); } else { work->entry.next = NULL; } } for_each_online_cpu(cpu) { struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); if (work->entry.next) flush_work(work); } put_online_cpus(); mutex_unlock(&lock); } Tejun, I don't know if you have a better idea for how to mark a work_struct as being "not used" so we can set and test it here. Is setting entry.next to NULL good? Should we offer it as an API in the workqueue header? We could wrap the whole thing in a new workqueue API too, of course (schedule_on_each_cpu_cond_sequential??) but it seems better at this point to wait until we find another caller with similar needs, and only then factor the code into a new workqueue API. -- Chris Metcalf, Tilera Corp. http://www.tilera.com ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v7 2/2] mm: make lru_add_drain_all() selective @ 2013-08-14 16:03 ` Chris Metcalf 0 siblings, 0 replies; 104+ messages in thread From: Chris Metcalf @ 2013-08-14 16:03 UTC (permalink / raw) To: Andrew Morton Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer On 8/14/2013 2:46 AM, Andrew Morton wrote: > On Tue, 13 Aug 2013 19:32:37 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote: > >> On 8/13/2013 7:29 PM, Tejun Heo wrote: >>> Hello, >>> >>> On Tue, Aug 13, 2013 at 06:53:32PM -0400, Chris Metcalf wrote: >>>> int lru_add_drain_all(void) >>>> { >>>> - return schedule_on_each_cpu(lru_add_drain_per_cpu); >>>> + return schedule_on_each_cpu_cond(lru_add_drain_per_cpu, >>>> + lru_add_drain_cond, NULL); > This version looks nice to me. It's missing the conversion of > schedule_on_each_cpu(), but I suppose that will be pretty simple. I assume you were thinking of something like a NULL "cond" function pointer meaning "no condition" in the implementation, and then schedule_on_each_cpu just calls schedule_on_each_cpu_cond(work, NULL, NULL)? >>> It won't nest and doing it simultaneously won't buy anything, right? >> Correct on both counts, I think. > I'm glad you understood the question :( > > What does "nest" mean? lru_add_drain_all() calls itself recursively, > presumably via some ghastly alloc_percpu()->alloc_pages(GFP_KERNEL) > route? If that ever happens then we'd certainly want to know about it. > Hopefully PF_MEMALLOC would prevent infinite recursion. Well, I think "it won't nest" is true in exactly the way you say - or maybe we want to say "it shouldn't nest". Catching the condition would obviously be a good thing. > If "nest" means something else then please enlighten me! > > As for "doing it simultaneously", I assume we're referring to > concurrent execution from separate threads. If so, why would that "buy > us anything"? Confused. As long as each thread sees "all pages which > were in pagevecs at the time I called lru_add_drain_all() get spilled > onto the LRU" then we're good. afaict the implementation will do this. If we can avoid doing it simultaneously, we avoid having to do any allocation at all, which seems like a win. Let's consider if we simply do a mutex_lock() around the routine. If we also then test to see which cpus need to be drained before issuing the drain, the result for two cpus that both try to drain simultaneously would be that one succeeds and drains everything; the second blocks on the mutex until the first is done, then quickly enters the code, finds that no cpus require draining, and returns, faster than would have been the case in the current code when it would also have fired drain requests at every cpu, concurrently with the earlier cpus firing its drain requests. Frankly, this seems like a pretty reasonable solution. We can implement it purely in swap.c using existing workqueue APIs. Note that lru_add_drain_all() always returns success this way, so I've changed the API to make it a void function. Considering none of the callers currently check the error return value anyway, that seemed like a straightforward thing to do. :-) Unfortunately DEFINE_PER_CPU doesn't work at function scope, so I have to make it file-static. Assuming the existence of my "need_activate_page_drain()" helper from the earlier versions of the patch set, lru_add_drain_all() looks like the following: static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); void lru_add_drain_all(void) { static DEFINE_MUTEX(lock); int cpu; mutex_lock(&lock); get_online_cpus(); for_each_online_cpu(cpu) { struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || need_activate_page_drain(cpu)) { INIT_WORK(work, lru_add_drain_per_cpu); schedule_work_on(cpu, work); } else { work->entry.next = NULL; } } for_each_online_cpu(cpu) { struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); if (work->entry.next) flush_work(work); } put_online_cpus(); mutex_unlock(&lock); } Tejun, I don't know if you have a better idea for how to mark a work_struct as being "not used" so we can set and test it here. Is setting entry.next to NULL good? Should we offer it as an API in the workqueue header? We could wrap the whole thing in a new workqueue API too, of course (schedule_on_each_cpu_cond_sequential??) but it seems better at this point to wait until we find another caller with similar needs, and only then factor the code into a new workqueue API. -- Chris Metcalf, Tilera Corp. http://www.tilera.com -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v7 2/2] mm: make lru_add_drain_all() selective 2013-08-14 16:03 ` Chris Metcalf @ 2013-08-14 16:57 ` Tejun Heo -1 siblings, 0 replies; 104+ messages in thread From: Tejun Heo @ 2013-08-14 16:57 UTC (permalink / raw) To: Chris Metcalf Cc: Andrew Morton, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer Hello, Chris. On Wed, Aug 14, 2013 at 12:03:39PM -0400, Chris Metcalf wrote: > Tejun, I don't know if you have a better idea for how to mark a > work_struct as being "not used" so we can set and test it here. > Is setting entry.next to NULL good? Should we offer it as an API > in the workqueue header? Maybe simply defining a static cpumask would be cleaner? > We could wrap the whole thing in a new workqueue API too, of course > (schedule_on_each_cpu_cond_sequential??) but it seems better at this > point to wait until we find another caller with similar needs, and only > then factor the code into a new workqueue API. We can have e.g. __schedule_on_cpu(fn, pcpu_works) but yeah it seems a bit excessive at this point. Thanks. -- tejun ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v7 2/2] mm: make lru_add_drain_all() selective @ 2013-08-14 16:57 ` Tejun Heo 0 siblings, 0 replies; 104+ messages in thread From: Tejun Heo @ 2013-08-14 16:57 UTC (permalink / raw) To: Chris Metcalf Cc: Andrew Morton, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer Hello, Chris. On Wed, Aug 14, 2013 at 12:03:39PM -0400, Chris Metcalf wrote: > Tejun, I don't know if you have a better idea for how to mark a > work_struct as being "not used" so we can set and test it here. > Is setting entry.next to NULL good? Should we offer it as an API > in the workqueue header? Maybe simply defining a static cpumask would be cleaner? > We could wrap the whole thing in a new workqueue API too, of course > (schedule_on_each_cpu_cond_sequential??) but it seems better at this > point to wait until we find another caller with similar needs, and only > then factor the code into a new workqueue API. We can have e.g. __schedule_on_cpu(fn, pcpu_works) but yeah it seems a bit excessive at this point. Thanks. -- tejun -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v7 2/2] mm: make lru_add_drain_all() selective 2013-08-14 16:57 ` Tejun Heo @ 2013-08-14 17:18 ` Chris Metcalf -1 siblings, 0 replies; 104+ messages in thread From: Chris Metcalf @ 2013-08-14 17:18 UTC (permalink / raw) To: Tejun Heo, Andrew Morton Cc: linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer On 8/14/2013 12:57 PM, Tejun Heo wrote: > Hello, Chris. > > On Wed, Aug 14, 2013 at 12:03:39PM -0400, Chris Metcalf wrote: >> Tejun, I don't know if you have a better idea for how to mark a >> work_struct as being "not used" so we can set and test it here. >> Is setting entry.next to NULL good? Should we offer it as an API >> in the workqueue header? > Maybe simply defining a static cpumask would be cleaner? I think you're right, actually. Andrew, Tejun, how does this look? static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); void lru_add_drain_all(void) { static DEFINE_MUTEX(lock); static struct cpumask has_work; int cpu; mutex_lock(&lock); get_online_cpus(); cpumask_clear(&has_work); for_each_online_cpu(cpu) { struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || need_activate_page_drain(cpu)) { INIT_WORK(work, lru_add_drain_per_cpu); schedule_work_on(cpu, work); cpumask_set_cpu(cpu, &has_work); } } for_each_cpu(cpu, &has_work) flush_work(&per_cpu(lru_add_drain_work, cpu)); put_online_cpus(); mutex_unlock(&lock); } -- Chris Metcalf, Tilera Corp. http://www.tilera.com ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v7 2/2] mm: make lru_add_drain_all() selective @ 2013-08-14 17:18 ` Chris Metcalf 0 siblings, 0 replies; 104+ messages in thread From: Chris Metcalf @ 2013-08-14 17:18 UTC (permalink / raw) To: Tejun Heo, Andrew Morton Cc: linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer On 8/14/2013 12:57 PM, Tejun Heo wrote: > Hello, Chris. > > On Wed, Aug 14, 2013 at 12:03:39PM -0400, Chris Metcalf wrote: >> Tejun, I don't know if you have a better idea for how to mark a >> work_struct as being "not used" so we can set and test it here. >> Is setting entry.next to NULL good? Should we offer it as an API >> in the workqueue header? > Maybe simply defining a static cpumask would be cleaner? I think you're right, actually. Andrew, Tejun, how does this look? static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); void lru_add_drain_all(void) { static DEFINE_MUTEX(lock); static struct cpumask has_work; int cpu; mutex_lock(&lock); get_online_cpus(); cpumask_clear(&has_work); for_each_online_cpu(cpu) { struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || need_activate_page_drain(cpu)) { INIT_WORK(work, lru_add_drain_per_cpu); schedule_work_on(cpu, work); cpumask_set_cpu(cpu, &has_work); } } for_each_cpu(cpu, &has_work) flush_work(&per_cpu(lru_add_drain_work, cpu)); put_online_cpus(); mutex_unlock(&lock); } -- Chris Metcalf, Tilera Corp. http://www.tilera.com -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v7 2/2] mm: make lru_add_drain_all() selective 2013-08-14 17:18 ` Chris Metcalf @ 2013-08-14 20:07 ` Tejun Heo -1 siblings, 0 replies; 104+ messages in thread From: Tejun Heo @ 2013-08-14 20:07 UTC (permalink / raw) To: Chris Metcalf Cc: Andrew Morton, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer Hello, Chris. On Wed, Aug 14, 2013 at 01:18:31PM -0400, Chris Metcalf wrote: > On 8/14/2013 12:57 PM, Tejun Heo wrote: > > Hello, Chris. > > > > On Wed, Aug 14, 2013 at 12:03:39PM -0400, Chris Metcalf wrote: > >> Tejun, I don't know if you have a better idea for how to mark a > >> work_struct as being "not used" so we can set and test it here. > >> Is setting entry.next to NULL good? Should we offer it as an API > >> in the workqueue header? > > Maybe simply defining a static cpumask would be cleaner? > > I think you're right, actually. Andrew, Tejun, how does this look? Looks good to me. Please feel free to add Reviewed-by: Tejun Heo <tj@kernel.org> Thanks. -- tejun ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v7 2/2] mm: make lru_add_drain_all() selective @ 2013-08-14 20:07 ` Tejun Heo 0 siblings, 0 replies; 104+ messages in thread From: Tejun Heo @ 2013-08-14 20:07 UTC (permalink / raw) To: Chris Metcalf Cc: Andrew Morton, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer Hello, Chris. On Wed, Aug 14, 2013 at 01:18:31PM -0400, Chris Metcalf wrote: > On 8/14/2013 12:57 PM, Tejun Heo wrote: > > Hello, Chris. > > > > On Wed, Aug 14, 2013 at 12:03:39PM -0400, Chris Metcalf wrote: > >> Tejun, I don't know if you have a better idea for how to mark a > >> work_struct as being "not used" so we can set and test it here. > >> Is setting entry.next to NULL good? Should we offer it as an API > >> in the workqueue header? > > Maybe simply defining a static cpumask would be cleaner? > > I think you're right, actually. Andrew, Tejun, how does this look? Looks good to me. Please feel free to add Reviewed-by: Tejun Heo <tj@kernel.org> Thanks. -- tejun -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 104+ messages in thread
* [PATCH v8] mm: make lru_add_drain_all() selective 2013-08-14 20:07 ` Tejun Heo @ 2013-08-14 20:22 ` Chris Metcalf -1 siblings, 0 replies; 104+ messages in thread From: Chris Metcalf @ 2013-08-14 20:22 UTC (permalink / raw) To: Andrew Morton Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer This change makes lru_add_drain_all() only selectively interrupt the cpus that have per-cpu free pages that can be drained. This is important in nohz mode where calling mlockall(), for example, otherwise will interrupt every core unnecessarily. Signed-off-by: Chris Metcalf <cmetcalf@tilera.com> Reviewed-by: Tejun Heo <tj@kernel.org> --- v8: Here's an actual git patch after Tejun's review. Pull everything inline into lru_add_drain_all(), and use a mutex plus build-time allocated work_structs to avoid allocation. v7: try a version with callbacks instead of cpu masks. Either this or v6 seem like reasonable solutions. v6: add Tejun's Acked-by, and add missing get/put_cpu_online to lru_add_drain_all(). v5: provide validity checking on the cpumask for schedule_on_cpu_mask. By providing an all-or-nothing EINVAL check, we impose the requirement that the calling code actually know clearly what it's trying to do. (Note: no change to the mm/swap.c commit) v4: don't lose possible -ENOMEM in schedule_on_each_cpu() (Note: no change to the mm/swap.c commit) v3: split commit into two, one for workqueue and one for mm, though both should probably be taken through -mm. include/linux/swap.h | 2 +- mm/swap.c | 44 +++++++++++++++++++++++++++++++++++++++----- 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index d95cde5..acea8e0 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -247,7 +247,7 @@ extern void activate_page(struct page *); extern void mark_page_accessed(struct page *); extern void lru_add_drain(void); extern void lru_add_drain_cpu(int cpu); -extern int lru_add_drain_all(void); +extern void lru_add_drain_all(void); extern void rotate_reclaimable_page(struct page *page); extern void deactivate_page(struct page *page); extern void swap_setup(void); diff --git a/mm/swap.c b/mm/swap.c index 4a1d0d2..8d19543 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -405,6 +405,11 @@ static void activate_page_drain(int cpu) pagevec_lru_move_fn(pvec, __activate_page, NULL); } +static bool need_activate_page_drain(int cpu) +{ + return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0; +} + void activate_page(struct page *page) { if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { @@ -422,6 +427,11 @@ static inline void activate_page_drain(int cpu) { } +static bool need_activate_page_drain(int cpu) +{ + return false; +} + void activate_page(struct page *page) { struct zone *zone = page_zone(page); @@ -678,12 +688,36 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) lru_add_drain(); } -/* - * Returns 0 for success - */ -int lru_add_drain_all(void) +static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); + +void lru_add_drain_all(void) { - return schedule_on_each_cpu(lru_add_drain_per_cpu); + static DEFINE_MUTEX(lock); + static struct cpumask has_work; + int cpu; + + mutex_lock(&lock); + get_online_cpus(); + cpumask_clear(&has_work); + + for_each_online_cpu(cpu) { + struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); + + if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || + pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || + pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || + need_activate_page_drain(cpu)) { + INIT_WORK(work, lru_add_drain_per_cpu); + schedule_work_on(cpu, work); + cpumask_set_cpu(cpu, &has_work); + } + } + + for_each_cpu(cpu, &has_work) + flush_work(&per_cpu(lru_add_drain_work, cpu)); + + put_online_cpus(); + mutex_unlock(&lock); } /* -- 1.8.3.1 ^ permalink raw reply related [flat|nested] 104+ messages in thread
* [PATCH v8] mm: make lru_add_drain_all() selective @ 2013-08-14 20:22 ` Chris Metcalf 0 siblings, 0 replies; 104+ messages in thread From: Chris Metcalf @ 2013-08-14 20:22 UTC (permalink / raw) To: Andrew Morton Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer This change makes lru_add_drain_all() only selectively interrupt the cpus that have per-cpu free pages that can be drained. This is important in nohz mode where calling mlockall(), for example, otherwise will interrupt every core unnecessarily. Signed-off-by: Chris Metcalf <cmetcalf@tilera.com> Reviewed-by: Tejun Heo <tj@kernel.org> --- v8: Here's an actual git patch after Tejun's review. Pull everything inline into lru_add_drain_all(), and use a mutex plus build-time allocated work_structs to avoid allocation. v7: try a version with callbacks instead of cpu masks. Either this or v6 seem like reasonable solutions. v6: add Tejun's Acked-by, and add missing get/put_cpu_online to lru_add_drain_all(). v5: provide validity checking on the cpumask for schedule_on_cpu_mask. By providing an all-or-nothing EINVAL check, we impose the requirement that the calling code actually know clearly what it's trying to do. (Note: no change to the mm/swap.c commit) v4: don't lose possible -ENOMEM in schedule_on_each_cpu() (Note: no change to the mm/swap.c commit) v3: split commit into two, one for workqueue and one for mm, though both should probably be taken through -mm. include/linux/swap.h | 2 +- mm/swap.c | 44 +++++++++++++++++++++++++++++++++++++++----- 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index d95cde5..acea8e0 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -247,7 +247,7 @@ extern void activate_page(struct page *); extern void mark_page_accessed(struct page *); extern void lru_add_drain(void); extern void lru_add_drain_cpu(int cpu); -extern int lru_add_drain_all(void); +extern void lru_add_drain_all(void); extern void rotate_reclaimable_page(struct page *page); extern void deactivate_page(struct page *page); extern void swap_setup(void); diff --git a/mm/swap.c b/mm/swap.c index 4a1d0d2..8d19543 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -405,6 +405,11 @@ static void activate_page_drain(int cpu) pagevec_lru_move_fn(pvec, __activate_page, NULL); } +static bool need_activate_page_drain(int cpu) +{ + return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0; +} + void activate_page(struct page *page) { if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { @@ -422,6 +427,11 @@ static inline void activate_page_drain(int cpu) { } +static bool need_activate_page_drain(int cpu) +{ + return false; +} + void activate_page(struct page *page) { struct zone *zone = page_zone(page); @@ -678,12 +688,36 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) lru_add_drain(); } -/* - * Returns 0 for success - */ -int lru_add_drain_all(void) +static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); + +void lru_add_drain_all(void) { - return schedule_on_each_cpu(lru_add_drain_per_cpu); + static DEFINE_MUTEX(lock); + static struct cpumask has_work; + int cpu; + + mutex_lock(&lock); + get_online_cpus(); + cpumask_clear(&has_work); + + for_each_online_cpu(cpu) { + struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); + + if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || + pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || + pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || + need_activate_page_drain(cpu)) { + INIT_WORK(work, lru_add_drain_per_cpu); + schedule_work_on(cpu, work); + cpumask_set_cpu(cpu, &has_work); + } + } + + for_each_cpu(cpu, &has_work) + flush_work(&per_cpu(lru_add_drain_work, cpu)); + + put_online_cpus(); + mutex_unlock(&lock); } /* -- 1.8.3.1 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply related [flat|nested] 104+ messages in thread
* Re: [PATCH v8] mm: make lru_add_drain_all() selective 2013-08-14 20:22 ` Chris Metcalf @ 2013-08-14 20:44 ` Andrew Morton -1 siblings, 0 replies; 104+ messages in thread From: Andrew Morton @ 2013-08-14 20:44 UTC (permalink / raw) To: Chris Metcalf Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer On Wed, 14 Aug 2013 16:22:18 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote: > This change makes lru_add_drain_all() only selectively interrupt > the cpus that have per-cpu free pages that can be drained. > > This is important in nohz mode where calling mlockall(), for > example, otherwise will interrupt every core unnecessarily. > I think the patch will work, but it's a bit sad to no longer gain the general ability to do schedule_on_some_cpus(). Oh well. > --- a/include/linux/swap.h > +++ b/include/linux/swap.h > @@ -247,7 +247,7 @@ extern void activate_page(struct page *); > extern void mark_page_accessed(struct page *); > extern void lru_add_drain(void); > extern void lru_add_drain_cpu(int cpu); > -extern int lru_add_drain_all(void); > +extern void lru_add_drain_all(void); > extern void rotate_reclaimable_page(struct page *page); > extern void deactivate_page(struct page *page); > extern void swap_setup(void); > diff --git a/mm/swap.c b/mm/swap.c > index 4a1d0d2..8d19543 100644 > --- a/mm/swap.c > +++ b/mm/swap.c > @@ -405,6 +405,11 @@ static void activate_page_drain(int cpu) > pagevec_lru_move_fn(pvec, __activate_page, NULL); > } > > +static bool need_activate_page_drain(int cpu) > +{ > + return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0; > +} static int need_activate_page_drain(int cpu) { return pagevec_count(&per_cpu(activate_page_pvecs, cpu)); } would be shorter and faster. bool rather sucks that way. It's a performance-vs-niceness thing. I guess one has to look at the call frequency when deciding. > void activate_page(struct page *page) > { > if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { > @@ -422,6 +427,11 @@ static inline void activate_page_drain(int cpu) > { > } > > +static bool need_activate_page_drain(int cpu) > +{ > + return false; > +} > + > void activate_page(struct page *page) > { > struct zone *zone = page_zone(page); > @@ -678,12 +688,36 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) > lru_add_drain(); > } > > -/* > - * Returns 0 for success > - */ > -int lru_add_drain_all(void) > +static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); > + > +void lru_add_drain_all(void) > { > - return schedule_on_each_cpu(lru_add_drain_per_cpu); > + static DEFINE_MUTEX(lock); > + static struct cpumask has_work; > + int cpu; > + > + mutex_lock(&lock); This is a bit scary but I expect it will be OK - later threads will just twiddle thumbs while some other thread does all or most of their work for them. > + get_online_cpus(); > + cpumask_clear(&has_work); > + > + for_each_online_cpu(cpu) { > + struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); > + > + if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || > + pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || > + pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || > + need_activate_page_drain(cpu)) { > + INIT_WORK(work, lru_add_drain_per_cpu); This initialization is only needed once per boot but I don't see a convenient way of doing this. > + schedule_work_on(cpu, work); > + cpumask_set_cpu(cpu, &has_work); > + } > + } > + > + for_each_cpu(cpu, &has_work) for_each_online_cpu()? > + flush_work(&per_cpu(lru_add_drain_work, cpu)); > + > + put_online_cpus(); > + mutex_unlock(&lock); > } ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v8] mm: make lru_add_drain_all() selective @ 2013-08-14 20:44 ` Andrew Morton 0 siblings, 0 replies; 104+ messages in thread From: Andrew Morton @ 2013-08-14 20:44 UTC (permalink / raw) To: Chris Metcalf Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer On Wed, 14 Aug 2013 16:22:18 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote: > This change makes lru_add_drain_all() only selectively interrupt > the cpus that have per-cpu free pages that can be drained. > > This is important in nohz mode where calling mlockall(), for > example, otherwise will interrupt every core unnecessarily. > I think the patch will work, but it's a bit sad to no longer gain the general ability to do schedule_on_some_cpus(). Oh well. > --- a/include/linux/swap.h > +++ b/include/linux/swap.h > @@ -247,7 +247,7 @@ extern void activate_page(struct page *); > extern void mark_page_accessed(struct page *); > extern void lru_add_drain(void); > extern void lru_add_drain_cpu(int cpu); > -extern int lru_add_drain_all(void); > +extern void lru_add_drain_all(void); > extern void rotate_reclaimable_page(struct page *page); > extern void deactivate_page(struct page *page); > extern void swap_setup(void); > diff --git a/mm/swap.c b/mm/swap.c > index 4a1d0d2..8d19543 100644 > --- a/mm/swap.c > +++ b/mm/swap.c > @@ -405,6 +405,11 @@ static void activate_page_drain(int cpu) > pagevec_lru_move_fn(pvec, __activate_page, NULL); > } > > +static bool need_activate_page_drain(int cpu) > +{ > + return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0; > +} static int need_activate_page_drain(int cpu) { return pagevec_count(&per_cpu(activate_page_pvecs, cpu)); } would be shorter and faster. bool rather sucks that way. It's a performance-vs-niceness thing. I guess one has to look at the call frequency when deciding. > void activate_page(struct page *page) > { > if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { > @@ -422,6 +427,11 @@ static inline void activate_page_drain(int cpu) > { > } > > +static bool need_activate_page_drain(int cpu) > +{ > + return false; > +} > + > void activate_page(struct page *page) > { > struct zone *zone = page_zone(page); > @@ -678,12 +688,36 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) > lru_add_drain(); > } > > -/* > - * Returns 0 for success > - */ > -int lru_add_drain_all(void) > +static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); > + > +void lru_add_drain_all(void) > { > - return schedule_on_each_cpu(lru_add_drain_per_cpu); > + static DEFINE_MUTEX(lock); > + static struct cpumask has_work; > + int cpu; > + > + mutex_lock(&lock); This is a bit scary but I expect it will be OK - later threads will just twiddle thumbs while some other thread does all or most of their work for them. > + get_online_cpus(); > + cpumask_clear(&has_work); > + > + for_each_online_cpu(cpu) { > + struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); > + > + if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || > + pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || > + pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || > + need_activate_page_drain(cpu)) { > + INIT_WORK(work, lru_add_drain_per_cpu); This initialization is only needed once per boot but I don't see a convenient way of doing this. > + schedule_work_on(cpu, work); > + cpumask_set_cpu(cpu, &has_work); > + } > + } > + > + for_each_cpu(cpu, &has_work) for_each_online_cpu()? > + flush_work(&per_cpu(lru_add_drain_work, cpu)); > + > + put_online_cpus(); > + mutex_unlock(&lock); > } -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v8] mm: make lru_add_drain_all() selective 2013-08-14 20:44 ` Andrew Morton @ 2013-08-14 20:50 ` Tejun Heo -1 siblings, 0 replies; 104+ messages in thread From: Tejun Heo @ 2013-08-14 20:50 UTC (permalink / raw) To: Andrew Morton Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer Hello, On Wed, Aug 14, 2013 at 01:44:30PM -0700, Andrew Morton wrote: > > +static bool need_activate_page_drain(int cpu) > > +{ > > + return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0; > > +} > > static int need_activate_page_drain(int cpu) > { > return pagevec_count(&per_cpu(activate_page_pvecs, cpu)); > } > > would be shorter and faster. bool rather sucks that way. It's a > performance-vs-niceness thing. I guess one has to look at the call > frequency when deciding. "!= 0" can be dropped but I'm fairly sure the compiler would be able to figure out that the type conversion can be skipped. It's a trivial optimization. > > + schedule_work_on(cpu, work); > > + cpumask_set_cpu(cpu, &has_work); > > + } > > + } > > + > > + for_each_cpu(cpu, &has_work) > > for_each_online_cpu()? That would lead to flushing work items which aren't used and may not have been initialized yet, no? > > + flush_work(&per_cpu(lru_add_drain_work, cpu)); > > + > > + put_online_cpus(); > > + mutex_unlock(&lock); > > } > Thanks. -- tejun ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v8] mm: make lru_add_drain_all() selective @ 2013-08-14 20:50 ` Tejun Heo 0 siblings, 0 replies; 104+ messages in thread From: Tejun Heo @ 2013-08-14 20:50 UTC (permalink / raw) To: Andrew Morton Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer Hello, On Wed, Aug 14, 2013 at 01:44:30PM -0700, Andrew Morton wrote: > > +static bool need_activate_page_drain(int cpu) > > +{ > > + return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0; > > +} > > static int need_activate_page_drain(int cpu) > { > return pagevec_count(&per_cpu(activate_page_pvecs, cpu)); > } > > would be shorter and faster. bool rather sucks that way. It's a > performance-vs-niceness thing. I guess one has to look at the call > frequency when deciding. "!= 0" can be dropped but I'm fairly sure the compiler would be able to figure out that the type conversion can be skipped. It's a trivial optimization. > > + schedule_work_on(cpu, work); > > + cpumask_set_cpu(cpu, &has_work); > > + } > > + } > > + > > + for_each_cpu(cpu, &has_work) > > for_each_online_cpu()? That would lead to flushing work items which aren't used and may not have been initialized yet, no? > > + flush_work(&per_cpu(lru_add_drain_work, cpu)); > > + > > + put_online_cpus(); > > + mutex_unlock(&lock); > > } > Thanks. -- tejun -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v8] mm: make lru_add_drain_all() selective 2013-08-14 20:50 ` Tejun Heo @ 2013-08-14 21:03 ` Andrew Morton -1 siblings, 0 replies; 104+ messages in thread From: Andrew Morton @ 2013-08-14 21:03 UTC (permalink / raw) To: Tejun Heo Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer On Wed, 14 Aug 2013 16:50:29 -0400 Tejun Heo <tj@kernel.org> wrote: > > > + for_each_cpu(cpu, &has_work) > > > > for_each_online_cpu()? > > That would lead to flushing work items which aren't used and may not > have been initialized yet, no? doh, I confused for_each_cpu with for_each_possible_cpu. Dumb name. ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v8] mm: make lru_add_drain_all() selective @ 2013-08-14 21:03 ` Andrew Morton 0 siblings, 0 replies; 104+ messages in thread From: Andrew Morton @ 2013-08-14 21:03 UTC (permalink / raw) To: Tejun Heo Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer On Wed, 14 Aug 2013 16:50:29 -0400 Tejun Heo <tj@kernel.org> wrote: > > > + for_each_cpu(cpu, &has_work) > > > > for_each_online_cpu()? > > That would lead to flushing work items which aren't used and may not > have been initialized yet, no? doh, I confused for_each_cpu with for_each_possible_cpu. Dumb name. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v8] mm: make lru_add_drain_all() selective 2013-08-14 20:50 ` Tejun Heo @ 2013-08-14 21:07 ` Andrew Morton -1 siblings, 0 replies; 104+ messages in thread From: Andrew Morton @ 2013-08-14 21:07 UTC (permalink / raw) To: Tejun Heo Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer On Wed, 14 Aug 2013 16:50:29 -0400 Tejun Heo <tj@kernel.org> wrote: > On Wed, Aug 14, 2013 at 01:44:30PM -0700, Andrew Morton wrote: > > > +static bool need_activate_page_drain(int cpu) > > > +{ > > > + return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0; > > > +} > > > > static int need_activate_page_drain(int cpu) > > { > > return pagevec_count(&per_cpu(activate_page_pvecs, cpu)); > > } > > > > would be shorter and faster. bool rather sucks that way. It's a > > performance-vs-niceness thing. I guess one has to look at the call > > frequency when deciding. > > "!= 0" can be dropped but I'm fairly sure the compiler would be able > to figure out that the type conversion can be skipped. It's a trivial > optimization. The != 0 can surely be removed and that shouldn't make any difference to generated code. The compiler will always need to do the int-to-bool conversion and that's overhead which is added by using bool. It's possible that the compiler will optmise away the int-to-bool conversion when inlining this function into a callsite. I don't know whether the compiler _does_ do this and it will be version dependent. ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v8] mm: make lru_add_drain_all() selective @ 2013-08-14 21:07 ` Andrew Morton 0 siblings, 0 replies; 104+ messages in thread From: Andrew Morton @ 2013-08-14 21:07 UTC (permalink / raw) To: Tejun Heo Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer On Wed, 14 Aug 2013 16:50:29 -0400 Tejun Heo <tj@kernel.org> wrote: > On Wed, Aug 14, 2013 at 01:44:30PM -0700, Andrew Morton wrote: > > > +static bool need_activate_page_drain(int cpu) > > > +{ > > > + return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0; > > > +} > > > > static int need_activate_page_drain(int cpu) > > { > > return pagevec_count(&per_cpu(activate_page_pvecs, cpu)); > > } > > > > would be shorter and faster. bool rather sucks that way. It's a > > performance-vs-niceness thing. I guess one has to look at the call > > frequency when deciding. > > "!= 0" can be dropped but I'm fairly sure the compiler would be able > to figure out that the type conversion can be skipped. It's a trivial > optimization. The != 0 can surely be removed and that shouldn't make any difference to generated code. The compiler will always need to do the int-to-bool conversion and that's overhead which is added by using bool. It's possible that the compiler will optmise away the int-to-bool conversion when inlining this function into a callsite. I don't know whether the compiler _does_ do this and it will be version dependent. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v8] mm: make lru_add_drain_all() selective 2013-08-14 20:22 ` Chris Metcalf @ 2013-08-14 21:12 ` Andrew Morton -1 siblings, 0 replies; 104+ messages in thread From: Andrew Morton @ 2013-08-14 21:12 UTC (permalink / raw) To: Chris Metcalf Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer On Wed, 14 Aug 2013 16:22:18 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote: > This change makes lru_add_drain_all() only selectively interrupt > the cpus that have per-cpu free pages that can be drained. > > This is important in nohz mode where calling mlockall(), for > example, otherwise will interrupt every core unnecessarily. Changelog isn't very informative. I added this: : This is important on workloads where nohz cores are handling 10 Gb traffic : in userspace. Those CPUs do not enter the kernel and place pages into LRU : pagevecs and they really, really don't want to be interrupted, or they : drop packets on the floor. to attempt to describe the rationale for the patch. ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v8] mm: make lru_add_drain_all() selective @ 2013-08-14 21:12 ` Andrew Morton 0 siblings, 0 replies; 104+ messages in thread From: Andrew Morton @ 2013-08-14 21:12 UTC (permalink / raw) To: Chris Metcalf Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer On Wed, 14 Aug 2013 16:22:18 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote: > This change makes lru_add_drain_all() only selectively interrupt > the cpus that have per-cpu free pages that can be drained. > > This is important in nohz mode where calling mlockall(), for > example, otherwise will interrupt every core unnecessarily. Changelog isn't very informative. I added this: : This is important on workloads where nohz cores are handling 10 Gb traffic : in userspace. Those CPUs do not enter the kernel and place pages into LRU : pagevecs and they really, really don't want to be interrupted, or they : drop packets on the floor. to attempt to describe the rationale for the patch. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v8] mm: make lru_add_drain_all() selective 2013-08-14 21:12 ` Andrew Morton @ 2013-08-14 21:23 ` Chris Metcalf -1 siblings, 0 replies; 104+ messages in thread From: Chris Metcalf @ 2013-08-14 21:23 UTC (permalink / raw) To: Andrew Morton Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer On 8/14/2013 5:12 PM, Andrew Morton wrote: > On Wed, 14 Aug 2013 16:22:18 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote: > >> This change makes lru_add_drain_all() only selectively interrupt >> the cpus that have per-cpu free pages that can be drained. >> >> This is important in nohz mode where calling mlockall(), for >> example, otherwise will interrupt every core unnecessarily. > Changelog isn't very informative. I added this: > > : This is important on workloads where nohz cores are handling 10 Gb traffic > : in userspace. Those CPUs do not enter the kernel and place pages into LRU > : pagevecs and they really, really don't want to be interrupted, or they > : drop packets on the floor. > > to attempt to describe the rationale for the patch. Thanks. More motivational text is always a good thing. -- Chris Metcalf, Tilera Corp. http://www.tilera.com ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v8] mm: make lru_add_drain_all() selective @ 2013-08-14 21:23 ` Chris Metcalf 0 siblings, 0 replies; 104+ messages in thread From: Chris Metcalf @ 2013-08-14 21:23 UTC (permalink / raw) To: Andrew Morton Cc: Tejun Heo, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer On 8/14/2013 5:12 PM, Andrew Morton wrote: > On Wed, 14 Aug 2013 16:22:18 -0400 Chris Metcalf <cmetcalf@tilera.com> wrote: > >> This change makes lru_add_drain_all() only selectively interrupt >> the cpus that have per-cpu free pages that can be drained. >> >> This is important in nohz mode where calling mlockall(), for >> example, otherwise will interrupt every core unnecessarily. > Changelog isn't very informative. I added this: > > : This is important on workloads where nohz cores are handling 10 Gb traffic > : in userspace. Those CPUs do not enter the kernel and place pages into LRU > : pagevecs and they really, really don't want to be interrupted, or they > : drop packets on the floor. > > to attempt to describe the rationale for the patch. Thanks. More motivational text is always a good thing. -- Chris Metcalf, Tilera Corp. http://www.tilera.com -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v7 2/2] mm: make lru_add_drain_all() selective 2013-08-13 23:29 ` Tejun Heo @ 2013-08-13 23:44 ` Chris Metcalf -1 siblings, 0 replies; 104+ messages in thread From: Chris Metcalf @ 2013-08-13 23:44 UTC (permalink / raw) To: Tejun Heo Cc: Andrew Morton, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer On 8/13/2013 7:29 PM, Tejun Heo wrote: > It won't nest and doing it simultaneously won't buy anything, right? > Wouldn't it be better to protect it with a mutex and define all > necessary resources statically (yeah, cpumask is pain in the ass and I > think we should un-deprecate cpumask_t for static use cases)? Then, > there'd be no allocation to worry about on the path. Here's what lru_add_drain_all() looks like with a guarding mutex. Pretty much the same code complexity as when we have to allocate the cpumask, and there really aren't any issues from locking, since we can assume all is well and return immediately if we fail to get the lock. int lru_add_drain_all(void) { static struct cpumask mask; static DEFINE_MUTEX(lock); int cpu, rc; if (!mutex_trylock(&lock)) return 0; /* already ongoing elsewhere */ cpumask_clear(&mask); get_online_cpus(); /* * Figure out which cpus need flushing. It's OK if we race * with changes to the per-cpu lru pvecs, since it's no worse * than if we flushed all cpus, since a cpu could still end * up putting pages back on its pvec before we returned. * And this avoids interrupting other cpus unnecessarily. */ for_each_online_cpu(cpu) { if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || need_activate_page_drain(cpu)) cpumask_set_cpu(cpu, &mask); } rc = schedule_on_cpu_mask(lru_add_drain_per_cpu, &mask); put_online_cpus(); mutex_unlock(&lock); return rc; } -- Chris Metcalf, Tilera Corp. http://www.tilera.com ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v7 2/2] mm: make lru_add_drain_all() selective @ 2013-08-13 23:44 ` Chris Metcalf 0 siblings, 0 replies; 104+ messages in thread From: Chris Metcalf @ 2013-08-13 23:44 UTC (permalink / raw) To: Tejun Heo Cc: Andrew Morton, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer On 8/13/2013 7:29 PM, Tejun Heo wrote: > It won't nest and doing it simultaneously won't buy anything, right? > Wouldn't it be better to protect it with a mutex and define all > necessary resources statically (yeah, cpumask is pain in the ass and I > think we should un-deprecate cpumask_t for static use cases)? Then, > there'd be no allocation to worry about on the path. Here's what lru_add_drain_all() looks like with a guarding mutex. Pretty much the same code complexity as when we have to allocate the cpumask, and there really aren't any issues from locking, since we can assume all is well and return immediately if we fail to get the lock. int lru_add_drain_all(void) { static struct cpumask mask; static DEFINE_MUTEX(lock); int cpu, rc; if (!mutex_trylock(&lock)) return 0; /* already ongoing elsewhere */ cpumask_clear(&mask); get_online_cpus(); /* * Figure out which cpus need flushing. It's OK if we race * with changes to the per-cpu lru pvecs, since it's no worse * than if we flushed all cpus, since a cpu could still end * up putting pages back on its pvec before we returned. * And this avoids interrupting other cpus unnecessarily. */ for_each_online_cpu(cpu) { if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || need_activate_page_drain(cpu)) cpumask_set_cpu(cpu, &mask); } rc = schedule_on_cpu_mask(lru_add_drain_per_cpu, &mask); put_online_cpus(); mutex_unlock(&lock); return rc; } -- Chris Metcalf, Tilera Corp. http://www.tilera.com -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v7 2/2] mm: make lru_add_drain_all() selective 2013-08-13 23:44 ` Chris Metcalf @ 2013-08-13 23:51 ` Tejun Heo -1 siblings, 0 replies; 104+ messages in thread From: Tejun Heo @ 2013-08-13 23:51 UTC (permalink / raw) To: Chris Metcalf Cc: Andrew Morton, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer On Tue, Aug 13, 2013 at 07:44:55PM -0400, Chris Metcalf wrote: > int lru_add_drain_all(void) > { > static struct cpumask mask; Instead of cpumask, > static DEFINE_MUTEX(lock); you can DEFINE_PER_CPU(struct work_struct, ...). > for_each_online_cpu(cpu) { > if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || > pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || > pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || > need_activate_page_drain(cpu)) > cpumask_set_cpu(cpu, &mask); and schedule the work items directly. > } > > rc = schedule_on_cpu_mask(lru_add_drain_per_cpu, &mask); Open coding flushing can be a bit bothersome but you can create a per-cpu workqueue and schedule work items on it and then flush the workqueue instead too. No matter how flushing is implemented, the path wouldn't have any memory allocation, which I thought was the topic of the thread, no? Thanks. -- tejun ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v7 2/2] mm: make lru_add_drain_all() selective @ 2013-08-13 23:51 ` Tejun Heo 0 siblings, 0 replies; 104+ messages in thread From: Tejun Heo @ 2013-08-13 23:51 UTC (permalink / raw) To: Chris Metcalf Cc: Andrew Morton, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer On Tue, Aug 13, 2013 at 07:44:55PM -0400, Chris Metcalf wrote: > int lru_add_drain_all(void) > { > static struct cpumask mask; Instead of cpumask, > static DEFINE_MUTEX(lock); you can DEFINE_PER_CPU(struct work_struct, ...). > for_each_online_cpu(cpu) { > if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || > pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || > pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || > need_activate_page_drain(cpu)) > cpumask_set_cpu(cpu, &mask); and schedule the work items directly. > } > > rc = schedule_on_cpu_mask(lru_add_drain_per_cpu, &mask); Open coding flushing can be a bit bothersome but you can create a per-cpu workqueue and schedule work items on it and then flush the workqueue instead too. No matter how flushing is implemented, the path wouldn't have any memory allocation, which I thought was the topic of the thread, no? Thanks. -- tejun -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective 2013-08-13 20:31 ` Andrew Morton @ 2013-08-13 21:07 ` Tejun Heo -1 siblings, 0 replies; 104+ messages in thread From: Tejun Heo @ 2013-08-13 21:07 UTC (permalink / raw) To: Andrew Morton Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer Hello, On Tue, Aug 13, 2013 at 01:31:35PM -0700, Andrew Morton wrote: > > the logical thing to do > > would be pre-allocating per-cpu buffers instead of depending on > > dynamic allocation. Do the invocations need to be stackable? > > schedule_on_each_cpu() calls should if course happen concurrently, and > there's the question of whether we wish to permit async > schedule_on_each_cpu(). Leaving the calling CPU twiddling thumbs until > everyone has finished is pretty sad if the caller doesn't want that. Oh, I meant the caller-side, not schedule_on_each_cpu(). So, if this particular caller is performance sensitive for some reason, it makes sense to pre-allocate resources on the caller side if the caller doesn't need to be reentrant or called concurrently. > I don't recall seeing such abuse. It's a very common and powerful > tool, and not implementing it because some dummy may abuse it weakens > the API for all non-dummies. That allocation is simply unneeded. More powerful and flexible doesn't always equal better and I think being simple and less prone to abuses are important characteristics that APIs should have. It feels a bit silly to me to push the API that way when doing so doesn't even solve the allocation problem. It doesn't really buy us much while making the interface more complex. Thanks. -- tejun ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective @ 2013-08-13 21:07 ` Tejun Heo 0 siblings, 0 replies; 104+ messages in thread From: Tejun Heo @ 2013-08-13 21:07 UTC (permalink / raw) To: Andrew Morton Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer Hello, On Tue, Aug 13, 2013 at 01:31:35PM -0700, Andrew Morton wrote: > > the logical thing to do > > would be pre-allocating per-cpu buffers instead of depending on > > dynamic allocation. Do the invocations need to be stackable? > > schedule_on_each_cpu() calls should if course happen concurrently, and > there's the question of whether we wish to permit async > schedule_on_each_cpu(). Leaving the calling CPU twiddling thumbs until > everyone has finished is pretty sad if the caller doesn't want that. Oh, I meant the caller-side, not schedule_on_each_cpu(). So, if this particular caller is performance sensitive for some reason, it makes sense to pre-allocate resources on the caller side if the caller doesn't need to be reentrant or called concurrently. > I don't recall seeing such abuse. It's a very common and powerful > tool, and not implementing it because some dummy may abuse it weakens > the API for all non-dummies. That allocation is simply unneeded. More powerful and flexible doesn't always equal better and I think being simple and less prone to abuses are important characteristics that APIs should have. It feels a bit silly to me to push the API that way when doing so doesn't even solve the allocation problem. It doesn't really buy us much while making the interface more complex. Thanks. -- tejun -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective 2013-08-13 21:07 ` Tejun Heo @ 2013-08-13 21:16 ` Andrew Morton -1 siblings, 0 replies; 104+ messages in thread From: Andrew Morton @ 2013-08-13 21:16 UTC (permalink / raw) To: Tejun Heo Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer On Tue, 13 Aug 2013 17:07:19 -0400 Tejun Heo <tj@kernel.org> wrote: > > I don't recall seeing such abuse. It's a very common and powerful > > tool, and not implementing it because some dummy may abuse it weakens > > the API for all non-dummies. That allocation is simply unneeded. > > More powerful and flexible doesn't always equal better and I think > being simple and less prone to abuses are important characteristics > that APIs should have. I've yet to see any evidence that callback APIs have been abused and I've yet to see any reasoning which makes me believe that this one will be abused. > It feels a bit silly to me to push the API > that way when doing so doesn't even solve the allocation problem. It removes the need to perform a cpumask allocation in lru_add_drain_all(). > It doesn't really buy us much while making the interface more complex. It's a superior interface. ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective @ 2013-08-13 21:16 ` Andrew Morton 0 siblings, 0 replies; 104+ messages in thread From: Andrew Morton @ 2013-08-13 21:16 UTC (permalink / raw) To: Tejun Heo Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer On Tue, 13 Aug 2013 17:07:19 -0400 Tejun Heo <tj@kernel.org> wrote: > > I don't recall seeing such abuse. It's a very common and powerful > > tool, and not implementing it because some dummy may abuse it weakens > > the API for all non-dummies. That allocation is simply unneeded. > > More powerful and flexible doesn't always equal better and I think > being simple and less prone to abuses are important characteristics > that APIs should have. I've yet to see any evidence that callback APIs have been abused and I've yet to see any reasoning which makes me believe that this one will be abused. > It feels a bit silly to me to push the API > that way when doing so doesn't even solve the allocation problem. It removes the need to perform a cpumask allocation in lru_add_drain_all(). > It doesn't really buy us much while making the interface more complex. It's a superior interface. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective 2013-08-13 21:16 ` Andrew Morton @ 2013-08-13 22:07 ` Tejun Heo -1 siblings, 0 replies; 104+ messages in thread From: Tejun Heo @ 2013-08-13 22:07 UTC (permalink / raw) To: Andrew Morton Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer Hello, On Tue, Aug 13, 2013 at 02:16:21PM -0700, Andrew Morton wrote: > I've yet to see any evidence that callback APIs have been abused and > I've yet to see any reasoning which makes me believe that this one will > be abused. Well, off the top of my head. * In general, it's clunkier. Callbacks become artificial boundaries across which context has to be carried over explicitly. It often involves packing data into a temporary struct. The artificial barrier also generally makes the logic more difficult to follow. This is pretty general problem with callback based interface and why many programming languages / conventions prefer iterator style interface over callback based ones. It makes the code a lot easier to organize around the looping construct. Here, it isn't as pronounced because the thing naturally requires a callback anyway. * From the API itself, it often isn't clear what restrictions the context the callback is called under would have. It sure is partly documentation problem but is pretty easy to get wrong inadvertantly as the code evolves and can be difficult to spot as the context isn't apparent. Moving away from callbacks started with higher level languages but the kernel sure is on the boat too where possible. This one is muddier as the interface is async in nature but still it's at least partially applicable. > > It feels a bit silly to me to push the API > > that way when doing so doesn't even solve the allocation problem. > > It removes the need to perform a cpumask allocation in > lru_add_drain_all(). But that doesn't really solve anything, does it? > > It doesn't really buy us much while making the interface more complex. > > It's a superior interface. It is more flexible but at the same time clunkier. I wouldn't call it superior and the flexibility doesn't buy us much here. Thanks. -- tejun ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective @ 2013-08-13 22:07 ` Tejun Heo 0 siblings, 0 replies; 104+ messages in thread From: Tejun Heo @ 2013-08-13 22:07 UTC (permalink / raw) To: Andrew Morton Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer Hello, On Tue, Aug 13, 2013 at 02:16:21PM -0700, Andrew Morton wrote: > I've yet to see any evidence that callback APIs have been abused and > I've yet to see any reasoning which makes me believe that this one will > be abused. Well, off the top of my head. * In general, it's clunkier. Callbacks become artificial boundaries across which context has to be carried over explicitly. It often involves packing data into a temporary struct. The artificial barrier also generally makes the logic more difficult to follow. This is pretty general problem with callback based interface and why many programming languages / conventions prefer iterator style interface over callback based ones. It makes the code a lot easier to organize around the looping construct. Here, it isn't as pronounced because the thing naturally requires a callback anyway. * From the API itself, it often isn't clear what restrictions the context the callback is called under would have. It sure is partly documentation problem but is pretty easy to get wrong inadvertantly as the code evolves and can be difficult to spot as the context isn't apparent. Moving away from callbacks started with higher level languages but the kernel sure is on the boat too where possible. This one is muddier as the interface is async in nature but still it's at least partially applicable. > > It feels a bit silly to me to push the API > > that way when doing so doesn't even solve the allocation problem. > > It removes the need to perform a cpumask allocation in > lru_add_drain_all(). But that doesn't really solve anything, does it? > > It doesn't really buy us much while making the interface more complex. > > It's a superior interface. It is more flexible but at the same time clunkier. I wouldn't call it superior and the flexibility doesn't buy us much here. Thanks. -- tejun -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective 2013-08-13 22:07 ` Tejun Heo @ 2013-08-13 22:18 ` Andrew Morton -1 siblings, 0 replies; 104+ messages in thread From: Andrew Morton @ 2013-08-13 22:18 UTC (permalink / raw) To: Tejun Heo Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer On Tue, 13 Aug 2013 18:07:00 -0400 Tejun Heo <tj@kernel.org> wrote: > Hello, > > On Tue, Aug 13, 2013 at 02:16:21PM -0700, Andrew Morton wrote: > > I've yet to see any evidence that callback APIs have been abused and > > I've yet to see any reasoning which makes me believe that this one will > > be abused. > > Well, off the top of my head. > > * In general, it's clunkier. Callbacks become artificial boundaries > across which context has to be carried over explicitly. It often > involves packing data into a temporary struct. The artificial > barrier also generally makes the logic more difficult to follow. > This is pretty general problem with callback based interface and why > many programming languages / conventions prefer iterator style > interface over callback based ones. It makes the code a lot easier > to organize around the looping construct. Here, it isn't as > pronounced because the thing naturally requires a callback anyway. > > * From the API itself, it often isn't clear what restrictions the > context the callback is called under would have. It sure is partly > documentation problem but is pretty easy to get wrong inadvertantly > as the code evolves and can be difficult to spot as the context > isn't apparent. > > Moving away from callbacks started with higher level languages but the > kernel sure is on the boat too where possible. This one is muddier as > the interface is async in nature but still it's at least partially > applicable. I don't buy it. The callback simply determines whether "we need to schuedule work on this cpu". It's utterly simple. Nobody will have trouble understanding or using such a thing. > > > It feels a bit silly to me to push the API > > > that way when doing so doesn't even solve the allocation problem. > > > > It removes the need to perform a cpumask allocation in > > lru_add_drain_all(). > > But that doesn't really solve anything, does it? It removes one memory allocation and initialisation per call. It removes an entire for_each_online_cpu() loop. > > > It doesn't really buy us much while making the interface more complex. > > > > It's a superior interface. > > It is more flexible but at the same time clunkier. The callback predicate is a quite natural thing in this case. > I wouldn't call it > superior and the flexibility doesn't buy us much here. It buys quite a lot and demonstrates why a callback interface is better. I really don't understand what's going on here. You're advocating for a weaker kernel interface and for inferior kernel runtime behaviour. Forcing callers to communicate their needs via a large, dynamically-allocated temporary rather than directly. And what do we get in return for all this? Some stuff about callbacks which frankly has me scratching my head. ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective @ 2013-08-13 22:18 ` Andrew Morton 0 siblings, 0 replies; 104+ messages in thread From: Andrew Morton @ 2013-08-13 22:18 UTC (permalink / raw) To: Tejun Heo Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer On Tue, 13 Aug 2013 18:07:00 -0400 Tejun Heo <tj@kernel.org> wrote: > Hello, > > On Tue, Aug 13, 2013 at 02:16:21PM -0700, Andrew Morton wrote: > > I've yet to see any evidence that callback APIs have been abused and > > I've yet to see any reasoning which makes me believe that this one will > > be abused. > > Well, off the top of my head. > > * In general, it's clunkier. Callbacks become artificial boundaries > across which context has to be carried over explicitly. It often > involves packing data into a temporary struct. The artificial > barrier also generally makes the logic more difficult to follow. > This is pretty general problem with callback based interface and why > many programming languages / conventions prefer iterator style > interface over callback based ones. It makes the code a lot easier > to organize around the looping construct. Here, it isn't as > pronounced because the thing naturally requires a callback anyway. > > * From the API itself, it often isn't clear what restrictions the > context the callback is called under would have. It sure is partly > documentation problem but is pretty easy to get wrong inadvertantly > as the code evolves and can be difficult to spot as the context > isn't apparent. > > Moving away from callbacks started with higher level languages but the > kernel sure is on the boat too where possible. This one is muddier as > the interface is async in nature but still it's at least partially > applicable. I don't buy it. The callback simply determines whether "we need to schuedule work on this cpu". It's utterly simple. Nobody will have trouble understanding or using such a thing. > > > It feels a bit silly to me to push the API > > > that way when doing so doesn't even solve the allocation problem. > > > > It removes the need to perform a cpumask allocation in > > lru_add_drain_all(). > > But that doesn't really solve anything, does it? It removes one memory allocation and initialisation per call. It removes an entire for_each_online_cpu() loop. > > > It doesn't really buy us much while making the interface more complex. > > > > It's a superior interface. > > It is more flexible but at the same time clunkier. The callback predicate is a quite natural thing in this case. > I wouldn't call it > superior and the flexibility doesn't buy us much here. It buys quite a lot and demonstrates why a callback interface is better. I really don't understand what's going on here. You're advocating for a weaker kernel interface and for inferior kernel runtime behaviour. Forcing callers to communicate their needs via a large, dynamically-allocated temporary rather than directly. And what do we get in return for all this? Some stuff about callbacks which frankly has me scratching my head. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective 2013-08-13 22:18 ` Andrew Morton @ 2013-08-13 22:33 ` Tejun Heo -1 siblings, 0 replies; 104+ messages in thread From: Tejun Heo @ 2013-08-13 22:33 UTC (permalink / raw) To: Andrew Morton Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer Hello, Andrew. On Tue, Aug 13, 2013 at 03:18:05PM -0700, Andrew Morton wrote: > I don't buy it. The callback simply determines whether "we need to > schuedule work on this cpu". It's utterly simple. Nobody will have > trouble understanding or using such a thing. Well, I don't buy that either. Callback based interface has its issues. The difference we're talking about here is pretty minute but then again the improvement brought on by the callback is pretty minute too. > It removes one memory allocation and initialisation per call. It > removes an entire for_each_online_cpu() loop. But that doesn't solve the original problem at all and while it removes the loop, it also adds a separate function. > I really don't understand what's going on here. You're advocating for > a weaker kernel interface and for inferior kernel runtime behaviour. > Forcing callers to communicate their needs via a large, > dynamically-allocated temporary rather than directly. And what do we > get in return for all this? Some stuff about callbacks which frankly > has me scratching my head. Well, it is a fairly heavy path and you're pushing for an optimization which won't make any noticeable difference at all. And, yes, I do think we need to stick to simpler APIs whereever possible. Sure the difference is minute here but the addition of test callback doesn't buy us anything either, so what's the point? The allocation doesn't even exist for vast majority of configurations. If kmalloc from that site is problematic, the right thing to do is pre-allocating resources on the caller side, isn't it? Thanks. -- tejun ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective @ 2013-08-13 22:33 ` Tejun Heo 0 siblings, 0 replies; 104+ messages in thread From: Tejun Heo @ 2013-08-13 22:33 UTC (permalink / raw) To: Andrew Morton Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer Hello, Andrew. On Tue, Aug 13, 2013 at 03:18:05PM -0700, Andrew Morton wrote: > I don't buy it. The callback simply determines whether "we need to > schuedule work on this cpu". It's utterly simple. Nobody will have > trouble understanding or using such a thing. Well, I don't buy that either. Callback based interface has its issues. The difference we're talking about here is pretty minute but then again the improvement brought on by the callback is pretty minute too. > It removes one memory allocation and initialisation per call. It > removes an entire for_each_online_cpu() loop. But that doesn't solve the original problem at all and while it removes the loop, it also adds a separate function. > I really don't understand what's going on here. You're advocating for > a weaker kernel interface and for inferior kernel runtime behaviour. > Forcing callers to communicate their needs via a large, > dynamically-allocated temporary rather than directly. And what do we > get in return for all this? Some stuff about callbacks which frankly > has me scratching my head. Well, it is a fairly heavy path and you're pushing for an optimization which won't make any noticeable difference at all. And, yes, I do think we need to stick to simpler APIs whereever possible. Sure the difference is minute here but the addition of test callback doesn't buy us anything either, so what's the point? The allocation doesn't even exist for vast majority of configurations. If kmalloc from that site is problematic, the right thing to do is pre-allocating resources on the caller side, isn't it? Thanks. -- tejun -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective 2013-08-13 22:33 ` Tejun Heo @ 2013-08-13 22:47 ` Andrew Morton -1 siblings, 0 replies; 104+ messages in thread From: Andrew Morton @ 2013-08-13 22:47 UTC (permalink / raw) To: Tejun Heo Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer On Tue, 13 Aug 2013 18:33:04 -0400 Tejun Heo <tj@kernel.org> wrote: > Hello, Andrew. > > On Tue, Aug 13, 2013 at 03:18:05PM -0700, Andrew Morton wrote: > > I don't buy it. The callback simply determines whether "we need to > > schuedule work on this cpu". It's utterly simple. Nobody will have > > trouble understanding or using such a thing. > > Well, I don't buy that either. Callback based interface has its > issues. No it hasn't. It's a common and simple technique which we all understand. > The difference we're talking about here is pretty minute but > then again the improvement brought on by the callback is pretty minute > too. It's a relatively small improvement in the lru_add_drain_all() case. Other callsites can gain improvements as well. > > It removes one memory allocation and initialisation per call. It > > removes an entire for_each_online_cpu() loop. > > But that doesn't solve the original problem at all and while it > removes the loop, it also adds a separate function. It results in superior runtime code. At this and potentially other callsites. > > I really don't understand what's going on here. You're advocating for > > a weaker kernel interface and for inferior kernel runtime behaviour. > > Forcing callers to communicate their needs via a large, > > dynamically-allocated temporary rather than directly. And what do we > > get in return for all this? Some stuff about callbacks which frankly > > has me scratching my head. > > Well, it is a fairly heavy path and you're pushing for an optimization > which won't make any noticeable difference at all. And, yes, I do > think we need to stick to simpler APIs whereever possible. Sure the > difference is minute here but the addition of test callback doesn't > buy us anything either, so what's the point? It does buy us things, as I've repeatedly described. You keep on saying things which demonstrably aren't so. I think I'll give up now. ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective @ 2013-08-13 22:47 ` Andrew Morton 0 siblings, 0 replies; 104+ messages in thread From: Andrew Morton @ 2013-08-13 22:47 UTC (permalink / raw) To: Tejun Heo Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer On Tue, 13 Aug 2013 18:33:04 -0400 Tejun Heo <tj@kernel.org> wrote: > Hello, Andrew. > > On Tue, Aug 13, 2013 at 03:18:05PM -0700, Andrew Morton wrote: > > I don't buy it. The callback simply determines whether "we need to > > schuedule work on this cpu". It's utterly simple. Nobody will have > > trouble understanding or using such a thing. > > Well, I don't buy that either. Callback based interface has its > issues. No it hasn't. It's a common and simple technique which we all understand. > The difference we're talking about here is pretty minute but > then again the improvement brought on by the callback is pretty minute > too. It's a relatively small improvement in the lru_add_drain_all() case. Other callsites can gain improvements as well. > > It removes one memory allocation and initialisation per call. It > > removes an entire for_each_online_cpu() loop. > > But that doesn't solve the original problem at all and while it > removes the loop, it also adds a separate function. It results in superior runtime code. At this and potentially other callsites. > > I really don't understand what's going on here. You're advocating for > > a weaker kernel interface and for inferior kernel runtime behaviour. > > Forcing callers to communicate their needs via a large, > > dynamically-allocated temporary rather than directly. And what do we > > get in return for all this? Some stuff about callbacks which frankly > > has me scratching my head. > > Well, it is a fairly heavy path and you're pushing for an optimization > which won't make any noticeable difference at all. And, yes, I do > think we need to stick to simpler APIs whereever possible. Sure the > difference is minute here but the addition of test callback doesn't > buy us anything either, so what's the point? It does buy us things, as I've repeatedly described. You keep on saying things which demonstrably aren't so. I think I'll give up now. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective 2013-08-13 22:47 ` Andrew Morton @ 2013-08-13 23:03 ` Tejun Heo -1 siblings, 0 replies; 104+ messages in thread From: Tejun Heo @ 2013-08-13 23:03 UTC (permalink / raw) To: Andrew Morton Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer Hello, Andrew. On Tue, Aug 13, 2013 at 03:47:40PM -0700, Andrew Morton wrote: > > Well, I don't buy that either. Callback based interface has its > > issues. > > No it hasn't. It's a common and simple technique which we all understand. It sure has its uses but it has receded some of its former use cases to better constructs which are easier to use and maintain. I'm not saying it's black and white here as the thing is callback based anyway but was trying to point out general disadvantages of callback based interface. If you're saying callback based interface isn't clunkier compared to constructs which can be embedded in the caller side, this discussion probably won't be very fruitful. > It's a relatively small improvement in the lru_add_drain_all() case. > Other callsites can gain improvements as well. Well, if we're talking about minute performance differences, for non-huge configurations, it'll actually be a small performance degradation as there will be more instructions and the control will be jumping back and forth. > It results in superior runtime code. At this and potentially other > callsites. It's actually inferior in majority of cases. > It does buy us things, as I've repeatedly described. You keep on > saying things which demonstrably aren't so. I think I'll give up now. I just don't think it's something clear cut and it doesn't even matter for the problem at hand. Let's please talk about how to solve the actual problem. Thanks. -- tejun ^ permalink raw reply [flat|nested] 104+ messages in thread
* Re: [PATCH v4 2/2] mm: make lru_add_drain_all() selective @ 2013-08-13 23:03 ` Tejun Heo 0 siblings, 0 replies; 104+ messages in thread From: Tejun Heo @ 2013-08-13 23:03 UTC (permalink / raw) To: Andrew Morton Cc: Chris Metcalf, linux-kernel, linux-mm, Thomas Gleixner, Frederic Weisbecker, Cody P Schafer Hello, Andrew. On Tue, Aug 13, 2013 at 03:47:40PM -0700, Andrew Morton wrote: > > Well, I don't buy that either. Callback based interface has its > > issues. > > No it hasn't. It's a common and simple technique which we all understand. It sure has its uses but it has receded some of its former use cases to better constructs which are easier to use and maintain. I'm not saying it's black and white here as the thing is callback based anyway but was trying to point out general disadvantages of callback based interface. If you're saying callback based interface isn't clunkier compared to constructs which can be embedded in the caller side, this discussion probably won't be very fruitful. > It's a relatively small improvement in the lru_add_drain_all() case. > Other callsites can gain improvements as well. Well, if we're talking about minute performance differences, for non-huge configurations, it'll actually be a small performance degradation as there will be more instructions and the control will be jumping back and forth. > It results in superior runtime code. At this and potentially other > callsites. It's actually inferior in majority of cases. > It does buy us things, as I've repeatedly described. You keep on > saying things which demonstrably aren't so. I think I'll give up now. I just don't think it's something clear cut and it doesn't even matter for the problem at hand. Let's please talk about how to solve the actual problem. Thanks. -- tejun -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 104+ messages in thread
end of thread, other threads:[~2013-08-14 21:23 UTC | newest] Thread overview: 104+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2013-08-06 20:22 [PATCH] mm: make lru_add_drain_all() selective Chris Metcalf 2013-08-06 20:22 ` Chris Metcalf 2013-08-06 20:22 ` [PATCH v2] " Chris Metcalf 2013-08-06 20:22 ` Chris Metcalf 2013-08-07 20:45 ` Tejun Heo 2013-08-07 20:45 ` Tejun Heo 2013-08-07 20:49 ` [PATCH v3 1/2] workqueue: add new schedule_on_cpu_mask() API Chris Metcalf 2013-08-07 20:49 ` Chris Metcalf 2013-08-07 20:52 ` [PATCH v3 2/2] mm: make lru_add_drain_all() selective Chris Metcalf 2013-08-07 20:52 ` Chris Metcalf 2013-08-07 22:48 ` [PATCH v2] " Cody P Schafer 2013-08-07 22:48 ` Cody P Schafer 2013-08-07 20:49 ` [PATCH v4 1/2] workqueue: add new schedule_on_cpu_mask() API Chris Metcalf 2013-08-07 20:49 ` Chris Metcalf 2013-08-09 15:02 ` Tejun Heo 2013-08-09 15:02 ` Tejun Heo 2013-08-09 16:12 ` Chris Metcalf 2013-08-09 16:12 ` Chris Metcalf 2013-08-09 16:30 ` Tejun Heo 2013-08-09 16:30 ` Tejun Heo 2013-08-07 20:49 ` [PATCH v5 " Chris Metcalf 2013-08-07 20:49 ` Chris Metcalf 2013-08-09 17:40 ` Tejun Heo 2013-08-09 17:40 ` Tejun Heo 2013-08-09 17:49 ` [PATCH v6 " Chris Metcalf 2013-08-09 17:49 ` Chris Metcalf 2013-08-09 17:52 ` [PATCH v6 2/2] mm: make lru_add_drain_all() selective Chris Metcalf 2013-08-09 17:52 ` Chris Metcalf 2013-08-07 20:52 ` [PATCH v5 " Chris Metcalf 2013-08-07 20:52 ` Chris Metcalf 2013-08-07 20:52 ` [PATCH v4 " Chris Metcalf 2013-08-07 20:52 ` Chris Metcalf 2013-08-12 21:05 ` Andrew Morton 2013-08-12 21:05 ` Andrew Morton 2013-08-13 1:53 ` Chris Metcalf 2013-08-13 1:53 ` Chris Metcalf 2013-08-13 19:35 ` Andrew Morton 2013-08-13 19:35 ` Andrew Morton 2013-08-13 20:19 ` Tejun Heo 2013-08-13 20:19 ` Tejun Heo 2013-08-13 20:31 ` Andrew Morton 2013-08-13 20:31 ` Andrew Morton 2013-08-13 20:59 ` Chris Metcalf 2013-08-13 20:59 ` Chris Metcalf 2013-08-13 21:13 ` Andrew Morton 2013-08-13 21:13 ` Andrew Morton 2013-08-13 22:13 ` Chris Metcalf 2013-08-13 22:13 ` Chris Metcalf 2013-08-13 22:26 ` Andrew Morton 2013-08-13 22:26 ` Andrew Morton 2013-08-13 23:04 ` Chris Metcalf 2013-08-13 23:04 ` Chris Metcalf 2013-08-13 22:51 ` [PATCH v7 1/2] workqueue: add schedule_on_each_cpu_cond Chris Metcalf 2013-08-13 22:51 ` Chris Metcalf 2013-08-13 22:53 ` [PATCH v7 2/2] mm: make lru_add_drain_all() selective Chris Metcalf 2013-08-13 22:53 ` Chris Metcalf 2013-08-13 23:29 ` Tejun Heo 2013-08-13 23:29 ` Tejun Heo 2013-08-13 23:32 ` Chris Metcalf 2013-08-13 23:32 ` Chris Metcalf 2013-08-14 6:46 ` Andrew Morton 2013-08-14 6:46 ` Andrew Morton 2013-08-14 13:05 ` Tejun Heo 2013-08-14 13:05 ` Tejun Heo 2013-08-14 16:03 ` Chris Metcalf 2013-08-14 16:03 ` Chris Metcalf 2013-08-14 16:57 ` Tejun Heo 2013-08-14 16:57 ` Tejun Heo 2013-08-14 17:18 ` Chris Metcalf 2013-08-14 17:18 ` Chris Metcalf 2013-08-14 20:07 ` Tejun Heo 2013-08-14 20:07 ` Tejun Heo 2013-08-14 20:22 ` [PATCH v8] " Chris Metcalf 2013-08-14 20:22 ` Chris Metcalf 2013-08-14 20:44 ` Andrew Morton 2013-08-14 20:44 ` Andrew Morton 2013-08-14 20:50 ` Tejun Heo 2013-08-14 20:50 ` Tejun Heo 2013-08-14 21:03 ` Andrew Morton 2013-08-14 21:03 ` Andrew Morton 2013-08-14 21:07 ` Andrew Morton 2013-08-14 21:07 ` Andrew Morton 2013-08-14 21:12 ` Andrew Morton 2013-08-14 21:12 ` Andrew Morton 2013-08-14 21:23 ` Chris Metcalf 2013-08-14 21:23 ` Chris Metcalf 2013-08-13 23:44 ` [PATCH v7 2/2] " Chris Metcalf 2013-08-13 23:44 ` Chris Metcalf 2013-08-13 23:51 ` Tejun Heo 2013-08-13 23:51 ` Tejun Heo 2013-08-13 21:07 ` [PATCH v4 " Tejun Heo 2013-08-13 21:07 ` Tejun Heo 2013-08-13 21:16 ` Andrew Morton 2013-08-13 21:16 ` Andrew Morton 2013-08-13 22:07 ` Tejun Heo 2013-08-13 22:07 ` Tejun Heo 2013-08-13 22:18 ` Andrew Morton 2013-08-13 22:18 ` Andrew Morton 2013-08-13 22:33 ` Tejun Heo 2013-08-13 22:33 ` Tejun Heo 2013-08-13 22:47 ` Andrew Morton 2013-08-13 22:47 ` Andrew Morton 2013-08-13 23:03 ` Tejun Heo 2013-08-13 23:03 ` Tejun Heo
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.