When free pages are done with higher order, time spend on coalescing pages by buddy allocator can be reduced. With section size of 256MB, hot add latency of a single section shows improvement from 50-60 ms to less than 1 ms, hence improving the hot add latency by 60%. Modify external providers of online callback to align with the change. Signed-off-by: Arun KS <arunks@codeaurora.org> --- Changes since v4: - As suggested by Michal Hocko, - Simplify logic in online_pages_block() by using get_order(). - Seperate out removal of prefetch from __free_pages_core(). Changes since v3: - Renamed _free_pages_boot_core -> __free_pages_core. - Removed prefetch from __free_pages_core. - Removed xen_online_page(). Changes since v2: - Reuse code from __free_pages_boot_core(). Changes since v1: - Removed prefetch(). Changes since RFC: - Rebase. - As suggested by Michal Hocko remove pages_per_block. - Modifed external providers of online_page_callback. v4: https://lore.kernel.org/patchwork/patch/995111/ v3: https://lore.kernel.org/patchwork/patch/992348/ v2: https://lore.kernel.org/patchwork/patch/991363/ v1: https://lore.kernel.org/patchwork/patch/989445/ RFC: https://lore.kernel.org/patchwork/patch/984754/ --- drivers/hv/hv_balloon.c | 6 ++++-- drivers/xen/balloon.c | 23 +++++++++++++++-------- include/linux/memory_hotplug.h | 2 +- mm/internal.h | 1 + mm/memory_hotplug.c | 42 ++++++++++++++++++++++++++++++------------ mm/page_alloc.c | 8 ++++---- 6 files changed, 55 insertions(+), 27 deletions(-) diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index b1b7880..c5bc0b5 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -771,7 +771,7 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size, } } -static void hv_online_page(struct page *pg) +static int hv_online_page(struct page *pg, unsigned int order) { struct hv_hotadd_state *has; unsigned long flags; @@ -783,10 +783,12 @@ static void hv_online_page(struct page *pg) if ((pfn < has->start_pfn) || (pfn >= has->end_pfn)) continue; - hv_page_online_one(has, pg); + hv_bring_pgs_online(has, pfn, (1UL << order)); break; } spin_unlock_irqrestore(&dm_device.ha_lock, flags); + + return 0; } static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt) diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index e12bb25..58ddf48 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -390,8 +390,8 @@ static enum bp_state reserve_additional_memory(void) /* * add_memory_resource() will call online_pages() which in its turn - * will call xen_online_page() callback causing deadlock if we don't - * release balloon_mutex here. Unlocking here is safe because the + * will call xen_bring_pgs_online() callback causing deadlock if we + * don't release balloon_mutex here. Unlocking here is safe because the * callers drop the mutex before trying again. */ mutex_unlock(&balloon_mutex); @@ -411,15 +411,22 @@ static enum bp_state reserve_additional_memory(void) return BP_ECANCELED; } -static void xen_online_page(struct page *page) +static int xen_bring_pgs_online(struct page *pg, unsigned int order) { - __online_page_set_limits(page); + unsigned long i, size = (1 << order); + unsigned long start_pfn = page_to_pfn(pg); + struct page *p; + pr_debug("Online %lu pages starting at pfn 0x%lx\n", size, start_pfn); mutex_lock(&balloon_mutex); - - __balloon_append(page); - + for (i = 0; i < size; i++) { + p = pfn_to_page(start_pfn + i); + __online_page_set_limits(p); + __balloon_append(p); + } mutex_unlock(&balloon_mutex); + + return 0; } static int xen_memory_notifier(struct notifier_block *nb, unsigned long val, void *v) @@ -744,7 +751,7 @@ static int __init balloon_init(void) balloon_stats.max_retry_count = RETRY_UNLIMITED; #ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG - set_online_page_callback(&xen_online_page); + set_online_page_callback(&xen_bring_pgs_online); register_memory_notifier(&xen_memory_nb); register_sysctl_table(xen_root); diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 34a2822..7b04c1d 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -87,7 +87,7 @@ extern int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn, unsigned long *valid_start, unsigned long *valid_end); extern void __offline_isolated_pages(unsigned long, unsigned long); -typedef void (*online_page_callback_t)(struct page *page); +typedef int (*online_page_callback_t)(struct page *page, unsigned int order); extern int set_online_page_callback(online_page_callback_t callback); extern int restore_online_page_callback(online_page_callback_t callback); diff --git a/mm/internal.h b/mm/internal.h index 87256ae..636679c 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -163,6 +163,7 @@ static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn, extern int __isolate_free_page(struct page *page, unsigned int order); extern void __free_pages_bootmem(struct page *page, unsigned long pfn, unsigned int order); +extern void __free_pages_core(struct page *page, unsigned int order); extern void prep_compound_page(struct page *page, unsigned int order); extern void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 38d94b7..e379e85 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -47,7 +47,7 @@ * and restore_online_page_callback() for generic callback restore. */ -static void generic_online_page(struct page *page); +static int generic_online_page(struct page *page, unsigned int order); static online_page_callback_t online_page_callback = generic_online_page; static DEFINE_MUTEX(online_page_callback_lock); @@ -655,26 +655,44 @@ void __online_page_free(struct page *page) } EXPORT_SYMBOL_GPL(__online_page_free); -static void generic_online_page(struct page *page) +static int generic_online_page(struct page *page, unsigned int order) { - __online_page_set_limits(page); - __online_page_increment_counters(page); - __online_page_free(page); + __free_pages_core(page, order); + totalram_pages += (1UL << order); +#ifdef CONFIG_HIGHMEM + if (PageHighMem(page)) + totalhigh_pages += (1UL << order); +#endif + return 0; +} + +static int online_pages_blocks(unsigned long start, unsigned long nr_pages) +{ + unsigned long end = start + nr_pages; + int order, ret, onlined_pages = 0; + + while (start < end) { + order = min(MAX_ORDER - 1, + get_order(PFN_PHYS(end) - PFN_PHYS(start))); + + ret = (*online_page_callback)(pfn_to_page(start), order); + if (!ret) + onlined_pages += (1UL << order); + else if (ret > 0) + onlined_pages += ret; + + start += (1UL << order); + } + return onlined_pages; } static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, void *arg) { - unsigned long i; unsigned long onlined_pages = *(unsigned long *)arg; - struct page *page; if (PageReserved(pfn_to_page(start_pfn))) - for (i = 0; i < nr_pages; i++) { - page = pfn_to_page(start_pfn + i); - (*online_page_callback)(page); - onlined_pages++; - } + onlined_pages = online_pages_blocks(start_pfn, nr_pages); online_mem_sections(start_pfn, start_pfn + nr_pages); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 89d2a2a..7ab5274 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1252,7 +1252,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) local_irq_restore(flags); } -static void __init __free_pages_boot_core(struct page *page, unsigned int order) +void __free_pages_core(struct page *page, unsigned int order) { unsigned int nr_pages = 1 << order; struct page *p = page; @@ -1331,7 +1331,7 @@ void __init __free_pages_bootmem(struct page *page, unsigned long pfn, { if (early_page_uninitialised(pfn)) return; - return __free_pages_boot_core(page, order); + return __free_pages_core(page, order); } /* @@ -1421,14 +1421,14 @@ static void __init deferred_free_range(unsigned long pfn, if (nr_pages == pageblock_nr_pages && (pfn & (pageblock_nr_pages - 1)) == 0) { set_pageblock_migratetype(page, MIGRATE_MOVABLE); - __free_pages_boot_core(page, pageblock_order); + __free_pages_core(page, pageblock_order); return; } for (i = 0; i < nr_pages; i++, page++, pfn++) { if ((pfn & (pageblock_nr_pages - 1)) == 0) set_pageblock_migratetype(page, MIGRATE_MOVABLE); - __free_pages_boot_core(page, 0); + __free_pages_core(page, 0); } } -- 1.9.1
They not only increase the code footprint, they actually make things slower rather than faster. Remove them as contemporary hardware doesn't need any hint. Suggested-by: Dan Williams <dan.j.williams@intel.com> Signed-off-by: Arun KS <arunks@codeaurora.org> --- mm/page_alloc.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7ab5274..90db431 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1258,14 +1258,10 @@ void __free_pages_core(struct page *page, unsigned int order) struct page *p = page; unsigned int loop; - prefetchw(p); - for (loop = 0; loop < (nr_pages - 1); loop++, p++) { - prefetchw(p + 1); + for (loop = 0; loop < nr_pages ; loop++, p++) { __ClearPageReserved(p); set_page_count(p, 0); } - __ClearPageReserved(p); - set_page_count(p, 0); page_zone(page)->managed_pages += nr_pages; set_page_refcounted(page); -- 1.9.1
On Fri, Oct 05, 2018 at 01:40:05PM +0530, Arun KS wrote: > When free pages are done with higher order, time spend on > coalescing pages by buddy allocator can be reduced. With > section size of 256MB, hot add latency of a single section > shows improvement from 50-60 ms to less than 1 ms, hence > improving the hot add latency by 60%. Modify external > providers of online callback to align with the change. > > Signed-off-by: Arun KS <arunks@codeaurora.org> Looks good to me. Reviewed-by: Oscar Salvador <osalvador@suse.de> Just one thing below: > @@ -1331,7 +1331,7 @@ void __init __free_pages_bootmem(struct page *page, unsigned long pfn, > { > if (early_page_uninitialised(pfn)) > return; > - return __free_pages_boot_core(page, order); > + return __free_pages_core(page, order); __free_pages_core is void, so I guess we do not need that return there. Probably the code generated is the same though. -- Oscar Salvador SUSE L3
On Fri 05-10-18 13:40:05, Arun KS wrote:
> When free pages are done with higher order, time spend on
> coalescing pages by buddy allocator can be reduced. With
> section size of 256MB, hot add latency of a single section
> shows improvement from 50-60 ms to less than 1 ms, hence
> improving the hot add latency by 60%. Modify external
> providers of online callback to align with the change.
Acked-by: Michal Hocko <mhocko@suse.com>
Thanks for your patience with all the resubmission.
--
Michal Hocko
SUSE Labs
On Fri 05-10-18 13:40:06, Arun KS wrote: > They not only increase the code footprint, they actually make things > slower rather than faster. Remove them as contemporary hardware doesn't > need any hint. I agree with the change but it is much better to add some numbers whenever arguing about performance impact. > > Suggested-by: Dan Williams <dan.j.williams@intel.com> > Signed-off-by: Arun KS <arunks@codeaurora.org> > --- > mm/page_alloc.c | 6 +----- > 1 file changed, 1 insertion(+), 5 deletions(-) > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > index 7ab5274..90db431 100644 > --- a/mm/page_alloc.c > +++ b/mm/page_alloc.c > @@ -1258,14 +1258,10 @@ void __free_pages_core(struct page *page, unsigned int order) > struct page *p = page; > unsigned int loop; > > - prefetchw(p); > - for (loop = 0; loop < (nr_pages - 1); loop++, p++) { > - prefetchw(p + 1); > + for (loop = 0; loop < nr_pages ; loop++, p++) { > __ClearPageReserved(p); > set_page_count(p, 0); > } > - __ClearPageReserved(p); > - set_page_count(p, 0); > > page_zone(page)->managed_pages += nr_pages; > set_page_refcounted(page); > -- > 1.9.1 > -- Michal Hocko SUSE Labs
On 2018-10-09 14:59, Michal Hocko wrote: > On Fri 05-10-18 13:40:05, Arun KS wrote: >> When free pages are done with higher order, time spend on >> coalescing pages by buddy allocator can be reduced. With >> section size of 256MB, hot add latency of a single section >> shows improvement from 50-60 ms to less than 1 ms, hence >> improving the hot add latency by 60%. Modify external >> providers of online callback to align with the change. > > Acked-by: Michal Hocko <mhocko@suse.com> > > Thanks for your patience with all the resubmission. Hello Michal, I got the below email few days back. Do I still need to resubmit the patch? or is it already on the way to merge? Regards, Arun The patch titled Subject: mm/page_alloc.c: memory hotplug: free pages as higher order has been added to the -mm tree. Its filename is memory_hotplug-free-pages-as-higher-order.patch This patch should soon appear at http://ozlabs.org/~akpm/mmots/broken-out/memory_hotplug-free-pages-as-higher-order.patch and later at http://ozlabs.org/~akpm/mmotm/broken-out/memory_hotplug-free-pages-as-higher-order.patch Before you just go and hit "reply", please: a) Consider who else should be cc'ed b) Prefer to cc a suitable mailing list as well c) Ideally: find the original patch on the mailing list and do a reply-to-all to that, adding suitable additional cc's *** Remember to use Documentation/process/submit-checklist.rst when testing your code *** The -mm tree is included into linux-next and is updated there every 3-4 working days
On Tue 09-10-18 15:24:13, Arun KS wrote:
> On 2018-10-09 14:59, Michal Hocko wrote:
> > On Fri 05-10-18 13:40:05, Arun KS wrote:
> > > When free pages are done with higher order, time spend on
> > > coalescing pages by buddy allocator can be reduced. With
> > > section size of 256MB, hot add latency of a single section
> > > shows improvement from 50-60 ms to less than 1 ms, hence
> > > improving the hot add latency by 60%. Modify external
> > > providers of online callback to align with the change.
> >
> > Acked-by: Michal Hocko <mhocko@suse.com>
> >
> > Thanks for your patience with all the resubmission.
>
> Hello Michal,
>
> I got the below email few days back. Do I still need to resubmit the patch?
> or is it already on the way to merge?
It is sitting in the queue for now. Andrew collects acks and
reviewed-bys and add them to the patch.
--
Michal Hocko
SUSE Labs
On Fri, Oct 05, 2018 at 01:40:05PM +0530, Arun KS wrote:
> When free pages are done with higher order, time spend on
> coalescing pages by buddy allocator can be reduced. With
> section size of 256MB, hot add latency of a single section
> shows improvement from 50-60 ms to less than 1 ms, hence
> improving the hot add latency by 60%. Modify external
> providers of online callback to align with the change.
Hi Arun, out of curiosity:
could you please explain how exactly did you mesure the speed
improvement?
Thanks
--
Oscar Salvador
SUSE L3
On 2018-10-10 13:37, Oscar Salvador wrote: > On Fri, Oct 05, 2018 at 01:40:05PM +0530, Arun KS wrote: >> When free pages are done with higher order, time spend on >> coalescing pages by buddy allocator can be reduced. With >> section size of 256MB, hot add latency of a single section >> shows improvement from 50-60 ms to less than 1 ms, hence >> improving the hot add latency by 60%. Modify external >> providers of online callback to align with the change. > > Hi Arun, out of curiosity: > > could you please explain how exactly did you mesure the speed > improvement? diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e379e85..2416136 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -690,9 +690,13 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, void *arg) { unsigned long onlined_pages = *(unsigned long *)arg; + u64 t1, t2; + t1 = local_clock(); if (PageReserved(pfn_to_page(start_pfn))) onlined_pages = online_pages_blocks(start_pfn, nr_pages); + t2 = local_clock(); + trace_printk("time spend = %llu us\n", (t2-t1)/(1000)); online_mem_sections(start_pfn, start_pfn + nr_pages); Regards, Arun > > Thanks
On Wed, Oct 10, 2018 at 04:21:16PM +0530, Arun KS wrote:
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index e379e85..2416136 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -690,9 +690,13 @@ static int online_pages_range(unsigned long start_pfn,
> unsigned long nr_pages,
> void *arg)
> {
> unsigned long onlined_pages = *(unsigned long *)arg;
> + u64 t1, t2;
>
> + t1 = local_clock();
> if (PageReserved(pfn_to_page(start_pfn)))
> onlined_pages = online_pages_blocks(start_pfn, nr_pages);
> + t2 = local_clock();
> + trace_printk("time spend = %llu us\n", (t2-t1)/(1000));
>
> online_mem_sections(start_pfn, start_pfn + nr_pages);
Thanks ;-)
--
Oscar Salvador
SUSE L3
On 10/5/18 10:10 AM, Arun KS wrote: > When free pages are done with higher order, time spend on > coalescing pages by buddy allocator can be reduced. With > section size of 256MB, hot add latency of a single section > shows improvement from 50-60 ms to less than 1 ms, hence > improving the hot add latency by 60%. Modify external > providers of online callback to align with the change. > > Signed-off-by: Arun KS <arunks@codeaurora.org> [...] > @@ -655,26 +655,44 @@ void __online_page_free(struct page *page) > } > EXPORT_SYMBOL_GPL(__online_page_free); > > -static void generic_online_page(struct page *page) > +static int generic_online_page(struct page *page, unsigned int order) > { > - __online_page_set_limits(page); This is now not called anymore, although the xen/hv variants still do it. The function seems empty these days, maybe remove it as a followup cleanup? > - __online_page_increment_counters(page); > - __online_page_free(page); > + __free_pages_core(page, order); > + totalram_pages += (1UL << order); > +#ifdef CONFIG_HIGHMEM > + if (PageHighMem(page)) > + totalhigh_pages += (1UL << order); > +#endif __online_page_increment_counters() would have used adjust_managed_page_count() which would do the changes under managed_page_count_lock. Are we safe without the lock? If yes, there should perhaps be a comment explaining why.
On 10/5/18 10:10 AM, Arun KS wrote: > They not only increase the code footprint, they actually make things > slower rather than faster. Remove them as contemporary hardware doesn't > need any hint. > > Suggested-by: Dan Williams <dan.j.williams@intel.com> > Signed-off-by: Arun KS <arunks@codeaurora.org> Yeah, a tight loop with fixed stride is a trivial case for hw prefetcher. Acked-by: Vlastimil Babka <vbabka@suse.cz> > --- > mm/page_alloc.c | 6 +----- > 1 file changed, 1 insertion(+), 5 deletions(-) > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > index 7ab5274..90db431 100644 > --- a/mm/page_alloc.c > +++ b/mm/page_alloc.c > @@ -1258,14 +1258,10 @@ void __free_pages_core(struct page *page, unsigned int order) > struct page *p = page; > unsigned int loop; > > - prefetchw(p); > - for (loop = 0; loop < (nr_pages - 1); loop++, p++) { > - prefetchw(p + 1); > + for (loop = 0; loop < nr_pages ; loop++, p++) { > __ClearPageReserved(p); > set_page_count(p, 0); > } > - __ClearPageReserved(p); > - set_page_count(p, 0); > > page_zone(page)->managed_pages += nr_pages; > set_page_refcounted(page); >
On 2018-10-10 21:00, Vlastimil Babka wrote: > On 10/5/18 10:10 AM, Arun KS wrote: >> When free pages are done with higher order, time spend on >> coalescing pages by buddy allocator can be reduced. With >> section size of 256MB, hot add latency of a single section >> shows improvement from 50-60 ms to less than 1 ms, hence >> improving the hot add latency by 60%. Modify external >> providers of online callback to align with the change. >> >> Signed-off-by: Arun KS <arunks@codeaurora.org> > > [...] > >> @@ -655,26 +655,44 @@ void __online_page_free(struct page *page) >> } >> EXPORT_SYMBOL_GPL(__online_page_free); >> >> -static void generic_online_page(struct page *page) >> +static int generic_online_page(struct page *page, unsigned int order) >> { >> - __online_page_set_limits(page); > > This is now not called anymore, although the xen/hv variants still do > it. The function seems empty these days, maybe remove it as a followup > cleanup? > >> - __online_page_increment_counters(page); >> - __online_page_free(page); >> + __free_pages_core(page, order); >> + totalram_pages += (1UL << order); >> +#ifdef CONFIG_HIGHMEM >> + if (PageHighMem(page)) >> + totalhigh_pages += (1UL << order); >> +#endif > > __online_page_increment_counters() would have used > adjust_managed_page_count() which would do the changes under > managed_page_count_lock. Are we safe without the lock? If yes, there > should perhaps be a comment explaining why. Looks unsafe without managed_page_count_lock. I think better have a similar implementation of free_boot_core() in memory_hotplug.c like we had in version 1 of patch. And use adjust_managed_page_count() instead of page_zone(page)->managed_pages += nr_pages; https://lore.kernel.org/patchwork/patch/989445/ -static void generic_online_page(struct page *page) +static int generic_online_page(struct page *page, unsigned int order) { - __online_page_set_limits(page); - __online_page_increment_counters(page); - __online_page_free(page); + unsigned long nr_pages = 1 << order; + struct page *p = page; + + for (loop = 0 ; loop < nr_pages ; loop++, p++) { + __ClearPageReserved(p); + set_page_count(p, 0); + } + + adjust_managed_page_count(page, nr_pages); + set_page_refcounted(page); + __free_pages(page, order); + + return 0; +} Regards, Arun
On Wed 10-10-18 22:26:41, Arun KS wrote:
> On 2018-10-10 21:00, Vlastimil Babka wrote:
> > On 10/5/18 10:10 AM, Arun KS wrote:
> > > When free pages are done with higher order, time spend on
> > > coalescing pages by buddy allocator can be reduced. With
> > > section size of 256MB, hot add latency of a single section
> > > shows improvement from 50-60 ms to less than 1 ms, hence
> > > improving the hot add latency by 60%. Modify external
> > > providers of online callback to align with the change.
> > >
> > > Signed-off-by: Arun KS <arunks@codeaurora.org>
> >
> > [...]
> >
> > > @@ -655,26 +655,44 @@ void __online_page_free(struct page *page)
> > > }
> > > EXPORT_SYMBOL_GPL(__online_page_free);
> > >
> > > -static void generic_online_page(struct page *page)
> > > +static int generic_online_page(struct page *page, unsigned int order)
> > > {
> > > - __online_page_set_limits(page);
> >
> > This is now not called anymore, although the xen/hv variants still do
> > it. The function seems empty these days, maybe remove it as a followup
> > cleanup?
> >
> > > - __online_page_increment_counters(page);
> > > - __online_page_free(page);
> > > + __free_pages_core(page, order);
> > > + totalram_pages += (1UL << order);
> > > +#ifdef CONFIG_HIGHMEM
> > > + if (PageHighMem(page))
> > > + totalhigh_pages += (1UL << order);
> > > +#endif
> >
> > __online_page_increment_counters() would have used
> > adjust_managed_page_count() which would do the changes under
> > managed_page_count_lock. Are we safe without the lock? If yes, there
> > should perhaps be a comment explaining why.
>
> Looks unsafe without managed_page_count_lock.
Why does it matter actually? We cannot online/offline memory in
parallel. This is not the case for the boot where we initialize memory
in parallel on multiple nodes. So this seems to be safe currently unless
I am missing something. A comment explaining that would be helpful
though.
--
Michal Hocko
SUSE Labs
On 2018-10-10 23:03, Michal Hocko wrote:
> On Wed 10-10-18 22:26:41, Arun KS wrote:
>> On 2018-10-10 21:00, Vlastimil Babka wrote:
>> > On 10/5/18 10:10 AM, Arun KS wrote:
>> > > When free pages are done with higher order, time spend on
>> > > coalescing pages by buddy allocator can be reduced. With
>> > > section size of 256MB, hot add latency of a single section
>> > > shows improvement from 50-60 ms to less than 1 ms, hence
>> > > improving the hot add latency by 60%. Modify external
>> > > providers of online callback to align with the change.
>> > >
>> > > Signed-off-by: Arun KS <arunks@codeaurora.org>
>> >
>> > [...]
>> >
>> > > @@ -655,26 +655,44 @@ void __online_page_free(struct page *page)
>> > > }
>> > > EXPORT_SYMBOL_GPL(__online_page_free);
>> > >
>> > > -static void generic_online_page(struct page *page)
>> > > +static int generic_online_page(struct page *page, unsigned int order)
>> > > {
>> > > - __online_page_set_limits(page);
>> >
>> > This is now not called anymore, although the xen/hv variants still do
>> > it. The function seems empty these days, maybe remove it as a followup
>> > cleanup?
>> >
>> > > - __online_page_increment_counters(page);
>> > > - __online_page_free(page);
>> > > + __free_pages_core(page, order);
>> > > + totalram_pages += (1UL << order);
>> > > +#ifdef CONFIG_HIGHMEM
>> > > + if (PageHighMem(page))
>> > > + totalhigh_pages += (1UL << order);
>> > > +#endif
>> >
>> > __online_page_increment_counters() would have used
>> > adjust_managed_page_count() which would do the changes under
>> > managed_page_count_lock. Are we safe without the lock? If yes, there
>> > should perhaps be a comment explaining why.
>>
>> Looks unsafe without managed_page_count_lock.
>
> Why does it matter actually? We cannot online/offline memory in
> parallel. This is not the case for the boot where we initialize memory
> in parallel on multiple nodes. So this seems to be safe currently
> unless
> I am missing something. A comment explaining that would be helpful
> though.
Other main callers of adjust_manage_page_count(),
static inline void free_reserved_page(struct page *page)
{
__free_reserved_page(page);
adjust_managed_page_count(page, 1);
}
static inline void mark_page_reserved(struct page *page)
{
SetPageReserved(page);
adjust_managed_page_count(page, -1);
}
Won't they race with memory hotplug?
Few more,
./drivers/xen/balloon.c:519: adjust_managed_page_count(page,
-1);
./drivers/virtio/virtio_balloon.c:175: adjust_managed_page_count(page,
-1);
./drivers/virtio/virtio_balloon.c:196: adjust_managed_page_count(page,
1);
./mm/hugetlb.c:2158: adjust_managed_page_count(page,
1 << h->order);
Regards,
Arun
On Thu 11-10-18 07:59:32, Arun KS wrote:
> On 2018-10-10 23:03, Michal Hocko wrote:
> > On Wed 10-10-18 22:26:41, Arun KS wrote:
> > > On 2018-10-10 21:00, Vlastimil Babka wrote:
> > > > On 10/5/18 10:10 AM, Arun KS wrote:
> > > > > When free pages are done with higher order, time spend on
> > > > > coalescing pages by buddy allocator can be reduced. With
> > > > > section size of 256MB, hot add latency of a single section
> > > > > shows improvement from 50-60 ms to less than 1 ms, hence
> > > > > improving the hot add latency by 60%. Modify external
> > > > > providers of online callback to align with the change.
> > > > >
> > > > > Signed-off-by: Arun KS <arunks@codeaurora.org>
> > > >
> > > > [...]
> > > >
> > > > > @@ -655,26 +655,44 @@ void __online_page_free(struct page *page)
> > > > > }
> > > > > EXPORT_SYMBOL_GPL(__online_page_free);
> > > > >
> > > > > -static void generic_online_page(struct page *page)
> > > > > +static int generic_online_page(struct page *page, unsigned int order)
> > > > > {
> > > > > - __online_page_set_limits(page);
> > > >
> > > > This is now not called anymore, although the xen/hv variants still do
> > > > it. The function seems empty these days, maybe remove it as a followup
> > > > cleanup?
> > > >
> > > > > - __online_page_increment_counters(page);
> > > > > - __online_page_free(page);
> > > > > + __free_pages_core(page, order);
> > > > > + totalram_pages += (1UL << order);
> > > > > +#ifdef CONFIG_HIGHMEM
> > > > > + if (PageHighMem(page))
> > > > > + totalhigh_pages += (1UL << order);
> > > > > +#endif
> > > >
> > > > __online_page_increment_counters() would have used
> > > > adjust_managed_page_count() which would do the changes under
> > > > managed_page_count_lock. Are we safe without the lock? If yes, there
> > > > should perhaps be a comment explaining why.
> > >
> > > Looks unsafe without managed_page_count_lock.
> >
> > Why does it matter actually? We cannot online/offline memory in
> > parallel. This is not the case for the boot where we initialize memory
> > in parallel on multiple nodes. So this seems to be safe currently unless
> > I am missing something. A comment explaining that would be helpful
> > though.
>
> Other main callers of adjust_manage_page_count(),
>
> static inline void free_reserved_page(struct page *page)
> {
> __free_reserved_page(page);
> adjust_managed_page_count(page, 1);
> }
>
> static inline void mark_page_reserved(struct page *page)
> {
> SetPageReserved(page);
> adjust_managed_page_count(page, -1);
> }
>
> Won't they race with memory hotplug?
>
> Few more,
> ./drivers/xen/balloon.c:519: adjust_managed_page_count(page, -1);
> ./drivers/virtio/virtio_balloon.c:175: adjust_managed_page_count(page, -1);
> ./drivers/virtio/virtio_balloon.c:196: adjust_managed_page_count(page, 1);
> ./mm/hugetlb.c:2158: adjust_managed_page_count(page, 1 <<
> h->order);
They can, and I have missed those.
--
Michal Hocko
SUSE Labs
On 10/10/18 6:56 PM, Arun KS wrote: > On 2018-10-10 21:00, Vlastimil Babka wrote: >> On 10/5/18 10:10 AM, Arun KS wrote: >>> When free pages are done with higher order, time spend on >>> coalescing pages by buddy allocator can be reduced. With >>> section size of 256MB, hot add latency of a single section >>> shows improvement from 50-60 ms to less than 1 ms, hence >>> improving the hot add latency by 60%. Modify external >>> providers of online callback to align with the change. >>> >>> Signed-off-by: Arun KS <arunks@codeaurora.org> >> >> [...] >> >>> @@ -655,26 +655,44 @@ void __online_page_free(struct page *page) >>> } >>> EXPORT_SYMBOL_GPL(__online_page_free); >>> >>> -static void generic_online_page(struct page *page) >>> +static int generic_online_page(struct page *page, unsigned int order) >>> { >>> - __online_page_set_limits(page); >> >> This is now not called anymore, although the xen/hv variants still do >> it. The function seems empty these days, maybe remove it as a followup >> cleanup? >> >>> - __online_page_increment_counters(page); >>> - __online_page_free(page); >>> + __free_pages_core(page, order); >>> + totalram_pages += (1UL << order); >>> +#ifdef CONFIG_HIGHMEM >>> + if (PageHighMem(page)) >>> + totalhigh_pages += (1UL << order); >>> +#endif >> >> __online_page_increment_counters() would have used >> adjust_managed_page_count() which would do the changes under >> managed_page_count_lock. Are we safe without the lock? If yes, there >> should perhaps be a comment explaining why. > > Looks unsafe without managed_page_count_lock. I think better have a > similar implementation of free_boot_core() in memory_hotplug.c like we > had in version 1 of patch. And use adjust_managed_page_count() instead > of page_zone(page)->managed_pages += nr_pages; > > https://lore.kernel.org/patchwork/patch/989445/ Looks like deferred_free_range() has the same problem calling __free_pages_core() to adjust zone->managed_pages. I expect __free_pages_bootmem() is OK because at that point the system is still single-threaded? Could be solved by moving that out of __free_pages_core(). But do we care about readers potentially seeing a store tear? If yes then maybe these counters should be converted to atomics... > -static void generic_online_page(struct page *page) > +static int generic_online_page(struct page *page, unsigned int order) > { > - __online_page_set_limits(page); > - __online_page_increment_counters(page); > - __online_page_free(page); > + unsigned long nr_pages = 1 << order; > + struct page *p = page; > + > + for (loop = 0 ; loop < nr_pages ; loop++, p++) { > + __ClearPageReserved(p); > + set_page_count(p, 0); > + } > + > + adjust_managed_page_count(page, nr_pages); > + set_page_refcounted(page); > + __free_pages(page, order); > + > + return 0; > +} > > > Regards, > Arun >
On Thu 11-10-18 10:07:02, Vlastimil Babka wrote: > On 10/10/18 6:56 PM, Arun KS wrote: > > On 2018-10-10 21:00, Vlastimil Babka wrote: > >> On 10/5/18 10:10 AM, Arun KS wrote: > >>> When free pages are done with higher order, time spend on > >>> coalescing pages by buddy allocator can be reduced. With > >>> section size of 256MB, hot add latency of a single section > >>> shows improvement from 50-60 ms to less than 1 ms, hence > >>> improving the hot add latency by 60%. Modify external > >>> providers of online callback to align with the change. > >>> > >>> Signed-off-by: Arun KS <arunks@codeaurora.org> > >> > >> [...] > >> > >>> @@ -655,26 +655,44 @@ void __online_page_free(struct page *page) > >>> } > >>> EXPORT_SYMBOL_GPL(__online_page_free); > >>> > >>> -static void generic_online_page(struct page *page) > >>> +static int generic_online_page(struct page *page, unsigned int order) > >>> { > >>> - __online_page_set_limits(page); > >> > >> This is now not called anymore, although the xen/hv variants still do > >> it. The function seems empty these days, maybe remove it as a followup > >> cleanup? > >> > >>> - __online_page_increment_counters(page); > >>> - __online_page_free(page); > >>> + __free_pages_core(page, order); > >>> + totalram_pages += (1UL << order); > >>> +#ifdef CONFIG_HIGHMEM > >>> + if (PageHighMem(page)) > >>> + totalhigh_pages += (1UL << order); > >>> +#endif > >> > >> __online_page_increment_counters() would have used > >> adjust_managed_page_count() which would do the changes under > >> managed_page_count_lock. Are we safe without the lock? If yes, there > >> should perhaps be a comment explaining why. > > > > Looks unsafe without managed_page_count_lock. I think better have a > > similar implementation of free_boot_core() in memory_hotplug.c like we > > had in version 1 of patch. And use adjust_managed_page_count() instead > > of page_zone(page)->managed_pages += nr_pages; > > > > https://lore.kernel.org/patchwork/patch/989445/ > > Looks like deferred_free_range() has the same problem calling > __free_pages_core() to adjust zone->managed_pages. deferred initialization has one thread per node AFAIR so we cannot race on managed_pages updates. Well, unless some of the mentioned can run that early which I dunno. > __free_pages_bootmem() is OK because at that point the system is still > single-threaded? > Could be solved by moving that out of __free_pages_core(). > > But do we care about readers potentially seeing a store tear? If yes > then maybe these counters should be converted to atomics... I wanted to suggest that already but I have no idea whether the lock instructions would cause more overhead. -- Michal Hocko SUSE Labs
On Thu, 11 Oct 2018 09:55:03 +0200 Michal Hocko <mhocko@kernel.org> wrote:
> > > > > This is now not called anymore, although the xen/hv variants still do
> > > > > it. The function seems empty these days, maybe remove it as a followup
> > > > > cleanup?
> > > > >
> > > > > > - __online_page_increment_counters(page);
> > > > > > - __online_page_free(page);
> > > > > > + __free_pages_core(page, order);
> > > > > > + totalram_pages += (1UL << order);
> > > > > > +#ifdef CONFIG_HIGHMEM
> > > > > > + if (PageHighMem(page))
> > > > > > + totalhigh_pages += (1UL << order);
> > > > > > +#endif
> > > > >
> > > > > __online_page_increment_counters() would have used
> > > > > adjust_managed_page_count() which would do the changes under
> > > > > managed_page_count_lock. Are we safe without the lock? If yes, there
> > > > > should perhaps be a comment explaining why.
> > > >
> > > > Looks unsafe without managed_page_count_lock.
> > >
> > > Why does it matter actually? We cannot online/offline memory in
> > > parallel. This is not the case for the boot where we initialize memory
> > > in parallel on multiple nodes. So this seems to be safe currently unless
> > > I am missing something. A comment explaining that would be helpful
> > > though.
> >
> > Other main callers of adjust_manage_page_count(),
> >
> > static inline void free_reserved_page(struct page *page)
> > {
> > __free_reserved_page(page);
> > adjust_managed_page_count(page, 1);
> > }
> >
> > static inline void mark_page_reserved(struct page *page)
> > {
> > SetPageReserved(page);
> > adjust_managed_page_count(page, -1);
> > }
> >
> > Won't they race with memory hotplug?
> >
> > Few more,
> > ./drivers/xen/balloon.c:519: adjust_managed_page_count(page, -1);
> > ./drivers/virtio/virtio_balloon.c:175: adjust_managed_page_count(page, -1);
> > ./drivers/virtio/virtio_balloon.c:196: adjust_managed_page_count(page, 1);
> > ./mm/hugetlb.c:2158: adjust_managed_page_count(page, 1 <<
> > h->order);
>
> They can, and I have missed those.
So this patch needs more work, yes?
On Thu 18-10-18 19:18:25, Andrew Morton wrote:
[...]
> So this patch needs more work, yes?
Yes, I've talked to Arun (he is offline until next week) offlist and he
will play with this some more.
--
Michal Hocko
SUSE Labs
On 2018-10-19 13:37, Michal Hocko wrote:
> On Thu 18-10-18 19:18:25, Andrew Morton wrote:
> [...]
>> So this patch needs more work, yes?
>
> Yes, I've talked to Arun (he is offline until next week) offlist and he
> will play with this some more.
Converted totalhigh_pages, totalram_pages and zone->managed_page to
atomic and tested hot add. Latency is not effected with this change.
Will send out a separate patch on top of this one.
Regards,
Arun
On 2018-10-22 16:03, Arun KS wrote:
> On 2018-10-19 13:37, Michal Hocko wrote:
>> On Thu 18-10-18 19:18:25, Andrew Morton wrote:
>> [...]
>>> So this patch needs more work, yes?
>>
>> Yes, I've talked to Arun (he is offline until next week) offlist and
>> he
>> will play with this some more.
>
> Converted totalhigh_pages, totalram_pages and zone->managed_page to
> atomic and tested hot add. Latency is not effected with this change.
> Will send out a separate patch on top of this one.
Hello Andrew/Michal,
Will this be going in subsequent -rcs?
Regards,
Arun
On Mon, 05 Nov 2018 15:12:27 +0530 Arun KS <arunks@codeaurora.org> wrote:
> On 2018-10-22 16:03, Arun KS wrote:
> > On 2018-10-19 13:37, Michal Hocko wrote:
> >> On Thu 18-10-18 19:18:25, Andrew Morton wrote:
> >> [...]
> >>> So this patch needs more work, yes?
> >>
> >> Yes, I've talked to Arun (he is offline until next week) offlist and
> >> he
> >> will play with this some more.
> >
> > Converted totalhigh_pages, totalram_pages and zone->managed_page to
> > atomic and tested hot add. Latency is not effected with this change.
> > Will send out a separate patch on top of this one.
> Hello Andrew/Michal,
>
> Will this be going in subsequent -rcs?
I thought were awaiting a new version? "Will send out a separate patch
on top of this one"?
I do think a resend would be useful, please. Ensure the changelog is
updated to capture the above info and any other worthy issues which
arose during review.
On 2018-11-06 03:14, Andrew Morton wrote: > On Mon, 05 Nov 2018 15:12:27 +0530 Arun KS <arunks@codeaurora.org> > wrote: > >> On 2018-10-22 16:03, Arun KS wrote: >> > On 2018-10-19 13:37, Michal Hocko wrote: >> >> On Thu 18-10-18 19:18:25, Andrew Morton wrote: >> >> [...] >> >>> So this patch needs more work, yes? >> >> >> >> Yes, I've talked to Arun (he is offline until next week) offlist and >> >> he >> >> will play with this some more. >> > >> > Converted totalhigh_pages, totalram_pages and zone->managed_page to >> > atomic and tested hot add. Latency is not effected with this change. >> > Will send out a separate patch on top of this one. >> Hello Andrew/Michal, >> >> Will this be going in subsequent -rcs? > > I thought were awaiting a new version? "Will send out a separate patch > on top of this one"? Sorry for confusion. I sent out an incremental patch converting counters to atomics. https://patchwork.kernel.org/cover/10657217/ > > I do think a resend would be useful, please. Ensure the changelog is > updated to capture the above info and any other worthy issues which > arose during review. Will do that. Regards, Arun
On Thu, Oct 11, 2018 at 6:05 PM Vlastimil Babka <vbabka@suse.cz> wrote: > > On 10/10/18 6:56 PM, Arun KS wrote: > > On 2018-10-10 21:00, Vlastimil Babka wrote: > >> On 10/5/18 10:10 AM, Arun KS wrote: > >>> When free pages are done with higher order, time spend on > >>> coalescing pages by buddy allocator can be reduced. With > >>> section size of 256MB, hot add latency of a single section > >>> shows improvement from 50-60 ms to less than 1 ms, hence > >>> improving the hot add latency by 60%. Modify external > >>> providers of online callback to align with the change. > >>> > >>> Signed-off-by: Arun KS <arunks@codeaurora.org> > >> > >> [...] > >> > >>> @@ -655,26 +655,44 @@ void __online_page_free(struct page *page) > >>> } > >>> EXPORT_SYMBOL_GPL(__online_page_free); > >>> > >>> -static void generic_online_page(struct page *page) > >>> +static int generic_online_page(struct page *page, unsigned int order) > >>> { > >>> - __online_page_set_limits(page); > >> > >> This is now not called anymore, although the xen/hv variants still do > >> it. The function seems empty these days, maybe remove it as a followup > >> cleanup? > >> > >>> - __online_page_increment_counters(page); > >>> - __online_page_free(page); > >>> + __free_pages_core(page, order); > >>> + totalram_pages += (1UL << order); > >>> +#ifdef CONFIG_HIGHMEM > >>> + if (PageHighMem(page)) > >>> + totalhigh_pages += (1UL << order); > >>> +#endif > >> > >> __online_page_increment_counters() would have used > >> adjust_managed_page_count() which would do the changes under > >> managed_page_count_lock. Are we safe without the lock? If yes, there > >> should perhaps be a comment explaining why. > > > > Looks unsafe without managed_page_count_lock. I think better have a > > similar implementation of free_boot_core() in memory_hotplug.c like we > > had in version 1 of patch. And use adjust_managed_page_count() instead > > of page_zone(page)->managed_pages += nr_pages; > > > > https://lore.kernel.org/patchwork/patch/989445/ > > Looks like deferred_free_range() has the same problem calling > __free_pages_core() to adjust zone->managed_pages. I expect > __free_pages_bootmem() is OK because at that point the system is still > single-threaded? > Could be solved by moving that out of __free_pages_core(). > Seems deferred_free_range() is protected by pgdat_resize_lock()/pgdat_resize_unlock(). Which protects pgdat's zones, if I am right. > But do we care about readers potentially seeing a store tear? If yes > then maybe these counters should be converted to atomics... > > > -static void generic_online_page(struct page *page) > > +static int generic_online_page(struct page *page, unsigned int order) > > { > > - __online_page_set_limits(page); > > - __online_page_increment_counters(page); > > - __online_page_free(page); > > + unsigned long nr_pages = 1 << order; > > + struct page *p = page; > > + > > + for (loop = 0 ; loop < nr_pages ; loop++, p++) { > > + __ClearPageReserved(p); > > + set_page_count(p, 0); > > + } > > + > > + adjust_managed_page_count(page, nr_pages); > > + set_page_refcounted(page); > > + __free_pages(page, order); > > + > > + return 0; > > +} > > > > > > Regards, > > Arun > > >