[v3] mm: fix tick timer stall during deferred page init
diff mbox series

Message ID 20200311123848.118638-1-shile.zhang@linux.alibaba.com
State In Next
Commit 13c1c35f61122cc5edfe17e8963b607d498d86de
Headers show
Series
  • [v3] mm: fix tick timer stall during deferred page init
Related show

Commit Message

Shile Zhang March 11, 2020, 12:38 p.m. UTC
When 'CONFIG_DEFERRED_STRUCT_PAGE_INIT' is set, 'pgdatinit' kthread will
initialise the deferred pages with local interrupts disabled. It is
introduced by commit 3a2d7fa8a3d5 ("mm: disable interrupts while
initializing deferred pages").

On machine with NCPUS <= 2, the 'pgdatinit' kthread could be bound to
the boot CPU, which could caused the tick timer long time stall, system
jiffies not be updated in time.

The dmesg shown that:

    [    0.197975] node 0 initialised, 32170688 pages in 1ms

Obviously, 1ms is unreasonable.

Now, fix it by restore in the pending interrupts for every 32*1204 pages
(128MB) initialized, give the chance to update the systemd jiffies.
The reasonable demsg shown likes:

    [    1.069306] node 0 initialised, 32203456 pages in 894ms

Fixes: 3a2d7fa8a3d5 ("mm: disable interrupts while initializing deferred pages").

Co-developed-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: Shile Zhang <shile.zhang@linux.alibaba.com>
---
 mm/page_alloc.c | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

Comments

Pavel Tatashin March 11, 2020, 5:45 p.m. UTC | #1
On Wed, Mar 11, 2020 at 8:39 AM Shile Zhang
<shile.zhang@linux.alibaba.com> wrote:
>
> When 'CONFIG_DEFERRED_STRUCT_PAGE_INIT' is set, 'pgdatinit' kthread will
> initialise the deferred pages with local interrupts disabled. It is
> introduced by commit 3a2d7fa8a3d5 ("mm: disable interrupts while
> initializing deferred pages").
>
> On machine with NCPUS <= 2, the 'pgdatinit' kthread could be bound to
> the boot CPU, which could caused the tick timer long time stall, system
> jiffies not be updated in time.
>
> The dmesg shown that:
>
>     [    0.197975] node 0 initialised, 32170688 pages in 1ms
>
> Obviously, 1ms is unreasonable.
>
> Now, fix it by restore in the pending interrupts for every 32*1204 pages
> (128MB) initialized, give the chance to update the systemd jiffies.
> The reasonable demsg shown likes:
>
>     [    1.069306] node 0 initialised, 32203456 pages in 894ms

Sorry for joining late to this thread. I wonder if we could use
sched_clock() to print this statistics. Or not to print statistics at
all?

==============
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3c4eb750a199..5958f599aced 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1770,7 +1770,7 @@ static int __init deferred_init_memmap(void *data)
        const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
        unsigned long spfn = 0, epfn = 0, nr_pages = 0;
        unsigned long first_init_pfn, flags;
-       unsigned long start = jiffies;
+       unsigned long start = sched_clock();
        struct zone *zone;
        int zid;
        u64 i;
@@ -1817,8 +1817,8 @@ static int __init deferred_init_memmap(void *data)
        /* Sanity check that the next zone really is unpopulated */
        WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));

-       pr_info("node %d initialised, %lu pages in %ums\n",
-               pgdat->node_id, nr_pages, jiffies_to_msecs(jiffies - start));
+       pr_info("node %d initialised, %lu pages in %lldns\n",
+               pgdat->node_id, nr_pages, sched_clock() - start);

        pgdat_init_report_one_done();
        return 0;
==============

[    1.245331] node 0 initialised, 10256176 pages in 373565742ns

Pasha



> Fixes: 3a2d7fa8a3d5 ("mm: disable interrupts while initializing deferred pages").
>
> Co-developed-by: Kirill Tkhai <ktkhai@virtuozzo.com>
> Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
> Signed-off-by: Shile Zhang <shile.zhang@linux.alibaba.com>
> ---
>  mm/page_alloc.c | 25 ++++++++++++++++++++++---
>  1 file changed, 22 insertions(+), 3 deletions(-)
>
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 3c4eb750a199..a3a47845e150 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1763,12 +1763,17 @@ deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn,
>         return nr_pages;
>  }
>
> +/*
> + * Release the pending interrupts for every TICK_PAGE_COUNT pages.
> + */
> +#define TICK_PAGE_COUNT        (32 * 1024)
> +
>  /* Initialise remaining memory on a node */
>  static int __init deferred_init_memmap(void *data)
>  {
>         pg_data_t *pgdat = data;
>         const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
> -       unsigned long spfn = 0, epfn = 0, nr_pages = 0;
> +       unsigned long spfn = 0, epfn = 0, nr_pages = 0, prev_nr_pages = 0;
>         unsigned long first_init_pfn, flags;
>         unsigned long start = jiffies;
>         struct zone *zone;
> @@ -1779,6 +1784,7 @@ static int __init deferred_init_memmap(void *data)
>         if (!cpumask_empty(cpumask))
>                 set_cpus_allowed_ptr(current, cpumask);
>
> +again:
>         pgdat_resize_lock(pgdat, &flags);
>         first_init_pfn = pgdat->first_deferred_pfn;
>         if (first_init_pfn == ULONG_MAX) {
> @@ -1790,7 +1796,6 @@ static int __init deferred_init_memmap(void *data)
>         /* Sanity check boundaries */
>         BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);
>         BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));
> -       pgdat->first_deferred_pfn = ULONG_MAX;
>
>         /* Only the highest zone is deferred so find it */
>         for (zid = 0; zid < MAX_NR_ZONES; zid++) {
> @@ -1809,9 +1814,23 @@ static int __init deferred_init_memmap(void *data)
>          * that we can avoid introducing any issues with the buddy
>          * allocator.
>          */
> -       while (spfn < epfn)
> +       while (spfn < epfn) {
>                 nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
> +               /*
> +                * Release the interrupts for every TICK_PAGE_COUNT pages
> +                * (128MB) to give tick timer the chance to update the
> +                * system jiffies.
> +                */
> +               if ((nr_pages - prev_nr_pages) > TICK_PAGE_COUNT) {
> +                       prev_nr_pages = nr_pages;
> +                       pgdat->first_deferred_pfn = spfn;
> +                       pgdat_resize_unlock(pgdat, &flags);
> +                       goto again;
> +               }
> +       }
> +
>  zone_empty:
> +       pgdat->first_deferred_pfn = ULONG_MAX;
>         pgdat_resize_unlock(pgdat, &flags);
>
>         /* Sanity check that the next zone really is unpopulated */
> --
> 2.24.0.rc2
>
Kirill Tkhai March 11, 2020, 8:16 p.m. UTC | #2
Hi, Pasha,

On 11.03.2020 20:45, Pavel Tatashin wrote:
> On Wed, Mar 11, 2020 at 8:39 AM Shile Zhang
> <shile.zhang@linux.alibaba.com> wrote:
>>
>> When 'CONFIG_DEFERRED_STRUCT_PAGE_INIT' is set, 'pgdatinit' kthread will
>> initialise the deferred pages with local interrupts disabled. It is
>> introduced by commit 3a2d7fa8a3d5 ("mm: disable interrupts while
>> initializing deferred pages").
>>
>> On machine with NCPUS <= 2, the 'pgdatinit' kthread could be bound to
>> the boot CPU, which could caused the tick timer long time stall, system
>> jiffies not be updated in time.
>>
>> The dmesg shown that:
>>
>>     [    0.197975] node 0 initialised, 32170688 pages in 1ms
>>
>> Obviously, 1ms is unreasonable.
>>
>> Now, fix it by restore in the pending interrupts for every 32*1204 pages
>> (128MB) initialized, give the chance to update the systemd jiffies.
>> The reasonable demsg shown likes:
>>
>>     [    1.069306] node 0 initialised, 32203456 pages in 894ms
> 
> Sorry for joining late to this thread. I wonder if we could use
> sched_clock() to print this statistics. Or not to print statistics at
> all?

This won't work for all cases since sched_clock() may fall back to jiffies-based
implementation, which gives wrong result, when interrupts are disabled.

And a bigger problem is not a statistics, but it's advancing jiffies. Some parallel
thread may expect jiffies are incrementing, and this will be a surprise for that
another component.

So, this fix is more about modularity and about not introduction a new corner case.

> ==============
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 3c4eb750a199..5958f599aced 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1770,7 +1770,7 @@ static int __init deferred_init_memmap(void *data)
>         const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
>         unsigned long spfn = 0, epfn = 0, nr_pages = 0;
>         unsigned long first_init_pfn, flags;
> -       unsigned long start = jiffies;
> +       unsigned long start = sched_clock();
>         struct zone *zone;
>         int zid;
>         u64 i;
> @@ -1817,8 +1817,8 @@ static int __init deferred_init_memmap(void *data)
>         /* Sanity check that the next zone really is unpopulated */
>         WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
> 
> -       pr_info("node %d initialised, %lu pages in %ums\n",
> -               pgdat->node_id, nr_pages, jiffies_to_msecs(jiffies - start));
> +       pr_info("node %d initialised, %lu pages in %lldns\n",
> +               pgdat->node_id, nr_pages, sched_clock() - start);
> 
>         pgdat_init_report_one_done();
>         return 0;
> ==============
> 
> [    1.245331] node 0 initialised, 10256176 pages in 373565742ns
> 
> Pasha
> 
> 
> 
>> Fixes: 3a2d7fa8a3d5 ("mm: disable interrupts while initializing deferred pages").
>>
>> Co-developed-by: Kirill Tkhai <ktkhai@virtuozzo.com>
>> Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
>> Signed-off-by: Shile Zhang <shile.zhang@linux.alibaba.com>
>> ---
>>  mm/page_alloc.c | 25 ++++++++++++++++++++++---
>>  1 file changed, 22 insertions(+), 3 deletions(-)
>>
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index 3c4eb750a199..a3a47845e150 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -1763,12 +1763,17 @@ deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn,
>>         return nr_pages;
>>  }
>>
>> +/*
>> + * Release the pending interrupts for every TICK_PAGE_COUNT pages.
>> + */
>> +#define TICK_PAGE_COUNT        (32 * 1024)
>> +
>>  /* Initialise remaining memory on a node */
>>  static int __init deferred_init_memmap(void *data)
>>  {
>>         pg_data_t *pgdat = data;
>>         const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
>> -       unsigned long spfn = 0, epfn = 0, nr_pages = 0;
>> +       unsigned long spfn = 0, epfn = 0, nr_pages = 0, prev_nr_pages = 0;
>>         unsigned long first_init_pfn, flags;
>>         unsigned long start = jiffies;
>>         struct zone *zone;
>> @@ -1779,6 +1784,7 @@ static int __init deferred_init_memmap(void *data)
>>         if (!cpumask_empty(cpumask))
>>                 set_cpus_allowed_ptr(current, cpumask);
>>
>> +again:
>>         pgdat_resize_lock(pgdat, &flags);
>>         first_init_pfn = pgdat->first_deferred_pfn;
>>         if (first_init_pfn == ULONG_MAX) {
>> @@ -1790,7 +1796,6 @@ static int __init deferred_init_memmap(void *data)
>>         /* Sanity check boundaries */
>>         BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);
>>         BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));
>> -       pgdat->first_deferred_pfn = ULONG_MAX;
>>
>>         /* Only the highest zone is deferred so find it */
>>         for (zid = 0; zid < MAX_NR_ZONES; zid++) {
>> @@ -1809,9 +1814,23 @@ static int __init deferred_init_memmap(void *data)
>>          * that we can avoid introducing any issues with the buddy
>>          * allocator.
>>          */
>> -       while (spfn < epfn)
>> +       while (spfn < epfn) {
>>                 nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
>> +               /*
>> +                * Release the interrupts for every TICK_PAGE_COUNT pages
>> +                * (128MB) to give tick timer the chance to update the
>> +                * system jiffies.
>> +                */
>> +               if ((nr_pages - prev_nr_pages) > TICK_PAGE_COUNT) {
>> +                       prev_nr_pages = nr_pages;
>> +                       pgdat->first_deferred_pfn = spfn;
>> +                       pgdat_resize_unlock(pgdat, &flags);
>> +                       goto again;
>> +               }
>> +       }
>> +
>>  zone_empty:
>> +       pgdat->first_deferred_pfn = ULONG_MAX;
>>         pgdat_resize_unlock(pgdat, &flags);
>>
>>         /* Sanity check that the next zone really is unpopulated */
>> --
>> 2.24.0.rc2
>>
Pavel Tatashin March 11, 2020, 8:25 p.m. UTC | #3
On Wed, Mar 11, 2020 at 4:17 PM Kirill Tkhai <ktkhai@virtuozzo.com> wrote:
>
> Hi, Pasha,
>
> On 11.03.2020 20:45, Pavel Tatashin wrote:
> > On Wed, Mar 11, 2020 at 8:39 AM Shile Zhang
> > <shile.zhang@linux.alibaba.com> wrote:
> >>
> >> When 'CONFIG_DEFERRED_STRUCT_PAGE_INIT' is set, 'pgdatinit' kthread will
> >> initialise the deferred pages with local interrupts disabled. It is
> >> introduced by commit 3a2d7fa8a3d5 ("mm: disable interrupts while
> >> initializing deferred pages").
> >>
> >> On machine with NCPUS <= 2, the 'pgdatinit' kthread could be bound to
> >> the boot CPU, which could caused the tick timer long time stall, system
> >> jiffies not be updated in time.
> >>
> >> The dmesg shown that:
> >>
> >>     [    0.197975] node 0 initialised, 32170688 pages in 1ms
> >>
> >> Obviously, 1ms is unreasonable.
> >>
> >> Now, fix it by restore in the pending interrupts for every 32*1204 pages
> >> (128MB) initialized, give the chance to update the systemd jiffies.
> >> The reasonable demsg shown likes:
> >>
> >>     [    1.069306] node 0 initialised, 32203456 pages in 894ms
> >
> > Sorry for joining late to this thread. I wonder if we could use
> > sched_clock() to print this statistics. Or not to print statistics at
> > all?
>
> This won't work for all cases since sched_clock() may fall back to jiffies-based
> implementation, which gives wrong result, when interrupts are disabled.
>
> And a bigger problem is not a statistics, but it's advancing jiffies. Some parallel
> thread may expect jiffies are incrementing, and this will be a surprise for that
> another component.
>
> So, this fix is more about modularity and about not introduction a new corner case.

Makes sense.

Reviewed-by: Pavel Tatashin <pasha.tatashin@soleen.com>

Thank you,
Pasha
Daniel Jordan March 19, 2020, 7:05 p.m. UTC | #4
On Wed, Mar 11, 2020 at 08:38:48PM +0800, Shile Zhang wrote:

Sorry, I'm late to this.

I don't have a better solution, but I did try to find a way to stop holding the
resize lock during (most of) page init, which would make this fix unnecessary
and the deferred_init_memmap context less strange.  Here are some ideas that
didn't work out in case someone sees a different way forward.

One thought is to unify the common parts of deferred_init_memmap and
deferred_grow_zone and have callers grab chunks of pages to initialize and note
the next available page to initialize for the next caller.  Interrupt handlers
participate in page init while it's happening rather than having to wait until
it's finished.  But what if a partially completed chunk is interrupted midway
through and the interrupt handler needs to allocate those in-progress pages?
May be possible to guarantee some memory is available if some minimum number of
chunks have been completed already, but it's hard to say what that number is if
the amount of memory handlers might allocate is unbounded.

Given that large allocations from interrupt handlers is a theoretical issue,
another thought is to reserve one section for deferred_grow_zone, should it be
called during page init, and if not then the pgdatinit thread could initialize
it with the resize lock held after the rest of page init is finished.
Meanwhile regular page init need not hold the resize lock.  If interrupt
handlers try to allocate more than a section during this time, trigger a
warning so we know the issue isn't theoretical.  The downside is that it's
possible this may not fix it for good.

> @@ -1811,9 +1816,23 @@ static int __init deferred_init_memmap(v
>  	 * that we can avoid introducing any issues with the buddy
>  	 * allocator.
>  	 */
> -	while (spfn < epfn)
> +	while (spfn < epfn) {
>  		nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
> +		/*
> +		 * Release the interrupts for every TICK_PAGE_COUNT pages
> +		 * (128MB) to give tick timer the chance to update the
> +		 * system jiffies.
> +		 */
> +		if ((nr_pages - prev_nr_pages) > TICK_PAGE_COUNT) {
> +			prev_nr_pages = nr_pages;
> +			pgdat->first_deferred_pfn = spfn;
> +			pgdat_resize_unlock(pgdat, &flags);
> +			goto again;
> +		}
> +	}
> +

Nits only:

 - s/Release the interrupts/Enable interrupts/
 - take out 128MB, that assumes PAGE_SIZE is 4k

I considered saving i, spfn, and epfn in pgdat to avoid having to rerun
deferred_init_mem_pfn_range_in_zone every retry, but it'd enlarge pgdat for
short-lived data and the function probably isn't expensive.

Regardless,
Reviewed-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Daniel Jordan March 26, 2020, 6:58 p.m. UTC | #5
On Thu, Mar 19, 2020 at 03:05:12PM -0400, Daniel Jordan wrote:
> Regardless,
> Reviewed-by: Daniel Jordan <daniel.m.jordan@oracle.com>

Darn, I spoke too soon.

On a two-socket Xeon, smaller values of TICK_PAGE_COUNT caused the deferred
init timestamp to grow by over 25%.  This was with pgdatinit0 bound to the
timer interrupt CPU to make sure the issue always reproduces.

               TICK_PAGE_COUNT     node 0 deferred
                                   init time (ms)
               ---------------     ---------------
                          4096                 610
                          8192                 587
                         16384                 487
                         32768                 480    // used in the patch

Instead of trying to find a constant that lets the timer interrupt run often
enough, I think a better way forward is to reconsider how we handle the resize
lock.  I plan to prototype something and reply back with what I get.
Pavel Tatashin March 26, 2020, 7:36 p.m. UTC | #6
I agree with Daniel, we should look into approach where
pgdat_resize_lock is taken only for the duration of updating tracking
values such as pgdat->first_deferred_pfn (perhaps we would need to add
another tracker that would show chunks that are currently being worked
on).

The vast duration of struct page initialization process should happen
outside of this lock, and only be taken when we update globally seen
data structures: lists, tracking variables. This way we can solve
several problems: 1. allow interrupt threads to grow zones if
required. 2. keep jiffies happy. 3. allow future scaling when we will
add inner node threads to initialize struct pages (i.e. ktasks from
Daniel).

Pasha

On Thu, Mar 26, 2020 at 2:58 PM Daniel Jordan
<daniel.m.jordan@oracle.com> wrote:
>
> On Thu, Mar 19, 2020 at 03:05:12PM -0400, Daniel Jordan wrote:
> > Regardless,
> > Reviewed-by: Daniel Jordan <daniel.m.jordan@oracle.com>
>
> Darn, I spoke too soon.
>
> On a two-socket Xeon, smaller values of TICK_PAGE_COUNT caused the deferred
> init timestamp to grow by over 25%.  This was with pgdatinit0 bound to the
> timer interrupt CPU to make sure the issue always reproduces.
>
>                TICK_PAGE_COUNT     node 0 deferred
>                                    init time (ms)
>                ---------------     ---------------
>                           4096                 610
>                           8192                 587
>                          16384                 487
>                          32768                 480    // used in the patch
>
> Instead of trying to find a constant that lets the timer interrupt run often
> enough, I think a better way forward is to reconsider how we handle the resize
> lock.  I plan to prototype something and reply back with what I get.
Shile Zhang March 27, 2020, 4:39 a.m. UTC | #7
On 2020/3/27 03:36, Pavel Tatashin wrote:
> I agree with Daniel, we should look into approach where
> pgdat_resize_lock is taken only for the duration of updating tracking
> values such as pgdat->first_deferred_pfn (perhaps we would need to add
> another tracker that would show chunks that are currently being worked
> on).
>
> The vast duration of struct page initialization process should happen
> outside of this lock, and only be taken when we update globally seen
> data structures: lists, tracking variables. This way we can solve
> several problems: 1. allow interrupt threads to grow zones if
> required. 2. keep jiffies happy. 3. allow future scaling when we will
> add inner node threads to initialize struct pages (i.e. ktasks from
> Daniel).

It make sense, looking forward to the inner node parallel init.

@Daniel
Is there schedule about ktasks?

Thanks!

>
> Pasha
>
> On Thu, Mar 26, 2020 at 2:58 PM Daniel Jordan
> <daniel.m.jordan@oracle.com> wrote:
>> On Thu, Mar 19, 2020 at 03:05:12PM -0400, Daniel Jordan wrote:
>>> Regardless,
>>> Reviewed-by: Daniel Jordan <daniel.m.jordan@oracle.com>
>> Darn, I spoke too soon.
>>
>> On a two-socket Xeon, smaller values of TICK_PAGE_COUNT caused the deferred
>> init timestamp to grow by over 25%.  This was with pgdatinit0 bound to the
>> timer interrupt CPU to make sure the issue always reproduces.
>>
>>                 TICK_PAGE_COUNT     node 0 deferred
>>                                     init time (ms)
>>                 ---------------     ---------------
>>                            4096                 610
>>                            8192                 587
>>                           16384                 487
>>                           32768                 480    // used in the patch
>>
>> Instead of trying to find a constant that lets the timer interrupt run often
>> enough, I think a better way forward is to reconsider how we handle the resize
>> lock.  I plan to prototype something and reply back with what I get.

Yes, the time spend of pages init depends on the CPU frequency,
and the jiffies update controlled by HZ, so it's hard to find a general 
constant.

It seems we need a bigger refactors about the lock to get a better solution.
Daniel Jordan March 28, 2020, 12:17 a.m. UTC | #8
On Fri, Mar 27, 2020 at 12:39:18PM +0800, Shile Zhang wrote:
> On 2020/3/27 03:36, Pavel Tatashin wrote:
> > I agree with Daniel, we should look into approach where
> > pgdat_resize_lock is taken only for the duration of updating tracking
> > values such as pgdat->first_deferred_pfn (perhaps we would need to add
> > another tracker that would show chunks that are currently being worked
> > on).
> > 
> > The vast duration of struct page initialization process should happen
> > outside of this lock, and only be taken when we update globally seen
> > data structures: lists, tracking variables. This way we can solve
> > several problems: 1. allow interrupt threads to grow zones if
> > required. 2. keep jiffies happy. 3. allow future scaling when we will
> > add inner node threads to initialize struct pages (i.e. ktasks from
> > Daniel).
> 
> It make sense, looking forward to the inner node parallel init.
> 
> @Daniel
> Is there schedule about ktasks?

Yep, and it's now padata multithreading instead of ktask since we already have
'task' in the kernel.

Current plan is to start with users in the system context, that is those that
don't require userland resource controls such as cgroup.  So I'll post a new
version of this timestamp fix pretty soon and then likely post a series that
multithreads page init.

Future work is tentatively doing other system users, remote charging for the
CPU controller, and then users that can be accounted with cgroup etc.
David Hildenbrand April 1, 2020, 2:26 p.m. UTC | #9
On 11.03.20 13:38, Shile Zhang wrote:
> When 'CONFIG_DEFERRED_STRUCT_PAGE_INIT' is set, 'pgdatinit' kthread will
> initialise the deferred pages with local interrupts disabled. It is
> introduced by commit 3a2d7fa8a3d5 ("mm: disable interrupts while
> initializing deferred pages").
> 
> On machine with NCPUS <= 2, the 'pgdatinit' kthread could be bound to
> the boot CPU, which could caused the tick timer long time stall, system
> jiffies not be updated in time.
> 
> The dmesg shown that:
> 
>     [    0.197975] node 0 initialised, 32170688 pages in 1ms
> 
> Obviously, 1ms is unreasonable.
> 
> Now, fix it by restore in the pending interrupts for every 32*1204 pages
> (128MB) initialized, give the chance to update the systemd jiffies.
> The reasonable demsg shown likes:
> 
>     [    1.069306] node 0 initialised, 32203456 pages in 894ms
> 
> Fixes: 3a2d7fa8a3d5 ("mm: disable interrupts while initializing deferred pages").
> 
> Co-developed-by: Kirill Tkhai <ktkhai@virtuozzo.com>
> Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
> Signed-off-by: Shile Zhang <shile.zhang@linux.alibaba.com>
> ---
>  mm/page_alloc.c | 25 ++++++++++++++++++++++---
>  1 file changed, 22 insertions(+), 3 deletions(-)
> 
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 3c4eb750a199..a3a47845e150 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1763,12 +1763,17 @@ deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn,
>  	return nr_pages;
>  }
>  
> +/*
> + * Release the pending interrupts for every TICK_PAGE_COUNT pages.
> + */
> +#define TICK_PAGE_COUNT	(32 * 1024)
> +
>  /* Initialise remaining memory on a node */
>  static int __init deferred_init_memmap(void *data)
>  {
>  	pg_data_t *pgdat = data;
>  	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
> -	unsigned long spfn = 0, epfn = 0, nr_pages = 0;
> +	unsigned long spfn = 0, epfn = 0, nr_pages = 0, prev_nr_pages = 0;
>  	unsigned long first_init_pfn, flags;
>  	unsigned long start = jiffies;
>  	struct zone *zone;
> @@ -1779,6 +1784,7 @@ static int __init deferred_init_memmap(void *data)
>  	if (!cpumask_empty(cpumask))
>  		set_cpus_allowed_ptr(current, cpumask);
>  
> +again:
>  	pgdat_resize_lock(pgdat, &flags);
>  	first_init_pfn = pgdat->first_deferred_pfn;
>  	if (first_init_pfn == ULONG_MAX) {
> @@ -1790,7 +1796,6 @@ static int __init deferred_init_memmap(void *data)
>  	/* Sanity check boundaries */
>  	BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);
>  	BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));
> -	pgdat->first_deferred_pfn = ULONG_MAX;
>  
>  	/* Only the highest zone is deferred so find it */
>  	for (zid = 0; zid < MAX_NR_ZONES; zid++) {
> @@ -1809,9 +1814,23 @@ static int __init deferred_init_memmap(void *data)
>  	 * that we can avoid introducing any issues with the buddy
>  	 * allocator.
>  	 */
> -	while (spfn < epfn)
> +	while (spfn < epfn) {
>  		nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
> +		/*
> +		 * Release the interrupts for every TICK_PAGE_COUNT pages
> +		 * (128MB) to give tick timer the chance to update the

Nit: 128MB is only true for 4k pages.

> +		 * system jiffies.
> +		 */
> +		if ((nr_pages - prev_nr_pages) > TICK_PAGE_COUNT) {
> +			prev_nr_pages = nr_pages;
> +			pgdat->first_deferred_pfn = spfn;
> +			pgdat_resize_unlock(pgdat, &flags);
> +			goto again;
> +		}
> +	}
> +

I find that deferred page init code horribly complicated to understand,
but that's a different story.

Your change looks sane to me and survives my basic testing. Thanks!

Acked-by: David Hildenbrand <david@redhat.com>
Michal Hocko April 1, 2020, 3:42 p.m. UTC | #10
I am sorry but I have completely missed this patch.

On Wed 11-03-20 20:38:48, Shile Zhang wrote:
> When 'CONFIG_DEFERRED_STRUCT_PAGE_INIT' is set, 'pgdatinit' kthread will
> initialise the deferred pages with local interrupts disabled. It is
> introduced by commit 3a2d7fa8a3d5 ("mm: disable interrupts while
> initializing deferred pages").
> 
> On machine with NCPUS <= 2, the 'pgdatinit' kthread could be bound to
> the boot CPU, which could caused the tick timer long time stall, system
> jiffies not be updated in time.
> 
> The dmesg shown that:
> 
>     [    0.197975] node 0 initialised, 32170688 pages in 1ms
> 
> Obviously, 1ms is unreasonable.
> 
> Now, fix it by restore in the pending interrupts for every 32*1204 pages
> (128MB) initialized, give the chance to update the systemd jiffies.
> The reasonable demsg shown likes:
> 
>     [    1.069306] node 0 initialised, 32203456 pages in 894ms
> 
> Fixes: 3a2d7fa8a3d5 ("mm: disable interrupts while initializing deferred pages").

I dislike this solution TBH. It effectivelly conserves the current code
and just works around the problem. Why do we hold the IRQ lock here in
the first place? This is an early init code and a very limited code is
running at this stage. Certainly nothing memory hotplug related which
should be the only path really interested in the resize lock AFAIR.

This needs a double checking but I strongly believe that the lock can be
simply dropped in this path.
 
> Co-developed-by: Kirill Tkhai <ktkhai@virtuozzo.com>
> Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
> Signed-off-by: Shile Zhang <shile.zhang@linux.alibaba.com>
> ---
>  mm/page_alloc.c | 25 ++++++++++++++++++++++---
>  1 file changed, 22 insertions(+), 3 deletions(-)
> 
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 3c4eb750a199..a3a47845e150 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1763,12 +1763,17 @@ deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn,
>  	return nr_pages;
>  }
>  
> +/*
> + * Release the pending interrupts for every TICK_PAGE_COUNT pages.
> + */
> +#define TICK_PAGE_COUNT	(32 * 1024)
> +
>  /* Initialise remaining memory on a node */
>  static int __init deferred_init_memmap(void *data)
>  {
>  	pg_data_t *pgdat = data;
>  	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
> -	unsigned long spfn = 0, epfn = 0, nr_pages = 0;
> +	unsigned long spfn = 0, epfn = 0, nr_pages = 0, prev_nr_pages = 0;
>  	unsigned long first_init_pfn, flags;
>  	unsigned long start = jiffies;
>  	struct zone *zone;
> @@ -1779,6 +1784,7 @@ static int __init deferred_init_memmap(void *data)
>  	if (!cpumask_empty(cpumask))
>  		set_cpus_allowed_ptr(current, cpumask);
>  
> +again:
>  	pgdat_resize_lock(pgdat, &flags);
>  	first_init_pfn = pgdat->first_deferred_pfn;
>  	if (first_init_pfn == ULONG_MAX) {
> @@ -1790,7 +1796,6 @@ static int __init deferred_init_memmap(void *data)
>  	/* Sanity check boundaries */
>  	BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);
>  	BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));
> -	pgdat->first_deferred_pfn = ULONG_MAX;
>  
>  	/* Only the highest zone is deferred so find it */
>  	for (zid = 0; zid < MAX_NR_ZONES; zid++) {
> @@ -1809,9 +1814,23 @@ static int __init deferred_init_memmap(void *data)
>  	 * that we can avoid introducing any issues with the buddy
>  	 * allocator.
>  	 */
> -	while (spfn < epfn)
> +	while (spfn < epfn) {
>  		nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
> +		/*
> +		 * Release the interrupts for every TICK_PAGE_COUNT pages
> +		 * (128MB) to give tick timer the chance to update the
> +		 * system jiffies.
> +		 */
> +		if ((nr_pages - prev_nr_pages) > TICK_PAGE_COUNT) {
> +			prev_nr_pages = nr_pages;
> +			pgdat->first_deferred_pfn = spfn;
> +			pgdat_resize_unlock(pgdat, &flags);
> +			goto again;
> +		}
> +	}
> +
>  zone_empty:
> +	pgdat->first_deferred_pfn = ULONG_MAX;
>  	pgdat_resize_unlock(pgdat, &flags);
>  
>  	/* Sanity check that the next zone really is unpopulated */
> -- 
> 2.24.0.rc2
David Hildenbrand April 1, 2020, 3:50 p.m. UTC | #11
On 01.04.20 17:42, Michal Hocko wrote:
> I am sorry but I have completely missed this patch.
> 
> On Wed 11-03-20 20:38:48, Shile Zhang wrote:
>> When 'CONFIG_DEFERRED_STRUCT_PAGE_INIT' is set, 'pgdatinit' kthread will
>> initialise the deferred pages with local interrupts disabled. It is
>> introduced by commit 3a2d7fa8a3d5 ("mm: disable interrupts while
>> initializing deferred pages").
>>
>> On machine with NCPUS <= 2, the 'pgdatinit' kthread could be bound to
>> the boot CPU, which could caused the tick timer long time stall, system
>> jiffies not be updated in time.
>>
>> The dmesg shown that:
>>
>>     [    0.197975] node 0 initialised, 32170688 pages in 1ms
>>
>> Obviously, 1ms is unreasonable.
>>
>> Now, fix it by restore in the pending interrupts for every 32*1204 pages
>> (128MB) initialized, give the chance to update the systemd jiffies.
>> The reasonable demsg shown likes:
>>
>>     [    1.069306] node 0 initialised, 32203456 pages in 894ms
>>
>> Fixes: 3a2d7fa8a3d5 ("mm: disable interrupts while initializing deferred pages").
> 
> I dislike this solution TBH. It effectivelly conserves the current code
> and just works around the problem. Why do we hold the IRQ lock here in
> the first place? This is an early init code and a very limited code is
> running at this stage. Certainly nothing memory hotplug related which
> should be the only path really interested in the resize lock AFAIR.

Yeah, I don't think ACPI and friends are up yet.
Michal Hocko April 1, 2020, 4 p.m. UTC | #12
On Wed 01-04-20 17:50:22, David Hildenbrand wrote:
> On 01.04.20 17:42, Michal Hocko wrote:
> > I am sorry but I have completely missed this patch.
> > 
> > On Wed 11-03-20 20:38:48, Shile Zhang wrote:
> >> When 'CONFIG_DEFERRED_STRUCT_PAGE_INIT' is set, 'pgdatinit' kthread will
> >> initialise the deferred pages with local interrupts disabled. It is
> >> introduced by commit 3a2d7fa8a3d5 ("mm: disable interrupts while
> >> initializing deferred pages").
> >>
> >> On machine with NCPUS <= 2, the 'pgdatinit' kthread could be bound to
> >> the boot CPU, which could caused the tick timer long time stall, system
> >> jiffies not be updated in time.
> >>
> >> The dmesg shown that:
> >>
> >>     [    0.197975] node 0 initialised, 32170688 pages in 1ms
> >>
> >> Obviously, 1ms is unreasonable.
> >>
> >> Now, fix it by restore in the pending interrupts for every 32*1204 pages
> >> (128MB) initialized, give the chance to update the systemd jiffies.
> >> The reasonable demsg shown likes:
> >>
> >>     [    1.069306] node 0 initialised, 32203456 pages in 894ms
> >>
> >> Fixes: 3a2d7fa8a3d5 ("mm: disable interrupts while initializing deferred pages").
> > 
> > I dislike this solution TBH. It effectivelly conserves the current code
> > and just works around the problem. Why do we hold the IRQ lock here in
> > the first place? This is an early init code and a very limited code is
> > running at this stage. Certainly nothing memory hotplug related which
> > should be the only path really interested in the resize lock AFAIR.
> 
> Yeah, I don't think ACPI and friends are up yet.

Just to save somebody time to check. The deferred initialization blocks
the further boot until all workders are done - see page_alloc_init_late
(kernel_init path).
Daniel Jordan April 1, 2020, 4:09 p.m. UTC | #13
On Wed, Apr 01, 2020 at 06:00:48PM +0200, Michal Hocko wrote:
> On Wed 01-04-20 17:50:22, David Hildenbrand wrote:
> > On 01.04.20 17:42, Michal Hocko wrote:
> > > I am sorry but I have completely missed this patch.
> > > 
> > > On Wed 11-03-20 20:38:48, Shile Zhang wrote:
> > >> When 'CONFIG_DEFERRED_STRUCT_PAGE_INIT' is set, 'pgdatinit' kthread will
> > >> initialise the deferred pages with local interrupts disabled. It is
> > >> introduced by commit 3a2d7fa8a3d5 ("mm: disable interrupts while
> > >> initializing deferred pages").
> > >>
> > >> On machine with NCPUS <= 2, the 'pgdatinit' kthread could be bound to
> > >> the boot CPU, which could caused the tick timer long time stall, system
> > >> jiffies not be updated in time.
> > >>
> > >> The dmesg shown that:
> > >>
> > >>     [    0.197975] node 0 initialised, 32170688 pages in 1ms
> > >>
> > >> Obviously, 1ms is unreasonable.
> > >>
> > >> Now, fix it by restore in the pending interrupts for every 32*1204 pages
> > >> (128MB) initialized, give the chance to update the systemd jiffies.
> > >> The reasonable demsg shown likes:
> > >>
> > >>     [    1.069306] node 0 initialised, 32203456 pages in 894ms
> > >>
> > >> Fixes: 3a2d7fa8a3d5 ("mm: disable interrupts while initializing deferred pages").
> > > 
> > > I dislike this solution TBH. It effectivelly conserves the current code
> > > and just works around the problem. Why do we hold the IRQ lock here in
> > > the first place? This is an early init code and a very limited code is
> > > running at this stage. Certainly nothing memory hotplug related which
> > > should be the only path really interested in the resize lock AFAIR.
> > 
> > Yeah, I don't think ACPI and friends are up yet.
> 
> Just to save somebody time to check. The deferred initialization blocks
> the further boot until all workders are done - see page_alloc_init_late
> (kernel_init path).

Ha, I just finished following all the hotplug paths to check this out, and as
you all know there are a *lot* :-) Well at least we're in agreement.

> > > This needs a double checking but I strongly believe that the lock can be
> > > simply dropped in this path.

This is what my fix does, it limits the time the resize lock is held.
Michal Hocko April 1, 2020, 4:12 p.m. UTC | #14
On Wed 01-04-20 12:09:29, Daniel Jordan wrote:
> On Wed, Apr 01, 2020 at 06:00:48PM +0200, Michal Hocko wrote:
> > On Wed 01-04-20 17:50:22, David Hildenbrand wrote:
> > > On 01.04.20 17:42, Michal Hocko wrote:
> > > > I am sorry but I have completely missed this patch.
> > > > 
> > > > On Wed 11-03-20 20:38:48, Shile Zhang wrote:
> > > >> When 'CONFIG_DEFERRED_STRUCT_PAGE_INIT' is set, 'pgdatinit' kthread will
> > > >> initialise the deferred pages with local interrupts disabled. It is
> > > >> introduced by commit 3a2d7fa8a3d5 ("mm: disable interrupts while
> > > >> initializing deferred pages").
> > > >>
> > > >> On machine with NCPUS <= 2, the 'pgdatinit' kthread could be bound to
> > > >> the boot CPU, which could caused the tick timer long time stall, system
> > > >> jiffies not be updated in time.
> > > >>
> > > >> The dmesg shown that:
> > > >>
> > > >>     [    0.197975] node 0 initialised, 32170688 pages in 1ms
> > > >>
> > > >> Obviously, 1ms is unreasonable.
> > > >>
> > > >> Now, fix it by restore in the pending interrupts for every 32*1204 pages
> > > >> (128MB) initialized, give the chance to update the systemd jiffies.
> > > >> The reasonable demsg shown likes:
> > > >>
> > > >>     [    1.069306] node 0 initialised, 32203456 pages in 894ms
> > > >>
> > > >> Fixes: 3a2d7fa8a3d5 ("mm: disable interrupts while initializing deferred pages").
> > > > 
> > > > I dislike this solution TBH. It effectivelly conserves the current code
> > > > and just works around the problem. Why do we hold the IRQ lock here in
> > > > the first place? This is an early init code and a very limited code is
> > > > running at this stage. Certainly nothing memory hotplug related which
> > > > should be the only path really interested in the resize lock AFAIR.
> > > 
> > > Yeah, I don't think ACPI and friends are up yet.
> > 
> > Just to save somebody time to check. The deferred initialization blocks
> > the further boot until all workders are done - see page_alloc_init_late
> > (kernel_init path).
> 
> Ha, I just finished following all the hotplug paths to check this out, and as
> you all know there are a *lot* :-) Well at least we're in agreement.

Good to have it double checked!
 
> > > > This needs a double checking but I strongly believe that the lock can be
> > > > simply dropped in this path.
> 
> This is what my fix does, it limits the time the resize lock is held.

Just remove it from the deferred intialization and add a comment that we
deliberately not taking the lock here because abc
David Hildenbrand April 1, 2020, 4:15 p.m. UTC | #15
On 01.04.20 18:12, Michal Hocko wrote:
> On Wed 01-04-20 12:09:29, Daniel Jordan wrote:
>> On Wed, Apr 01, 2020 at 06:00:48PM +0200, Michal Hocko wrote:
>>> On Wed 01-04-20 17:50:22, David Hildenbrand wrote:
>>>> On 01.04.20 17:42, Michal Hocko wrote:
>>>>> I am sorry but I have completely missed this patch.
>>>>>
>>>>> On Wed 11-03-20 20:38:48, Shile Zhang wrote:
>>>>>> When 'CONFIG_DEFERRED_STRUCT_PAGE_INIT' is set, 'pgdatinit' kthread will
>>>>>> initialise the deferred pages with local interrupts disabled. It is
>>>>>> introduced by commit 3a2d7fa8a3d5 ("mm: disable interrupts while
>>>>>> initializing deferred pages").
>>>>>>
>>>>>> On machine with NCPUS <= 2, the 'pgdatinit' kthread could be bound to
>>>>>> the boot CPU, which could caused the tick timer long time stall, system
>>>>>> jiffies not be updated in time.
>>>>>>
>>>>>> The dmesg shown that:
>>>>>>
>>>>>>     [    0.197975] node 0 initialised, 32170688 pages in 1ms
>>>>>>
>>>>>> Obviously, 1ms is unreasonable.
>>>>>>
>>>>>> Now, fix it by restore in the pending interrupts for every 32*1204 pages
>>>>>> (128MB) initialized, give the chance to update the systemd jiffies.
>>>>>> The reasonable demsg shown likes:
>>>>>>
>>>>>>     [    1.069306] node 0 initialised, 32203456 pages in 894ms
>>>>>>
>>>>>> Fixes: 3a2d7fa8a3d5 ("mm: disable interrupts while initializing deferred pages").
>>>>>
>>>>> I dislike this solution TBH. It effectivelly conserves the current code
>>>>> and just works around the problem. Why do we hold the IRQ lock here in
>>>>> the first place? This is an early init code and a very limited code is
>>>>> running at this stage. Certainly nothing memory hotplug related which
>>>>> should be the only path really interested in the resize lock AFAIR.
>>>>
>>>> Yeah, I don't think ACPI and friends are up yet.
>>>
>>> Just to save somebody time to check. The deferred initialization blocks
>>> the further boot until all workders are done - see page_alloc_init_late
>>> (kernel_init path).
>>
>> Ha, I just finished following all the hotplug paths to check this out, and as
>> you all know there are a *lot* :-) Well at least we're in agreement.
> 
> Good to have it double checked!
>  
>>>>> This needs a double checking but I strongly believe that the lock can be
>>>>> simply dropped in this path.
>>
>> This is what my fix does, it limits the time the resize lock is held.
> 
> Just remove it from the deferred intialization and add a comment that we
> deliberately not taking the lock here because abc
> 

We should just double check what

commit 3a2d7fa8a3d5ae740bd0c21d933acc6220857ed0
Author: Pavel Tatashin <pasha.tatashin@oracle.com>
Date:   Thu Apr 5 16:22:27 2018 -0700

    mm: disable interrupts while initializing deferred pages
    
    Vlastimil Babka reported about a window issue during which when deferred
    pages are initialized, and the current version of on-demand
    initialization is finished, allocations may fail.  While this is highly
    unlikely scenario, since this kind of allocation request must be large,
    and must come from interrupt handler, we still want to cover it.
    
    We solve this by initializing deferred pages with interrupts disabled,
    and holding node_size_lock spin lock while pages in the node are being
    initialized.  The on-demand deferred page initialization that comes
    later will use the same lock, and thus synchronize with
    deferred_init_memmap().
    
    It is unlikely for threads that initialize deferred pages to be
    interrupted.  They run soon after smp_init(), but before modules are
    initialized, and long before user space programs.  This is why there is
    no adverse effect of having these threads running with interrupts
    disabled.

tried to solve does not apply.
Daniel Jordan April 1, 2020, 4:18 p.m. UTC | #16
On Wed, Apr 01, 2020 at 06:12:43PM +0200, Michal Hocko wrote:
> On Wed 01-04-20 12:09:29, Daniel Jordan wrote:
> > On Wed, Apr 01, 2020 at 06:00:48PM +0200, Michal Hocko wrote:
> > > On Wed 01-04-20 17:50:22, David Hildenbrand wrote:
> > > > On 01.04.20 17:42, Michal Hocko wrote:
> > > > > This needs a double checking but I strongly believe that the lock can be
> > > > > simply dropped in this path.
> > 
> > This is what my fix does, it limits the time the resize lock is held.
> 
> Just remove it from the deferred intialization and add a comment that we
> deliberately not taking the lock here because abc

I think it has to be a little more involved because of the window where
interrupts might allocate during deferred init, as Vlastimil pointed out a few
years ago when the change was made.  I'll explain myself in the changelog.
Michal Hocko April 1, 2020, 4:26 p.m. UTC | #17
On Wed 01-04-20 12:18:10, Daniel Jordan wrote:
> On Wed, Apr 01, 2020 at 06:12:43PM +0200, Michal Hocko wrote:
> > On Wed 01-04-20 12:09:29, Daniel Jordan wrote:
> > > On Wed, Apr 01, 2020 at 06:00:48PM +0200, Michal Hocko wrote:
> > > > On Wed 01-04-20 17:50:22, David Hildenbrand wrote:
> > > > > On 01.04.20 17:42, Michal Hocko wrote:
> > > > > > This needs a double checking but I strongly believe that the lock can be
> > > > > > simply dropped in this path.
> > > 
> > > This is what my fix does, it limits the time the resize lock is held.
> > 
> > Just remove it from the deferred intialization and add a comment that we
> > deliberately not taking the lock here because abc
> 
> I think it has to be a little more involved because of the window where
> interrupts might allocate during deferred init, as Vlastimil pointed out a few
> years ago when the change was made.

I do not remember any details but do we have any actual real allocation
failure or was this mostly a theoretical concern. Vlastimil? For your
context we are talking about 3a2d7fa8a3d5 ("mm: disable interrupts while
initializing deferred pages")

> I'll explain myself in the changelog.

OK, I will wait for the patch.
Pavel Tatashin April 1, 2020, 4:41 p.m. UTC | #18
On Wed, Apr 1, 2020 at 12:26 PM Michal Hocko <mhocko@kernel.org> wrote:
>
> On Wed 01-04-20 12:18:10, Daniel Jordan wrote:
> > On Wed, Apr 01, 2020 at 06:12:43PM +0200, Michal Hocko wrote:
> > > On Wed 01-04-20 12:09:29, Daniel Jordan wrote:
> > > > On Wed, Apr 01, 2020 at 06:00:48PM +0200, Michal Hocko wrote:
> > > > > On Wed 01-04-20 17:50:22, David Hildenbrand wrote:
> > > > > > On 01.04.20 17:42, Michal Hocko wrote:
> > > > > > > This needs a double checking but I strongly believe that the lock can be
> > > > > > > simply dropped in this path.
> > > >
> > > > This is what my fix does, it limits the time the resize lock is held.
> > >
> > > Just remove it from the deferred intialization and add a comment that we
> > > deliberately not taking the lock here because abc
> >
> > I think it has to be a little more involved because of the window where
> > interrupts might allocate during deferred init, as Vlastimil pointed out a few
> > years ago when the change was made.
>
> I do not remember any details but do we have any actual real allocation
> failure or was this mostly a theoretical concern. Vlastimil? For your
> context we are talking about 3a2d7fa8a3d5 ("mm: disable interrupts while
> initializing deferred pages")

I do not remember seeing any real failures, this was a theoretical
window. So, we could potentially simply remove these locks until we
see a real boot failure in some interrupt thread. The allocation has
to be rather large as well.

Pasha
Michal Hocko April 1, 2020, 4:46 p.m. UTC | #19
On Wed 01-04-20 12:41:13, Pavel Tatashin wrote:
> On Wed, Apr 1, 2020 at 12:26 PM Michal Hocko <mhocko@kernel.org> wrote:
> >
> > On Wed 01-04-20 12:18:10, Daniel Jordan wrote:
> > > On Wed, Apr 01, 2020 at 06:12:43PM +0200, Michal Hocko wrote:
> > > > On Wed 01-04-20 12:09:29, Daniel Jordan wrote:
> > > > > On Wed, Apr 01, 2020 at 06:00:48PM +0200, Michal Hocko wrote:
> > > > > > On Wed 01-04-20 17:50:22, David Hildenbrand wrote:
> > > > > > > On 01.04.20 17:42, Michal Hocko wrote:
> > > > > > > > This needs a double checking but I strongly believe that the lock can be
> > > > > > > > simply dropped in this path.
> > > > >
> > > > > This is what my fix does, it limits the time the resize lock is held.
> > > >
> > > > Just remove it from the deferred intialization and add a comment that we
> > > > deliberately not taking the lock here because abc
> > >
> > > I think it has to be a little more involved because of the window where
> > > interrupts might allocate during deferred init, as Vlastimil pointed out a few
> > > years ago when the change was made.
> >
> > I do not remember any details but do we have any actual real allocation
> > failure or was this mostly a theoretical concern. Vlastimil? For your
> > context we are talking about 3a2d7fa8a3d5 ("mm: disable interrupts while
> > initializing deferred pages")
> 
> I do not remember seeing any real failures, this was a theoretical
> window. So, we could potentially simply remove these locks until we
> see a real boot failure in some interrupt thread. The allocation has
> to be rather large as well.

Yes please! We are really great at over complicating and over
engineering stuff based on theoretical issues and build on top of that
and make the code even more complex because nobody dares to re-evaluate
and so on.
Pavel Tatashin April 1, 2020, 4:51 p.m. UTC | #20
> > I do not remember seeing any real failures, this was a theoretical
> > window. So, we could potentially simply remove these locks until we
> > see a real boot failure in some interrupt thread. The allocation has
> > to be rather large as well.
>
> Yes please! We are really great at over complicating and over
> engineering stuff based on theoretical issues and build on top of that
> and make the code even more complex because nobody dares to re-evaluate
> and so on.

I will submit a patch (or revert) whichever is cleaner.

Pasha
Daniel Jordan April 1, 2020, 5:13 p.m. UTC | #21
On Wed, Apr 01, 2020 at 12:51:56PM -0400, Pavel Tatashin wrote:
> > > I do not remember seeing any real failures, this was a theoretical
> > > window. So, we could potentially simply remove these locks until we
> > > see a real boot failure in some interrupt thread. The allocation has
> > > to be rather large as well.
> >
> > Yes please! We are really great at over complicating and over
> > engineering stuff based on theoretical issues and build on top of that
> > and make the code even more complex because nobody dares to re-evaluate
> > and so on.
> 
> I will submit a patch (or revert) whichever is cleaner.

I had thought people would be concerned about that window.  I believe the quote
from the time it was being discussed was "rare failures suck."  They very much
do :) but in this case I think any failure would be relatively easy to
diagnose.

So great, simpler is always better, and I'll wait for what you send, Pasha.

Alternatively, if you won't be able to get to it for a while, I can write it
up, having thoroughly paged all this in over the past week.

Patch
diff mbox series

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3c4eb750a199..a3a47845e150 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1763,12 +1763,17 @@  deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn,
 	return nr_pages;
 }
 
+/*
+ * Release the pending interrupts for every TICK_PAGE_COUNT pages.
+ */
+#define TICK_PAGE_COUNT	(32 * 1024)
+
 /* Initialise remaining memory on a node */
 static int __init deferred_init_memmap(void *data)
 {
 	pg_data_t *pgdat = data;
 	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
-	unsigned long spfn = 0, epfn = 0, nr_pages = 0;
+	unsigned long spfn = 0, epfn = 0, nr_pages = 0, prev_nr_pages = 0;
 	unsigned long first_init_pfn, flags;
 	unsigned long start = jiffies;
 	struct zone *zone;
@@ -1779,6 +1784,7 @@  static int __init deferred_init_memmap(void *data)
 	if (!cpumask_empty(cpumask))
 		set_cpus_allowed_ptr(current, cpumask);
 
+again:
 	pgdat_resize_lock(pgdat, &flags);
 	first_init_pfn = pgdat->first_deferred_pfn;
 	if (first_init_pfn == ULONG_MAX) {
@@ -1790,7 +1796,6 @@  static int __init deferred_init_memmap(void *data)
 	/* Sanity check boundaries */
 	BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);
 	BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));
-	pgdat->first_deferred_pfn = ULONG_MAX;
 
 	/* Only the highest zone is deferred so find it */
 	for (zid = 0; zid < MAX_NR_ZONES; zid++) {
@@ -1809,9 +1814,23 @@  static int __init deferred_init_memmap(void *data)
 	 * that we can avoid introducing any issues with the buddy
 	 * allocator.
 	 */
-	while (spfn < epfn)
+	while (spfn < epfn) {
 		nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
+		/*
+		 * Release the interrupts for every TICK_PAGE_COUNT pages
+		 * (128MB) to give tick timer the chance to update the
+		 * system jiffies.
+		 */
+		if ((nr_pages - prev_nr_pages) > TICK_PAGE_COUNT) {
+			prev_nr_pages = nr_pages;
+			pgdat->first_deferred_pfn = spfn;
+			pgdat_resize_unlock(pgdat, &flags);
+			goto again;
+		}
+	}
+
 zone_empty:
+	pgdat->first_deferred_pfn = ULONG_MAX;
 	pgdat_resize_unlock(pgdat, &flags);
 
 	/* Sanity check that the next zone really is unpopulated */