Hi Oscar, On Wed, 10 Mar 2021 at 03:11, Oscar Salvador wrote: > > When sizeof(struct page) is not a power of 2, sections do not span > a PMD anymore and so when populating them some parts of the PMD will > remain unused. > Because of this, PMDs will be left behind when depopulating sections > since remove_pmd_table() thinks that those unused parts are still in > use. > > Fix this by marking the unused parts with PAGE_UNUSED, so memchr_inv() > will do the right thing and will let us free the PMD when the last user > of it is gone. > > This patch is based on a similar patch by David Hildenbrand: > > https://lore.kernel.org/linux-mm/20200722094558.9828-9-david@redhat.com/ > > Signed-off-by: Oscar Salvador > Reviewed-by: David Hildenbrand > Acked-by: Dave Hansen > --- > arch/x86/mm/init_64.c | 65 +++++++++++++++++++++++++++++++++++++++++---------- > 1 file changed, 53 insertions(+), 12 deletions(-) > > diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c > index 9ecb3c488ac8..d93b36856ed3 100644 > --- a/arch/x86/mm/init_64.c > +++ b/arch/x86/mm/init_64.c > @@ -826,6 +826,51 @@ void __init paging_init(void) > zone_sizes_init(); > } > > +#ifdef CONFIG_SPARSEMEM_VMEMMAP > +#define PAGE_UNUSED 0xFD > + > +/* Returns true if the PMD is completely unused and thus it can be freed */ > +static bool __meminit vmemmap_pmd_is_unused(unsigned long addr, unsigned long end) > +{ > + unsigned long start = ALIGN_DOWN(addr, PMD_SIZE); > + > + memset((void *)addr, PAGE_UNUSED, end - addr); > + > + return !memchr_inv((void *)start, PAGE_UNUSED, PMD_SIZE); > +} > + > +static void __meminit vmemmap_use_sub_pmd(unsigned long start) > +{ > + /* > + * As we expect to add in the same granularity as we remove, it's > + * sufficient to mark only some piece used to block the memmap page from > + * getting removed when removing some other adjacent memmap (just in > + * case the first memmap never gets initialized e.g., because the memory > + * block never gets onlined). > + */ > + memset((void *)start, 0, sizeof(struct page)); > +} > + > +static void __meminit vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end) > +{ > + /* > + * Could be our memmap page is filled with PAGE_UNUSED already from a > + * previous remove. Make sure to reset it. > + */ > + vmemmap_use_sub_pmd(start); > + > + /* > + * Mark with PAGE_UNUSED the unused parts of the new memmap range > + */ > + if (!IS_ALIGNED(start, PMD_SIZE)) > + memset((void *)start, PAGE_UNUSED, > + start - ALIGN_DOWN(start, PMD_SIZE)); > + if (!IS_ALIGNED(end, PMD_SIZE)) > + memset((void *)end, PAGE_UNUSED, > + ALIGN(end, PMD_SIZE) - end); > +} > +#endif > + > /* > * Memory hotplug specific functions > */ > @@ -871,8 +916,6 @@ int arch_add_memory(int nid, u64 start, u64 size, > return add_pages(nid, start_pfn, nr_pages, params); > } > > -#define PAGE_INUSE 0xFD > - > static void __meminit free_pagetable(struct page *page, int order) > { > unsigned long magic; > @@ -1006,7 +1049,6 @@ remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end, > unsigned long next, pages = 0; > pte_t *pte_base; > pmd_t *pmd; > - void *page_addr; > > pmd = pmd_start + pmd_index(addr); > for (; addr < end; addr = next, pmd++) { > @@ -1026,20 +1068,13 @@ remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end, > pmd_clear(pmd); > spin_unlock(&init_mm.page_table_lock); > pages++; > - } else { > - /* If here, we are freeing vmemmap pages. */ > - memset((void *)addr, PAGE_INUSE, next - addr); > - > - page_addr = page_address(pmd_page(*pmd)); > - if (!memchr_inv(page_addr, PAGE_INUSE, > - PMD_SIZE)) { > + } else if (IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP) && > + vmemmap_pmd_is_unused(addr, next)) { > free_hugepage_table(pmd_page(*pmd), > altmap); > - > spin_lock(&init_mm.page_table_lock); > pmd_clear(pmd); > spin_unlock(&init_mm.page_table_lock); > - } > } > > continue; > @@ -1492,11 +1527,17 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start, > > addr_end = addr + PMD_SIZE; > p_end = p + PMD_SIZE; > + > + if (!IS_ALIGNED(addr, PMD_SIZE) || > + !IS_ALIGNED(next, PMD_SIZE)) > + vmemmap_use_new_sub_pmd(addr, next); > + > continue; > } else if (altmap) > return -ENOMEM; /* no fallback */ > } else if (pmd_large(*pmd)) { > vmemmap_verify((pte_t *)pmd, node, addr, next); > + vmemmap_use_sub_pmd(addr); While building the linux next 20210310 tag for x86_64 architecture with clang-12 and gcc-9 the following warnings / errors were noticed. arch/x86/mm/init_64.c:1585:6: error: implicit declaration of function 'vmemmap_use_new_sub_pmd' [-Werror,-Wimplicit-function-declaration] vmemmap_use_new_sub_pmd(addr, next); ^ arch/x86/mm/init_64.c:1591:4: error: implicit declaration of function 'vmemmap_use_sub_pmd' [-Werror,-Wimplicit-function-declaration] vmemmap_use_sub_pmd(addr, next); ^ 2 errors generated. make[3]: *** [scripts/Makefile.build:271: arch/x86/mm/init_64.o] Error 1 Reported-by: Naresh Kamboju Steps to reproduce: ------------------- # TuxMake is a command line tool and Python library that provides # portable and repeatable Linux kernel builds across a variety of # architectures, toolchains, kernel configurations, and make targets. # # TuxMake supports the concept of runtimes. # See https://docs.tuxmake.org/runtimes/, for that to work it requires # that you install podman or docker on your system. # # To install tuxmake on your system globally: # sudo pip3 install -U tuxmake # # See https://docs.tuxmake.org/ for complete documentation. tuxmake --runtime podman --target-arch x86_64 --toolchain clang-12 --kconfig defconfig --kconfig-add https://builds.tuxbuild.com/1pYCPt4WlgSfSdv1BULm6ABINeJ/config Build pipeline error link, https://gitlab.com/Linaro/lkft/mirrors/next/linux-next/-/jobs/1085496613#L428 -- Linaro LKFT https://lkft.linaro.org