Hi all, Today's linux-next merge of the tip tree got a conflict in mm/huge_memory.c between commits d516904bd239 ("thp: merge page pre-alloc in khugepaged_loop into khugepaged_do_scan"), e3ebcf643811 ("thp: remove assumptions on pgtable_t type") and 46dcde735c9d ("thp: introduce pmdp_invalidate()") from Linus' tree and commit 93c9d633bd9e ("mm/thp: Preserve pgprot across huge page split") from the tip tree. I fixed it up (I think - see below) and can carry the fix as necessary (no action is required). -- Cheers, Stephen Rothwell sfr@canb.auug.org.au diff --cc mm/huge_memory.c index a863af2,5b9ab25..0000000 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@@ -1346,59 -1428,55 +1417,54 @@@ static int __split_huge_page_map(struc spin_lock(&mm->page_table_lock); pmd = page_check_address_pmd(page, mm, address, PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); - if (pmd) { - pgtable = pgtable_trans_huge_withdraw(mm); - pmd_populate(mm, &_pmd, pgtable); - - haddr = address; - for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { - pte_t *pte, entry; - BUG_ON(PageCompound(page+i)); - entry = mk_pte(page + i, vma->vm_page_prot); - entry = maybe_mkwrite(pte_mkdirty(entry), vma); - if (!pmd_write(*pmd)) - entry = pte_wrprotect(entry); - else - BUG_ON(page_mapcount(page) != 1); - if (!pmd_young(*pmd)) - entry = pte_mkold(entry); - pte = pte_offset_map(&_pmd, haddr); - BUG_ON(!pte_none(*pte)); - set_pte_at(mm, haddr, pte, entry); - pte_unmap(pte); - } + if (!pmd) + goto unlock; - smp_wmb(); /* make pte visible before pmd */ - /* - * Up to this point the pmd is present and huge and - * userland has the whole access to the hugepage - * during the split (which happens in place). If we - * overwrite the pmd with the not-huge version - * pointing to the pte here (which of course we could - * if all CPUs were bug free), userland could trigger - * a small page size TLB miss on the small sized TLB - * while the hugepage TLB entry is still established - * in the huge TLB. Some CPU doesn't like that. See - * http://support.amd.com/us/Processor_TechDocs/41322.pdf, - * Erratum 383 on page 93. Intel should be safe but is - * also warns that it's only safe if the permission - * and cache attributes of the two entries loaded in - * the two TLB is identical (which should be the case - * here). But it is generally safer to never allow - * small and huge TLB entries for the same virtual - * address to be loaded simultaneously. So instead of - * doing "pmd_populate(); flush_tlb_range();" we first - * mark the current pmd notpresent (atomically because - * here the pmd_trans_huge and pmd_trans_splitting - * must remain set at all times on the pmd until the - * split is complete for this pmd), then we flush the - * SMP TLB and finally we write the non-huge version - * of the pmd entry with pmd_populate. - */ - pmdp_invalidate(vma, address, pmd); - pmd_populate(mm, pmd, pgtable); - ret = 1; + prot = pmd_pgprot(*pmd); - pgtable = get_pmd_huge_pte(mm); ++ pgtable = pgtable_trans_huge_withdraw(mm); + pmd_populate(mm, &_pmd, pgtable); + + for (i = 0, haddr = address; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { + pte_t *pte, entry; + + BUG_ON(PageCompound(page+i)); + entry = mk_pte(page + i, prot); + entry = pte_mkdirty(entry); + if (!pmd_young(*pmd)) + entry = pte_mkold(entry); + pte = pte_offset_map(&_pmd, haddr); + BUG_ON(!pte_none(*pte)); + set_pte_at(mm, haddr, pte, entry); + pte_unmap(pte); } + + smp_wmb(); /* make ptes visible before pmd, see __pte_alloc */ + /* + * Up to this point the pmd is present and huge. + * + * If we overwrite the pmd with the not-huge version, we could trigger + * a small page size TLB miss on the small sized TLB while the hugepage + * TLB entry is still established in the huge TLB. + * + * Some CPUs don't like that. See + * http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum 383 + * on page 93. + * + * Thus it is generally safer to never allow small and huge TLB entries + * for overlapping virtual addresses to be loaded. So we first mark the + * current pmd not present, then we flush the TLB and finally we write + * the non-huge version of the pmd entry with pmd_populate. + * + * The above needs to be done under the ptl because pmd_trans_huge and + * pmd_trans_splitting must remain set on the pmd until the split is + * complete. The ptl also protects against concurrent faults due to + * making the pmd not-present. + */ - set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd)); - flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); ++ pmdp_invalidate(vma, address, pmd); + pmd_populate(mm, pmd, pgtable); + ret = 1; + + unlock: spin_unlock(&mm->page_table_lock); return ret; @@@ -2279,23 -2300,30 +2345,21 @@@ static int khugepaged_has_work(void static int khugepaged_wait_event(void) { return !list_empty(&khugepaged_scan.mm_head) || - !khugepaged_enabled(); + kthread_should_stop(); } -static void khugepaged_do_scan(struct page **hpage) +static void khugepaged_do_scan(void) { + struct page *hpage = NULL; unsigned int progress = 0, pass_through_head = 0; - unsigned int pages = khugepaged_pages_to_scan; + unsigned int pages = ACCESS_ONCE(khugepaged_pages_to_scan); + bool wait = true; - barrier(); /* write khugepaged_pages_to_scan to local stack */ - while (progress < pages) { - cond_resched(); - -#ifndef CONFIG_NUMA - if (!*hpage) { - *hpage = alloc_hugepage(khugepaged_defrag()); - if (unlikely(!*hpage)) { - count_vm_event(THP_COLLAPSE_ALLOC_FAILED); - break; - } - count_vm_event(THP_COLLAPSE_ALLOC); - } -#else - if (IS_ERR(*hpage)) + if (!khugepaged_prealloc_page(&hpage, &wait)) break; -#endif + + cond_resched(); if (unlikely(kthread_should_stop() || freezing(current))) break;