arch/x86/include/asm/pgtable.h | 31 +++++-- arch/x86/include/asm/pgtable_types.h | 27 +----- arch/x86/mm/gup.c | 4 +- include/asm-generic/pgtable.h | 159 ++--------------------------------- include/linux/huge_mm.h | 3 +- init/Kconfig | 11 --- mm/gup.c | 4 +- mm/huge_memory.c | 42 +++------ mm/memory.c | 16 ++-- mm/mempolicy.c | 2 +- mm/migrate.c | 1 - mm/mprotect.c | 40 +++------ mm/pgtable-generic.c | 2 - 13 files changed, 74 insertions(+), 268 deletions(-) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index aa97a070f09f..0d4ec3a62fed 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -299,7 +299,7 @@ static inline pmd_t pmd_mkwrite(pmd_t pmd) static inline pmd_t pmd_mknotpresent(pmd_t pmd) { - return pmd_clear_flags(pmd, _PAGE_PRESENT); + return pmd_clear_flags(pmd, _PAGE_PRESENT | _PAGE_PROTNONE); } #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY @@ -457,8 +457,7 @@ static inline int pte_same(pte_t a, pte_t b) static inline int pte_present(pte_t a) { - return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE | - _PAGE_NUMA); + return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); } #define pte_present_nonuma pte_present_nonuma @@ -473,7 +472,7 @@ static inline bool pte_accessible(struct mm_struct *mm, pte_t a) if (pte_flags(a) & _PAGE_PRESENT) return true; - if ((pte_flags(a) & (_PAGE_PROTNONE | _PAGE_NUMA)) && + if ((pte_flags(a) & _PAGE_PROTNONE) && mm_tlb_flush_pending(mm)) return true; @@ -493,10 +492,26 @@ static inline int pmd_present(pmd_t pmd) * the _PAGE_PSE flag will remain set at all times while the * _PAGE_PRESENT bit is clear). */ - return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE | - _PAGE_NUMA); + return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE); } +#ifdef CONFIG_NUMA_BALANCING +/* + * Technically these work even when not doing NUMA + * balancing, but we don't care and have simpler + * "always false" versions for that case + */ +static inline int pte_protnone(pte_t pte) +{ + return pte_flags(pte) & _PAGE_PROTNONE; +} + +static inline int pmd_protnone(pmd_t pmd) +{ + return pmd_flags(pmd) & _PAGE_PROTNONE; +} +#endif + static inline int pmd_none(pmd_t pmd) { /* Only check low word on 32-bit platforms, since it might be @@ -554,8 +569,8 @@ static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address) static inline int pmd_bad(pmd_t pmd) { #ifdef CONFIG_NUMA_BALANCING - /* pmd_numa check */ - if ((pmd_flags(pmd) & (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA) + /* pmd_numa check. Really? */ + if ((pmd_flags(pmd) & (_PAGE_PROTNONE|_PAGE_PRESENT)) == _PAGE_PROTNONE) return 0; #endif return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE; diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index f216963760e5..5438e96b2915 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -28,14 +28,6 @@ #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ -/* - * Swap offsets on configurations that allow automatic NUMA balancing use the - * bits after _PAGE_BIT_GLOBAL. To uniquely distinguish NUMA hinting PTEs from - * swap entries, we use the first bit after _PAGE_BIT_GLOBAL and shrink the - * maximum possible swap space from 16TB to 8TB. - */ -#define _PAGE_BIT_NUMA (_PAGE_BIT_GLOBAL+1) - /* If _PAGE_BIT_PRESENT is clear, we use these: */ /* - if the user mapped it with PROT_NONE; pte_present gives true */ #define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL @@ -79,21 +71,6 @@ #endif /* - * _PAGE_NUMA distinguishes between a numa hinting minor fault and a page - * that is not present. The hinting fault gathers numa placement statistics - * (see pte_numa()). The bit is always zero when the PTE is not present. - * - * The bit picked must be always zero when the pmd is present and not - * present, so that we don't lose information when we set it while - * atomically clearing the present bit. - */ -#ifdef CONFIG_NUMA_BALANCING -#define _PAGE_NUMA (_AT(pteval_t, 1) << _PAGE_BIT_NUMA) -#else -#define _PAGE_NUMA (_AT(pteval_t, 0)) -#endif - -/* * Tracking soft dirty bit when a page goes to a swap is tricky. * We need a bit which can be stored in pte _and_ not conflict * with swap entry format. On x86 bits 6 and 7 are *not* involved @@ -126,8 +103,8 @@ /* Set of bits not changed in pte_modify */ #define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \ - _PAGE_SOFT_DIRTY | _PAGE_NUMA) -#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_NUMA) + _PAGE_SOFT_DIRTY) +#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) #define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT) #define _PAGE_CACHE_WB (0) diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 207d9aef662d..f32e12c44b23 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -84,7 +84,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, struct page *page; /* Similar to the PMD case, NUMA hinting must take slow path */ - if (pte_numa(pte)) { + if (pte_protnone(pte)) { pte_unmap(ptep); return 0; } @@ -178,7 +178,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, * slowpath for accounting purposes and so that they * can be serialised against THP migration. */ - if (pmd_numa(pmd)) + if (pmd_protnone(pmd)) return 0; if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) return 0; diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 53b2acc38213..a7c08d9ddeda 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -659,165 +659,24 @@ static inline int pmd_trans_unstable(pmd_t *pmd) #endif } -#ifdef CONFIG_NUMA_BALANCING -#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE +#ifndef CONFIG_NUMA_BALANCING /* - * _PAGE_NUMA works identical to _PAGE_PROTNONE (it's actually the - * same bit too). It's set only when _PAGE_PRESET is not set and it's - * never set if _PAGE_PRESENT is set. - * - * pte/pmd_present() returns true if pte/pmd_numa returns true. Page - * fault triggers on those regions if pte/pmd_numa returns true - * (because _PAGE_PRESENT is not set). - */ -#ifndef pte_numa -static inline int pte_numa(pte_t pte) -{ - return (pte_flags(pte) & - (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)) == _PAGE_NUMA; -} -#endif - -#ifndef pmd_numa -static inline int pmd_numa(pmd_t pmd) -{ - return (pmd_flags(pmd) & - (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)) == _PAGE_NUMA; -} -#endif - -/* - * pte/pmd_mknuma sets the _PAGE_ACCESSED bitflag automatically - * because they're called by the NUMA hinting minor page fault. If we - * wouldn't set the _PAGE_ACCESSED bitflag here, the TLB miss handler - * would be forced to set it later while filling the TLB after we - * return to userland. That would trigger a second write to memory - * that we optimize away by setting _PAGE_ACCESSED here. + * Technically a pte can be PROTNONE even when not + * doing NUMA balancing, but the only case where the + * kernel actually _cares_ is for the numa balancing + * case, so by default we just implement the simpler + * "always no" helper function. */ -#ifndef pte_mknonnuma -static inline pte_t pte_mknonnuma(pte_t pte) -{ - pteval_t val = pte_val(pte); - - val &= ~_PAGE_NUMA; - val |= (_PAGE_PRESENT|_PAGE_ACCESSED); - return __pte(val); -} -#endif - -#ifndef pmd_mknonnuma -static inline pmd_t pmd_mknonnuma(pmd_t pmd) -{ - pmdval_t val = pmd_val(pmd); - - val &= ~_PAGE_NUMA; - val |= (_PAGE_PRESENT|_PAGE_ACCESSED); - - return __pmd(val); -} -#endif - -#ifndef pte_mknuma -static inline pte_t pte_mknuma(pte_t pte) -{ - pteval_t val = pte_val(pte); - - val &= ~_PAGE_PRESENT; - val |= _PAGE_NUMA; - - return __pte(val); -} -#endif - -#ifndef ptep_set_numa -static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) -{ - pte_t ptent = *ptep; - - ptent = pte_mknuma(ptent); - set_pte_at(mm, addr, ptep, ptent); - return; -} -#endif - -#ifndef pmd_mknuma -static inline pmd_t pmd_mknuma(pmd_t pmd) -{ - pmdval_t val = pmd_val(pmd); - - val &= ~_PAGE_PRESENT; - val |= _PAGE_NUMA; - - return __pmd(val); -} -#endif - -#ifndef pmdp_set_numa -static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp) -{ - pmd_t pmd = *pmdp; - - pmd = pmd_mknuma(pmd); - set_pmd_at(mm, addr, pmdp, pmd); - return; -} -#endif -#else -extern int pte_numa(pte_t pte); -extern int pmd_numa(pmd_t pmd); -extern pte_t pte_mknonnuma(pte_t pte); -extern pmd_t pmd_mknonnuma(pmd_t pmd); -extern pte_t pte_mknuma(pte_t pte); -extern pmd_t pmd_mknuma(pmd_t pmd); -extern void ptep_set_numa(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -extern void pmdp_set_numa(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp); -#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */ -#else -static inline int pmd_numa(pmd_t pmd) +static inline int pte_protnone(pte_t pte) { return 0; } -static inline int pte_numa(pte_t pte) +static inline int pmd_protnone(pmd_t pmd) { return 0; } - -static inline pte_t pte_mknonnuma(pte_t pte) -{ - return pte; -} - -static inline pmd_t pmd_mknonnuma(pmd_t pmd) -{ - return pmd; -} - -static inline pte_t pte_mknuma(pte_t pte) -{ - return pte; -} - -static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) -{ - return; -} - - -static inline pmd_t pmd_mknuma(pmd_t pmd) -{ - return pmd; -} - -static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp) -{ - return ; -} -#endif /* CONFIG_NUMA_BALANCING */ +#endif #endif /* CONFIG_MMU */ diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 63579cb8d3dc..2a2ac91eafde 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -31,8 +31,7 @@ extern int move_huge_pmd(struct vm_area_struct *vma, unsigned long new_addr, unsigned long old_end, pmd_t *old_pmd, pmd_t *new_pmd); extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, pgprot_t newprot, - int prot_numa); + unsigned long addr, pgprot_t newprot); enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_FLAG, diff --git a/init/Kconfig b/init/Kconfig index e84c6423a2e5..f5c6bb56617a 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -887,17 +887,6 @@ config ARCH_SUPPORTS_INT128 config ARCH_WANT_NUMA_VARIABLE_LOCALITY bool -# -# For architectures that are willing to define _PAGE_NUMA as _PAGE_PROTNONE -config ARCH_WANTS_PROT_NUMA_PROT_NONE - bool - -config ARCH_USES_NUMA_PROT_NONE - bool - default y - depends on ARCH_WANTS_PROT_NUMA_PROT_NONE - depends on NUMA_BALANCING - config NUMA_BALANCING_DEFAULT_ENABLED bool "Automatically enable NUMA aware memory/task placement" default y diff --git a/mm/gup.c b/mm/gup.c index 91d044b1600d..7a7419da2da9 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -60,7 +60,7 @@ retry: migration_entry_wait(mm, pmd, address); goto retry; } - if ((flags & FOLL_NUMA) && pte_numa(pte)) + if ((flags & FOLL_NUMA) && pte_protnone(pte)) goto no_page; if ((flags & FOLL_WRITE) && !pte_write(pte)) { pte_unmap_unlock(ptep, ptl); @@ -189,7 +189,7 @@ struct page *follow_page_mask(struct vm_area_struct *vma, } return page; } - if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) + if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) return no_page_table(vma, flags); if (pmd_trans_huge(*pmd)) { if (flags & FOLL_SPLIT) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d9a21d06b862..14de54af6c38 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1223,7 +1223,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, return ERR_PTR(-EFAULT); /* Full NUMA hinting faults to serialise migration in fault paths */ - if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) + if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) goto out; page = pmd_page(*pmd); @@ -1366,9 +1366,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out; clear_pmdnuma: BUG_ON(!PageLocked(page)); - pmd = pmd_mknonnuma(pmd); + pmd = pmd_modify(pmd, vma->vm_page_prot); set_pmd_at(mm, haddr, pmdp, pmd); - VM_BUG_ON(pmd_numa(*pmdp)); update_mmu_cache_pmd(vma, addr, pmdp); unlock_page(page); out_unlock: @@ -1502,7 +1501,7 @@ out: * - HPAGE_PMD_NR is protections changed and TLB flush necessary */ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, pgprot_t newprot, int prot_numa) + unsigned long addr, pgprot_t newprot) { struct mm_struct *mm = vma->vm_mm; spinlock_t *ptl; @@ -1511,29 +1510,12 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { pmd_t entry; ret = 1; - if (!prot_numa) { - entry = pmdp_get_and_clear(mm, addr, pmd); - if (pmd_numa(entry)) - entry = pmd_mknonnuma(entry); - entry = pmd_modify(entry, newprot); - ret = HPAGE_PMD_NR; - set_pmd_at(mm, addr, pmd, entry); - BUG_ON(pmd_write(entry)); - } else { - struct page *page = pmd_page(*pmd); - /* - * Do not trap faults against the zero page. The - * read-only data is likely to be read-cached on the - * local CPU cache and it is less useful to know about - * local vs remote hits on the zero page. - */ - if (!is_huge_zero_page(page) && - !pmd_numa(*pmd)) { - pmdp_set_numa(mm, addr, pmd); - ret = HPAGE_PMD_NR; - } - } + entry = pmdp_get_and_clear(mm, addr, pmd); + entry = pmd_modify(entry, newprot); + ret = HPAGE_PMD_NR; + set_pmd_at(mm, addr, pmd, entry); + BUG_ON(pmd_write(entry)); spin_unlock(ptl); } @@ -1794,15 +1776,17 @@ static int __split_huge_page_map(struct page *page, haddr = address; for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { pte_t *pte, entry; + pgprot_t newprot = vma->vm_page_prot; + BUG_ON(PageCompound(page+i)); - entry = mk_pte(page + i, vma->vm_page_prot); + if (pmd_protnone(*pmd)) + newprot = PAGE_NONE; + entry = mk_pte(page + i, newprot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (!pmd_write(*pmd)) entry = pte_wrprotect(entry); if (!pmd_young(*pmd)) entry = pte_mkold(entry); - if (pmd_numa(*pmd)) - entry = pte_mknuma(entry); pte = pte_offset_map(&_pmd, haddr); BUG_ON(!pte_none(*pte)); set_pte_at(mm, haddr, pte, entry); diff --git a/mm/memory.c b/mm/memory.c index e229970e4223..62f1b7b82aef 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3118,9 +3118,9 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, * validation through pte_unmap_same(). It's of NUMA type but * the pfn may be screwed if the read is non atomic. * - * ptep_modify_prot_start is not called as this is clearing - * the _PAGE_NUMA bit and it is not really expected that there - * would be concurrent hardware modifications to the PTE. + * We can safely just do a "set_pte_at()", because the old + * page table entry is not accessible, so there would be no + * concurrent hardware modifications to the PTE. */ ptl = pte_lockptr(mm, pmd); spin_lock(ptl); @@ -3129,7 +3129,11 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out; } - pte = pte_mknonnuma(pte); + /* Make it present again */ + /* Should we make it writable? We don't have the fault flags */ + pte = pte_modify(pte, vma->vm_page_prot); + pte = pte_mkyoung(pte); + set_pte_at(mm, addr, ptep, pte); update_mmu_cache(vma, addr, ptep); @@ -3218,7 +3222,7 @@ static int handle_pte_fault(struct mm_struct *mm, pte, pmd, flags, entry); } - if (pte_numa(entry)) + if (pte_protnone(entry)) return do_numa_page(mm, vma, address, entry, pte, pmd); ptl = pte_lockptr(mm, pmd); @@ -3296,7 +3300,7 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (pmd_trans_splitting(orig_pmd)) return 0; - if (pmd_numa(orig_pmd)) + if (pmd_protnone(orig_pmd)) return do_huge_pmd_numa_page(mm, vma, address, orig_pmd, pmd); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 8f5330d74f47..5e22f1ff4e33 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -635,7 +635,7 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, { int nr_updated; - nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1); + nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1); if (nr_updated) count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); diff --git a/mm/migrate.c b/mm/migrate.c index f78ec9bd454d..2600f33416b1 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1901,7 +1901,6 @@ out_fail: out_dropref: ptl = pmd_lock(mm, pmd); if (pmd_same(*pmd, entry)) { - entry = pmd_mknonnuma(entry); set_pmd_at(mm, mmun_start, pmd, entry); update_mmu_cache_pmd(vma, address, &entry); } diff --git a/mm/mprotect.c b/mm/mprotect.c index c43d557941f8..aef6fe6aba36 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -82,34 +82,17 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, oldpte = *pte; if (pte_present(oldpte)) { pte_t ptent; - bool updated = false; - if (!prot_numa) { - ptent = ptep_modify_prot_start(mm, addr, pte); - if (pte_numa(ptent)) - ptent = pte_mknonnuma(ptent); - ptent = pte_modify(ptent, newprot); - /* - * Avoid taking write faults for pages we - * know to be dirty. - */ - if (dirty_accountable && pte_dirty(ptent)) - ptent = pte_mkwrite(ptent); - ptep_modify_prot_commit(mm, addr, pte, ptent); - updated = true; - } else { - struct page *page; - - page = vm_normal_page(vma, addr, oldpte); - if (page && !PageKsm(page)) { - if (!pte_numa(oldpte)) { - ptep_set_numa(mm, addr, pte); - updated = true; - } - } - } - if (updated) - pages++; + ptent = ptep_modify_prot_start(mm, addr, pte); + ptent = pte_modify(ptent, newprot); + /* + * Avoid taking write faults for pages we + * know to be dirty. + */ + if (dirty_accountable && pte_dirty(ptent)) + ptent = pte_mkwrite(ptent); + ptep_modify_prot_commit(mm, addr, pte, ptent); + pages++; } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { swp_entry_t entry = pte_to_swp_entry(oldpte); @@ -164,8 +147,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, if (next - addr != HPAGE_PMD_SIZE) split_huge_page_pmd(vma, addr, pmd); else { - int nr_ptes = change_huge_pmd(vma, pmd, addr, - newprot, prot_numa); + int nr_ptes = change_huge_pmd(vma, pmd, addr, newprot); if (nr_ptes) { if (nr_ptes == HPAGE_PMD_NR) { diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index dfb79e028ecb..c25f94b33811 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -193,8 +193,6 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { pmd_t entry = *pmdp; - if (pmd_numa(entry)) - entry = pmd_mknonnuma(entry); set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(entry)); flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); }