arch/x86/mm/fault.c | 243 +++++++++++++--------------------------------------- 1 file changed, 58 insertions(+), 185 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index d973e61e450d..4b0a1b9404b1 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -42,6 +42,64 @@ enum x86_pf_error_code { }; /* + * Handle a possible vmalloc fault. We just copy the + * top-level page table entry if necessary. + * + * With PAE, the top-most pgd entry is always shared, + * and that's where the vmalloc area is. So PAE had + * better never have any vmalloc faults. + * + * NOTE! This on purpose does *NOT* use pgd_present() + * and such generic accessor functions, because + * the pgd may contain a folded pud/pmd, and is thus + * always "present". We access the actual hardware + * state directly, except for the final "set_pgd()" + * that may go through a paravirtualization layer. + * + * Also note the disgusting hackery for the whole + * paravirtualization case. Since PAE isn't an issue, + * we know that the pmd is the top level, and we just + * short-circuit it all. + * + * We *seriously* need to get rid of the crazy + * paravirtualization crud. + */ +static nokprobe_inline int vmalloc_fault(unsigned long address) +{ +#ifdef CONFIG_X86_PAE + return -1; +#else + pgd_t *pgd_dst, pgd_entry; + unsigned index = pgd_index(address); + + if (index < KERNEL_PGD_BOUNDARY) + return -1; + + pgd_entry = init_mm.pgd[index]; + if (!(pgd_entry.pgd & 1)) + return -1; + + pgd_dst = __va(PAGE_MASK & read_cr3()); + pgd_dst += index; + + if (pgd_dst->pgd) + return -1; + +#ifdef CONFIG_PARAVIRT +#ifdef CONFIG_X86_32 + set_pmd((pmd_t *)pgd_dst, (pmd_t){(pud_t){pgd_entry}}); +#else + set_pgd(pgd_dst, pgd_entry); + arch_flush_lazy_mmu_mode(); // WTF? +#endif +#else + *pgd_dst = pgd_entry; +#endif + return 0; +#endif +} + +/* * Returns 0 if mmiotrace is disabled, or if the fault is not * handled by mmiotrace: */ @@ -189,110 +247,6 @@ DEFINE_SPINLOCK(pgd_lock); LIST_HEAD(pgd_list); #ifdef CONFIG_X86_32 -static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) -{ - unsigned index = pgd_index(address); - pgd_t *pgd_k; - pud_t *pud, *pud_k; - pmd_t *pmd, *pmd_k; - - pgd += index; - pgd_k = init_mm.pgd + index; - - if (!pgd_present(*pgd_k)) - return NULL; - - /* - * set_pgd(pgd, *pgd_k); here would be useless on PAE - * and redundant with the set_pmd() on non-PAE. As would - * set_pud. - */ - pud = pud_offset(pgd, address); - pud_k = pud_offset(pgd_k, address); - if (!pud_present(*pud_k)) - return NULL; - - pmd = pmd_offset(pud, address); - pmd_k = pmd_offset(pud_k, address); - if (!pmd_present(*pmd_k)) - return NULL; - - if (!pmd_present(*pmd)) - set_pmd(pmd, *pmd_k); - else - BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); - - return pmd_k; -} - -void vmalloc_sync_all(void) -{ - unsigned long address; - - if (SHARED_KERNEL_PMD) - return; - - for (address = VMALLOC_START & PMD_MASK; - address >= TASK_SIZE && address < FIXADDR_TOP; - address += PMD_SIZE) { - struct page *page; - - spin_lock(&pgd_lock); - list_for_each_entry(page, &pgd_list, lru) { - spinlock_t *pgt_lock; - pmd_t *ret; - - /* the pgt_lock only for Xen */ - pgt_lock = &pgd_page_get_mm(page)->page_table_lock; - - spin_lock(pgt_lock); - ret = vmalloc_sync_one(page_address(page), address); - spin_unlock(pgt_lock); - - if (!ret) - break; - } - spin_unlock(&pgd_lock); - } -} - -/* - * 32-bit: - * - * Handle a fault on the vmalloc or module mapping area - */ -static noinline int vmalloc_fault(unsigned long address) -{ - unsigned long pgd_paddr; - pmd_t *pmd_k; - pte_t *pte_k; - - /* Make sure we are in vmalloc area: */ - if (!(address >= VMALLOC_START && address < VMALLOC_END)) - return -1; - - WARN_ON_ONCE(in_nmi()); - - /* - * Synchronize this task's top level page-table - * with the 'reference' page table. - * - * Do _not_ use "current" here. We might be inside - * an interrupt in the middle of a task switch.. - */ - pgd_paddr = read_cr3(); - pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); - if (!pmd_k) - return -1; - - pte_k = pte_offset_kernel(pmd_k, address); - if (!pte_present(*pte_k)) - return -1; - - return 0; -} -NOKPROBE_SYMBOL(vmalloc_fault); - /* * Did it hit the DOS screen memory VA from vm86 mode? */ @@ -347,87 +301,6 @@ out: #else /* CONFIG_X86_64: */ -void vmalloc_sync_all(void) -{ - sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END, 0); -} - -/* - * 64-bit: - * - * Handle a fault on the vmalloc area - * - * This assumes no large pages in there. - */ -static noinline int vmalloc_fault(unsigned long address) -{ - pgd_t *pgd, *pgd_ref; - pud_t *pud, *pud_ref; - pmd_t *pmd, *pmd_ref; - pte_t *pte, *pte_ref; - - /* Make sure we are in vmalloc area: */ - if (!(address >= VMALLOC_START && address < VMALLOC_END)) - return -1; - - WARN_ON_ONCE(in_nmi()); - - /* - * Copy kernel mappings over when needed. This can also - * happen within a race in page table update. In the later - * case just flush: - */ - pgd = pgd_offset(current->active_mm, address); - pgd_ref = pgd_offset_k(address); - if (pgd_none(*pgd_ref)) - return -1; - - if (pgd_none(*pgd)) { - set_pgd(pgd, *pgd_ref); - arch_flush_lazy_mmu_mode(); - } else { - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); - } - - /* - * Below here mismatches are bugs because these lower tables - * are shared: - */ - - pud = pud_offset(pgd, address); - pud_ref = pud_offset(pgd_ref, address); - if (pud_none(*pud_ref)) - return -1; - - if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) - BUG(); - - pmd = pmd_offset(pud, address); - pmd_ref = pmd_offset(pud_ref, address); - if (pmd_none(*pmd_ref)) - return -1; - - if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) - BUG(); - - pte_ref = pte_offset_kernel(pmd_ref, address); - if (!pte_present(*pte_ref)) - return -1; - - pte = pte_offset_kernel(pmd, address); - - /* - * Don't use pte_page here, because the mappings can point - * outside mem_map, and the NUMA hash lookup cannot handle - * that: - */ - if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) - BUG(); - - return 0; -} -NOKPROBE_SYMBOL(vmalloc_fault); - #ifdef CONFIG_CPU_SUP_AMD static const char errata93_warning[] = KERN_ERR