From: Nicholas Piggin <npiggin@gmail.com> To: linux-mm@kvack.org, Andrew Morton <akpm@linux-foundation.org> Cc: Nicholas Piggin <npiggin@gmail.com>, linux-kernel@vger.kernel.org, linux-arch@vger.kernel.org, linuxppc-dev@lists.ozlabs.org, Zefan Li <lizefan@huawei.com>, Jonathan Cameron <Jonathan.Cameron@Huawei.com>, Christoph Hellwig <hch@infradead.org>, Christophe Leroy <christophe.leroy@csgroup.eu>, Rick Edgecombe <rick.p.edgecombe@intel.com>, Ding Tianhong <dingtianhong@huawei.com> Subject: [PATCH v10 11/12] mm/vmalloc: Hugepage vmalloc mappings Date: Sun, 24 Jan 2021 18:22:29 +1000 [thread overview] Message-ID: <20210124082230.2118861-12-npiggin@gmail.com> (raw) In-Reply-To: <20210124082230.2118861-1-npiggin@gmail.com> Support huge page vmalloc mappings. Config option HAVE_ARCH_HUGE_VMALLOC enables support on architectures that define HAVE_ARCH_HUGE_VMAP and supports PMD sized vmap mappings. vmalloc will attempt to allocate PMD-sized pages if allocating PMD size or larger, and fall back to small pages if that was unsuccessful. Architectures must ensure that any arch specific vmalloc allocations that require PAGE_SIZE mappings (e.g., module allocations vs strict module rwx) use the VM_NOHUGE flag to inhibit larger mappings. When hugepage vmalloc mappings are enabled in the next patch, this reduces TLB misses by nearly 30x on a `git diff` workload on a 2-node POWER9 (59,800 -> 2,100) and reduces CPU cycles by 0.54%. This can result in more internal fragmentation and memory overhead for a given allocation, an option nohugevmalloc is added to disable at boot. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> --- arch/Kconfig | 10 +++ include/linux/vmalloc.h | 18 ++++ mm/page_alloc.c | 5 +- mm/vmalloc.c | 192 ++++++++++++++++++++++++++++++---------- 4 files changed, 177 insertions(+), 48 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 24862d15f3a3..f87feb616184 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -724,6 +724,16 @@ config HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD config HAVE_ARCH_HUGE_VMAP bool +config HAVE_ARCH_HUGE_VMALLOC + depends on HAVE_ARCH_HUGE_VMAP + bool + help + Archs that select this would be capable of PMD-sized vmaps (i.e., + arch_vmap_pmd_supported() returns true), and they must make no + assumptions that vmalloc memory is mapped with PAGE_SIZE ptes. The + VM_NOHUGE flag can be used to prohibit arch-specific allocations from + using hugepages to help with this (e.g., modules may require it). + config ARCH_WANT_HUGE_PMD_SHARE bool diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 40649c4bb5a2..2ba023daf188 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -25,6 +25,7 @@ struct notifier_block; /* in notifier.h */ #define VM_NO_GUARD 0x00000040 /* don't add guard page */ #define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */ #define VM_MAP_PUT_PAGES 0x00000100 /* put pages and free array in vfree */ +#define VM_NOHUGE 0x00000200 /* force PAGE_SIZE pte mapping */ /* * VM_KASAN is used slighly differently depending on CONFIG_KASAN_VMALLOC. @@ -59,6 +60,7 @@ struct vm_struct { unsigned long size; unsigned long flags; struct page **pages; + unsigned int page_order; unsigned int nr_pages; phys_addr_t phys_addr; const void *caller; @@ -194,6 +196,18 @@ static inline void set_vm_flush_reset_perms(void *addr) if (vm) vm->flags |= VM_FLUSH_RESET_PERMS; } + +static inline bool is_vm_area_hugepages(const void *addr) +{ + /* + * This may not 100% tell if the area is mapped with > PAGE_SIZE + * page table entries, if for some reason the architecture indicates + * larger sizes are available but decides not to use them, nothing + * prevents that. This only indicates the size of the physical page + * allocated in the vmalloc layer. + */ + return (find_vm_area(addr)->page_order > 0); +} #else static inline int map_kernel_range_noflush(unsigned long start, unsigned long size, @@ -210,6 +224,10 @@ unmap_kernel_range_noflush(unsigned long addr, unsigned long size) static inline void set_vm_flush_reset_perms(void *addr) { } +static inline bool is_vm_area_hugepages(const void *addr) +{ + return false; +} #endif /* for /dev/kmem */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 027f6481ba59..b7a9661fa232 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -72,6 +72,7 @@ #include <linux/padata.h> #include <linux/khugepaged.h> #include <linux/buffer_head.h> +#include <linux/vmalloc.h> #include <asm/sections.h> #include <asm/tlbflush.h> @@ -8238,6 +8239,7 @@ void *__init alloc_large_system_hash(const char *tablename, void *table = NULL; gfp_t gfp_flags; bool virt; + bool huge; /* allow the kernel cmdline to have a say */ if (!numentries) { @@ -8305,6 +8307,7 @@ void *__init alloc_large_system_hash(const char *tablename, } else if (get_order(size) >= MAX_ORDER || hashdist) { table = __vmalloc(size, gfp_flags); virt = true; + huge = is_vm_area_hugepages(table); } else { /* * If bucketsize is not a power-of-two, we may free @@ -8321,7 +8324,7 @@ void *__init alloc_large_system_hash(const char *tablename, pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n", tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size, - virt ? "vmalloc" : "linear"); + virt ? (huge ? "vmalloc hugepage" : "vmalloc") : "linear"); if (_hash_shift) *_hash_shift = log2qty; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 0377e1d059e5..eef61e0f5170 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -42,6 +42,19 @@ #include "internal.h" #include "pgalloc-track.h" +#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC +static bool __ro_after_init vmap_allow_huge = true; + +static int __init set_nohugevmalloc(char *str) +{ + vmap_allow_huge = false; + return 0; +} +early_param("nohugevmalloc", set_nohugevmalloc); +#else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */ +static const bool vmap_allow_huge = false; +#endif /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */ + bool is_vmalloc_addr(const void *x) { unsigned long addr = (unsigned long)x; @@ -477,31 +490,12 @@ static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr, return 0; } -/** - * map_kernel_range_noflush - map kernel VM area with the specified pages - * @addr: start of the VM area to map - * @size: size of the VM area to map - * @prot: page protection flags to use - * @pages: pages to map - * - * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size specify should - * have been allocated using get_vm_area() and its friends. - * - * NOTE: - * This function does NOT do any cache flushing. The caller is responsible for - * calling flush_cache_vmap() on to-be-mapped areas before calling this - * function. - * - * RETURNS: - * 0 on success, -errno on failure. - */ -int map_kernel_range_noflush(unsigned long addr, unsigned long size, - pgprot_t prot, struct page **pages) +static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages) { unsigned long start = addr; - unsigned long end = addr + size; - unsigned long next; pgd_t *pgd; + unsigned long next; int err = 0; int nr = 0; pgtbl_mod_mask mask = 0; @@ -523,6 +517,65 @@ int map_kernel_range_noflush(unsigned long addr, unsigned long size, return 0; } +static int vmap_pages_range_noflush(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, unsigned int page_shift) +{ + unsigned int i, nr = (end - addr) >> PAGE_SHIFT; + + WARN_ON(page_shift < PAGE_SHIFT); + + if (page_shift == PAGE_SHIFT) + return vmap_small_pages_range_noflush(addr, end, prot, pages); + + for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) { + int err; + + err = vmap_range_noflush(addr, addr + (1UL << page_shift), + __pa(page_address(pages[i])), prot, + page_shift); + if (err) + return err; + + addr += 1UL << page_shift; + } + + return 0; +} + +static int vmap_pages_range(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, unsigned int page_shift) +{ + int err; + + err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift); + flush_cache_vmap(addr, end); + return err; +} + +/** + * map_kernel_range_noflush - map kernel VM area with the specified pages + * @addr: start of the VM area to map + * @size: size of the VM area to map + * @prot: page protection flags to use + * @pages: pages to map + * + * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size specify should + * have been allocated using get_vm_area() and its friends. + * + * NOTE: + * This function does NOT do any cache flushing. The caller is responsible for + * calling flush_cache_vmap() on to-be-mapped areas before calling this + * function. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int map_kernel_range_noflush(unsigned long addr, unsigned long size, + pgprot_t prot, struct page **pages) +{ + return vmap_pages_range_noflush(addr, addr + size, prot, pages, PAGE_SHIFT); +} + int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot, struct page **pages) { @@ -2416,6 +2469,7 @@ static inline void set_area_direct_map(const struct vm_struct *area, { int i; + /* HUGE_VMALLOC passes small pages to set_direct_map */ for (i = 0; i < area->nr_pages; i++) if (page_address(area->pages[i])) set_direct_map(area->pages[i]); @@ -2449,11 +2503,12 @@ static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) * map. Find the start and end range of the direct mappings to make sure * the vm_unmap_aliases() flush includes the direct map. */ - for (i = 0; i < area->nr_pages; i++) { + for (i = 0; i < area->nr_pages; i += 1U << area->page_order) { unsigned long addr = (unsigned long)page_address(area->pages[i]); if (addr) { + unsigned long page_size = PAGE_SIZE << area->page_order; start = min(addr, start); - end = max(addr + PAGE_SIZE, end); + end = max(addr + page_size, end); flush_dmap = 1; } } @@ -2496,11 +2551,11 @@ static void __vunmap(const void *addr, int deallocate_pages) if (deallocate_pages) { int i; - for (i = 0; i < area->nr_pages; i++) { + for (i = 0; i < area->nr_pages; i += 1U << area->page_order) { struct page *page = area->pages[i]; BUG_ON(!page); - __free_pages(page, 0); + __free_pages(page, area->page_order); } atomic_long_sub(area->nr_pages, &nr_vmalloc_pages); @@ -2691,15 +2746,18 @@ EXPORT_SYMBOL_GPL(vmap_pfn); #endif /* CONFIG_VMAP_PFN */ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, - pgprot_t prot, int node) + pgprot_t prot, unsigned int page_shift, + int node) { const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; - unsigned int nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; - unsigned long array_size; - unsigned int i; + unsigned int page_order = page_shift - PAGE_SHIFT; + unsigned long addr = (unsigned long)area->addr; + unsigned long size = get_vm_area_size(area); + unsigned int nr_small_pages = size >> PAGE_SHIFT; struct page **pages; + unsigned int i; - array_size = (unsigned long)nr_pages * sizeof(struct page *); + array_size = (unsigned long)nr_small_pages * sizeof(struct page *); gfp_mask |= __GFP_NOWARN; if (!(gfp_mask & (GFP_DMA | GFP_DMA32))) gfp_mask |= __GFP_HIGHMEM; @@ -2718,30 +2776,35 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, } area->pages = pages; - area->nr_pages = nr_pages; + area->nr_pages = nr_small_pages; + area->page_order = page_order; - for (i = 0; i < area->nr_pages; i++) { + /* + * Careful, we allocate and map page_order pages, but tracking is done + * per PAGE_SIZE page so as to keep the vm_struct APIs independent of + * the physical/mapped size. + */ + for (i = 0; i < area->nr_pages; i += 1U << page_order) { struct page *page; + int p; - if (node == NUMA_NO_NODE) - page = alloc_page(gfp_mask); - else - page = alloc_pages_node(node, gfp_mask, 0); - + page = alloc_pages_node(node, gfp_mask, page_order); if (unlikely(!page)) { /* Successfully allocated i pages, free them in __vfree() */ area->nr_pages = i; atomic_long_add(area->nr_pages, &nr_vmalloc_pages); goto fail; } - area->pages[i] = page; + + for (p = 0; p < (1U << page_order); p++) + area->pages[i + p] = page + p; + if (gfpflags_allow_blocking(gfp_mask)) cond_resched(); } atomic_long_add(area->nr_pages, &nr_vmalloc_pages); - if (map_kernel_range((unsigned long)area->addr, get_vm_area_size(area), - prot, pages) < 0) + if (vmap_pages_range(addr, addr + size, prot, pages, page_shift) < 0) goto fail; return area->addr; @@ -2749,7 +2812,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, fail: warn_alloc(gfp_mask, NULL, "vmalloc: allocation failure, allocated %ld of %ld bytes", - (area->nr_pages*PAGE_SIZE), area->size); + (area->nr_pages*PAGE_SIZE), size); __vfree(area->addr); return NULL; } @@ -2780,19 +2843,44 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, struct vm_struct *area; void *addr; unsigned long real_size = size; + unsigned long real_align = align; + unsigned int shift = PAGE_SHIFT; - size = PAGE_ALIGN(size); if (!size || (size >> PAGE_SHIFT) > totalram_pages()) goto fail; - area = __get_vm_area_node(real_size, align, VM_ALLOC | VM_UNINITIALIZED | + if (vmap_allow_huge && !(vm_flags & VM_NOHUGE) && + arch_vmap_pmd_supported(prot) && + (pgprot_val(prot) == pgprot_val(PAGE_KERNEL))) { + unsigned long size_per_node; + + /* + * Try huge pages. Only try for PAGE_KERNEL allocations, + * others like modules don't yet expect huge pages in + * their allocations due to apply_to_page_range not + * supporting them. + */ + + size_per_node = size; + if (node == NUMA_NO_NODE) + size_per_node /= num_online_nodes(); + if (size_per_node >= PMD_SIZE) { + shift = PMD_SHIFT; + align = max(real_align, 1UL << shift); + size = ALIGN(real_size, 1UL << shift); + } + } + +again: + size = PAGE_ALIGN(size); + area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED | vm_flags, start, end, node, gfp_mask, caller); if (!area) goto fail; - addr = __vmalloc_area_node(area, gfp_mask, prot, node); + addr = __vmalloc_area_node(area, gfp_mask, prot, shift, node); if (!addr) - return NULL; + goto fail; /* * In this function, newly allocated vm_struct has VM_UNINITIALIZED @@ -2806,8 +2894,18 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, return addr; fail: - warn_alloc(gfp_mask, NULL, + if (shift > PAGE_SHIFT) { + shift = PAGE_SHIFT; + align = real_align; + size = real_size; + goto again; + } + + if (!area) { + /* Warn for area allocation, page allocations already warn */ + warn_alloc(gfp_mask, NULL, "vmalloc: allocation failure: %lu bytes", real_size); + } return NULL; } -- 2.23.0
WARNING: multiple messages have this Message-ID (diff)
From: Nicholas Piggin <npiggin@gmail.com> To: linux-mm@kvack.org, Andrew Morton <akpm@linux-foundation.org> Cc: linux-arch@vger.kernel.org, Ding Tianhong <dingtianhong@huawei.com>, linux-kernel@vger.kernel.org, Nicholas Piggin <npiggin@gmail.com>, Christoph Hellwig <hch@infradead.org>, Zefan Li <lizefan@huawei.com>, Jonathan Cameron <Jonathan.Cameron@Huawei.com>, Rick Edgecombe <rick.p.edgecombe@intel.com>, linuxppc-dev@lists.ozlabs.org Subject: [PATCH v10 11/12] mm/vmalloc: Hugepage vmalloc mappings Date: Sun, 24 Jan 2021 18:22:29 +1000 [thread overview] Message-ID: <20210124082230.2118861-12-npiggin@gmail.com> (raw) In-Reply-To: <20210124082230.2118861-1-npiggin@gmail.com> Support huge page vmalloc mappings. Config option HAVE_ARCH_HUGE_VMALLOC enables support on architectures that define HAVE_ARCH_HUGE_VMAP and supports PMD sized vmap mappings. vmalloc will attempt to allocate PMD-sized pages if allocating PMD size or larger, and fall back to small pages if that was unsuccessful. Architectures must ensure that any arch specific vmalloc allocations that require PAGE_SIZE mappings (e.g., module allocations vs strict module rwx) use the VM_NOHUGE flag to inhibit larger mappings. When hugepage vmalloc mappings are enabled in the next patch, this reduces TLB misses by nearly 30x on a `git diff` workload on a 2-node POWER9 (59,800 -> 2,100) and reduces CPU cycles by 0.54%. This can result in more internal fragmentation and memory overhead for a given allocation, an option nohugevmalloc is added to disable at boot. Signed-off-by: Nicholas Piggin <npiggin@gmail.com> --- arch/Kconfig | 10 +++ include/linux/vmalloc.h | 18 ++++ mm/page_alloc.c | 5 +- mm/vmalloc.c | 192 ++++++++++++++++++++++++++++++---------- 4 files changed, 177 insertions(+), 48 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 24862d15f3a3..f87feb616184 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -724,6 +724,16 @@ config HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD config HAVE_ARCH_HUGE_VMAP bool +config HAVE_ARCH_HUGE_VMALLOC + depends on HAVE_ARCH_HUGE_VMAP + bool + help + Archs that select this would be capable of PMD-sized vmaps (i.e., + arch_vmap_pmd_supported() returns true), and they must make no + assumptions that vmalloc memory is mapped with PAGE_SIZE ptes. The + VM_NOHUGE flag can be used to prohibit arch-specific allocations from + using hugepages to help with this (e.g., modules may require it). + config ARCH_WANT_HUGE_PMD_SHARE bool diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 40649c4bb5a2..2ba023daf188 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -25,6 +25,7 @@ struct notifier_block; /* in notifier.h */ #define VM_NO_GUARD 0x00000040 /* don't add guard page */ #define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */ #define VM_MAP_PUT_PAGES 0x00000100 /* put pages and free array in vfree */ +#define VM_NOHUGE 0x00000200 /* force PAGE_SIZE pte mapping */ /* * VM_KASAN is used slighly differently depending on CONFIG_KASAN_VMALLOC. @@ -59,6 +60,7 @@ struct vm_struct { unsigned long size; unsigned long flags; struct page **pages; + unsigned int page_order; unsigned int nr_pages; phys_addr_t phys_addr; const void *caller; @@ -194,6 +196,18 @@ static inline void set_vm_flush_reset_perms(void *addr) if (vm) vm->flags |= VM_FLUSH_RESET_PERMS; } + +static inline bool is_vm_area_hugepages(const void *addr) +{ + /* + * This may not 100% tell if the area is mapped with > PAGE_SIZE + * page table entries, if for some reason the architecture indicates + * larger sizes are available but decides not to use them, nothing + * prevents that. This only indicates the size of the physical page + * allocated in the vmalloc layer. + */ + return (find_vm_area(addr)->page_order > 0); +} #else static inline int map_kernel_range_noflush(unsigned long start, unsigned long size, @@ -210,6 +224,10 @@ unmap_kernel_range_noflush(unsigned long addr, unsigned long size) static inline void set_vm_flush_reset_perms(void *addr) { } +static inline bool is_vm_area_hugepages(const void *addr) +{ + return false; +} #endif /* for /dev/kmem */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 027f6481ba59..b7a9661fa232 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -72,6 +72,7 @@ #include <linux/padata.h> #include <linux/khugepaged.h> #include <linux/buffer_head.h> +#include <linux/vmalloc.h> #include <asm/sections.h> #include <asm/tlbflush.h> @@ -8238,6 +8239,7 @@ void *__init alloc_large_system_hash(const char *tablename, void *table = NULL; gfp_t gfp_flags; bool virt; + bool huge; /* allow the kernel cmdline to have a say */ if (!numentries) { @@ -8305,6 +8307,7 @@ void *__init alloc_large_system_hash(const char *tablename, } else if (get_order(size) >= MAX_ORDER || hashdist) { table = __vmalloc(size, gfp_flags); virt = true; + huge = is_vm_area_hugepages(table); } else { /* * If bucketsize is not a power-of-two, we may free @@ -8321,7 +8324,7 @@ void *__init alloc_large_system_hash(const char *tablename, pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n", tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size, - virt ? "vmalloc" : "linear"); + virt ? (huge ? "vmalloc hugepage" : "vmalloc") : "linear"); if (_hash_shift) *_hash_shift = log2qty; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 0377e1d059e5..eef61e0f5170 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -42,6 +42,19 @@ #include "internal.h" #include "pgalloc-track.h" +#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC +static bool __ro_after_init vmap_allow_huge = true; + +static int __init set_nohugevmalloc(char *str) +{ + vmap_allow_huge = false; + return 0; +} +early_param("nohugevmalloc", set_nohugevmalloc); +#else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */ +static const bool vmap_allow_huge = false; +#endif /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */ + bool is_vmalloc_addr(const void *x) { unsigned long addr = (unsigned long)x; @@ -477,31 +490,12 @@ static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr, return 0; } -/** - * map_kernel_range_noflush - map kernel VM area with the specified pages - * @addr: start of the VM area to map - * @size: size of the VM area to map - * @prot: page protection flags to use - * @pages: pages to map - * - * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size specify should - * have been allocated using get_vm_area() and its friends. - * - * NOTE: - * This function does NOT do any cache flushing. The caller is responsible for - * calling flush_cache_vmap() on to-be-mapped areas before calling this - * function. - * - * RETURNS: - * 0 on success, -errno on failure. - */ -int map_kernel_range_noflush(unsigned long addr, unsigned long size, - pgprot_t prot, struct page **pages) +static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages) { unsigned long start = addr; - unsigned long end = addr + size; - unsigned long next; pgd_t *pgd; + unsigned long next; int err = 0; int nr = 0; pgtbl_mod_mask mask = 0; @@ -523,6 +517,65 @@ int map_kernel_range_noflush(unsigned long addr, unsigned long size, return 0; } +static int vmap_pages_range_noflush(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, unsigned int page_shift) +{ + unsigned int i, nr = (end - addr) >> PAGE_SHIFT; + + WARN_ON(page_shift < PAGE_SHIFT); + + if (page_shift == PAGE_SHIFT) + return vmap_small_pages_range_noflush(addr, end, prot, pages); + + for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) { + int err; + + err = vmap_range_noflush(addr, addr + (1UL << page_shift), + __pa(page_address(pages[i])), prot, + page_shift); + if (err) + return err; + + addr += 1UL << page_shift; + } + + return 0; +} + +static int vmap_pages_range(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, unsigned int page_shift) +{ + int err; + + err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift); + flush_cache_vmap(addr, end); + return err; +} + +/** + * map_kernel_range_noflush - map kernel VM area with the specified pages + * @addr: start of the VM area to map + * @size: size of the VM area to map + * @prot: page protection flags to use + * @pages: pages to map + * + * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size specify should + * have been allocated using get_vm_area() and its friends. + * + * NOTE: + * This function does NOT do any cache flushing. The caller is responsible for + * calling flush_cache_vmap() on to-be-mapped areas before calling this + * function. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int map_kernel_range_noflush(unsigned long addr, unsigned long size, + pgprot_t prot, struct page **pages) +{ + return vmap_pages_range_noflush(addr, addr + size, prot, pages, PAGE_SHIFT); +} + int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot, struct page **pages) { @@ -2416,6 +2469,7 @@ static inline void set_area_direct_map(const struct vm_struct *area, { int i; + /* HUGE_VMALLOC passes small pages to set_direct_map */ for (i = 0; i < area->nr_pages; i++) if (page_address(area->pages[i])) set_direct_map(area->pages[i]); @@ -2449,11 +2503,12 @@ static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) * map. Find the start and end range of the direct mappings to make sure * the vm_unmap_aliases() flush includes the direct map. */ - for (i = 0; i < area->nr_pages; i++) { + for (i = 0; i < area->nr_pages; i += 1U << area->page_order) { unsigned long addr = (unsigned long)page_address(area->pages[i]); if (addr) { + unsigned long page_size = PAGE_SIZE << area->page_order; start = min(addr, start); - end = max(addr + PAGE_SIZE, end); + end = max(addr + page_size, end); flush_dmap = 1; } } @@ -2496,11 +2551,11 @@ static void __vunmap(const void *addr, int deallocate_pages) if (deallocate_pages) { int i; - for (i = 0; i < area->nr_pages; i++) { + for (i = 0; i < area->nr_pages; i += 1U << area->page_order) { struct page *page = area->pages[i]; BUG_ON(!page); - __free_pages(page, 0); + __free_pages(page, area->page_order); } atomic_long_sub(area->nr_pages, &nr_vmalloc_pages); @@ -2691,15 +2746,18 @@ EXPORT_SYMBOL_GPL(vmap_pfn); #endif /* CONFIG_VMAP_PFN */ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, - pgprot_t prot, int node) + pgprot_t prot, unsigned int page_shift, + int node) { const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; - unsigned int nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; - unsigned long array_size; - unsigned int i; + unsigned int page_order = page_shift - PAGE_SHIFT; + unsigned long addr = (unsigned long)area->addr; + unsigned long size = get_vm_area_size(area); + unsigned int nr_small_pages = size >> PAGE_SHIFT; struct page **pages; + unsigned int i; - array_size = (unsigned long)nr_pages * sizeof(struct page *); + array_size = (unsigned long)nr_small_pages * sizeof(struct page *); gfp_mask |= __GFP_NOWARN; if (!(gfp_mask & (GFP_DMA | GFP_DMA32))) gfp_mask |= __GFP_HIGHMEM; @@ -2718,30 +2776,35 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, } area->pages = pages; - area->nr_pages = nr_pages; + area->nr_pages = nr_small_pages; + area->page_order = page_order; - for (i = 0; i < area->nr_pages; i++) { + /* + * Careful, we allocate and map page_order pages, but tracking is done + * per PAGE_SIZE page so as to keep the vm_struct APIs independent of + * the physical/mapped size. + */ + for (i = 0; i < area->nr_pages; i += 1U << page_order) { struct page *page; + int p; - if (node == NUMA_NO_NODE) - page = alloc_page(gfp_mask); - else - page = alloc_pages_node(node, gfp_mask, 0); - + page = alloc_pages_node(node, gfp_mask, page_order); if (unlikely(!page)) { /* Successfully allocated i pages, free them in __vfree() */ area->nr_pages = i; atomic_long_add(area->nr_pages, &nr_vmalloc_pages); goto fail; } - area->pages[i] = page; + + for (p = 0; p < (1U << page_order); p++) + area->pages[i + p] = page + p; + if (gfpflags_allow_blocking(gfp_mask)) cond_resched(); } atomic_long_add(area->nr_pages, &nr_vmalloc_pages); - if (map_kernel_range((unsigned long)area->addr, get_vm_area_size(area), - prot, pages) < 0) + if (vmap_pages_range(addr, addr + size, prot, pages, page_shift) < 0) goto fail; return area->addr; @@ -2749,7 +2812,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, fail: warn_alloc(gfp_mask, NULL, "vmalloc: allocation failure, allocated %ld of %ld bytes", - (area->nr_pages*PAGE_SIZE), area->size); + (area->nr_pages*PAGE_SIZE), size); __vfree(area->addr); return NULL; } @@ -2780,19 +2843,44 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, struct vm_struct *area; void *addr; unsigned long real_size = size; + unsigned long real_align = align; + unsigned int shift = PAGE_SHIFT; - size = PAGE_ALIGN(size); if (!size || (size >> PAGE_SHIFT) > totalram_pages()) goto fail; - area = __get_vm_area_node(real_size, align, VM_ALLOC | VM_UNINITIALIZED | + if (vmap_allow_huge && !(vm_flags & VM_NOHUGE) && + arch_vmap_pmd_supported(prot) && + (pgprot_val(prot) == pgprot_val(PAGE_KERNEL))) { + unsigned long size_per_node; + + /* + * Try huge pages. Only try for PAGE_KERNEL allocations, + * others like modules don't yet expect huge pages in + * their allocations due to apply_to_page_range not + * supporting them. + */ + + size_per_node = size; + if (node == NUMA_NO_NODE) + size_per_node /= num_online_nodes(); + if (size_per_node >= PMD_SIZE) { + shift = PMD_SHIFT; + align = max(real_align, 1UL << shift); + size = ALIGN(real_size, 1UL << shift); + } + } + +again: + size = PAGE_ALIGN(size); + area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED | vm_flags, start, end, node, gfp_mask, caller); if (!area) goto fail; - addr = __vmalloc_area_node(area, gfp_mask, prot, node); + addr = __vmalloc_area_node(area, gfp_mask, prot, shift, node); if (!addr) - return NULL; + goto fail; /* * In this function, newly allocated vm_struct has VM_UNINITIALIZED @@ -2806,8 +2894,18 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, return addr; fail: - warn_alloc(gfp_mask, NULL, + if (shift > PAGE_SHIFT) { + shift = PAGE_SHIFT; + align = real_align; + size = real_size; + goto again; + } + + if (!area) { + /* Warn for area allocation, page allocations already warn */ + warn_alloc(gfp_mask, NULL, "vmalloc: allocation failure: %lu bytes", real_size); + } return NULL; } -- 2.23.0
next prev parent reply other threads:[~2021-01-24 8:28 UTC|newest] Thread overview: 73+ messages / expand[flat|nested] mbox.gz Atom feed top 2021-01-24 8:22 [PATCH v10 00/12] huge vmalloc mappings Nicholas Piggin 2021-01-24 8:22 ` Nicholas Piggin 2021-01-24 8:22 ` [PATCH v10 01/12] mm/vmalloc: fix vmalloc_to_page for huge vmap mappings Nicholas Piggin 2021-01-24 8:22 ` Nicholas Piggin 2021-01-24 11:31 ` Christoph Hellwig 2021-01-24 11:31 ` Christoph Hellwig 2021-01-24 8:22 ` [PATCH v10 02/12] mm: apply_to_pte_range warn and fail if a large pte is encountered Nicholas Piggin 2021-01-24 8:22 ` Nicholas Piggin 2021-01-24 11:32 ` Christoph Hellwig 2021-01-24 11:32 ` Christoph Hellwig 2021-01-24 8:22 ` [PATCH v10 03/12] mm/vmalloc: rename vmap_*_range vmap_pages_*_range Nicholas Piggin 2021-01-24 8:22 ` Nicholas Piggin 2021-01-24 11:34 ` Christoph Hellwig 2021-01-24 11:34 ` Christoph Hellwig 2021-01-24 8:22 ` [PATCH v10 04/12] mm/ioremap: rename ioremap_*_range to vmap_*_range Nicholas Piggin 2021-01-24 8:22 ` Nicholas Piggin 2021-01-24 11:36 ` Christoph Hellwig 2021-01-24 11:36 ` Christoph Hellwig 2021-01-24 12:04 ` Nicholas Piggin 2021-01-24 12:04 ` Nicholas Piggin 2021-01-24 8:22 ` [PATCH v10 05/12] mm: HUGE_VMAP arch support cleanup Nicholas Piggin 2021-01-24 8:22 ` Nicholas Piggin 2021-01-24 8:22 ` Nicholas Piggin 2021-01-24 11:40 ` Christoph Hellwig 2021-01-24 11:40 ` Christoph Hellwig 2021-01-24 11:40 ` Christoph Hellwig 2021-01-24 12:22 ` Nicholas Piggin 2021-01-24 12:22 ` Nicholas Piggin 2021-01-24 12:22 ` Nicholas Piggin 2021-01-25 8:19 ` Christophe Leroy 2021-01-25 8:19 ` Christophe Leroy 2021-01-25 8:19 ` Christophe Leroy 2021-01-25 8:40 ` Christophe Leroy 2021-01-25 8:40 ` Christophe Leroy 2021-01-25 8:40 ` Christophe Leroy 2021-01-24 8:22 ` [PATCH v10 06/12] powerpc: inline huge vmap supported functions Nicholas Piggin 2021-01-24 8:22 ` Nicholas Piggin 2021-01-25 8:42 ` Christophe Leroy 2021-01-25 8:42 ` Christophe Leroy 2021-01-25 11:37 ` Nicholas Piggin 2021-01-25 11:37 ` Nicholas Piggin 2021-01-24 8:22 ` [PATCH v10 07/12] arm64: " Nicholas Piggin 2021-01-24 8:22 ` Nicholas Piggin 2021-01-24 8:22 ` Nicholas Piggin 2021-01-24 8:22 ` [PATCH v10 08/12] x86: " Nicholas Piggin 2021-01-24 8:22 ` Nicholas Piggin 2021-01-24 8:22 ` [PATCH v10 09/12] mm: Move vmap_range from mm/ioremap.c to mm/vmalloc.c Nicholas Piggin 2021-01-24 8:22 ` Nicholas Piggin 2021-01-24 14:49 ` Christoph Hellwig 2021-01-24 14:49 ` Christoph Hellwig 2021-01-24 8:22 ` [PATCH v10 10/12] mm/vmalloc: add vmap_range_noflush variant Nicholas Piggin 2021-01-24 8:22 ` Nicholas Piggin 2021-01-24 14:51 ` Christoph Hellwig 2021-01-24 14:51 ` Christoph Hellwig 2021-01-24 8:22 ` Nicholas Piggin [this message] 2021-01-24 8:22 ` [PATCH v10 11/12] mm/vmalloc: Hugepage vmalloc mappings Nicholas Piggin 2021-01-24 15:07 ` Christoph Hellwig 2021-01-24 15:07 ` Christoph Hellwig 2021-01-24 18:06 ` Randy Dunlap 2021-01-24 18:06 ` Randy Dunlap 2021-01-24 23:17 ` Nicholas Piggin 2021-01-24 23:17 ` Nicholas Piggin 2021-01-25 9:14 ` Christophe Leroy 2021-01-25 9:14 ` Christophe Leroy 2021-01-25 11:37 ` Nicholas Piggin 2021-01-25 11:37 ` Nicholas Piggin 2021-01-25 12:13 ` Christophe Leroy 2021-01-25 12:13 ` Christophe Leroy 2021-01-25 12:24 ` David Laight 2021-01-26 9:50 ` Nicholas Piggin 2021-01-26 9:50 ` Nicholas Piggin 2021-01-24 8:22 ` [PATCH v10 12/12] powerpc/64s/radix: Enable huge " Nicholas Piggin 2021-01-24 8:22 ` Nicholas Piggin
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=20210124082230.2118861-12-npiggin@gmail.com \ --to=npiggin@gmail.com \ --cc=Jonathan.Cameron@Huawei.com \ --cc=akpm@linux-foundation.org \ --cc=christophe.leroy@csgroup.eu \ --cc=dingtianhong@huawei.com \ --cc=hch@infradead.org \ --cc=linux-arch@vger.kernel.org \ --cc=linux-kernel@vger.kernel.org \ --cc=linux-mm@kvack.org \ --cc=linuxppc-dev@lists.ozlabs.org \ --cc=lizefan@huawei.com \ --cc=rick.p.edgecombe@intel.com \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: linkBe sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.