linux-arm-kernel.lists.infradead.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/4] mm: Move ioremap page table mapping function to mm/
@ 2019-06-10  4:38 Nicholas Piggin
  2019-06-10  4:38 ` [PATCH 2/4] arm64: support huge vmap vmalloc Nicholas Piggin
                   ` (4 more replies)
  0 siblings, 5 replies; 24+ messages in thread
From: Nicholas Piggin @ 2019-06-10  4:38 UTC (permalink / raw)
  To: linux-mm; +Cc: linuxppc-dev, linux-arm-kernel, Nicholas Piggin

ioremap_page_range is a generic function to create a kernel virtual
mapping, move it to mm/vmalloc.c and rename it vmap_range.

For clarity with this move, also:
- Rename vunmap_page_range (vmap_range's inverse) to vunmap_range.
- Rename vmap_page_range (which takes a page array) to vmap_pages.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---

Fixed up the arm64 compile errors, fixed a few bugs, and tidied
things up a bit more.

Have tested powerpc and x86 but not arm64, would appreciate a review
and test of the arm64 patch if possible.

 include/linux/vmalloc.h |   3 +
 lib/ioremap.c           | 173 +++---------------------------
 mm/vmalloc.c            | 228 ++++++++++++++++++++++++++++++++++++----
 3 files changed, 229 insertions(+), 175 deletions(-)

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 51e131245379..812bea5866d6 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -147,6 +147,9 @@ extern struct vm_struct *find_vm_area(const void *addr);
 extern int map_vm_area(struct vm_struct *area, pgprot_t prot,
 			struct page **pages);
 #ifdef CONFIG_MMU
+extern int vmap_range(unsigned long addr,
+		       unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
+		       unsigned int max_page_shift);
 extern int map_kernel_range_noflush(unsigned long start, unsigned long size,
 				    pgprot_t prot, struct page **pages);
 extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size);
diff --git a/lib/ioremap.c b/lib/ioremap.c
index 063213685563..e13946da8ec3 100644
--- a/lib/ioremap.c
+++ b/lib/ioremap.c
@@ -58,165 +58,24 @@ static inline int ioremap_pud_enabled(void) { return 0; }
 static inline int ioremap_pmd_enabled(void) { return 0; }
 #endif	/* CONFIG_HAVE_ARCH_HUGE_VMAP */
 
-static int ioremap_pte_range(pmd_t *pmd, unsigned long addr,
-		unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
-{
-	pte_t *pte;
-	u64 pfn;
-
-	pfn = phys_addr >> PAGE_SHIFT;
-	pte = pte_alloc_kernel(pmd, addr);
-	if (!pte)
-		return -ENOMEM;
-	do {
-		BUG_ON(!pte_none(*pte));
-		set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
-		pfn++;
-	} while (pte++, addr += PAGE_SIZE, addr != end);
-	return 0;
-}
-
-static int ioremap_try_huge_pmd(pmd_t *pmd, unsigned long addr,
-				unsigned long end, phys_addr_t phys_addr,
-				pgprot_t prot)
-{
-	if (!ioremap_pmd_enabled())
-		return 0;
-
-	if ((end - addr) != PMD_SIZE)
-		return 0;
-
-	if (!IS_ALIGNED(phys_addr, PMD_SIZE))
-		return 0;
-
-	if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
-		return 0;
-
-	return pmd_set_huge(pmd, phys_addr, prot);
-}
-
-static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
-		unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
-{
-	pmd_t *pmd;
-	unsigned long next;
-
-	pmd = pmd_alloc(&init_mm, pud, addr);
-	if (!pmd)
-		return -ENOMEM;
-	do {
-		next = pmd_addr_end(addr, end);
-
-		if (ioremap_try_huge_pmd(pmd, addr, next, phys_addr, prot))
-			continue;
-
-		if (ioremap_pte_range(pmd, addr, next, phys_addr, prot))
-			return -ENOMEM;
-	} while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
-	return 0;
-}
-
-static int ioremap_try_huge_pud(pud_t *pud, unsigned long addr,
-				unsigned long end, phys_addr_t phys_addr,
-				pgprot_t prot)
-{
-	if (!ioremap_pud_enabled())
-		return 0;
-
-	if ((end - addr) != PUD_SIZE)
-		return 0;
-
-	if (!IS_ALIGNED(phys_addr, PUD_SIZE))
-		return 0;
-
-	if (pud_present(*pud) && !pud_free_pmd_page(pud, addr))
-		return 0;
-
-	return pud_set_huge(pud, phys_addr, prot);
-}
-
-static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr,
-		unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
-{
-	pud_t *pud;
-	unsigned long next;
-
-	pud = pud_alloc(&init_mm, p4d, addr);
-	if (!pud)
-		return -ENOMEM;
-	do {
-		next = pud_addr_end(addr, end);
-
-		if (ioremap_try_huge_pud(pud, addr, next, phys_addr, prot))
-			continue;
-
-		if (ioremap_pmd_range(pud, addr, next, phys_addr, prot))
-			return -ENOMEM;
-	} while (pud++, phys_addr += (next - addr), addr = next, addr != end);
-	return 0;
-}
-
-static int ioremap_try_huge_p4d(p4d_t *p4d, unsigned long addr,
-				unsigned long end, phys_addr_t phys_addr,
-				pgprot_t prot)
-{
-	if (!ioremap_p4d_enabled())
-		return 0;
-
-	if ((end - addr) != P4D_SIZE)
-		return 0;
-
-	if (!IS_ALIGNED(phys_addr, P4D_SIZE))
-		return 0;
-
-	if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr))
-		return 0;
-
-	return p4d_set_huge(p4d, phys_addr, prot);
-}
-
-static inline int ioremap_p4d_range(pgd_t *pgd, unsigned long addr,
-		unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
-{
-	p4d_t *p4d;
-	unsigned long next;
-
-	p4d = p4d_alloc(&init_mm, pgd, addr);
-	if (!p4d)
-		return -ENOMEM;
-	do {
-		next = p4d_addr_end(addr, end);
-
-		if (ioremap_try_huge_p4d(p4d, addr, next, phys_addr, prot))
-			continue;
-
-		if (ioremap_pud_range(p4d, addr, next, phys_addr, prot))
-			return -ENOMEM;
-	} while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
-	return 0;
-}
-
 int ioremap_page_range(unsigned long addr,
 		       unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
 {
-	pgd_t *pgd;
-	unsigned long start;
-	unsigned long next;
-	int err;
-
-	might_sleep();
-	BUG_ON(addr >= end);
-
-	start = addr;
-	pgd = pgd_offset_k(addr);
-	do {
-		next = pgd_addr_end(addr, end);
-		err = ioremap_p4d_range(pgd, addr, next, phys_addr, prot);
-		if (err)
-			break;
-	} while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
-
-	flush_cache_vmap(start, end);
+	unsigned int max_page_shift = PAGE_SHIFT;
+
+	/*
+	 * Due to the max_page_shift parameter to vmap_range, platforms must
+	 * enable all smaller sizes to take advantage of a given size,
+	 * otherwise fall back to small pages.
+	 */
+	if (ioremap_pmd_enabled()) {
+		max_page_shift = PMD_SHIFT;
+		if (ioremap_pud_enabled()) {
+			max_page_shift = PUD_SHIFT;
+			if (ioremap_p4d_enabled())
+				max_page_shift = P4D_SHIFT;
+		}
+	}
 
-	return err;
+	return vmap_range(addr, end, phys_addr, prot, max_page_shift);
 }
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 233af6936c93..dd27cfb29b10 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -119,7 +119,7 @@ static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end)
 	} while (p4d++, addr = next, addr != end);
 }
 
-static void vunmap_page_range(unsigned long addr, unsigned long end)
+static void vunmap_range(unsigned long addr, unsigned long end)
 {
 	pgd_t *pgd;
 	unsigned long next;
@@ -135,6 +135,198 @@ static void vunmap_page_range(unsigned long addr, unsigned long end)
 }
 
 static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
+			unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
+{
+	pte_t *pte;
+	u64 pfn;
+
+	pfn = phys_addr >> PAGE_SHIFT;
+	pte = pte_alloc_kernel(pmd, addr);
+	if (!pte)
+		return -ENOMEM;
+	do {
+		BUG_ON(!pte_none(*pte));
+		set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
+		pfn++;
+	} while (pte++, addr += PAGE_SIZE, addr != end);
+	return 0;
+}
+
+static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr,
+			unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
+			unsigned int max_page_shift)
+{
+	if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
+		return 0;
+
+	if (max_page_shift < PMD_SHIFT)
+		return 0;
+
+	if ((end - addr) != PMD_SIZE)
+		return 0;
+
+	if (!IS_ALIGNED(phys_addr, PMD_SIZE))
+		return 0;
+
+	if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
+		return 0;
+
+	return pmd_set_huge(pmd, phys_addr, prot);
+}
+
+static inline int vmap_pmd_range(pud_t *pud, unsigned long addr,
+			unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
+			unsigned int max_page_shift)
+{
+	pmd_t *pmd;
+	unsigned long next;
+
+	pmd = pmd_alloc(&init_mm, pud, addr);
+	if (!pmd)
+		return -ENOMEM;
+	do {
+		next = pmd_addr_end(addr, end);
+
+		if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot,
+					max_page_shift))
+			continue;
+
+		if (vmap_pte_range(pmd, addr, next, phys_addr, prot))
+			return -ENOMEM;
+	} while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
+	return 0;
+}
+
+static int vmap_try_huge_pud(pud_t *pud, unsigned long addr,
+			unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
+			unsigned int max_page_shift)
+{
+	if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
+		return 0;
+
+	if (max_page_shift < PUD_SHIFT)
+		return 0;
+
+	if ((end - addr) != PUD_SIZE)
+		return 0;
+
+	if (!IS_ALIGNED(phys_addr, PUD_SIZE))
+		return 0;
+
+	if (pud_present(*pud) && !pud_free_pmd_page(pud, addr))
+		return 0;
+
+	return pud_set_huge(pud, phys_addr, prot);
+}
+
+static inline int vmap_pud_range(p4d_t *p4d, unsigned long addr,
+			unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
+			unsigned int max_page_shift)
+{
+	pud_t *pud;
+	unsigned long next;
+
+	pud = pud_alloc(&init_mm, p4d, addr);
+	if (!pud)
+		return -ENOMEM;
+	do {
+		next = pud_addr_end(addr, end);
+
+		if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot,
+					max_page_shift))
+			continue;
+
+		if (vmap_pmd_range(pud, addr, next, phys_addr, prot,
+					max_page_shift))
+			return -ENOMEM;
+	} while (pud++, phys_addr += (next - addr), addr = next, addr != end);
+	return 0;
+}
+
+static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr,
+			unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
+			unsigned int max_page_shift)
+{
+	if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
+		return 0;
+
+	if (max_page_shift < P4D_SHIFT)
+		return 0;
+
+	if ((end - addr) != P4D_SIZE)
+		return 0;
+
+	if (!IS_ALIGNED(phys_addr, P4D_SIZE))
+		return 0;
+
+	if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr))
+		return 0;
+
+	return p4d_set_huge(p4d, phys_addr, prot);
+}
+
+static inline int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
+			unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
+			unsigned int max_page_shift)
+{
+	p4d_t *p4d;
+	unsigned long next;
+
+	p4d = p4d_alloc(&init_mm, pgd, addr);
+	if (!p4d)
+		return -ENOMEM;
+	do {
+		next = p4d_addr_end(addr, end);
+
+		if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot,
+					max_page_shift))
+			continue;
+
+		if (vmap_pud_range(p4d, addr, next, phys_addr, prot,
+					max_page_shift))
+			return -ENOMEM;
+	} while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
+	return 0;
+}
+
+static int vmap_range_noflush(unsigned long addr,
+			unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
+			unsigned int max_page_shift)
+{
+	pgd_t *pgd;
+	unsigned long start;
+	unsigned long next;
+	int err;
+
+	might_sleep();
+	BUG_ON(addr >= end);
+
+	start = addr;
+	pgd = pgd_offset_k(addr);
+	do {
+		next = pgd_addr_end(addr, end);
+		err = vmap_p4d_range(pgd, addr, next, phys_addr, prot,
+					max_page_shift);
+		if (err)
+			break;
+	} while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
+
+	return err;
+}
+
+int vmap_range(unsigned long addr,
+		       unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
+		       unsigned int max_page_shift)
+{
+	int ret;
+
+	ret = vmap_range_noflush(addr, end, phys_addr, prot, max_page_shift);
+	flush_cache_vmap(addr, end);
+
+	return ret;
+}
+
+static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
 		unsigned long end, pgprot_t prot, struct page **pages, int *nr)
 {
 	pte_t *pte;
@@ -160,7 +352,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
 	return 0;
 }
 
-static int vmap_pmd_range(pud_t *pud, unsigned long addr,
+static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr,
 		unsigned long end, pgprot_t prot, struct page **pages, int *nr)
 {
 	pmd_t *pmd;
@@ -171,13 +363,13 @@ static int vmap_pmd_range(pud_t *pud, unsigned long addr,
 		return -ENOMEM;
 	do {
 		next = pmd_addr_end(addr, end);
-		if (vmap_pte_range(pmd, addr, next, prot, pages, nr))
+		if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr))
 			return -ENOMEM;
 	} while (pmd++, addr = next, addr != end);
 	return 0;
 }
 
-static int vmap_pud_range(p4d_t *p4d, unsigned long addr,
+static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr,
 		unsigned long end, pgprot_t prot, struct page **pages, int *nr)
 {
 	pud_t *pud;
@@ -188,13 +380,13 @@ static int vmap_pud_range(p4d_t *p4d, unsigned long addr,
 		return -ENOMEM;
 	do {
 		next = pud_addr_end(addr, end);
-		if (vmap_pmd_range(pud, addr, next, prot, pages, nr))
+		if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr))
 			return -ENOMEM;
 	} while (pud++, addr = next, addr != end);
 	return 0;
 }
 
-static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
+static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr,
 		unsigned long end, pgprot_t prot, struct page **pages, int *nr)
 {
 	p4d_t *p4d;
@@ -205,7 +397,7 @@ static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
 		return -ENOMEM;
 	do {
 		next = p4d_addr_end(addr, end);
-		if (vmap_pud_range(p4d, addr, next, prot, pages, nr))
+		if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr))
 			return -ENOMEM;
 	} while (p4d++, addr = next, addr != end);
 	return 0;
@@ -217,7 +409,7 @@ static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
  *
  * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
  */
-static int vmap_page_range_noflush(unsigned long start, unsigned long end,
+static int vmap_pages_range_noflush(unsigned long start, unsigned long end,
 				   pgprot_t prot, struct page **pages)
 {
 	pgd_t *pgd;
@@ -230,7 +422,7 @@ static int vmap_page_range_noflush(unsigned long start, unsigned long end,
 	pgd = pgd_offset_k(addr);
 	do {
 		next = pgd_addr_end(addr, end);
-		err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr);
+		err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr);
 		if (err)
 			return err;
 	} while (pgd++, addr = next, addr != end);
@@ -238,12 +430,12 @@ static int vmap_page_range_noflush(unsigned long start, unsigned long end,
 	return nr;
 }
 
-static int vmap_page_range(unsigned long start, unsigned long end,
+static int vmap_pages_range(unsigned long start, unsigned long end,
 			   pgprot_t prot, struct page **pages)
 {
 	int ret;
 
-	ret = vmap_page_range_noflush(start, end, prot, pages);
+	ret = vmap_pages_range_noflush(start, end, prot, pages);
 	flush_cache_vmap(start, end);
 	return ret;
 }
@@ -1148,7 +1340,7 @@ static void free_vmap_area(struct vmap_area *va)
  */
 static void unmap_vmap_area(struct vmap_area *va)
 {
-	vunmap_page_range(va->va_start, va->va_end);
+	vunmap_range(va->va_start, va->va_end);
 }
 
 /*
@@ -1586,7 +1778,7 @@ static void vb_free(const void *addr, unsigned long size)
 	rcu_read_unlock();
 	BUG_ON(!vb);
 
-	vunmap_page_range((unsigned long)addr, (unsigned long)addr + size);
+	vunmap_range((unsigned long)addr, (unsigned long)addr + size);
 
 	if (debug_pagealloc_enabled())
 		flush_tlb_kernel_range((unsigned long)addr,
@@ -1736,7 +1928,7 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro
 		addr = va->va_start;
 		mem = (void *)addr;
 	}
-	if (vmap_page_range(addr, addr + size, prot, pages) < 0) {
+	if (vmap_pages_range(addr, addr + size, prot, pages) < 0) {
 		vm_unmap_ram(mem, count);
 		return NULL;
 	}
@@ -1903,7 +2095,7 @@ void __init vmalloc_init(void)
 int map_kernel_range_noflush(unsigned long addr, unsigned long size,
 			     pgprot_t prot, struct page **pages)
 {
-	return vmap_page_range_noflush(addr, addr + size, prot, pages);
+	return vmap_pages_range_noflush(addr, addr + size, prot, pages);
 }
 
 /**
@@ -1922,7 +2114,7 @@ int map_kernel_range_noflush(unsigned long addr, unsigned long size,
  */
 void unmap_kernel_range_noflush(unsigned long addr, unsigned long size)
 {
-	vunmap_page_range(addr, addr + size);
+	vunmap_range(addr, addr + size);
 }
 EXPORT_SYMBOL_GPL(unmap_kernel_range_noflush);
 
@@ -1939,7 +2131,7 @@ void unmap_kernel_range(unsigned long addr, unsigned long size)
 	unsigned long end = addr + size;
 
 	flush_cache_vunmap(addr, end);
-	vunmap_page_range(addr, end);
+	vunmap_range(addr, end);
 	flush_tlb_kernel_range(addr, end);
 }
 EXPORT_SYMBOL_GPL(unmap_kernel_range);
@@ -1950,7 +2142,7 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages)
 	unsigned long end = addr + get_vm_area_size(area);
 	int err;
 
-	err = vmap_page_range(addr, end, prot, pages);
+	err = vmap_pages_range(addr, end, prot, pages);
 
 	return err > 0 ? 0 : err;
 }
-- 
2.20.1


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 2/4] arm64: support huge vmap vmalloc
  2019-06-10  4:38 [PATCH 1/4] mm: Move ioremap page table mapping function to mm/ Nicholas Piggin
@ 2019-06-10  4:38 ` Nicholas Piggin
  2019-06-10  5:47   ` Anshuman Khandual
  2019-06-10  4:38 ` [PATCH 3/4] powerpc/64s/radix: " Nicholas Piggin
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 24+ messages in thread
From: Nicholas Piggin @ 2019-06-10  4:38 UTC (permalink / raw)
  To: linux-mm; +Cc: linuxppc-dev, linux-arm-kernel, Nicholas Piggin

Applying huge vmap to vmalloc requires vmalloc_to_page to walk huge
pages. Define pud_large and pmd_large to support this.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/arm64/include/asm/pgtable.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 2c41b04708fe..30fe7b344bf7 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -428,6 +428,7 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 				 PMD_TYPE_TABLE)
 #define pmd_sect(pmd)		((pmd_val(pmd) & PMD_TYPE_MASK) == \
 				 PMD_TYPE_SECT)
+#define pmd_large(pmd)		pmd_sect(pmd)
 
 #if defined(CONFIG_ARM64_64K_PAGES) || CONFIG_PGTABLE_LEVELS < 3
 #define pud_sect(pud)		(0)
@@ -438,6 +439,7 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 #define pud_table(pud)		((pud_val(pud) & PUD_TYPE_MASK) == \
 				 PUD_TYPE_TABLE)
 #endif
+#define pud_large(pud)		pud_sect(pud)
 
 extern pgd_t init_pg_dir[PTRS_PER_PGD];
 extern pgd_t init_pg_end[];
-- 
2.20.1


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 3/4] powerpc/64s/radix: support huge vmap vmalloc
  2019-06-10  4:38 [PATCH 1/4] mm: Move ioremap page table mapping function to mm/ Nicholas Piggin
  2019-06-10  4:38 ` [PATCH 2/4] arm64: support huge vmap vmalloc Nicholas Piggin
@ 2019-06-10  4:38 ` Nicholas Piggin
  2019-06-10  4:38 ` [PATCH 4/4] mm/vmalloc: Hugepage vmalloc mappings Nicholas Piggin
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 24+ messages in thread
From: Nicholas Piggin @ 2019-06-10  4:38 UTC (permalink / raw)
  To: linux-mm; +Cc: linuxppc-dev, linux-arm-kernel, Nicholas Piggin

Applying huge vmap to vmalloc requires vmalloc_to_page to walk huge
pages. Define pud_large and pmd_large to support this.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/include/asm/book3s/64/pgtable.h | 24 ++++++++++++--------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 5faceeefd9f9..8e02077b11fb 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -923,6 +923,11 @@ static inline int pud_present(pud_t pud)
 	return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PRESENT));
 }
 
+static inline int pud_large(pud_t pud)
+{
+	return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PTE));
+}
+
 extern struct page *pud_page(pud_t pud);
 extern struct page *pmd_page(pmd_t pmd);
 static inline pte_t pud_pte(pud_t pud)
@@ -966,6 +971,11 @@ static inline int pgd_present(pgd_t pgd)
 	return !!(pgd_raw(pgd) & cpu_to_be64(_PAGE_PRESENT));
 }
 
+static inline int pgd_large(pgd_t pgd)
+{
+	return !!(pgd_raw(pgd) & cpu_to_be64(_PAGE_PTE));
+}
+
 static inline pte_t pgd_pte(pgd_t pgd)
 {
 	return __pte_raw(pgd_raw(pgd));
@@ -1091,6 +1101,11 @@ static inline pte_t *pmdp_ptep(pmd_t *pmd)
 #define pmd_mk_savedwrite(pmd)	pte_pmd(pte_mk_savedwrite(pmd_pte(pmd)))
 #define pmd_clear_savedwrite(pmd)	pte_pmd(pte_clear_savedwrite(pmd_pte(pmd)))
 
+static inline int pmd_large(pmd_t pmd)
+{
+	return !!(pmd_raw(pmd) & cpu_to_be64(_PAGE_PTE));
+}
+
 #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
 #define pmd_soft_dirty(pmd)    pte_soft_dirty(pmd_pte(pmd))
 #define pmd_mksoft_dirty(pmd)  pte_pmd(pte_mksoft_dirty(pmd_pte(pmd)))
@@ -1159,15 +1174,6 @@ pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp,
 	return hash__pmd_hugepage_update(mm, addr, pmdp, clr, set);
 }
 
-/*
- * returns true for pmd migration entries, THP, devmap, hugetlb
- * But compile time dependent on THP config
- */
-static inline int pmd_large(pmd_t pmd)
-{
-	return !!(pmd_raw(pmd) & cpu_to_be64(_PAGE_PTE));
-}
-
 static inline pmd_t pmd_mknotpresent(pmd_t pmd)
 {
 	return __pmd(pmd_val(pmd) & ~_PAGE_PRESENT);
-- 
2.20.1


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 4/4] mm/vmalloc: Hugepage vmalloc mappings
  2019-06-10  4:38 [PATCH 1/4] mm: Move ioremap page table mapping function to mm/ Nicholas Piggin
  2019-06-10  4:38 ` [PATCH 2/4] arm64: support huge vmap vmalloc Nicholas Piggin
  2019-06-10  4:38 ` [PATCH 3/4] powerpc/64s/radix: " Nicholas Piggin
@ 2019-06-10  4:38 ` Nicholas Piggin
  2019-06-10  5:49   ` Nicholas Piggin
                     ` (3 more replies)
  2019-06-10  5:42 ` [PATCH 1/4] mm: Move ioremap page table mapping function to mm/ Anshuman Khandual
  2019-06-11  5:24 ` Christophe Leroy
  4 siblings, 4 replies; 24+ messages in thread
From: Nicholas Piggin @ 2019-06-10  4:38 UTC (permalink / raw)
  To: linux-mm; +Cc: linuxppc-dev, linux-arm-kernel, Nicholas Piggin

For platforms that define HAVE_ARCH_HUGE_VMAP, have vmap allow vmalloc to
allocate huge pages and map them

This brings dTLB misses for linux kernel tree `git diff` from 45,000 to
8,000 on a Kaby Lake KVM guest with 8MB dentry hash and mitigations=off
(performance is in the noise, under 1% difference, page tables are likely
to be well cached for this workload). Similar numbers are seen on POWER9.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 include/asm-generic/4level-fixup.h |   1 +
 include/asm-generic/5level-fixup.h |   1 +
 include/linux/vmalloc.h            |   1 +
 mm/vmalloc.c                       | 132 +++++++++++++++++++++++------
 4 files changed, 107 insertions(+), 28 deletions(-)

diff --git a/include/asm-generic/4level-fixup.h b/include/asm-generic/4level-fixup.h
index e3667c9a33a5..3cc65a4dd093 100644
--- a/include/asm-generic/4level-fixup.h
+++ b/include/asm-generic/4level-fixup.h
@@ -20,6 +20,7 @@
 #define pud_none(pud)			0
 #define pud_bad(pud)			0
 #define pud_present(pud)		1
+#define pud_large(pud)			0
 #define pud_ERROR(pud)			do { } while (0)
 #define pud_clear(pud)			pgd_clear(pud)
 #define pud_val(pud)			pgd_val(pud)
diff --git a/include/asm-generic/5level-fixup.h b/include/asm-generic/5level-fixup.h
index bb6cb347018c..c4377db09a4f 100644
--- a/include/asm-generic/5level-fixup.h
+++ b/include/asm-generic/5level-fixup.h
@@ -22,6 +22,7 @@
 #define p4d_none(p4d)			0
 #define p4d_bad(p4d)			0
 #define p4d_present(p4d)		1
+#define p4d_large(p4d)			0
 #define p4d_ERROR(p4d)			do { } while (0)
 #define p4d_clear(p4d)			pgd_clear(p4d)
 #define p4d_val(p4d)			pgd_val(p4d)
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 812bea5866d6..4c92dc608928 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -42,6 +42,7 @@ struct vm_struct {
 	unsigned long		size;
 	unsigned long		flags;
 	struct page		**pages;
+	unsigned int		page_shift;
 	unsigned int		nr_pages;
 	phys_addr_t		phys_addr;
 	const void		*caller;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index dd27cfb29b10..0cf8e861caeb 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -36,6 +36,7 @@
 #include <linux/rbtree_augmented.h>
 
 #include <linux/uaccess.h>
+#include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <asm/shmparam.h>
 
@@ -440,6 +441,41 @@ static int vmap_pages_range(unsigned long start, unsigned long end,
 	return ret;
 }
 
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+static int vmap_hpages_range(unsigned long start, unsigned long end,
+				   pgprot_t prot, struct page **pages,
+				   unsigned int page_shift)
+{
+	unsigned long addr = start;
+	unsigned int i, nr = (end - start) >> (PAGE_SHIFT + page_shift);
+
+	for (i = 0; i < nr; i++) {
+		int err;
+
+		err = vmap_range_noflush(addr,
+					addr + (PAGE_SIZE << page_shift),
+					__pa(page_address(pages[i])), prot,
+					PAGE_SHIFT + page_shift);
+		if (err)
+			return err;
+
+		addr += PAGE_SIZE << page_shift;
+	}
+	flush_cache_vmap(start, end);
+
+	return nr;
+}
+#else
+static int vmap_hpages_range(unsigned long start, unsigned long end,
+			   pgprot_t prot, struct page **pages,
+			   unsigned int page_shift)
+{
+	BUG_ON(page_shift != PAGE_SIZE);
+	return vmap_pages_range(start, end, prot, pages);
+}
+#endif
+
+
 int is_vmalloc_or_module_addr(const void *x)
 {
 	/*
@@ -462,7 +498,7 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
 {
 	unsigned long addr = (unsigned long) vmalloc_addr;
 	struct page *page = NULL;
-	pgd_t *pgd = pgd_offset_k(addr);
+	pgd_t *pgd;
 	p4d_t *p4d;
 	pud_t *pud;
 	pmd_t *pmd;
@@ -474,27 +510,38 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
 	 */
 	VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
 
+	pgd = pgd_offset_k(addr);
 	if (pgd_none(*pgd))
 		return NULL;
+
 	p4d = p4d_offset(pgd, addr);
 	if (p4d_none(*p4d))
 		return NULL;
-	pud = pud_offset(p4d, addr);
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+	if (p4d_large(*p4d))
+		return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT);
+#endif
+	if (WARN_ON_ONCE(p4d_bad(*p4d)))
+		return NULL;
 
-	/*
-	 * Don't dereference bad PUD or PMD (below) entries. This will also
-	 * identify huge mappings, which we may encounter on architectures
-	 * that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be
-	 * identified as vmalloc addresses by is_vmalloc_addr(), but are
-	 * not [unambiguously] associated with a struct page, so there is
-	 * no correct value to return for them.
-	 */
-	WARN_ON_ONCE(pud_bad(*pud));
-	if (pud_none(*pud) || pud_bad(*pud))
+	pud = pud_offset(p4d, addr);
+	if (pud_none(*pud))
+		return NULL;
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+	if (pud_large(*pud))
+		return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+#endif
+	if (WARN_ON_ONCE(pud_bad(*pud)))
 		return NULL;
+
 	pmd = pmd_offset(pud, addr);
-	WARN_ON_ONCE(pmd_bad(*pmd));
-	if (pmd_none(*pmd) || pmd_bad(*pmd))
+	if (pmd_none(*pmd))
+		return NULL;
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+	if (pmd_large(*pmd))
+		return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+#endif
+	if (WARN_ON_ONCE(pmd_bad(*pmd)))
 		return NULL;
 
 	ptep = pte_offset_map(pmd, addr);
@@ -502,6 +549,7 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
 	if (pte_present(pte))
 		page = pte_page(pte);
 	pte_unmap(ptep);
+
 	return page;
 }
 EXPORT_SYMBOL(vmalloc_to_page);
@@ -2185,8 +2233,9 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
 		return NULL;
 
 	if (flags & VM_IOREMAP)
-		align = 1ul << clamp_t(int, get_count_order_long(size),
-				       PAGE_SHIFT, IOREMAP_MAX_ORDER);
+		align = max(align,
+				1ul << clamp_t(int, get_count_order_long(size),
+				       PAGE_SHIFT, IOREMAP_MAX_ORDER));
 
 	area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
 	if (unlikely(!area))
@@ -2398,7 +2447,7 @@ static void __vunmap(const void *addr, int deallocate_pages)
 			struct page *page = area->pages[i];
 
 			BUG_ON(!page);
-			__free_pages(page, 0);
+			__free_pages(page, area->page_shift);
 		}
 
 		kvfree(area->pages);
@@ -2541,14 +2590,17 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 				 pgprot_t prot, int node)
 {
 	struct page **pages;
+	unsigned long addr = (unsigned long)area->addr;
+	unsigned long size = get_vm_area_size(area);
+	unsigned int page_shift = area->page_shift;
+	unsigned int shift = page_shift + PAGE_SHIFT;
 	unsigned int nr_pages, array_size, i;
 	const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
 	const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;
 	const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ?
-					0 :
-					__GFP_HIGHMEM;
+					0 : __GFP_HIGHMEM;
 
-	nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
+	nr_pages = size >> shift;
 	array_size = (nr_pages * sizeof(struct page *));
 
 	area->nr_pages = nr_pages;
@@ -2569,10 +2621,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 	for (i = 0; i < area->nr_pages; i++) {
 		struct page *page;
 
-		if (node == NUMA_NO_NODE)
-			page = alloc_page(alloc_mask|highmem_mask);
-		else
-			page = alloc_pages_node(node, alloc_mask|highmem_mask, 0);
+		page = alloc_pages_node(node,
+				alloc_mask|highmem_mask, page_shift);
 
 		if (unlikely(!page)) {
 			/* Successfully allocated i pages, free them in __vunmap() */
@@ -2584,8 +2634,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 			cond_resched();
 	}
 
-	if (map_vm_area(area, prot, pages))
+	if (vmap_hpages_range(addr, addr + size, prot, pages, page_shift) < 0)
 		goto fail;
+
 	return area->addr;
 
 fail:
@@ -2619,22 +2670,39 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
 			pgprot_t prot, unsigned long vm_flags, int node,
 			const void *caller)
 {
-	struct vm_struct *area;
+	struct vm_struct *area = NULL;
 	void *addr;
 	unsigned long real_size = size;
+	unsigned long real_align = align;
+	unsigned int shift = PAGE_SHIFT;
 
 	size = PAGE_ALIGN(size);
 	if (!size || (size >> PAGE_SHIFT) > totalram_pages())
 		goto fail;
 
+	if (IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP)) {
+		unsigned long size_per_node;
+
+		size_per_node = size;
+		if (node == NUMA_NO_NODE)
+			size_per_node /= num_online_nodes();
+		if (size_per_node >= PMD_SIZE)
+			shift = PMD_SHIFT;
+	}
+again:
+	align = max(real_align, 1UL << shift);
+	size = ALIGN(real_size, align);
+
 	area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |
 				vm_flags, start, end, node, gfp_mask, caller);
 	if (!area)
 		goto fail;
 
+	area->page_shift = shift - PAGE_SHIFT;
+
 	addr = __vmalloc_area_node(area, gfp_mask, prot, node);
 	if (!addr)
-		return NULL;
+		goto fail;
 
 	/*
 	 * In this function, newly allocated vm_struct has VM_UNINITIALIZED
@@ -2648,8 +2716,16 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
 	return addr;
 
 fail:
-	warn_alloc(gfp_mask, NULL,
+	if (shift == PMD_SHIFT) {
+		shift = PAGE_SHIFT;
+		goto again;
+	}
+
+	if (!area) {
+		/* Warn for area allocation, page allocations already warn */
+		warn_alloc(gfp_mask, NULL,
 			  "vmalloc: allocation failure: %lu bytes", real_size);
+	}
 	return NULL;
 }
 
-- 
2.20.1


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [PATCH 1/4] mm: Move ioremap page table mapping function to mm/
  2019-06-10  4:38 [PATCH 1/4] mm: Move ioremap page table mapping function to mm/ Nicholas Piggin
                   ` (2 preceding siblings ...)
  2019-06-10  4:38 ` [PATCH 4/4] mm/vmalloc: Hugepage vmalloc mappings Nicholas Piggin
@ 2019-06-10  5:42 ` Anshuman Khandual
  2019-06-10  6:21   ` Nicholas Piggin
  2019-06-11  5:24 ` Christophe Leroy
  4 siblings, 1 reply; 24+ messages in thread
From: Anshuman Khandual @ 2019-06-10  5:42 UTC (permalink / raw)
  To: Nicholas Piggin, linux-mm; +Cc: linuxppc-dev, linux-arm-kernel



On 06/10/2019 10:08 AM, Nicholas Piggin wrote:
> ioremap_page_range is a generic function to create a kernel virtual
> mapping, move it to mm/vmalloc.c and rename it vmap_range.

Absolutely. It belongs in mm/vmalloc.c as its a kernel virtual range.
But what is the rationale of changing the name to vmap_range ?
 
> 
> For clarity with this move, also:
> - Rename vunmap_page_range (vmap_range's inverse) to vunmap_range.

Will be inverse for both vmap_range() and vmap_page[s]_range() ?

> - Rename vmap_page_range (which takes a page array) to vmap_pages.

s/vmap_pages/vmap_pages_range instead here ................^^^^^^

This deviates from the subject of this patch that it is related to
ioremap only. I believe what this patch intends is to create

- vunmap_range() takes [VA range]

	This will be the common kernel virtual range tear down
	function for ranges created either with vmap_range() or
	vmap_pages_range(). Is that correct ?

- vmap_range() takes [VA range, PA range, prot]
- vmap_pages_range() takes [VA range, struct pages, prot] 

Can we re-order the arguments (pages <--> prot) for vmap_pages_range()
just to make it sync with vmap_range() ?

static int vmap_pages_range(unsigned long start, unsigned long end,
 			   pgprot_t prot, struct page **pages)

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 2/4] arm64: support huge vmap vmalloc
  2019-06-10  4:38 ` [PATCH 2/4] arm64: support huge vmap vmalloc Nicholas Piggin
@ 2019-06-10  5:47   ` Anshuman Khandual
  2019-06-10  6:14     ` Nicholas Piggin
  0 siblings, 1 reply; 24+ messages in thread
From: Anshuman Khandual @ 2019-06-10  5:47 UTC (permalink / raw)
  To: Nicholas Piggin, linux-mm; +Cc: linuxppc-dev, linux-arm-kernel



On 06/10/2019 10:08 AM, Nicholas Piggin wrote:
> Applying huge vmap to vmalloc requires vmalloc_to_page to walk huge
> pages. Define pud_large and pmd_large to support this.
> 
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> ---
>  arch/arm64/include/asm/pgtable.h | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> index 2c41b04708fe..30fe7b344bf7 100644
> --- a/arch/arm64/include/asm/pgtable.h
> +++ b/arch/arm64/include/asm/pgtable.h
> @@ -428,6 +428,7 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
>  				 PMD_TYPE_TABLE)
>  #define pmd_sect(pmd)		((pmd_val(pmd) & PMD_TYPE_MASK) == \
>  				 PMD_TYPE_SECT)
> +#define pmd_large(pmd)		pmd_sect(pmd)
>  
>  #if defined(CONFIG_ARM64_64K_PAGES) || CONFIG_PGTABLE_LEVELS < 3
>  #define pud_sect(pud)		(0)
> @@ -438,6 +439,7 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
>  #define pud_table(pud)		((pud_val(pud) & PUD_TYPE_MASK) == \
>  				 PUD_TYPE_TABLE)
>  #endif
> +#define pud_large(pud)		pud_sect(pud)
>  
>  extern pgd_t init_pg_dir[PTRS_PER_PGD];
>  extern pgd_t init_pg_end[];

Another series (I guess not merged yet) is trying to add these wrappers
on arm64 (https://patchwork.kernel.org/patch/10883887/).

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 4/4] mm/vmalloc: Hugepage vmalloc mappings
  2019-06-10  4:38 ` [PATCH 4/4] mm/vmalloc: Hugepage vmalloc mappings Nicholas Piggin
@ 2019-06-10  5:49   ` Nicholas Piggin
  2019-06-10  8:08     ` Satheesh Rajendran
  2019-06-10  8:53   ` Anshuman Khandual
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 24+ messages in thread
From: Nicholas Piggin @ 2019-06-10  5:49 UTC (permalink / raw)
  To: linux-mm; +Cc: linuxppc-dev, linux-arm-kernel

Nicholas Piggin's on June 10, 2019 2:38 pm:
> +static int vmap_hpages_range(unsigned long start, unsigned long end,
> +			   pgprot_t prot, struct page **pages,
> +			   unsigned int page_shift)
> +{
> +	BUG_ON(page_shift != PAGE_SIZE);
> +	return vmap_pages_range(start, end, prot, pages);
> +}

That's a false positive BUG_ON for !HUGE_VMAP configs. I'll fix that
and repost after a round of feedback.

Thanks,
Nick


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 2/4] arm64: support huge vmap vmalloc
  2019-06-10  5:47   ` Anshuman Khandual
@ 2019-06-10  6:14     ` Nicholas Piggin
  0 siblings, 0 replies; 24+ messages in thread
From: Nicholas Piggin @ 2019-06-10  6:14 UTC (permalink / raw)
  To: Anshuman Khandual, linux-mm; +Cc: linuxppc-dev, linux-arm-kernel

Anshuman Khandual's on June 10, 2019 3:47 pm:
> 
> 
> On 06/10/2019 10:08 AM, Nicholas Piggin wrote:
>> Applying huge vmap to vmalloc requires vmalloc_to_page to walk huge
>> pages. Define pud_large and pmd_large to support this.
>> 
>> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
>> ---
>>  arch/arm64/include/asm/pgtable.h | 2 ++
>>  1 file changed, 2 insertions(+)
>> 
>> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
>> index 2c41b04708fe..30fe7b344bf7 100644
>> --- a/arch/arm64/include/asm/pgtable.h
>> +++ b/arch/arm64/include/asm/pgtable.h
>> @@ -428,6 +428,7 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
>>  				 PMD_TYPE_TABLE)
>>  #define pmd_sect(pmd)		((pmd_val(pmd) & PMD_TYPE_MASK) == \
>>  				 PMD_TYPE_SECT)
>> +#define pmd_large(pmd)		pmd_sect(pmd)
>>  
>>  #if defined(CONFIG_ARM64_64K_PAGES) || CONFIG_PGTABLE_LEVELS < 3
>>  #define pud_sect(pud)		(0)
>> @@ -438,6 +439,7 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
>>  #define pud_table(pud)		((pud_val(pud) & PUD_TYPE_MASK) == \
>>  				 PUD_TYPE_TABLE)
>>  #endif
>> +#define pud_large(pud)		pud_sect(pud)
>>  
>>  extern pgd_t init_pg_dir[PTRS_PER_PGD];
>>  extern pgd_t init_pg_end[];
> 
> Another series (I guess not merged yet) is trying to add these wrappers
> on arm64 (https://patchwork.kernel.org/patch/10883887/).
> 

Okay good, I'll just cherry pick it for the series.

Thanks,
Nick


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 1/4] mm: Move ioremap page table mapping function to mm/
  2019-06-10  5:42 ` [PATCH 1/4] mm: Move ioremap page table mapping function to mm/ Anshuman Khandual
@ 2019-06-10  6:21   ` Nicholas Piggin
  0 siblings, 0 replies; 24+ messages in thread
From: Nicholas Piggin @ 2019-06-10  6:21 UTC (permalink / raw)
  To: Anshuman Khandual, linux-mm; +Cc: linuxppc-dev, linux-arm-kernel

Anshuman Khandual's on June 10, 2019 3:42 pm:
> 
> 
> On 06/10/2019 10:08 AM, Nicholas Piggin wrote:
>> ioremap_page_range is a generic function to create a kernel virtual
>> mapping, move it to mm/vmalloc.c and rename it vmap_range.
> 
> Absolutely. It belongs in mm/vmalloc.c as its a kernel virtual range.
> But what is the rationale of changing the name to vmap_range ?

Well it doesn't just map IO. It's for arbitrary kernel virtual mapping
(including ioremap). Last patch uses it to map regular cacheable
memory.

>> For clarity with this move, also:
>> - Rename vunmap_page_range (vmap_range's inverse) to vunmap_range.
> 
> Will be inverse for both vmap_range() and vmap_page[s]_range() ?

Yes.

> 
>> - Rename vmap_page_range (which takes a page array) to vmap_pages.
> 
> s/vmap_pages/vmap_pages_range instead here ................^^^^^^

Yes.

> This deviates from the subject of this patch that it is related to
> ioremap only. I believe what this patch intends is to create
> 
> - vunmap_range() takes [VA range]
> 
> 	This will be the common kernel virtual range tear down
> 	function for ranges created either with vmap_range() or
> 	vmap_pages_range(). Is that correct ?
> - vmap_range() takes [VA range, PA range, prot]
> - vmap_pages_range() takes [VA range, struct pages, prot] 

That's right although we already have all those functions, so I don't
create anything, only move and re-name. I'm happy to change the
subject if you have a preference.

> Can we re-order the arguments (pages <--> prot) for vmap_pages_range()
> just to make it sync with vmap_range() ?
> 
> static int vmap_pages_range(unsigned long start, unsigned long end,
>  			   pgprot_t prot, struct page **pages)
> 

Sure, makes sense.

Thanks,
Nick


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 4/4] mm/vmalloc: Hugepage vmalloc mappings
  2019-06-10  5:49   ` Nicholas Piggin
@ 2019-06-10  8:08     ` Satheesh Rajendran
  0 siblings, 0 replies; 24+ messages in thread
From: Satheesh Rajendran @ 2019-06-10  8:08 UTC (permalink / raw)
  To: Nicholas Piggin; +Cc: linux-mm, linuxppc-dev, linux-arm-kernel

On Mon, Jun 10, 2019 at 03:49:48PM +1000, Nicholas Piggin wrote:
> Nicholas Piggin's on June 10, 2019 2:38 pm:
> > +static int vmap_hpages_range(unsigned long start, unsigned long end,
> > +			   pgprot_t prot, struct page **pages,
> > +			   unsigned int page_shift)
> > +{
> > +	BUG_ON(page_shift != PAGE_SIZE);
> > +	return vmap_pages_range(start, end, prot, pages);
> > +}
> 
> That's a false positive BUG_ON for !HUGE_VMAP configs. I'll fix that
> and repost after a round of feedback.

Sure, Crash log for that false positive BUG_ON on Power8 Host.

[    0.001718] pid_max: default: 163840 minimum: 1280
[    0.010437] ------------[ cut here ]------------
[    0.010461] kernel BUG at mm/vmalloc.c:473!
[    0.010471] Oops: Exception in kernel mode, sig: 5 [#1]
[    0.010481] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA PowerNV
[    0.010491] Modules linked in:
[    0.010503] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.2.0-rc3-ga7ee9421d #1
[    0.010515] NIP:  c00000000034dbd8 LR: c00000000034dc80 CTR: 0000000000000000
[    0.010527] REGS: c0000000015bf9a0 TRAP: 0700   Not tainted  (5.2.0-rc3-ga7ee9421d)
[    0.010537] MSR:  9000000002029033 <SF,HV,VEC,EE,ME,IR,DR,RI,LE>  CR: 22022422  XER: 20000000
[    0.010559] CFAR: c00000000034dc88 IRQMASK: 0
[    0.010559] GPR00: c00000000034dc80 c0000000015bfc30 c0000000015c2f00 c00c000001fd0e00
[    0.010559] GPR04: 0000000000000000 0000000000002322 0000000000000000 0000000000000040
[    0.010559] GPR08: c000000ff9080000 0000000000000400 0000000000000400 0000000000000100
[    0.010559] GPR12: 0000000042022422 c0000000017a0000 00000001035ae7d8 0000000000000400
[    0.010559] GPR16: 0000000004000000 800000000000018e c000000000ee08c8 0000000000000000
[    0.010559] GPR20: 0000000000010000 0000000000002b22 0000000000000b20 0000000000000022
[    0.010559] GPR24: c0000007f92c7880 0000000000000b22 0000000000010000 c00a000000000000
[    0.010559] GPR28: c008000000000000 0000000004000000 ffffffffffffffff 0000000000000b20
[    0.010664] NIP [c00000000034dbd8] __vmalloc_node_range+0x1f8/0x410
[    0.010677] LR [c00000000034dc80] __vmalloc_node_range+0x2a0/0x410
[    0.010686] Call Trace:
[    0.010695] [c0000000015bfc30] [c00000000034dc80] __vmalloc_node_range+0x2a0/0x410 (unreliable)
[    0.010711] [c0000000015bfd30] [c00000000034de40] __vmalloc+0x50/0x60
[    0.010724] [c0000000015bfda0] [c00000000101e54c] alloc_large_system_hash+0x200/0x304
[    0.010738] [c0000000015bfe60] [c0000000010235bc] vfs_caches_init+0xd8/0x138
[    0.010752] [c0000000015bfee0] [c000000000fe428c] start_kernel+0x5c4/0x668
[    0.010767] [c0000000015bff90] [c00000000000b774] start_here_common+0x1c/0x528
[    0.010777] Instruction dump:
[    0.010785] 60000000 7c691b79 418200dc e9180020 79ea1f24 7d28512a 40920170 8138002c
[    0.010803] 394f0001 794f0020 7c095040 4181ffbc <0fe00000> 60000000 3f400001 4bfffedc
[    0.010826] ---[ end trace dd0217488686d653 ]---
[    0.010834]
[    1.010946] Kernel panic - not syncing: Attempted to kill the idle task!
[    1.011061] Rebooting in 10 seconds..

Regards,
-Satheesh.
> 
> Thanks,
> Nick
> 
> 


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 4/4] mm/vmalloc: Hugepage vmalloc mappings
  2019-06-10  4:38 ` [PATCH 4/4] mm/vmalloc: Hugepage vmalloc mappings Nicholas Piggin
  2019-06-10  5:49   ` Nicholas Piggin
@ 2019-06-10  8:53   ` Anshuman Khandual
  2019-06-11  0:16     ` Nicholas Piggin
  2019-06-10 14:10   ` Mark Rutland
  2019-06-11  5:39   ` Christophe Leroy
  3 siblings, 1 reply; 24+ messages in thread
From: Anshuman Khandual @ 2019-06-10  8:53 UTC (permalink / raw)
  To: Nicholas Piggin, linux-mm; +Cc: linuxppc-dev, linux-arm-kernel

On 06/10/2019 10:08 AM, Nicholas Piggin wrote:
> For platforms that define HAVE_ARCH_HUGE_VMAP, have vmap allow vmalloc to
> allocate huge pages and map them.

IIUC that extends HAVE_ARCH_HUGE_VMAP from iormap to vmalloc. 

> 
> This brings dTLB misses for linux kernel tree `git diff` from 45,000 to
> 8,000 on a Kaby Lake KVM guest with 8MB dentry hash and mitigations=off
> (performance is in the noise, under 1% difference, page tables are likely
> to be well cached for this workload). Similar numbers are seen on POWER9.

Sure will try this on arm64.

> 
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> ---
>  include/asm-generic/4level-fixup.h |   1 +
>  include/asm-generic/5level-fixup.h |   1 +
>  include/linux/vmalloc.h            |   1 +
>  mm/vmalloc.c                       | 132 +++++++++++++++++++++++------
>  4 files changed, 107 insertions(+), 28 deletions(-)
> 
> diff --git a/include/asm-generic/4level-fixup.h b/include/asm-generic/4level-fixup.h
> index e3667c9a33a5..3cc65a4dd093 100644
> --- a/include/asm-generic/4level-fixup.h
> +++ b/include/asm-generic/4level-fixup.h
> @@ -20,6 +20,7 @@
>  #define pud_none(pud)			0
>  #define pud_bad(pud)			0
>  #define pud_present(pud)		1
> +#define pud_large(pud)			0
>  #define pud_ERROR(pud)			do { } while (0)
>  #define pud_clear(pud)			pgd_clear(pud)
>  #define pud_val(pud)			pgd_val(pud)
> diff --git a/include/asm-generic/5level-fixup.h b/include/asm-generic/5level-fixup.h
> index bb6cb347018c..c4377db09a4f 100644
> --- a/include/asm-generic/5level-fixup.h
> +++ b/include/asm-generic/5level-fixup.h
> @@ -22,6 +22,7 @@
>  #define p4d_none(p4d)			0
>  #define p4d_bad(p4d)			0
>  #define p4d_present(p4d)		1
> +#define p4d_large(p4d)			0
>  #define p4d_ERROR(p4d)			do { } while (0)
>  #define p4d_clear(p4d)			pgd_clear(p4d)
>  #define p4d_val(p4d)			pgd_val(p4d)

Both of these are required from vmalloc_to_page() which as per a later comment
should be part of a prerequisite patch before this series.

> diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
> index 812bea5866d6..4c92dc608928 100644
> --- a/include/linux/vmalloc.h
> +++ b/include/linux/vmalloc.h
> @@ -42,6 +42,7 @@ struct vm_struct {
>  	unsigned long		size;
>  	unsigned long		flags;
>  	struct page		**pages;
> +	unsigned int		page_shift;

So the entire vm_struct will be mapped with a single page_shift. It cannot have
mix and match mappings with PAGE_SIZE, PMD_SIZE, PUD_SIZE etc in case the
allocation fails for larger ones, falling back etc what over other reasons.

>  	unsigned int		nr_pages;
>  	phys_addr_t		phys_addr;
>  	const void		*caller;
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index dd27cfb29b10..0cf8e861caeb 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -36,6 +36,7 @@
>  #include <linux/rbtree_augmented.h>
>  
>  #include <linux/uaccess.h>
> +#include <asm/pgtable.h>
>  #include <asm/tlbflush.h>
>  #include <asm/shmparam.h>
>  
> @@ -440,6 +441,41 @@ static int vmap_pages_range(unsigned long start, unsigned long end,
>  	return ret;
>  }
>  
> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
> +static int vmap_hpages_range(unsigned long start, unsigned long end,

A small nit (if you agree) s/hpages/huge_pages/

> +				   pgprot_t prot, struct page **pages,

Re-order (prot <---> pages) just to follow the standard like before.

> +				   unsigned int page_shift)
> +{
> +	unsigned long addr = start;
> +	unsigned int i, nr = (end - start) >> (PAGE_SHIFT + page_shift);

s/nr/nr_huge_pages ?

Also should not we check for the alignment of the range [start...end] with
respect to (1UL << [PAGE_SHIFT + page_shift]).


> +
> +	for (i = 0; i < nr; i++) {
> +		int err;
> +
> +		err = vmap_range_noflush(addr,
> +					addr + (PAGE_SIZE << page_shift),
> +					__pa(page_address(pages[i])), prot,
> +					PAGE_SHIFT + page_shift);
> +		if (err)
> +			return err;
> +
> +		addr += PAGE_SIZE << page_shift;
> +	}
> +	flush_cache_vmap(start, end);
> +
> +	return nr;
> +}
> +#else
> +static int vmap_hpages_range(unsigned long start, unsigned long end,
> +			   pgprot_t prot, struct page **pages,
> +			   unsigned int page_shift)
> +{
> +	BUG_ON(page_shift != PAGE_SIZE);
> +	return vmap_pages_range(start, end, prot, pages);
> +}
> +#endif
> +
> +
>  int is_vmalloc_or_module_addr(const void *x)
>  {
>  	/*
> @@ -462,7 +498,7 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
>  {
>  	unsigned long addr = (unsigned long) vmalloc_addr;
>  	struct page *page = NULL;
> -	pgd_t *pgd = pgd_offset_k(addr);
> +	pgd_t *pgd;
>  	p4d_t *p4d;
>  	pud_t *pud;
>  	pmd_t *pmd;
> @@ -474,27 +510,38 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
>  	 */
>  	VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
>  
> +	pgd = pgd_offset_k(addr);
>  	if (pgd_none(*pgd))
>  		return NULL;
> +

Small nit. Stray line here.

'pgd' related changes here seem to be just cleanups and should not part of this patch.

>  	p4d = p4d_offset(pgd, addr);
>  	if (p4d_none(*p4d))
>  		return NULL;
> -	pud = pud_offset(p4d, addr);
> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
> +	if (p4d_large(*p4d))
> +		return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT);
> +#endif
> +	if (WARN_ON_ONCE(p4d_bad(*p4d)))
> +		return NULL;
>  
> -	/*
> -	 * Don't dereference bad PUD or PMD (below) entries. This will also
> -	 * identify huge mappings, which we may encounter on architectures
> -	 * that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be
> -	 * identified as vmalloc addresses by is_vmalloc_addr(), but are
> -	 * not [unambiguously] associated with a struct page, so there is
> -	 * no correct value to return for them.
> -	 */

What changed the situation so that we could return struct page for a huge
mapping now ? AFAICT even after this patch, PUD/P4D level huge pages can only
be created with ioremap_page_range() not with vmalloc() which creates PMD
sized mappings only. Hence if it's okay to dereference struct page of a huge
mapping (not withstanding the comment here) it should be part of an earlier
patch fixing it first for existing ioremap_page_range() huge mappings.

> -	WARN_ON_ONCE(pud_bad(*pud));
> -	if (pud_none(*pud) || pud_bad(*pud))
> +	pud = pud_offset(p4d, addr);
> +	if (pud_none(*pud))
> +		return NULL;
> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
> +	if (pud_large(*pud))
> +		return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
> +#endif
> +	if (WARN_ON_ONCE(pud_bad(*pud)))
>  		return NULL;
> +
>  	pmd = pmd_offset(pud, addr);
> -	WARN_ON_ONCE(pmd_bad(*pmd));
> -	if (pmd_none(*pmd) || pmd_bad(*pmd))
> +	if (pmd_none(*pmd))
> +		return NULL;
> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
> +	if (pmd_large(*pmd))
> +		return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
> +#endif
> +	if (WARN_ON_ONCE(pmd_bad(*pmd)))
>  		return NULL;

At each page table level, we are checking in this order

pXX_none() --> pXX_large() --> pXX_bad()

Are not these alternative orders bit better

pXX_bad() --> pXX_none() --> pXX_large()

Or

pXX_none() --> pXX_bad() --> pXX_large()

Checking for pXX_bad() at the end does not make much sense.

>  
>  	ptep = pte_offset_map(pmd, addr);
> @@ -502,6 +549,7 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
>  	if (pte_present(pte))
>  		page = pte_page(pte);
>  	pte_unmap(ptep);
> +

Small nit. Stray line here.

>  	return page;
>  }
>  EXPORT_SYMBOL(vmalloc_to_page);
> @@ -2185,8 +2233,9 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
>  		return NULL;
>  
>  	if (flags & VM_IOREMAP)
> -		align = 1ul << clamp_t(int, get_count_order_long(size),
> -				       PAGE_SHIFT, IOREMAP_MAX_ORDER);
> +		align = max(align,
> +				1ul << clamp_t(int, get_count_order_long(size),
> +				       PAGE_SHIFT, IOREMAP_MAX_ORDER));
>  
>  	area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
>  	if (unlikely(!area))
> @@ -2398,7 +2447,7 @@ static void __vunmap(const void *addr, int deallocate_pages)
>  			struct page *page = area->pages[i];
>  
>  			BUG_ON(!page);
> -			__free_pages(page, 0);
> +			__free_pages(page, area->page_shift);

area->page_shift' turns out to be effective page order. I think the name here is bit
misleading. s/page_shift/page_order or nr_pages should be better IMHO. page_shift is
not actual shift (1UL << area->shift to get size) nor does it sound like page 'order'.

>  		}
>  
>  		kvfree(area->pages);
> @@ -2541,14 +2590,17 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
>  				 pgprot_t prot, int node)
>  {
>  	struct page **pages;
> +	unsigned long addr = (unsigned long)area->addr;
> +	unsigned long size = get_vm_area_size(area);
> +	unsigned int page_shift = area->page_shift;
> +	unsigned int shift = page_shift + PAGE_SHIFT;
>  	unsigned int nr_pages, array_size, i;
>  	const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
>  	const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;
>  	const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ?
> -					0 :
> -					__GFP_HIGHMEM;
> +					0 : __GFP_HIGHMEM;
>  
> -	nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
> +	nr_pages = size >> shift;
>  	array_size = (nr_pages * sizeof(struct page *));
>  
>  	area->nr_pages = nr_pages;
> @@ -2569,10 +2621,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
>  	for (i = 0; i < area->nr_pages; i++) {
>  		struct page *page;
>  
> -		if (node == NUMA_NO_NODE)
> -			page = alloc_page(alloc_mask|highmem_mask);
> -		else
> -			page = alloc_pages_node(node, alloc_mask|highmem_mask, 0);
> +		page = alloc_pages_node(node,
> +				alloc_mask|highmem_mask, page_shift);

alloc_mask remains the exact same like before even for these high order pages.

>  
>  		if (unlikely(!page)) {
>  			/* Successfully allocated i pages, free them in __vunmap() */
> @@ -2584,8 +2634,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
>  			cond_resched();
>  	}
>  
> -	if (map_vm_area(area, prot, pages))
> +	if (vmap_hpages_range(addr, addr + size, prot, pages, page_shift) < 0)
>  		goto fail;
> +
>  	return area->addr;
>  
>  fail:
> @@ -2619,22 +2670,39 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
>  			pgprot_t prot, unsigned long vm_flags, int node,
>  			const void *caller)
>  {
> -	struct vm_struct *area;
> +	struct vm_struct *area = NULL;
>  	void *addr;
>  	unsigned long real_size = size;
> +	unsigned long real_align = align;
> +	unsigned int shift = PAGE_SHIFT;
>  
>  	size = PAGE_ALIGN(size);
>  	if (!size || (size >> PAGE_SHIFT) > totalram_pages())
>  		goto fail;
>  
> +	if (IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP)) {
> +		unsigned long size_per_node;
> +
> +		size_per_node = size;
> +		if (node == NUMA_NO_NODE)
> +			size_per_node /= num_online_nodes();
> +		if (size_per_node >= PMD_SIZE)
> +			shift = PMD_SHIFT;

There are two problems here.

1. Should not size_per_node be aligned with PMD_SIZE to avoid wasting memory later
   because of alignment upwards (making it worse for NUMA_NO_NODE)
2. What about PUD_SIZE which is not considered here at all
3. We should have similar knobs like ioremap controlling different size huge mappings

static int __read_mostly ioremap_p4d_capable;
static int __read_mostly ioremap_pud_capable;
static int __read_mostly ioremap_pmd_capable;
static int __read_mostly ioremap_huge_disabled;

while also giving arch a chance to weigh in through similar overrides like these.

arch_ioremap_[pud|pmd]_supported() ---> probably unifying it for vmalloc() 
 
> +	}
> +again:
> +	align = max(real_align, 1UL << shift);
> +	size = ALIGN(real_size, align);
> +
>  	area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |
>  				vm_flags, start, end, node, gfp_mask, caller);
>  	if (!area)
>  		goto fail;
>  
> +	area->page_shift = shift - PAGE_SHIFT;
> +
>  	addr = __vmalloc_area_node(area, gfp_mask, prot, node);
>  	if (!addr)
> -		return NULL;
> +		goto fail;
>  
>  	/*
>  	 * In this function, newly allocated vm_struct has VM_UNINITIALIZED
> @@ -2648,8 +2716,16 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
>  	return addr;
>  
>  fail:
> -	warn_alloc(gfp_mask, NULL,
> +	if (shift == PMD_SHIFT) {
> +		shift = PAGE_SHIFT;
> +		goto again;
> +	}

PUD_SHIFT should be accommodated here as well while falling back to lower
mapping sizes in case previous allocation attempt fails.

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 4/4] mm/vmalloc: Hugepage vmalloc mappings
  2019-06-10  4:38 ` [PATCH 4/4] mm/vmalloc: Hugepage vmalloc mappings Nicholas Piggin
  2019-06-10  5:49   ` Nicholas Piggin
  2019-06-10  8:53   ` Anshuman Khandual
@ 2019-06-10 14:10   ` Mark Rutland
  2019-06-10 14:44     ` Nicholas Piggin
  2019-06-11  5:39   ` Christophe Leroy
  3 siblings, 1 reply; 24+ messages in thread
From: Mark Rutland @ 2019-06-10 14:10 UTC (permalink / raw)
  To: Nicholas Piggin; +Cc: linux-mm, linuxppc-dev, linux-arm-kernel

Hi,

On Mon, Jun 10, 2019 at 02:38:38PM +1000, Nicholas Piggin wrote:
> For platforms that define HAVE_ARCH_HUGE_VMAP, have vmap allow vmalloc to
> allocate huge pages and map them
> 
> This brings dTLB misses for linux kernel tree `git diff` from 45,000 to
> 8,000 on a Kaby Lake KVM guest with 8MB dentry hash and mitigations=off
> (performance is in the noise, under 1% difference, page tables are likely
> to be well cached for this workload). Similar numbers are seen on POWER9.

Do you happen to know which vmalloc mappings these get used for in the
above case? Where do we see vmalloc mappings that large?

I'm worried as to how this would interact with the set_memory_*()
functions, as on arm64 those can only operate on page-granular mappings.
Those may need fixing up to handle huge mappings; certainly if the above
is all for modules.

Thanks,
Mark.

> 
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> ---
>  include/asm-generic/4level-fixup.h |   1 +
>  include/asm-generic/5level-fixup.h |   1 +
>  include/linux/vmalloc.h            |   1 +
>  mm/vmalloc.c                       | 132 +++++++++++++++++++++++------
>  4 files changed, 107 insertions(+), 28 deletions(-)
> 
> diff --git a/include/asm-generic/4level-fixup.h b/include/asm-generic/4level-fixup.h
> index e3667c9a33a5..3cc65a4dd093 100644
> --- a/include/asm-generic/4level-fixup.h
> +++ b/include/asm-generic/4level-fixup.h
> @@ -20,6 +20,7 @@
>  #define pud_none(pud)			0
>  #define pud_bad(pud)			0
>  #define pud_present(pud)		1
> +#define pud_large(pud)			0
>  #define pud_ERROR(pud)			do { } while (0)
>  #define pud_clear(pud)			pgd_clear(pud)
>  #define pud_val(pud)			pgd_val(pud)
> diff --git a/include/asm-generic/5level-fixup.h b/include/asm-generic/5level-fixup.h
> index bb6cb347018c..c4377db09a4f 100644
> --- a/include/asm-generic/5level-fixup.h
> +++ b/include/asm-generic/5level-fixup.h
> @@ -22,6 +22,7 @@
>  #define p4d_none(p4d)			0
>  #define p4d_bad(p4d)			0
>  #define p4d_present(p4d)		1
> +#define p4d_large(p4d)			0
>  #define p4d_ERROR(p4d)			do { } while (0)
>  #define p4d_clear(p4d)			pgd_clear(p4d)
>  #define p4d_val(p4d)			pgd_val(p4d)
> diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
> index 812bea5866d6..4c92dc608928 100644
> --- a/include/linux/vmalloc.h
> +++ b/include/linux/vmalloc.h
> @@ -42,6 +42,7 @@ struct vm_struct {
>  	unsigned long		size;
>  	unsigned long		flags;
>  	struct page		**pages;
> +	unsigned int		page_shift;
>  	unsigned int		nr_pages;
>  	phys_addr_t		phys_addr;
>  	const void		*caller;
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index dd27cfb29b10..0cf8e861caeb 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -36,6 +36,7 @@
>  #include <linux/rbtree_augmented.h>
>  
>  #include <linux/uaccess.h>
> +#include <asm/pgtable.h>
>  #include <asm/tlbflush.h>
>  #include <asm/shmparam.h>
>  
> @@ -440,6 +441,41 @@ static int vmap_pages_range(unsigned long start, unsigned long end,
>  	return ret;
>  }
>  
> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
> +static int vmap_hpages_range(unsigned long start, unsigned long end,
> +				   pgprot_t prot, struct page **pages,
> +				   unsigned int page_shift)
> +{
> +	unsigned long addr = start;
> +	unsigned int i, nr = (end - start) >> (PAGE_SHIFT + page_shift);
> +
> +	for (i = 0; i < nr; i++) {
> +		int err;
> +
> +		err = vmap_range_noflush(addr,
> +					addr + (PAGE_SIZE << page_shift),
> +					__pa(page_address(pages[i])), prot,
> +					PAGE_SHIFT + page_shift);
> +		if (err)
> +			return err;
> +
> +		addr += PAGE_SIZE << page_shift;
> +	}
> +	flush_cache_vmap(start, end);
> +
> +	return nr;
> +}
> +#else
> +static int vmap_hpages_range(unsigned long start, unsigned long end,
> +			   pgprot_t prot, struct page **pages,
> +			   unsigned int page_shift)
> +{
> +	BUG_ON(page_shift != PAGE_SIZE);
> +	return vmap_pages_range(start, end, prot, pages);
> +}
> +#endif
> +
> +
>  int is_vmalloc_or_module_addr(const void *x)
>  {
>  	/*
> @@ -462,7 +498,7 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
>  {
>  	unsigned long addr = (unsigned long) vmalloc_addr;
>  	struct page *page = NULL;
> -	pgd_t *pgd = pgd_offset_k(addr);
> +	pgd_t *pgd;
>  	p4d_t *p4d;
>  	pud_t *pud;
>  	pmd_t *pmd;
> @@ -474,27 +510,38 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
>  	 */
>  	VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
>  
> +	pgd = pgd_offset_k(addr);
>  	if (pgd_none(*pgd))
>  		return NULL;
> +
>  	p4d = p4d_offset(pgd, addr);
>  	if (p4d_none(*p4d))
>  		return NULL;
> -	pud = pud_offset(p4d, addr);
> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
> +	if (p4d_large(*p4d))
> +		return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT);
> +#endif
> +	if (WARN_ON_ONCE(p4d_bad(*p4d)))
> +		return NULL;
>  
> -	/*
> -	 * Don't dereference bad PUD or PMD (below) entries. This will also
> -	 * identify huge mappings, which we may encounter on architectures
> -	 * that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be
> -	 * identified as vmalloc addresses by is_vmalloc_addr(), but are
> -	 * not [unambiguously] associated with a struct page, so there is
> -	 * no correct value to return for them.
> -	 */
> -	WARN_ON_ONCE(pud_bad(*pud));
> -	if (pud_none(*pud) || pud_bad(*pud))
> +	pud = pud_offset(p4d, addr);
> +	if (pud_none(*pud))
> +		return NULL;
> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
> +	if (pud_large(*pud))
> +		return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
> +#endif
> +	if (WARN_ON_ONCE(pud_bad(*pud)))
>  		return NULL;
> +
>  	pmd = pmd_offset(pud, addr);
> -	WARN_ON_ONCE(pmd_bad(*pmd));
> -	if (pmd_none(*pmd) || pmd_bad(*pmd))
> +	if (pmd_none(*pmd))
> +		return NULL;
> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
> +	if (pmd_large(*pmd))
> +		return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
> +#endif
> +	if (WARN_ON_ONCE(pmd_bad(*pmd)))
>  		return NULL;
>  
>  	ptep = pte_offset_map(pmd, addr);
> @@ -502,6 +549,7 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
>  	if (pte_present(pte))
>  		page = pte_page(pte);
>  	pte_unmap(ptep);
> +
>  	return page;
>  }
>  EXPORT_SYMBOL(vmalloc_to_page);
> @@ -2185,8 +2233,9 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
>  		return NULL;
>  
>  	if (flags & VM_IOREMAP)
> -		align = 1ul << clamp_t(int, get_count_order_long(size),
> -				       PAGE_SHIFT, IOREMAP_MAX_ORDER);
> +		align = max(align,
> +				1ul << clamp_t(int, get_count_order_long(size),
> +				       PAGE_SHIFT, IOREMAP_MAX_ORDER));
>  
>  	area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
>  	if (unlikely(!area))
> @@ -2398,7 +2447,7 @@ static void __vunmap(const void *addr, int deallocate_pages)
>  			struct page *page = area->pages[i];
>  
>  			BUG_ON(!page);
> -			__free_pages(page, 0);
> +			__free_pages(page, area->page_shift);
>  		}
>  
>  		kvfree(area->pages);
> @@ -2541,14 +2590,17 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
>  				 pgprot_t prot, int node)
>  {
>  	struct page **pages;
> +	unsigned long addr = (unsigned long)area->addr;
> +	unsigned long size = get_vm_area_size(area);
> +	unsigned int page_shift = area->page_shift;
> +	unsigned int shift = page_shift + PAGE_SHIFT;
>  	unsigned int nr_pages, array_size, i;
>  	const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
>  	const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;
>  	const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ?
> -					0 :
> -					__GFP_HIGHMEM;
> +					0 : __GFP_HIGHMEM;
>  
> -	nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
> +	nr_pages = size >> shift;
>  	array_size = (nr_pages * sizeof(struct page *));
>  
>  	area->nr_pages = nr_pages;
> @@ -2569,10 +2621,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
>  	for (i = 0; i < area->nr_pages; i++) {
>  		struct page *page;
>  
> -		if (node == NUMA_NO_NODE)
> -			page = alloc_page(alloc_mask|highmem_mask);
> -		else
> -			page = alloc_pages_node(node, alloc_mask|highmem_mask, 0);
> +		page = alloc_pages_node(node,
> +				alloc_mask|highmem_mask, page_shift);
>  
>  		if (unlikely(!page)) {
>  			/* Successfully allocated i pages, free them in __vunmap() */
> @@ -2584,8 +2634,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
>  			cond_resched();
>  	}
>  
> -	if (map_vm_area(area, prot, pages))
> +	if (vmap_hpages_range(addr, addr + size, prot, pages, page_shift) < 0)
>  		goto fail;
> +
>  	return area->addr;
>  
>  fail:
> @@ -2619,22 +2670,39 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
>  			pgprot_t prot, unsigned long vm_flags, int node,
>  			const void *caller)
>  {
> -	struct vm_struct *area;
> +	struct vm_struct *area = NULL;
>  	void *addr;
>  	unsigned long real_size = size;
> +	unsigned long real_align = align;
> +	unsigned int shift = PAGE_SHIFT;
>  
>  	size = PAGE_ALIGN(size);
>  	if (!size || (size >> PAGE_SHIFT) > totalram_pages())
>  		goto fail;
>  
> +	if (IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP)) {
> +		unsigned long size_per_node;
> +
> +		size_per_node = size;
> +		if (node == NUMA_NO_NODE)
> +			size_per_node /= num_online_nodes();
> +		if (size_per_node >= PMD_SIZE)
> +			shift = PMD_SHIFT;
> +	}
> +again:
> +	align = max(real_align, 1UL << shift);
> +	size = ALIGN(real_size, align);
> +
>  	area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |
>  				vm_flags, start, end, node, gfp_mask, caller);
>  	if (!area)
>  		goto fail;
>  
> +	area->page_shift = shift - PAGE_SHIFT;
> +
>  	addr = __vmalloc_area_node(area, gfp_mask, prot, node);
>  	if (!addr)
> -		return NULL;
> +		goto fail;
>  
>  	/*
>  	 * In this function, newly allocated vm_struct has VM_UNINITIALIZED
> @@ -2648,8 +2716,16 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
>  	return addr;
>  
>  fail:
> -	warn_alloc(gfp_mask, NULL,
> +	if (shift == PMD_SHIFT) {
> +		shift = PAGE_SHIFT;
> +		goto again;
> +	}
> +
> +	if (!area) {
> +		/* Warn for area allocation, page allocations already warn */
> +		warn_alloc(gfp_mask, NULL,
>  			  "vmalloc: allocation failure: %lu bytes", real_size);
> +	}
>  	return NULL;
>  }
>  
> -- 
> 2.20.1
> 
> 
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 4/4] mm/vmalloc: Hugepage vmalloc mappings
  2019-06-10 14:10   ` Mark Rutland
@ 2019-06-10 14:44     ` Nicholas Piggin
  2019-06-11  6:17       ` Anshuman Khandual
  0 siblings, 1 reply; 24+ messages in thread
From: Nicholas Piggin @ 2019-06-10 14:44 UTC (permalink / raw)
  To: Mark Rutland; +Cc: linux-mm, linuxppc-dev, linux-arm-kernel

Mark Rutland's on June 11, 2019 12:10 am:
> Hi,
> 
> On Mon, Jun 10, 2019 at 02:38:38PM +1000, Nicholas Piggin wrote:
>> For platforms that define HAVE_ARCH_HUGE_VMAP, have vmap allow vmalloc to
>> allocate huge pages and map them
>> 
>> This brings dTLB misses for linux kernel tree `git diff` from 45,000 to
>> 8,000 on a Kaby Lake KVM guest with 8MB dentry hash and mitigations=off
>> (performance is in the noise, under 1% difference, page tables are likely
>> to be well cached for this workload). Similar numbers are seen on POWER9.
> 
> Do you happen to know which vmalloc mappings these get used for in the
> above case? Where do we see vmalloc mappings that large?

Large module vmalloc could be subject to huge mappings.

> I'm worried as to how this would interact with the set_memory_*()
> functions, as on arm64 those can only operate on page-granular mappings.
> Those may need fixing up to handle huge mappings; certainly if the above
> is all for modules.

Good point, that looks like it would break on arm64 at least. I'll
work on it. We may have to make this opt in beyond HUGE_VMAP.

Thanks,
Nick

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 4/4] mm/vmalloc: Hugepage vmalloc mappings
  2019-06-10  8:53   ` Anshuman Khandual
@ 2019-06-11  0:16     ` Nicholas Piggin
  2019-06-11  6:59       ` Anshuman Khandual
  0 siblings, 1 reply; 24+ messages in thread
From: Nicholas Piggin @ 2019-06-11  0:16 UTC (permalink / raw)
  To: Anshuman Khandual, linux-mm; +Cc: linuxppc-dev, linux-arm-kernel

Anshuman Khandual's on June 10, 2019 6:53 pm:
> On 06/10/2019 10:08 AM, Nicholas Piggin wrote:
>> For platforms that define HAVE_ARCH_HUGE_VMAP, have vmap allow vmalloc to
>> allocate huge pages and map them.
> 
> IIUC that extends HAVE_ARCH_HUGE_VMAP from iormap to vmalloc. 
> 
>> 
>> This brings dTLB misses for linux kernel tree `git diff` from 45,000 to
>> 8,000 on a Kaby Lake KVM guest with 8MB dentry hash and mitigations=off
>> (performance is in the noise, under 1% difference, page tables are likely
>> to be well cached for this workload). Similar numbers are seen on POWER9.
> 
> Sure will try this on arm64.
> 
>> 
>> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
>> ---
>>  include/asm-generic/4level-fixup.h |   1 +
>>  include/asm-generic/5level-fixup.h |   1 +
>>  include/linux/vmalloc.h            |   1 +
>>  mm/vmalloc.c                       | 132 +++++++++++++++++++++++------
>>  4 files changed, 107 insertions(+), 28 deletions(-)
>> 
>> diff --git a/include/asm-generic/4level-fixup.h b/include/asm-generic/4level-fixup.h
>> index e3667c9a33a5..3cc65a4dd093 100644
>> --- a/include/asm-generic/4level-fixup.h
>> +++ b/include/asm-generic/4level-fixup.h
>> @@ -20,6 +20,7 @@
>>  #define pud_none(pud)			0
>>  #define pud_bad(pud)			0
>>  #define pud_present(pud)		1
>> +#define pud_large(pud)			0
>>  #define pud_ERROR(pud)			do { } while (0)
>>  #define pud_clear(pud)			pgd_clear(pud)
>>  #define pud_val(pud)			pgd_val(pud)
>> diff --git a/include/asm-generic/5level-fixup.h b/include/asm-generic/5level-fixup.h
>> index bb6cb347018c..c4377db09a4f 100644
>> --- a/include/asm-generic/5level-fixup.h
>> +++ b/include/asm-generic/5level-fixup.h
>> @@ -22,6 +22,7 @@
>>  #define p4d_none(p4d)			0
>>  #define p4d_bad(p4d)			0
>>  #define p4d_present(p4d)		1
>> +#define p4d_large(p4d)			0
>>  #define p4d_ERROR(p4d)			do { } while (0)
>>  #define p4d_clear(p4d)			pgd_clear(p4d)
>>  #define p4d_val(p4d)			pgd_val(p4d)
> 
> Both of these are required from vmalloc_to_page() which as per a later comment
> should be part of a prerequisite patch before this series.

I'm not sure what you mean. This patch is where they get used.

Possibly I could split this and the vmalloc_to_page change out. I'll
consider it.

>> diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
>> index 812bea5866d6..4c92dc608928 100644
>> --- a/include/linux/vmalloc.h
>> +++ b/include/linux/vmalloc.h
>> @@ -42,6 +42,7 @@ struct vm_struct {
>>  	unsigned long		size;
>>  	unsigned long		flags;
>>  	struct page		**pages;
>> +	unsigned int		page_shift;
> 
> So the entire vm_struct will be mapped with a single page_shift. It cannot have
> mix and match mappings with PAGE_SIZE, PMD_SIZE, PUD_SIZE etc in case the
> allocation fails for larger ones, falling back etc what over other reasons.

For now, yes. I have a bit of follow up work to improve that and make
it able to fall back, but it's a bit more churn and not a significant
benefit just yet because there are not a lot of very large vmallocs
(except the early hashes which can be satisfied with large allocs).

> 
>>  	unsigned int		nr_pages;
>>  	phys_addr_t		phys_addr;
>>  	const void		*caller;
>> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
>> index dd27cfb29b10..0cf8e861caeb 100644
>> --- a/mm/vmalloc.c
>> +++ b/mm/vmalloc.c
>> @@ -36,6 +36,7 @@
>>  #include <linux/rbtree_augmented.h>
>>  
>>  #include <linux/uaccess.h>
>> +#include <asm/pgtable.h>
>>  #include <asm/tlbflush.h>
>>  #include <asm/shmparam.h>
>>  
>> @@ -440,6 +441,41 @@ static int vmap_pages_range(unsigned long start, unsigned long end,
>>  	return ret;
>>  }
>>  
>> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
>> +static int vmap_hpages_range(unsigned long start, unsigned long end,
> 
> A small nit (if you agree) s/hpages/huge_pages/

Hmm. It's not actually a good function name because it can do small
pages as well. vmap_pages_size_range or something may be better.

> 
>> +				   pgprot_t prot, struct page **pages,
> 
> Re-order (prot <---> pages) just to follow the standard like before.

Will do.

>> +				   unsigned int page_shift)
>> +{
>> +	unsigned long addr = start;
>> +	unsigned int i, nr = (end - start) >> (PAGE_SHIFT + page_shift);
> 
> s/nr/nr_huge_pages ?

Sure.

> Also should not we check for the alignment of the range [start...end] with
> respect to (1UL << [PAGE_SHIFT + page_shift]).

The caller should if it specifies large page. Could check and -EINVAL
for incorrect alignment.

>> +
>> +	for (i = 0; i < nr; i++) {
>> +		int err;
>> +
>> +		err = vmap_range_noflush(addr,
>> +					addr + (PAGE_SIZE << page_shift),
>> +					__pa(page_address(pages[i])), prot,
>> +					PAGE_SHIFT + page_shift);
>> +		if (err)
>> +			return err;
>> +
>> +		addr += PAGE_SIZE << page_shift;
>> +	}
>> +	flush_cache_vmap(start, end);
>> +
>> +	return nr;
>> +}
>> +#else
>> +static int vmap_hpages_range(unsigned long start, unsigned long end,
>> +			   pgprot_t prot, struct page **pages,
>> +			   unsigned int page_shift)
>> +{
>> +	BUG_ON(page_shift != PAGE_SIZE);
>> +	return vmap_pages_range(start, end, prot, pages);
>> +}
>> +#endif
>> +
>> +
>>  int is_vmalloc_or_module_addr(const void *x)
>>  {
>>  	/*
>> @@ -462,7 +498,7 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
>>  {
>>  	unsigned long addr = (unsigned long) vmalloc_addr;
>>  	struct page *page = NULL;
>> -	pgd_t *pgd = pgd_offset_k(addr);
>> +	pgd_t *pgd;
>>  	p4d_t *p4d;
>>  	pud_t *pud;
>>  	pmd_t *pmd;
>> @@ -474,27 +510,38 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
>>  	 */
>>  	VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
>>  
>> +	pgd = pgd_offset_k(addr);
>>  	if (pgd_none(*pgd))
>>  		return NULL;
>> +
> 
> Small nit. Stray line here.
> 
> 'pgd' related changes here seem to be just cleanups and should not part of this patch.

Yeah I figure it doesn't matter to make small changes close by, but
maybe that's more frowned upon now for git blame?

>>  	p4d = p4d_offset(pgd, addr);
>>  	if (p4d_none(*p4d))
>>  		return NULL;
>> -	pud = pud_offset(p4d, addr);
>> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
>> +	if (p4d_large(*p4d))
>> +		return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT);
>> +#endif
>> +	if (WARN_ON_ONCE(p4d_bad(*p4d)))
>> +		return NULL;
>>  
>> -	/*
>> -	 * Don't dereference bad PUD or PMD (below) entries. This will also
>> -	 * identify huge mappings, which we may encounter on architectures
>> -	 * that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be
>> -	 * identified as vmalloc addresses by is_vmalloc_addr(), but are
>> -	 * not [unambiguously] associated with a struct page, so there is
>> -	 * no correct value to return for them.
>> -	 */
> 
> What changed the situation so that we could return struct page for a huge
> mapping now ?

For the PUD case? Nothing changed, per se, we I just calculate the
correct struct page now, so I may return it.

> AFAICT even after this patch, PUD/P4D level huge pages can only
> be created with ioremap_page_range() not with vmalloc() which creates PMD
> sized mappings only. Hence if it's okay to dereference struct page of a huge
> mapping (not withstanding the comment here) it should be part of an earlier
> patch fixing it first for existing ioremap_page_range() huge mappings.

Possibly yes, we can consider 029c54b095995 to be a band-aid for huge
vmaps which is fixed properly by this change, in which case it could
make sense to break this into its own patch.

> 
>> -	WARN_ON_ONCE(pud_bad(*pud));
>> -	if (pud_none(*pud) || pud_bad(*pud))
>> +	pud = pud_offset(p4d, addr);
>> +	if (pud_none(*pud))
>> +		return NULL;
>> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
>> +	if (pud_large(*pud))
>> +		return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
>> +#endif
>> +	if (WARN_ON_ONCE(pud_bad(*pud)))
>>  		return NULL;
>> +
>>  	pmd = pmd_offset(pud, addr);
>> -	WARN_ON_ONCE(pmd_bad(*pmd));
>> -	if (pmd_none(*pmd) || pmd_bad(*pmd))
>> +	if (pmd_none(*pmd))
>> +		return NULL;
>> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
>> +	if (pmd_large(*pmd))
>> +		return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
>> +#endif
>> +	if (WARN_ON_ONCE(pmd_bad(*pmd)))
>>  		return NULL;
> 
> At each page table level, we are checking in this order
> 
> pXX_none() --> pXX_large() --> pXX_bad()
> 
> Are not these alternative orders bit better
> 
> pXX_bad() --> pXX_none() --> pXX_large()
> 
> Or
> 
> pXX_none() --> pXX_bad() --> pXX_large()
> 
> Checking for pXX_bad() at the end does not make much sense.

Yeah the order tends to go none->bad. It's not 100% clear we can
test bad before none (at least it goes against convention). But good
point should be changed to your last sequence I think.

> 
>>  
>>  	ptep = pte_offset_map(pmd, addr);
>> @@ -502,6 +549,7 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
>>  	if (pte_present(pte))
>>  		page = pte_page(pte);
>>  	pte_unmap(ptep);
>> +
> 
> Small nit. Stray line here.

I don't mind adding some lines here and there, like here. It is an
unrelated (alleged-)cleanup though.

> 
>>  	return page;
>>  }
>>  EXPORT_SYMBOL(vmalloc_to_page);
>> @@ -2185,8 +2233,9 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
>>  		return NULL;
>>  
>>  	if (flags & VM_IOREMAP)
>> -		align = 1ul << clamp_t(int, get_count_order_long(size),
>> -				       PAGE_SHIFT, IOREMAP_MAX_ORDER);
>> +		align = max(align,
>> +				1ul << clamp_t(int, get_count_order_long(size),
>> +				       PAGE_SHIFT, IOREMAP_MAX_ORDER));
>>  
>>  	area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
>>  	if (unlikely(!area))
>> @@ -2398,7 +2447,7 @@ static void __vunmap(const void *addr, int deallocate_pages)
>>  			struct page *page = area->pages[i];
>>  
>>  			BUG_ON(!page);
>> -			__free_pages(page, 0);
>> +			__free_pages(page, area->page_shift);
> 
> area->page_shift' turns out to be effective page order. I think the name here is bit
> misleading. s/page_shift/page_order or nr_pages should be better IMHO. page_shift is
> not actual shift (1UL << area->shift to get size) nor does it sound like page 'order'.

Yeah good point.

>>  		}
>>  
>>  		kvfree(area->pages);
>> @@ -2541,14 +2590,17 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
>>  				 pgprot_t prot, int node)
>>  {
>>  	struct page **pages;
>> +	unsigned long addr = (unsigned long)area->addr;
>> +	unsigned long size = get_vm_area_size(area);
>> +	unsigned int page_shift = area->page_shift;
>> +	unsigned int shift = page_shift + PAGE_SHIFT;
>>  	unsigned int nr_pages, array_size, i;
>>  	const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
>>  	const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;
>>  	const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ?
>> -					0 :
>> -					__GFP_HIGHMEM;
>> +					0 : __GFP_HIGHMEM;
>>  
>> -	nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
>> +	nr_pages = size >> shift;
>>  	array_size = (nr_pages * sizeof(struct page *));
>>  
>>  	area->nr_pages = nr_pages;
>> @@ -2569,10 +2621,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
>>  	for (i = 0; i < area->nr_pages; i++) {
>>  		struct page *page;
>>  
>> -		if (node == NUMA_NO_NODE)
>> -			page = alloc_page(alloc_mask|highmem_mask);
>> -		else
>> -			page = alloc_pages_node(node, alloc_mask|highmem_mask, 0);
>> +		page = alloc_pages_node(node,
>> +				alloc_mask|highmem_mask, page_shift);
> 
> alloc_mask remains the exact same like before even for these high order pages.

Is there a problem there? I don't see.

>>  
>>  		if (unlikely(!page)) {
>>  			/* Successfully allocated i pages, free them in __vunmap() */
>> @@ -2584,8 +2634,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
>>  			cond_resched();
>>  	}
>>  
>> -	if (map_vm_area(area, prot, pages))
>> +	if (vmap_hpages_range(addr, addr + size, prot, pages, page_shift) < 0)
>>  		goto fail;
>> +
>>  	return area->addr;
>>  
>>  fail:
>> @@ -2619,22 +2670,39 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
>>  			pgprot_t prot, unsigned long vm_flags, int node,
>>  			const void *caller)
>>  {
>> -	struct vm_struct *area;
>> +	struct vm_struct *area = NULL;
>>  	void *addr;
>>  	unsigned long real_size = size;
>> +	unsigned long real_align = align;
>> +	unsigned int shift = PAGE_SHIFT;
>>  
>>  	size = PAGE_ALIGN(size);
>>  	if (!size || (size >> PAGE_SHIFT) > totalram_pages())
>>  		goto fail;
>>  
>> +	if (IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP)) {
>> +		unsigned long size_per_node;
>> +
>> +		size_per_node = size;
>> +		if (node == NUMA_NO_NODE)
>> +			size_per_node /= num_online_nodes();
>> +		if (size_per_node >= PMD_SIZE)
>> +			shift = PMD_SHIFT;
> 
> There are two problems here.
> 
> 1. Should not size_per_node be aligned with PMD_SIZE to avoid wasting memory later
>    because of alignment upwards (making it worse for NUMA_NO_NODE)

I'm not sure what you mean, it's just a heuristic to check for node
interleaving, and use small pages if large can not interleave well.

> 2. What about PUD_SIZE which is not considered here at all

Yeah, not doing PUD pages at all. It would be pretty trivial to add 
after PMD is working, but would it actually get used anywhere?

> 3. We should have similar knobs like ioremap controlling different size huge mappings
> 
> static int __read_mostly ioremap_p4d_capable;
> static int __read_mostly ioremap_pud_capable;
> static int __read_mostly ioremap_pmd_capable;
> static int __read_mostly ioremap_huge_disabled;
> 
> while also giving arch a chance to weigh in through similar overrides like these.
> 
> arch_ioremap_[pud|pmd]_supported() ---> probably unifying it for vmalloc() 

I'm not sure if that makes sense. IO mappings I could see maybe having
some quirks or bugs or support issues. Cacheable mappings should be the
"base" for supporting larger pages, if the platform has trouble with
those then it should just avoid the feature I think.

Or is there a good reason to add the option? I don't mind, I just want
to avoid proliferation.

>> +	}
>> +again:
>> +	align = max(real_align, 1UL << shift);
>> +	size = ALIGN(real_size, align);
>> +
>>  	area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |
>>  				vm_flags, start, end, node, gfp_mask, caller);
>>  	if (!area)
>>  		goto fail;
>>  
>> +	area->page_shift = shift - PAGE_SHIFT;
>> +
>>  	addr = __vmalloc_area_node(area, gfp_mask, prot, node);
>>  	if (!addr)
>> -		return NULL;
>> +		goto fail;
>>  
>>  	/*
>>  	 * In this function, newly allocated vm_struct has VM_UNINITIALIZED
>> @@ -2648,8 +2716,16 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
>>  	return addr;
>>  
>>  fail:
>> -	warn_alloc(gfp_mask, NULL,
>> +	if (shift == PMD_SHIFT) {
>> +		shift = PAGE_SHIFT;
>> +		goto again;
>> +	}
> 
> PUD_SHIFT should be accommodated here as well while falling back to lower
> mapping sizes in case previous allocation attempt fails.
> 

This also would be part of PUD support.

Thanks,
Nick

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 1/4] mm: Move ioremap page table mapping function to mm/
  2019-06-10  4:38 [PATCH 1/4] mm: Move ioremap page table mapping function to mm/ Nicholas Piggin
                   ` (3 preceding siblings ...)
  2019-06-10  5:42 ` [PATCH 1/4] mm: Move ioremap page table mapping function to mm/ Anshuman Khandual
@ 2019-06-11  5:24 ` Christophe Leroy
  2019-06-19  3:43   ` Nicholas Piggin
  4 siblings, 1 reply; 24+ messages in thread
From: Christophe Leroy @ 2019-06-11  5:24 UTC (permalink / raw)
  To: Nicholas Piggin, linux-mm; +Cc: linuxppc-dev, linux-arm-kernel



Le 10/06/2019 à 06:38, Nicholas Piggin a écrit :
> ioremap_page_range is a generic function to create a kernel virtual
> mapping, move it to mm/vmalloc.c and rename it vmap_range.
> 
> For clarity with this move, also:
> - Rename vunmap_page_range (vmap_range's inverse) to vunmap_range.
> - Rename vmap_page_range (which takes a page array) to vmap_pages.

Maybe it would be easier to follow the change if the name change was 
done in another patch than the move.

> 
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> ---
> 
> Fixed up the arm64 compile errors, fixed a few bugs, and tidied
> things up a bit more.
> 
> Have tested powerpc and x86 but not arm64, would appreciate a review
> and test of the arm64 patch if possible.
> 
>   include/linux/vmalloc.h |   3 +
>   lib/ioremap.c           | 173 +++---------------------------
>   mm/vmalloc.c            | 228 ++++++++++++++++++++++++++++++++++++----
>   3 files changed, 229 insertions(+), 175 deletions(-)
> 
> diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
> index 51e131245379..812bea5866d6 100644
> --- a/include/linux/vmalloc.h
> +++ b/include/linux/vmalloc.h
> @@ -147,6 +147,9 @@ extern struct vm_struct *find_vm_area(const void *addr);
>   extern int map_vm_area(struct vm_struct *area, pgprot_t prot,
>   			struct page **pages);
>   #ifdef CONFIG_MMU
> +extern int vmap_range(unsigned long addr,
> +		       unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
> +		       unsigned int max_page_shift);

Drop extern keyword here.

As checkpatch tells you, 'CHECK:AVOID_EXTERNS: extern prototypes should 
be avoided in .h files'

Christophe

>   extern int map_kernel_range_noflush(unsigned long start, unsigned long size,
>   				    pgprot_t prot, struct page **pages);
>   extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size);
> diff --git a/lib/ioremap.c b/lib/ioremap.c
> index 063213685563..e13946da8ec3 100644
> --- a/lib/ioremap.c
> +++ b/lib/ioremap.c
> @@ -58,165 +58,24 @@ static inline int ioremap_pud_enabled(void) { return 0; }
>   static inline int ioremap_pmd_enabled(void) { return 0; }
>   #endif	/* CONFIG_HAVE_ARCH_HUGE_VMAP */
>   
> -static int ioremap_pte_range(pmd_t *pmd, unsigned long addr,
> -		unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
> -{
> -	pte_t *pte;
> -	u64 pfn;
> -
> -	pfn = phys_addr >> PAGE_SHIFT;
> -	pte = pte_alloc_kernel(pmd, addr);
> -	if (!pte)
> -		return -ENOMEM;
> -	do {
> -		BUG_ON(!pte_none(*pte));
> -		set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
> -		pfn++;
> -	} while (pte++, addr += PAGE_SIZE, addr != end);
> -	return 0;
> -}
> -
> -static int ioremap_try_huge_pmd(pmd_t *pmd, unsigned long addr,
> -				unsigned long end, phys_addr_t phys_addr,
> -				pgprot_t prot)
> -{
> -	if (!ioremap_pmd_enabled())
> -		return 0;
> -
> -	if ((end - addr) != PMD_SIZE)
> -		return 0;
> -
> -	if (!IS_ALIGNED(phys_addr, PMD_SIZE))
> -		return 0;
> -
> -	if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
> -		return 0;
> -
> -	return pmd_set_huge(pmd, phys_addr, prot);
> -}
> -
> -static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
> -		unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
> -{
> -	pmd_t *pmd;
> -	unsigned long next;
> -
> -	pmd = pmd_alloc(&init_mm, pud, addr);
> -	if (!pmd)
> -		return -ENOMEM;
> -	do {
> -		next = pmd_addr_end(addr, end);
> -
> -		if (ioremap_try_huge_pmd(pmd, addr, next, phys_addr, prot))
> -			continue;
> -
> -		if (ioremap_pte_range(pmd, addr, next, phys_addr, prot))
> -			return -ENOMEM;
> -	} while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
> -	return 0;
> -}
> -
> -static int ioremap_try_huge_pud(pud_t *pud, unsigned long addr,
> -				unsigned long end, phys_addr_t phys_addr,
> -				pgprot_t prot)
> -{
> -	if (!ioremap_pud_enabled())
> -		return 0;
> -
> -	if ((end - addr) != PUD_SIZE)
> -		return 0;
> -
> -	if (!IS_ALIGNED(phys_addr, PUD_SIZE))
> -		return 0;
> -
> -	if (pud_present(*pud) && !pud_free_pmd_page(pud, addr))
> -		return 0;
> -
> -	return pud_set_huge(pud, phys_addr, prot);
> -}
> -
> -static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr,
> -		unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
> -{
> -	pud_t *pud;
> -	unsigned long next;
> -
> -	pud = pud_alloc(&init_mm, p4d, addr);
> -	if (!pud)
> -		return -ENOMEM;
> -	do {
> -		next = pud_addr_end(addr, end);
> -
> -		if (ioremap_try_huge_pud(pud, addr, next, phys_addr, prot))
> -			continue;
> -
> -		if (ioremap_pmd_range(pud, addr, next, phys_addr, prot))
> -			return -ENOMEM;
> -	} while (pud++, phys_addr += (next - addr), addr = next, addr != end);
> -	return 0;
> -}
> -
> -static int ioremap_try_huge_p4d(p4d_t *p4d, unsigned long addr,
> -				unsigned long end, phys_addr_t phys_addr,
> -				pgprot_t prot)
> -{
> -	if (!ioremap_p4d_enabled())
> -		return 0;
> -
> -	if ((end - addr) != P4D_SIZE)
> -		return 0;
> -
> -	if (!IS_ALIGNED(phys_addr, P4D_SIZE))
> -		return 0;
> -
> -	if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr))
> -		return 0;
> -
> -	return p4d_set_huge(p4d, phys_addr, prot);
> -}
> -
> -static inline int ioremap_p4d_range(pgd_t *pgd, unsigned long addr,
> -		unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
> -{
> -	p4d_t *p4d;
> -	unsigned long next;
> -
> -	p4d = p4d_alloc(&init_mm, pgd, addr);
> -	if (!p4d)
> -		return -ENOMEM;
> -	do {
> -		next = p4d_addr_end(addr, end);
> -
> -		if (ioremap_try_huge_p4d(p4d, addr, next, phys_addr, prot))
> -			continue;
> -
> -		if (ioremap_pud_range(p4d, addr, next, phys_addr, prot))
> -			return -ENOMEM;
> -	} while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
> -	return 0;
> -}
> -
>   int ioremap_page_range(unsigned long addr,
>   		       unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
>   {
> -	pgd_t *pgd;
> -	unsigned long start;
> -	unsigned long next;
> -	int err;
> -
> -	might_sleep();
> -	BUG_ON(addr >= end);
> -
> -	start = addr;
> -	pgd = pgd_offset_k(addr);
> -	do {
> -		next = pgd_addr_end(addr, end);
> -		err = ioremap_p4d_range(pgd, addr, next, phys_addr, prot);
> -		if (err)
> -			break;
> -	} while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
> -
> -	flush_cache_vmap(start, end);
> +	unsigned int max_page_shift = PAGE_SHIFT;
> +
> +	/*
> +	 * Due to the max_page_shift parameter to vmap_range, platforms must
> +	 * enable all smaller sizes to take advantage of a given size,
> +	 * otherwise fall back to small pages.
> +	 */
> +	if (ioremap_pmd_enabled()) {
> +		max_page_shift = PMD_SHIFT;
> +		if (ioremap_pud_enabled()) {
> +			max_page_shift = PUD_SHIFT;
> +			if (ioremap_p4d_enabled())
> +				max_page_shift = P4D_SHIFT;
> +		}
> +	}
>   
> -	return err;
> +	return vmap_range(addr, end, phys_addr, prot, max_page_shift);
>   }
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 233af6936c93..dd27cfb29b10 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -119,7 +119,7 @@ static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end)
>   	} while (p4d++, addr = next, addr != end);
>   }
>   
> -static void vunmap_page_range(unsigned long addr, unsigned long end)
> +static void vunmap_range(unsigned long addr, unsigned long end)
>   {
>   	pgd_t *pgd;
>   	unsigned long next;
> @@ -135,6 +135,198 @@ static void vunmap_page_range(unsigned long addr, unsigned long end)
>   }
>   
>   static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
> +			unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
> +{
> +	pte_t *pte;
> +	u64 pfn;
> +
> +	pfn = phys_addr >> PAGE_SHIFT;
> +	pte = pte_alloc_kernel(pmd, addr);
> +	if (!pte)
> +		return -ENOMEM;
> +	do {
> +		BUG_ON(!pte_none(*pte));
> +		set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
> +		pfn++;
> +	} while (pte++, addr += PAGE_SIZE, addr != end);
> +	return 0;
> +}
> +
> +static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr,
> +			unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
> +			unsigned int max_page_shift)
> +{
> +	if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
> +		return 0;
> +
> +	if (max_page_shift < PMD_SHIFT)
> +		return 0;
> +
> +	if ((end - addr) != PMD_SIZE)
> +		return 0;
> +
> +	if (!IS_ALIGNED(phys_addr, PMD_SIZE))
> +		return 0;
> +
> +	if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
> +		return 0;
> +
> +	return pmd_set_huge(pmd, phys_addr, prot);
> +}
> +
> +static inline int vmap_pmd_range(pud_t *pud, unsigned long addr,
> +			unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
> +			unsigned int max_page_shift)
> +{
> +	pmd_t *pmd;
> +	unsigned long next;
> +
> +	pmd = pmd_alloc(&init_mm, pud, addr);
> +	if (!pmd)
> +		return -ENOMEM;
> +	do {
> +		next = pmd_addr_end(addr, end);
> +
> +		if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot,
> +					max_page_shift))
> +			continue;
> +
> +		if (vmap_pte_range(pmd, addr, next, phys_addr, prot))
> +			return -ENOMEM;
> +	} while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
> +	return 0;
> +}
> +
> +static int vmap_try_huge_pud(pud_t *pud, unsigned long addr,
> +			unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
> +			unsigned int max_page_shift)
> +{
> +	if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
> +		return 0;
> +
> +	if (max_page_shift < PUD_SHIFT)
> +		return 0;
> +
> +	if ((end - addr) != PUD_SIZE)
> +		return 0;
> +
> +	if (!IS_ALIGNED(phys_addr, PUD_SIZE))
> +		return 0;
> +
> +	if (pud_present(*pud) && !pud_free_pmd_page(pud, addr))
> +		return 0;
> +
> +	return pud_set_huge(pud, phys_addr, prot);
> +}
> +
> +static inline int vmap_pud_range(p4d_t *p4d, unsigned long addr,
> +			unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
> +			unsigned int max_page_shift)
> +{
> +	pud_t *pud;
> +	unsigned long next;
> +
> +	pud = pud_alloc(&init_mm, p4d, addr);
> +	if (!pud)
> +		return -ENOMEM;
> +	do {
> +		next = pud_addr_end(addr, end);
> +
> +		if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot,
> +					max_page_shift))
> +			continue;
> +
> +		if (vmap_pmd_range(pud, addr, next, phys_addr, prot,
> +					max_page_shift))
> +			return -ENOMEM;
> +	} while (pud++, phys_addr += (next - addr), addr = next, addr != end);
> +	return 0;
> +}
> +
> +static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr,
> +			unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
> +			unsigned int max_page_shift)
> +{
> +	if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
> +		return 0;
> +
> +	if (max_page_shift < P4D_SHIFT)
> +		return 0;
> +
> +	if ((end - addr) != P4D_SIZE)
> +		return 0;
> +
> +	if (!IS_ALIGNED(phys_addr, P4D_SIZE))
> +		return 0;
> +
> +	if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr))
> +		return 0;
> +
> +	return p4d_set_huge(p4d, phys_addr, prot);
> +}
> +
> +static inline int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
> +			unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
> +			unsigned int max_page_shift)
> +{
> +	p4d_t *p4d;
> +	unsigned long next;
> +
> +	p4d = p4d_alloc(&init_mm, pgd, addr);
> +	if (!p4d)
> +		return -ENOMEM;
> +	do {
> +		next = p4d_addr_end(addr, end);
> +
> +		if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot,
> +					max_page_shift))
> +			continue;
> +
> +		if (vmap_pud_range(p4d, addr, next, phys_addr, prot,
> +					max_page_shift))
> +			return -ENOMEM;
> +	} while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
> +	return 0;
> +}
> +
> +static int vmap_range_noflush(unsigned long addr,
> +			unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
> +			unsigned int max_page_shift)
> +{
> +	pgd_t *pgd;
> +	unsigned long start;
> +	unsigned long next;
> +	int err;
> +
> +	might_sleep();
> +	BUG_ON(addr >= end);
> +
> +	start = addr;
> +	pgd = pgd_offset_k(addr);
> +	do {
> +		next = pgd_addr_end(addr, end);
> +		err = vmap_p4d_range(pgd, addr, next, phys_addr, prot,
> +					max_page_shift);
> +		if (err)
> +			break;
> +	} while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
> +
> +	return err;
> +}
> +
> +int vmap_range(unsigned long addr,
> +		       unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
> +		       unsigned int max_page_shift)
> +{
> +	int ret;
> +
> +	ret = vmap_range_noflush(addr, end, phys_addr, prot, max_page_shift);
> +	flush_cache_vmap(addr, end);
> +
> +	return ret;
> +}
> +
> +static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
>   		unsigned long end, pgprot_t prot, struct page **pages, int *nr)
>   {
>   	pte_t *pte;
> @@ -160,7 +352,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
>   	return 0;
>   }
>   
> -static int vmap_pmd_range(pud_t *pud, unsigned long addr,
> +static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr,
>   		unsigned long end, pgprot_t prot, struct page **pages, int *nr)
>   {
>   	pmd_t *pmd;
> @@ -171,13 +363,13 @@ static int vmap_pmd_range(pud_t *pud, unsigned long addr,
>   		return -ENOMEM;
>   	do {
>   		next = pmd_addr_end(addr, end);
> -		if (vmap_pte_range(pmd, addr, next, prot, pages, nr))
> +		if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr))
>   			return -ENOMEM;
>   	} while (pmd++, addr = next, addr != end);
>   	return 0;
>   }
>   
> -static int vmap_pud_range(p4d_t *p4d, unsigned long addr,
> +static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr,
>   		unsigned long end, pgprot_t prot, struct page **pages, int *nr)
>   {
>   	pud_t *pud;
> @@ -188,13 +380,13 @@ static int vmap_pud_range(p4d_t *p4d, unsigned long addr,
>   		return -ENOMEM;
>   	do {
>   		next = pud_addr_end(addr, end);
> -		if (vmap_pmd_range(pud, addr, next, prot, pages, nr))
> +		if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr))
>   			return -ENOMEM;
>   	} while (pud++, addr = next, addr != end);
>   	return 0;
>   }
>   
> -static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
> +static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr,
>   		unsigned long end, pgprot_t prot, struct page **pages, int *nr)
>   {
>   	p4d_t *p4d;
> @@ -205,7 +397,7 @@ static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
>   		return -ENOMEM;
>   	do {
>   		next = p4d_addr_end(addr, end);
> -		if (vmap_pud_range(p4d, addr, next, prot, pages, nr))
> +		if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr))
>   			return -ENOMEM;
>   	} while (p4d++, addr = next, addr != end);
>   	return 0;
> @@ -217,7 +409,7 @@ static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
>    *
>    * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
>    */
> -static int vmap_page_range_noflush(unsigned long start, unsigned long end,
> +static int vmap_pages_range_noflush(unsigned long start, unsigned long end,
>   				   pgprot_t prot, struct page **pages)
>   {
>   	pgd_t *pgd;
> @@ -230,7 +422,7 @@ static int vmap_page_range_noflush(unsigned long start, unsigned long end,
>   	pgd = pgd_offset_k(addr);
>   	do {
>   		next = pgd_addr_end(addr, end);
> -		err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr);
> +		err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr);
>   		if (err)
>   			return err;
>   	} while (pgd++, addr = next, addr != end);
> @@ -238,12 +430,12 @@ static int vmap_page_range_noflush(unsigned long start, unsigned long end,
>   	return nr;
>   }
>   
> -static int vmap_page_range(unsigned long start, unsigned long end,
> +static int vmap_pages_range(unsigned long start, unsigned long end,
>   			   pgprot_t prot, struct page **pages)
>   {
>   	int ret;
>   
> -	ret = vmap_page_range_noflush(start, end, prot, pages);
> +	ret = vmap_pages_range_noflush(start, end, prot, pages);
>   	flush_cache_vmap(start, end);
>   	return ret;
>   }
> @@ -1148,7 +1340,7 @@ static void free_vmap_area(struct vmap_area *va)
>    */
>   static void unmap_vmap_area(struct vmap_area *va)
>   {
> -	vunmap_page_range(va->va_start, va->va_end);
> +	vunmap_range(va->va_start, va->va_end);
>   }
>   
>   /*
> @@ -1586,7 +1778,7 @@ static void vb_free(const void *addr, unsigned long size)
>   	rcu_read_unlock();
>   	BUG_ON(!vb);
>   
> -	vunmap_page_range((unsigned long)addr, (unsigned long)addr + size);
> +	vunmap_range((unsigned long)addr, (unsigned long)addr + size);
>   
>   	if (debug_pagealloc_enabled())
>   		flush_tlb_kernel_range((unsigned long)addr,
> @@ -1736,7 +1928,7 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro
>   		addr = va->va_start;
>   		mem = (void *)addr;
>   	}
> -	if (vmap_page_range(addr, addr + size, prot, pages) < 0) {
> +	if (vmap_pages_range(addr, addr + size, prot, pages) < 0) {
>   		vm_unmap_ram(mem, count);
>   		return NULL;
>   	}
> @@ -1903,7 +2095,7 @@ void __init vmalloc_init(void)
>   int map_kernel_range_noflush(unsigned long addr, unsigned long size,
>   			     pgprot_t prot, struct page **pages)
>   {
> -	return vmap_page_range_noflush(addr, addr + size, prot, pages);
> +	return vmap_pages_range_noflush(addr, addr + size, prot, pages);
>   }
>   
>   /**
> @@ -1922,7 +2114,7 @@ int map_kernel_range_noflush(unsigned long addr, unsigned long size,
>    */
>   void unmap_kernel_range_noflush(unsigned long addr, unsigned long size)
>   {
> -	vunmap_page_range(addr, addr + size);
> +	vunmap_range(addr, addr + size);
>   }
>   EXPORT_SYMBOL_GPL(unmap_kernel_range_noflush);
>   
> @@ -1939,7 +2131,7 @@ void unmap_kernel_range(unsigned long addr, unsigned long size)
>   	unsigned long end = addr + size;
>   
>   	flush_cache_vunmap(addr, end);
> -	vunmap_page_range(addr, end);
> +	vunmap_range(addr, end);
>   	flush_tlb_kernel_range(addr, end);
>   }
>   EXPORT_SYMBOL_GPL(unmap_kernel_range);
> @@ -1950,7 +2142,7 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages)
>   	unsigned long end = addr + get_vm_area_size(area);
>   	int err;
>   
> -	err = vmap_page_range(addr, end, prot, pages);
> +	err = vmap_pages_range(addr, end, prot, pages);
>   
>   	return err > 0 ? 0 : err;
>   }
> 

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 4/4] mm/vmalloc: Hugepage vmalloc mappings
  2019-06-10  4:38 ` [PATCH 4/4] mm/vmalloc: Hugepage vmalloc mappings Nicholas Piggin
                     ` (2 preceding siblings ...)
  2019-06-10 14:10   ` Mark Rutland
@ 2019-06-11  5:39   ` Christophe Leroy
  2019-06-19  3:39     ` Nicholas Piggin
  3 siblings, 1 reply; 24+ messages in thread
From: Christophe Leroy @ 2019-06-11  5:39 UTC (permalink / raw)
  To: Nicholas Piggin, linux-mm, Russell Currey; +Cc: linuxppc-dev, linux-arm-kernel



Le 10/06/2019 à 06:38, Nicholas Piggin a écrit :
> For platforms that define HAVE_ARCH_HUGE_VMAP, have vmap allow vmalloc to
> allocate huge pages and map them

Will this be compatible with Russell's series 
https://patchwork.ozlabs.org/patch/1099857/ for the implementation of 
STRICT_MODULE_RWX ?
I see that apply_to_page_range() have things like BUG_ON(pud_huge(*pud));

Might also be an issue for arm64 as I think Russell's implementation 
comes from there.

> 
> This brings dTLB misses for linux kernel tree `git diff` from 45,000 to
> 8,000 on a Kaby Lake KVM guest with 8MB dentry hash and mitigations=off
> (performance is in the noise, under 1% difference, page tables are likely
> to be well cached for this workload). Similar numbers are seen on POWER9.
> 
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> ---
>   include/asm-generic/4level-fixup.h |   1 +
>   include/asm-generic/5level-fixup.h |   1 +
>   include/linux/vmalloc.h            |   1 +
>   mm/vmalloc.c                       | 132 +++++++++++++++++++++++------
>   4 files changed, 107 insertions(+), 28 deletions(-)
> 
> diff --git a/include/asm-generic/4level-fixup.h b/include/asm-generic/4level-fixup.h
> index e3667c9a33a5..3cc65a4dd093 100644
> --- a/include/asm-generic/4level-fixup.h
> +++ b/include/asm-generic/4level-fixup.h
> @@ -20,6 +20,7 @@
>   #define pud_none(pud)			0
>   #define pud_bad(pud)			0
>   #define pud_present(pud)		1
> +#define pud_large(pud)			0
>   #define pud_ERROR(pud)			do { } while (0)
>   #define pud_clear(pud)			pgd_clear(pud)
>   #define pud_val(pud)			pgd_val(pud)
> diff --git a/include/asm-generic/5level-fixup.h b/include/asm-generic/5level-fixup.h
> index bb6cb347018c..c4377db09a4f 100644
> --- a/include/asm-generic/5level-fixup.h
> +++ b/include/asm-generic/5level-fixup.h
> @@ -22,6 +22,7 @@
>   #define p4d_none(p4d)			0
>   #define p4d_bad(p4d)			0
>   #define p4d_present(p4d)		1
> +#define p4d_large(p4d)			0
>   #define p4d_ERROR(p4d)			do { } while (0)
>   #define p4d_clear(p4d)			pgd_clear(p4d)
>   #define p4d_val(p4d)			pgd_val(p4d)
> diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
> index 812bea5866d6..4c92dc608928 100644
> --- a/include/linux/vmalloc.h
> +++ b/include/linux/vmalloc.h
> @@ -42,6 +42,7 @@ struct vm_struct {
>   	unsigned long		size;
>   	unsigned long		flags;
>   	struct page		**pages;
> +	unsigned int		page_shift;
>   	unsigned int		nr_pages;
>   	phys_addr_t		phys_addr;
>   	const void		*caller;
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index dd27cfb29b10..0cf8e861caeb 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -36,6 +36,7 @@
>   #include <linux/rbtree_augmented.h>
>   
>   #include <linux/uaccess.h>
> +#include <asm/pgtable.h>
>   #include <asm/tlbflush.h>
>   #include <asm/shmparam.h>
>   
> @@ -440,6 +441,41 @@ static int vmap_pages_range(unsigned long start, unsigned long end,
>   	return ret;
>   }
>   
> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
> +static int vmap_hpages_range(unsigned long start, unsigned long end,
> +				   pgprot_t prot, struct page **pages,
> +				   unsigned int page_shift)
> +{
> +	unsigned long addr = start;
> +	unsigned int i, nr = (end - start) >> (PAGE_SHIFT + page_shift);
> +
> +	for (i = 0; i < nr; i++) {
> +		int err;
> +
> +		err = vmap_range_noflush(addr,
> +					addr + (PAGE_SIZE << page_shift),
> +					__pa(page_address(pages[i])), prot,
> +					PAGE_SHIFT + page_shift);
> +		if (err)
> +			return err;
> +
> +		addr += PAGE_SIZE << page_shift;
> +	}
> +	flush_cache_vmap(start, end);
> +
> +	return nr;
> +}
> +#else
> +static int vmap_hpages_range(unsigned long start, unsigned long end,
> +			   pgprot_t prot, struct page **pages,
> +			   unsigned int page_shift)
> +{
> +	BUG_ON(page_shift != PAGE_SIZE);

Do we really need a BUG_ON() there ? What happens if this condition is 
true ?

> +	return vmap_pages_range(start, end, prot, pages);
> +}
> +#endif
> +
> +
>   int is_vmalloc_or_module_addr(const void *x)
>   {
>   	/*
> @@ -462,7 +498,7 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
>   {
>   	unsigned long addr = (unsigned long) vmalloc_addr;
>   	struct page *page = NULL;
> -	pgd_t *pgd = pgd_offset_k(addr);
> +	pgd_t *pgd;
>   	p4d_t *p4d;
>   	pud_t *pud;
>   	pmd_t *pmd;
> @@ -474,27 +510,38 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
>   	 */
>   	VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
>   
> +	pgd = pgd_offset_k(addr);
>   	if (pgd_none(*pgd))
>   		return NULL;
> +
>   	p4d = p4d_offset(pgd, addr);
>   	if (p4d_none(*p4d))
>   		return NULL;
> -	pud = pud_offset(p4d, addr);
> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP

Do we really need that ifdef ? Won't p4d_large() always return 0 when is 
not set ?
Otherwise, could we use IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP) instead ?

Same several places below.

> +	if (p4d_large(*p4d))
> +		return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT);
> +#endif
> +	if (WARN_ON_ONCE(p4d_bad(*p4d)))
> +		return NULL;
>   
> -	/*
> -	 * Don't dereference bad PUD or PMD (below) entries. This will also
> -	 * identify huge mappings, which we may encounter on architectures
> -	 * that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be
> -	 * identified as vmalloc addresses by is_vmalloc_addr(), but are
> -	 * not [unambiguously] associated with a struct page, so there is
> -	 * no correct value to return for them.
> -	 */
> -	WARN_ON_ONCE(pud_bad(*pud));
> -	if (pud_none(*pud) || pud_bad(*pud))
> +	pud = pud_offset(p4d, addr);
> +	if (pud_none(*pud))
> +		return NULL;
> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
> +	if (pud_large(*pud))
> +		return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
> +#endif
> +	if (WARN_ON_ONCE(pud_bad(*pud)))
>   		return NULL;
> +
>   	pmd = pmd_offset(pud, addr);
> -	WARN_ON_ONCE(pmd_bad(*pmd));
> -	if (pmd_none(*pmd) || pmd_bad(*pmd))
> +	if (pmd_none(*pmd))
> +		return NULL;
> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
> +	if (pmd_large(*pmd))
> +		return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
> +#endif
> +	if (WARN_ON_ONCE(pmd_bad(*pmd)))
>   		return NULL;
>   
>   	ptep = pte_offset_map(pmd, addr);
> @@ -502,6 +549,7 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
>   	if (pte_present(pte))
>   		page = pte_page(pte);
>   	pte_unmap(ptep);
> +
>   	return page;
>   }
>   EXPORT_SYMBOL(vmalloc_to_page);
> @@ -2185,8 +2233,9 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
>   		return NULL;
>   
>   	if (flags & VM_IOREMAP)
> -		align = 1ul << clamp_t(int, get_count_order_long(size),
> -				       PAGE_SHIFT, IOREMAP_MAX_ORDER);
> +		align = max(align,
> +				1ul << clamp_t(int, get_count_order_long(size),
> +				       PAGE_SHIFT, IOREMAP_MAX_ORDER));
>   
>   	area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
>   	if (unlikely(!area))
> @@ -2398,7 +2447,7 @@ static void __vunmap(const void *addr, int deallocate_pages)
>   			struct page *page = area->pages[i];
>   
>   			BUG_ON(!page);
> -			__free_pages(page, 0);
> +			__free_pages(page, area->page_shift);
>   		}
>   
>   		kvfree(area->pages);
> @@ -2541,14 +2590,17 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
>   				 pgprot_t prot, int node)
>   {
>   	struct page **pages;
> +	unsigned long addr = (unsigned long)area->addr;
> +	unsigned long size = get_vm_area_size(area);
> +	unsigned int page_shift = area->page_shift;
> +	unsigned int shift = page_shift + PAGE_SHIFT;
>   	unsigned int nr_pages, array_size, i;
>   	const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
>   	const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;
>   	const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ?
> -					0 :
> -					__GFP_HIGHMEM;
> +					0 : __GFP_HIGHMEM;

This patch is already quite big, shouldn't this kind of unrelated 
cleanups be in another patch ?

>   
> -	nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
> +	nr_pages = size >> shift;
>   	array_size = (nr_pages * sizeof(struct page *));
>   
>   	area->nr_pages = nr_pages;
> @@ -2569,10 +2621,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
>   	for (i = 0; i < area->nr_pages; i++) {
>   		struct page *page;
>   
> -		if (node == NUMA_NO_NODE)
> -			page = alloc_page(alloc_mask|highmem_mask);
> -		else
> -			page = alloc_pages_node(node, alloc_mask|highmem_mask, 0);
> +		page = alloc_pages_node(node,
> +				alloc_mask|highmem_mask, page_shift);

This is also nice cleanup, but does it really belong to this patch ?

>   
>   		if (unlikely(!page)) {
>   			/* Successfully allocated i pages, free them in __vunmap() */
> @@ -2584,8 +2634,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
>   			cond_resched();
>   	}
>   
> -	if (map_vm_area(area, prot, pages))
> +	if (vmap_hpages_range(addr, addr + size, prot, pages, page_shift) < 0)
>   		goto fail;
> +

Cleanup ?

>   	return area->addr;
>   
>   fail:
> @@ -2619,22 +2670,39 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
>   			pgprot_t prot, unsigned long vm_flags, int node,
>   			const void *caller)
>   {
> -	struct vm_struct *area;
> +	struct vm_struct *area = NULL;
>   	void *addr;
>   	unsigned long real_size = size;
> +	unsigned long real_align = align;
> +	unsigned int shift = PAGE_SHIFT;
>   
>   	size = PAGE_ALIGN(size);
>   	if (!size || (size >> PAGE_SHIFT) > totalram_pages())
>   		goto fail;
>   
> +	if (IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP)) {
> +		unsigned long size_per_node;
> +
> +		size_per_node = size;
> +		if (node == NUMA_NO_NODE)
> +			size_per_node /= num_online_nodes();
> +		if (size_per_node >= PMD_SIZE)
> +			shift = PMD_SHIFT;
> +	}
> +again:
> +	align = max(real_align, 1UL << shift);
> +	size = ALIGN(real_size, align);
> +
>   	area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |
>   				vm_flags, start, end, node, gfp_mask, caller);
>   	if (!area)
>   		goto fail;
>   
> +	area->page_shift = shift - PAGE_SHIFT;
> +
>   	addr = __vmalloc_area_node(area, gfp_mask, prot, node);
>   	if (!addr)
> -		return NULL;
> +		goto fail;
>   
>   	/*
>   	 * In this function, newly allocated vm_struct has VM_UNINITIALIZED
> @@ -2648,8 +2716,16 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
>   	return addr;
>   
>   fail:
> -	warn_alloc(gfp_mask, NULL,
> +	if (shift == PMD_SHIFT) {
> +		shift = PAGE_SHIFT;
> +		goto again;
> +	}
> +
> +	if (!area) {
> +		/* Warn for area allocation, page allocations already warn */
> +		warn_alloc(gfp_mask, NULL,
>   			  "vmalloc: allocation failure: %lu bytes", real_size);
> +	}
>   	return NULL;
>   }
>   
> 

Christophe

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 4/4] mm/vmalloc: Hugepage vmalloc mappings
  2019-06-10 14:44     ` Nicholas Piggin
@ 2019-06-11  6:17       ` Anshuman Khandual
  2019-06-19  3:33         ` Nicholas Piggin
  0 siblings, 1 reply; 24+ messages in thread
From: Anshuman Khandual @ 2019-06-11  6:17 UTC (permalink / raw)
  To: Nicholas Piggin, Mark Rutland; +Cc: linux-mm, linuxppc-dev, linux-arm-kernel



On 06/10/2019 08:14 PM, Nicholas Piggin wrote:
> Mark Rutland's on June 11, 2019 12:10 am:
>> Hi,
>>
>> On Mon, Jun 10, 2019 at 02:38:38PM +1000, Nicholas Piggin wrote:
>>> For platforms that define HAVE_ARCH_HUGE_VMAP, have vmap allow vmalloc to
>>> allocate huge pages and map them
>>>
>>> This brings dTLB misses for linux kernel tree `git diff` from 45,000 to
>>> 8,000 on a Kaby Lake KVM guest with 8MB dentry hash and mitigations=off
>>> (performance is in the noise, under 1% difference, page tables are likely
>>> to be well cached for this workload). Similar numbers are seen on POWER9.
>>
>> Do you happen to know which vmalloc mappings these get used for in the
>> above case? Where do we see vmalloc mappings that large?
> 
> Large module vmalloc could be subject to huge mappings.
> 
>> I'm worried as to how this would interact with the set_memory_*()
>> functions, as on arm64 those can only operate on page-granular mappings.
>> Those may need fixing up to handle huge mappings; certainly if the above
>> is all for modules.
> 
> Good point, that looks like it would break on arm64 at least. I'll
> work on it. We may have to make this opt in beyond HUGE_VMAP.

This is another reason we might need to have an arch opt-ins like the one
I mentioned before.

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 4/4] mm/vmalloc: Hugepage vmalloc mappings
  2019-06-11  0:16     ` Nicholas Piggin
@ 2019-06-11  6:59       ` Anshuman Khandual
  2019-06-19  3:29         ` Nicholas Piggin
  0 siblings, 1 reply; 24+ messages in thread
From: Anshuman Khandual @ 2019-06-11  6:59 UTC (permalink / raw)
  To: Nicholas Piggin, linux-mm; +Cc: linuxppc-dev, Ard Biesheuvel, linux-arm-kernel

On 06/11/2019 05:46 AM, Nicholas Piggin wrote:
> Anshuman Khandual's on June 10, 2019 6:53 pm:
>> On 06/10/2019 10:08 AM, Nicholas Piggin wrote:
>>> For platforms that define HAVE_ARCH_HUGE_VMAP, have vmap allow vmalloc to
>>> allocate huge pages and map them.
>>
>> IIUC that extends HAVE_ARCH_HUGE_VMAP from iormap to vmalloc. 
>>
>>>
>>> This brings dTLB misses for linux kernel tree `git diff` from 45,000 to
>>> 8,000 on a Kaby Lake KVM guest with 8MB dentry hash and mitigations=off
>>> (performance is in the noise, under 1% difference, page tables are likely
>>> to be well cached for this workload). Similar numbers are seen on POWER9.
>>
>> Sure will try this on arm64.
>>
>>>
>>> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
>>> ---
>>>  include/asm-generic/4level-fixup.h |   1 +
>>>  include/asm-generic/5level-fixup.h |   1 +
>>>  include/linux/vmalloc.h            |   1 +
>>>  mm/vmalloc.c                       | 132 +++++++++++++++++++++++------
>>>  4 files changed, 107 insertions(+), 28 deletions(-)
>>>
>>> diff --git a/include/asm-generic/4level-fixup.h b/include/asm-generic/4level-fixup.h
>>> index e3667c9a33a5..3cc65a4dd093 100644
>>> --- a/include/asm-generic/4level-fixup.h
>>> +++ b/include/asm-generic/4level-fixup.h
>>> @@ -20,6 +20,7 @@
>>>  #define pud_none(pud)			0
>>>  #define pud_bad(pud)			0
>>>  #define pud_present(pud)		1
>>> +#define pud_large(pud)			0
>>>  #define pud_ERROR(pud)			do { } while (0)
>>>  #define pud_clear(pud)			pgd_clear(pud)
>>>  #define pud_val(pud)			pgd_val(pud)
>>> diff --git a/include/asm-generic/5level-fixup.h b/include/asm-generic/5level-fixup.h
>>> index bb6cb347018c..c4377db09a4f 100644
>>> --- a/include/asm-generic/5level-fixup.h
>>> +++ b/include/asm-generic/5level-fixup.h
>>> @@ -22,6 +22,7 @@
>>>  #define p4d_none(p4d)			0
>>>  #define p4d_bad(p4d)			0
>>>  #define p4d_present(p4d)		1
>>> +#define p4d_large(p4d)			0
>>>  #define p4d_ERROR(p4d)			do { } while (0)
>>>  #define p4d_clear(p4d)			pgd_clear(p4d)
>>>  #define p4d_val(p4d)			pgd_val(p4d)
>>
>> Both of these are required from vmalloc_to_page() which as per a later comment
>> should be part of a prerequisite patch before this series.
> 
> I'm not sure what you mean. This patch is where they get used.

In case you move out vmalloc_to_page() changes to a separate patch.

> 
> Possibly I could split this and the vmalloc_to_page change out. I'll
> consider it.
> 
>>> diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
>>> index 812bea5866d6..4c92dc608928 100644
>>> --- a/include/linux/vmalloc.h
>>> +++ b/include/linux/vmalloc.h
>>> @@ -42,6 +42,7 @@ struct vm_struct {
>>>  	unsigned long		size;
>>>  	unsigned long		flags;
>>>  	struct page		**pages;
>>> +	unsigned int		page_shift;
>>
>> So the entire vm_struct will be mapped with a single page_shift. It cannot have
>> mix and match mappings with PAGE_SIZE, PMD_SIZE, PUD_SIZE etc in case the
>> allocation fails for larger ones, falling back etc what over other reasons.
> 
> For now, yes. I have a bit of follow up work to improve that and make
> it able to fall back, but it's a bit more churn and not a significant
> benefit just yet because there are not a lot of very large vmallocs
> (except the early hashes which can be satisfied with large allocs).

Right but it will make this new feature complete like ioremap which logically
supports till P4D (though AFAICT not used). If there are no actual vmalloc
requests that large it is fine. Allocation attempts will start from the page
table level depending on the requested size. It is better to have PUD/P4D
considerations now rather than trying to after fit it later.

> 
>>
>>>  	unsigned int		nr_pages;
>>>  	phys_addr_t		phys_addr;
>>>  	const void		*caller;
>>> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
>>> index dd27cfb29b10..0cf8e861caeb 100644
>>> --- a/mm/vmalloc.c
>>> +++ b/mm/vmalloc.c
>>> @@ -36,6 +36,7 @@
>>>  #include <linux/rbtree_augmented.h>
>>>  
>>>  #include <linux/uaccess.h>
>>> +#include <asm/pgtable.h>
>>>  #include <asm/tlbflush.h>
>>>  #include <asm/shmparam.h>
>>>  
>>> @@ -440,6 +441,41 @@ static int vmap_pages_range(unsigned long start, unsigned long end,
>>>  	return ret;
>>>  }
>>>  
>>> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
>>> +static int vmap_hpages_range(unsigned long start, unsigned long end,
>>
>> A small nit (if you agree) s/hpages/huge_pages/
> 
> Hmm. It's not actually a good function name because it can do small
> pages as well. vmap_pages_size_range or something may be better.

Right.

> 
>>
>>> +				   pgprot_t prot, struct page **pages,
>>
>> Re-order (prot <---> pages) just to follow the standard like before.
> 
> Will do.
> 
>>> +				   unsigned int page_shift)
>>> +{
>>> +	unsigned long addr = start;
>>> +	unsigned int i, nr = (end - start) >> (PAGE_SHIFT + page_shift);
>>
>> s/nr/nr_huge_pages ?
> 
> Sure.
> 
>> Also should not we check for the alignment of the range [start...end] with
>> respect to (1UL << [PAGE_SHIFT + page_shift]).
> 
> The caller should if it specifies large page. Could check and -EINVAL
> for incorrect alignment.

That might be a good check here.

> 
>>> +
>>> +	for (i = 0; i < nr; i++) {
>>> +		int err;
>>> +
>>> +		err = vmap_range_noflush(addr,
>>> +					addr + (PAGE_SIZE << page_shift),
>>> +					__pa(page_address(pages[i])), prot,
>>> +					PAGE_SHIFT + page_shift);
>>> +		if (err)
>>> +			return err;
>>> +
>>> +		addr += PAGE_SIZE << page_shift;
>>> +	}
>>> +	flush_cache_vmap(start, end);
>>> +
>>> +	return nr;
>>> +}
>>> +#else
>>> +static int vmap_hpages_range(unsigned long start, unsigned long end,
>>> +			   pgprot_t prot, struct page **pages,
>>> +			   unsigned int page_shift)
>>> +{
>>> +	BUG_ON(page_shift != PAGE_SIZE);
>>> +	return vmap_pages_range(start, end, prot, pages);
>>> +}
>>> +#endif
>>> +
>>> +
>>>  int is_vmalloc_or_module_addr(const void *x)
>>>  {
>>>  	/*
>>> @@ -462,7 +498,7 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
>>>  {
>>>  	unsigned long addr = (unsigned long) vmalloc_addr;
>>>  	struct page *page = NULL;
>>> -	pgd_t *pgd = pgd_offset_k(addr);
>>> +	pgd_t *pgd;
>>>  	p4d_t *p4d;
>>>  	pud_t *pud;
>>>  	pmd_t *pmd;
>>> @@ -474,27 +510,38 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
>>>  	 */
>>>  	VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
>>>  
>>> +	pgd = pgd_offset_k(addr);
>>>  	if (pgd_none(*pgd))
>>>  		return NULL;
>>> +
>>
>> Small nit. Stray line here.
>>
>> 'pgd' related changes here seem to be just cleanups and should not part of this patch.
> 
> Yeah I figure it doesn't matter to make small changes close by, but
> maybe that's more frowned upon now for git blame?

Right. But I guess it should be okay if you can make vmalloc_to_page()
changes as a separate patch. This patch which adds a new feature should
not have any clean ups IMHO.

> 
>>>  	p4d = p4d_offset(pgd, addr);
>>>  	if (p4d_none(*p4d))
>>>  		return NULL;
>>> -	pud = pud_offset(p4d, addr);
>>> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
>>> +	if (p4d_large(*p4d))
>>> +		return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT);
>>> +#endif
>>> +	if (WARN_ON_ONCE(p4d_bad(*p4d)))
>>> +		return NULL;
>>>  
>>> -	/*
>>> -	 * Don't dereference bad PUD or PMD (below) entries. This will also
>>> -	 * identify huge mappings, which we may encounter on architectures
>>> -	 * that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be
>>> -	 * identified as vmalloc addresses by is_vmalloc_addr(), but are
>>> -	 * not [unambiguously] associated with a struct page, so there is
>>> -	 * no correct value to return for them.
>>> -	 */
>>
>> What changed the situation so that we could return struct page for a huge
>> mapping now ?
> 
> For the PUD case? Nothing changed, per se, we I just calculate the
> correct struct page now, so I may return it.

I was just curious what prevented this earlier (before this series). The
comment here and commit message which added this change making me wonder
what was the reason for not doing this earlier.  

> 
>> AFAICT even after this patch, PUD/P4D level huge pages can only
>> be created with ioremap_page_range() not with vmalloc() which creates PMD
>> sized mappings only. Hence if it's okay to dereference struct page of a huge
>> mapping (not withstanding the comment here) it should be part of an earlier
>> patch fixing it first for existing ioremap_page_range() huge mappings.
> 
> Possibly yes, we can consider 029c54b095995 to be a band-aid for huge
> vmaps which is fixed properly by this change, in which case it could
> make sense to break this into its own patch.

On arm64 [pud|pmd]_bad() calls out huge mappings at PUD or PMD. I still wonder what
Ard (copied him now) meant by "not [unambiguously] associated with a struct page".
He also mentioned about compound pages in the commit message. Anyways these makes
sense (fetching the struct page) unless I am missing something. But should be part
of a separate patch.

pXd_page(*pXd) + ((addr & ~PXD_MASK) >> PAGE_SHIFT)

> 
>>
>>> -	WARN_ON_ONCE(pud_bad(*pud));
>>> -	if (pud_none(*pud) || pud_bad(*pud))
>>> +	pud = pud_offset(p4d, addr);
>>> +	if (pud_none(*pud))
>>> +		return NULL;
>>> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
>>> +	if (pud_large(*pud))
>>> +		return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
>>> +#endif
>>> +	if (WARN_ON_ONCE(pud_bad(*pud)))
>>>  		return NULL;
>>> +
>>>  	pmd = pmd_offset(pud, addr);
>>> -	WARN_ON_ONCE(pmd_bad(*pmd));
>>> -	if (pmd_none(*pmd) || pmd_bad(*pmd))
>>> +	if (pmd_none(*pmd))
>>> +		return NULL;
>>> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
>>> +	if (pmd_large(*pmd))
>>> +		return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
>>> +#endif
>>> +	if (WARN_ON_ONCE(pmd_bad(*pmd)))
>>>  		return NULL;
>>
>> At each page table level, we are checking in this order
>>
>> pXX_none() --> pXX_large() --> pXX_bad()
>>
>> Are not these alternative orders bit better
>>
>> pXX_bad() --> pXX_none() --> pXX_large()
>>
>> Or
>>
>> pXX_none() --> pXX_bad() --> pXX_large()
>>
>> Checking for pXX_bad() at the end does not make much sense.
> 
> Yeah the order tends to go none->bad. It's not 100% clear we can
> test bad before none (at least it goes against convention). But good
> point should be changed to your last sequence I think.

Sure.

> 
>>
>>>  
>>>  	ptep = pte_offset_map(pmd, addr);
>>> @@ -502,6 +549,7 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
>>>  	if (pte_present(pte))
>>>  		page = pte_page(pte);
>>>  	pte_unmap(ptep);
>>> +
>>
>> Small nit. Stray line here.
> 
> I don't mind adding some lines here and there, like here. It is an
> unrelated (alleged-)cleanup though.
> 
>>
>>>  	return page;
>>>  }
>>>  EXPORT_SYMBOL(vmalloc_to_page);
>>> @@ -2185,8 +2233,9 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
>>>  		return NULL;
>>>  
>>>  	if (flags & VM_IOREMAP)
>>> -		align = 1ul << clamp_t(int, get_count_order_long(size),
>>> -				       PAGE_SHIFT, IOREMAP_MAX_ORDER);
>>> +		align = max(align,
>>> +				1ul << clamp_t(int, get_count_order_long(size),
>>> +				       PAGE_SHIFT, IOREMAP_MAX_ORDER));
>>>  
>>>  	area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
>>>  	if (unlikely(!area))
>>> @@ -2398,7 +2447,7 @@ static void __vunmap(const void *addr, int deallocate_pages)
>>>  			struct page *page = area->pages[i];
>>>  
>>>  			BUG_ON(!page);
>>> -			__free_pages(page, 0);
>>> +			__free_pages(page, area->page_shift);
>>
>> area->page_shift' turns out to be effective page order. I think the name here is bit
>> misleading. s/page_shift/page_order or nr_pages should be better IMHO. page_shift is
>> not actual shift (1UL << area->shift to get size) nor does it sound like page 'order'.
> 
> Yeah good point.
> 
>>>  		}
>>>  
>>>  		kvfree(area->pages);
>>> @@ -2541,14 +2590,17 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
>>>  				 pgprot_t prot, int node)
>>>  {
>>>  	struct page **pages;
>>> +	unsigned long addr = (unsigned long)area->addr;
>>> +	unsigned long size = get_vm_area_size(area);
>>> +	unsigned int page_shift = area->page_shift;
>>> +	unsigned int shift = page_shift + PAGE_SHIFT;
>>>  	unsigned int nr_pages, array_size, i;
>>>  	const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
>>>  	const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;
>>>  	const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ?
>>> -					0 :
>>> -					__GFP_HIGHMEM;
>>> +					0 : __GFP_HIGHMEM;
>>>  
>>> -	nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
>>> +	nr_pages = size >> shift;
>>>  	array_size = (nr_pages * sizeof(struct page *));
>>>  
>>>  	area->nr_pages = nr_pages;
>>> @@ -2569,10 +2621,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
>>>  	for (i = 0; i < area->nr_pages; i++) {
>>>  		struct page *page;
>>>  
>>> -		if (node == NUMA_NO_NODE)
>>> -			page = alloc_page(alloc_mask|highmem_mask);
>>> -		else
>>> -			page = alloc_pages_node(node, alloc_mask|highmem_mask, 0);
>>> +		page = alloc_pages_node(node,
>>> +				alloc_mask|highmem_mask, page_shift);
>>
>> alloc_mask remains the exact same like before even for these high order pages.
> 
> Is there a problem there? I don't see.

There is no problem. I justed noted it.

> 
>>>  
>>>  		if (unlikely(!page)) {
>>>  			/* Successfully allocated i pages, free them in __vunmap() */
>>> @@ -2584,8 +2634,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
>>>  			cond_resched();
>>>  	}
>>>  
>>> -	if (map_vm_area(area, prot, pages))
>>> +	if (vmap_hpages_range(addr, addr + size, prot, pages, page_shift) < 0)
>>>  		goto fail;
>>> +
>>>  	return area->addr;
>>>  
>>>  fail:
>>> @@ -2619,22 +2670,39 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
>>>  			pgprot_t prot, unsigned long vm_flags, int node,
>>>  			const void *caller)
>>>  {
>>> -	struct vm_struct *area;
>>> +	struct vm_struct *area = NULL;
>>>  	void *addr;
>>>  	unsigned long real_size = size;
>>> +	unsigned long real_align = align;
>>> +	unsigned int shift = PAGE_SHIFT;
>>>  
>>>  	size = PAGE_ALIGN(size);
>>>  	if (!size || (size >> PAGE_SHIFT) > totalram_pages())
>>>  		goto fail;
>>>  
>>> +	if (IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP)) {
>>> +		unsigned long size_per_node;
>>> +
>>> +		size_per_node = size;
>>> +		if (node == NUMA_NO_NODE)
>>> +			size_per_node /= num_online_nodes();
>>> +		if (size_per_node >= PMD_SIZE)
>>> +			shift = PMD_SHIFT;
>>
>> There are two problems here.
>>
>> 1. Should not size_per_node be aligned with PMD_SIZE to avoid wasting memory later
>>    because of alignment upwards (making it worse for NUMA_NO_NODE)
> 
> I'm not sure what you mean, it's just a heuristic to check for node
> interleaving, and use small pages if large can not interleave well.
> 
>> 2. What about PUD_SIZE which is not considered here at all
> 
> Yeah, not doing PUD pages at all. It would be pretty trivial to add 
> after PMD is working, but would it actually get used anywhere?

But it should make this feature logically complete. Allocation attempts can start
at right pgtable level depending on the requested size. I dont think it will have
any performance impact or something.

> 
>> 3. We should have similar knobs like ioremap controlling different size huge mappings
>>
>> static int __read_mostly ioremap_p4d_capable;
>> static int __read_mostly ioremap_pud_capable;
>> static int __read_mostly ioremap_pmd_capable;
>> static int __read_mostly ioremap_huge_disabled;
>>
>> while also giving arch a chance to weigh in through similar overrides like these.
>>
>> arch_ioremap_[pud|pmd]_supported() ---> probably unifying it for vmalloc() 
> 
> I'm not sure if that makes sense. IO mappings I could see maybe having
> some quirks or bugs or support issues. Cacheable mappings should be the
> "base" for supporting larger pages, if the platform has trouble with
> those then it should just avoid the feature I think.
> 
> Or is there a good reason to add the option? I don't mind, I just want
> to avoid proliferation.

We might need atleast for the potential problem on arm64 as discussed on the
other thread.

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 4/4] mm/vmalloc: Hugepage vmalloc mappings
  2019-06-11  6:59       ` Anshuman Khandual
@ 2019-06-19  3:29         ` Nicholas Piggin
  0 siblings, 0 replies; 24+ messages in thread
From: Nicholas Piggin @ 2019-06-19  3:29 UTC (permalink / raw)
  To: Anshuman Khandual, linux-mm
  Cc: linuxppc-dev, linux-arm-kernel, Ard Biesheuvel

Anshuman Khandual's on June 11, 2019 4:59 pm:
> On 06/11/2019 05:46 AM, Nicholas Piggin wrote:
>> Anshuman Khandual's on June 10, 2019 6:53 pm:
>>> On 06/10/2019 10:08 AM, Nicholas Piggin wrote:
>>>> For platforms that define HAVE_ARCH_HUGE_VMAP, have vmap allow vmalloc to
>>>> allocate huge pages and map them.
>>>
>>> IIUC that extends HAVE_ARCH_HUGE_VMAP from iormap to vmalloc. 
>>>
>>>>
>>>> This brings dTLB misses for linux kernel tree `git diff` from 45,000 to
>>>> 8,000 on a Kaby Lake KVM guest with 8MB dentry hash and mitigations=off
>>>> (performance is in the noise, under 1% difference, page tables are likely
>>>> to be well cached for this workload). Similar numbers are seen on POWER9.
>>>
>>> Sure will try this on arm64.
>>>
>>>>
>>>> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
>>>> ---
>>>>  include/asm-generic/4level-fixup.h |   1 +
>>>>  include/asm-generic/5level-fixup.h |   1 +
>>>>  include/linux/vmalloc.h            |   1 +
>>>>  mm/vmalloc.c                       | 132 +++++++++++++++++++++++------
>>>>  4 files changed, 107 insertions(+), 28 deletions(-)
>>>>
>>>> diff --git a/include/asm-generic/4level-fixup.h b/include/asm-generic/4level-fixup.h
>>>> index e3667c9a33a5..3cc65a4dd093 100644
>>>> --- a/include/asm-generic/4level-fixup.h
>>>> +++ b/include/asm-generic/4level-fixup.h
>>>> @@ -20,6 +20,7 @@
>>>>  #define pud_none(pud)			0
>>>>  #define pud_bad(pud)			0
>>>>  #define pud_present(pud)		1
>>>> +#define pud_large(pud)			0
>>>>  #define pud_ERROR(pud)			do { } while (0)
>>>>  #define pud_clear(pud)			pgd_clear(pud)
>>>>  #define pud_val(pud)			pgd_val(pud)
>>>> diff --git a/include/asm-generic/5level-fixup.h b/include/asm-generic/5level-fixup.h
>>>> index bb6cb347018c..c4377db09a4f 100644
>>>> --- a/include/asm-generic/5level-fixup.h
>>>> +++ b/include/asm-generic/5level-fixup.h
>>>> @@ -22,6 +22,7 @@
>>>>  #define p4d_none(p4d)			0
>>>>  #define p4d_bad(p4d)			0
>>>>  #define p4d_present(p4d)		1
>>>> +#define p4d_large(p4d)			0
>>>>  #define p4d_ERROR(p4d)			do { } while (0)
>>>>  #define p4d_clear(p4d)			pgd_clear(p4d)
>>>>  #define p4d_val(p4d)			pgd_val(p4d)
>>>
>>> Both of these are required from vmalloc_to_page() which as per a later comment
>>> should be part of a prerequisite patch before this series.
>> 
>> I'm not sure what you mean. This patch is where they get used.
> 
> In case you move out vmalloc_to_page() changes to a separate patch.

Sorry for the delay in reply.

I'll split this and see if we might be able to get it into next
merge window. I can have another try at the huge vmalloc patch
after that.

> 
>> 
>> Possibly I could split this and the vmalloc_to_page change out. I'll
>> consider it.
>> 
>>>> diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
>>>> index 812bea5866d6..4c92dc608928 100644
>>>> --- a/include/linux/vmalloc.h
>>>> +++ b/include/linux/vmalloc.h
>>>> @@ -42,6 +42,7 @@ struct vm_struct {
>>>>  	unsigned long		size;
>>>>  	unsigned long		flags;
>>>>  	struct page		**pages;
>>>> +	unsigned int		page_shift;
>>>
>>> So the entire vm_struct will be mapped with a single page_shift. It cannot have
>>> mix and match mappings with PAGE_SIZE, PMD_SIZE, PUD_SIZE etc in case the
>>> allocation fails for larger ones, falling back etc what over other reasons.
>> 
>> For now, yes. I have a bit of follow up work to improve that and make
>> it able to fall back, but it's a bit more churn and not a significant
>> benefit just yet because there are not a lot of very large vmallocs
>> (except the early hashes which can be satisfied with large allocs).
> 
> Right but it will make this new feature complete like ioremap which logically
> supports till P4D (though AFAICT not used). If there are no actual vmalloc
> requests that large it is fine. Allocation attempts will start from the page
> table level depending on the requested size. It is better to have PUD/P4D
> considerations now rather than trying to after fit it later.

I've considered them, which is why e.g., a shift gets passed around 
rather than a bool for small/large.

I won't over complicate this page array data structure for something
that may never be supported though. I think we may actually be better
moving away from it in the vmalloc code and just referencing pages
from the page tables, so it's just something we can cross when we get
to it.

>>> Also should not we check for the alignment of the range [start...end] with
>>> respect to (1UL << [PAGE_SHIFT + page_shift]).
>> 
>> The caller should if it specifies large page. Could check and -EINVAL
>> for incorrect alignment.
> 
> That might be a good check here.

Will add.

>>>> @@ -474,27 +510,38 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
>>>>  	 */
>>>>  	VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
>>>>  
>>>> +	pgd = pgd_offset_k(addr);
>>>>  	if (pgd_none(*pgd))
>>>>  		return NULL;
>>>> +
>>>
>>> Small nit. Stray line here.
>>>
>>> 'pgd' related changes here seem to be just cleanups and should not part of this patch.
>> 
>> Yeah I figure it doesn't matter to make small changes close by, but
>> maybe that's more frowned upon now for git blame?
> 
> Right. But I guess it should be okay if you can make vmalloc_to_page()
> changes as a separate patch. This patch which adds a new feature should
> not have any clean ups IMHO.

Well... that alone would be a new feature too. Or could be considered
a bug fix, which makes it even more important not to contain
superfluous changes.

Is there a real prohibition on small slightly peripheral tidying
like this? I don't think I'd bother sending a lone patch just to
change a couple lines of spacing.

>>>>  	p4d = p4d_offset(pgd, addr);
>>>>  	if (p4d_none(*p4d))
>>>>  		return NULL;
>>>> -	pud = pud_offset(p4d, addr);
>>>> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
>>>> +	if (p4d_large(*p4d))
>>>> +		return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT);
>>>> +#endif
>>>> +	if (WARN_ON_ONCE(p4d_bad(*p4d)))
>>>> +		return NULL;
>>>>  
>>>> -	/*
>>>> -	 * Don't dereference bad PUD or PMD (below) entries. This will also
>>>> -	 * identify huge mappings, which we may encounter on architectures
>>>> -	 * that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be
>>>> -	 * identified as vmalloc addresses by is_vmalloc_addr(), but are
>>>> -	 * not [unambiguously] associated with a struct page, so there is
>>>> -	 * no correct value to return for them.
>>>> -	 */
>>>
>>> What changed the situation so that we could return struct page for a huge
>>> mapping now ?
>> 
>> For the PUD case? Nothing changed, per se, we I just calculate the
>> correct struct page now, so I may return it.
> 
> I was just curious what prevented this earlier (before this series). The
> comment here and commit message which added this change making me wonder
> what was the reason for not doing this earlier.  

Just not implemented I guess.

>>> AFAICT even after this patch, PUD/P4D level huge pages can only
>>> be created with ioremap_page_range() not with vmalloc() which creates PMD
>>> sized mappings only. Hence if it's okay to dereference struct page of a huge
>>> mapping (not withstanding the comment here) it should be part of an earlier
>>> patch fixing it first for existing ioremap_page_range() huge mappings.
>> 
>> Possibly yes, we can consider 029c54b095995 to be a band-aid for huge
>> vmaps which is fixed properly by this change, in which case it could
>> make sense to break this into its own patch.
> 
> On arm64 [pud|pmd]_bad() calls out huge mappings at PUD or PMD. I still wonder what
> Ard (copied him now) meant by "not [unambiguously] associated with a struct page".
> He also mentioned about compound pages in the commit message. Anyways these makes
> sense (fetching the struct page) unless I am missing something. But should be part
> of a separate patch.

I do somewhat see the intention of the commit message, but if we
consider the vmap/iomap layer's choice of page size as transparent to
the caller, and the vmalloc_to_page API has always been specifically
interested in the PAGE_SIZE struct page, then my patch is fine and
introduces no problems. It restores the API functionality to be the
same regardless of whether small or large pages were used for the
actual mapping.

>>>> +	if (IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP)) {
>>>> +		unsigned long size_per_node;
>>>> +
>>>> +		size_per_node = size;
>>>> +		if (node == NUMA_NO_NODE)
>>>> +			size_per_node /= num_online_nodes();
>>>> +		if (size_per_node >= PMD_SIZE)
>>>> +			shift = PMD_SHIFT;
>>>
>>> There are two problems here.
>>>
>>> 1. Should not size_per_node be aligned with PMD_SIZE to avoid wasting memory later
>>>    because of alignment upwards (making it worse for NUMA_NO_NODE)
>> 
>> I'm not sure what you mean, it's just a heuristic to check for node
>> interleaving, and use small pages if large can not interleave well.
>> 
>>> 2. What about PUD_SIZE which is not considered here at all
>> 
>> Yeah, not doing PUD pages at all. It would be pretty trivial to add 
>> after PMD is working, but would it actually get used anywhere?
> 
> But it should make this feature logically complete. Allocation attempts can start
> at right pgtable level depending on the requested size. I dont think it will have
> any performance impact or something.

I disagree that's necessary or desirable for PMD support here. Sure
an arch might have PUD size within MAX_ORDER and implement that, but
it's just something that can be implemented when the time comes.

There's nothing about this patch that hinders being extendedto PUD
level I just won't add code that's not used and I can't test.

Thanks for the detailed review, I appreciate it.

Thanks,
Nick

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 4/4] mm/vmalloc: Hugepage vmalloc mappings
  2019-06-11  6:17       ` Anshuman Khandual
@ 2019-06-19  3:33         ` Nicholas Piggin
  0 siblings, 0 replies; 24+ messages in thread
From: Nicholas Piggin @ 2019-06-19  3:33 UTC (permalink / raw)
  To: Anshuman Khandual, Mark Rutland; +Cc: linux-mm, linuxppc-dev, linux-arm-kernel

Anshuman Khandual's on June 11, 2019 4:17 pm:
> 
> 
> On 06/10/2019 08:14 PM, Nicholas Piggin wrote:
>> Mark Rutland's on June 11, 2019 12:10 am:
>>> Hi,
>>>
>>> On Mon, Jun 10, 2019 at 02:38:38PM +1000, Nicholas Piggin wrote:
>>>> For platforms that define HAVE_ARCH_HUGE_VMAP, have vmap allow vmalloc to
>>>> allocate huge pages and map them
>>>>
>>>> This brings dTLB misses for linux kernel tree `git diff` from 45,000 to
>>>> 8,000 on a Kaby Lake KVM guest with 8MB dentry hash and mitigations=off
>>>> (performance is in the noise, under 1% difference, page tables are likely
>>>> to be well cached for this workload). Similar numbers are seen on POWER9.
>>>
>>> Do you happen to know which vmalloc mappings these get used for in the
>>> above case? Where do we see vmalloc mappings that large?
>> 
>> Large module vmalloc could be subject to huge mappings.
>> 
>>> I'm worried as to how this would interact with the set_memory_*()
>>> functions, as on arm64 those can only operate on page-granular mappings.
>>> Those may need fixing up to handle huge mappings; certainly if the above
>>> is all for modules.
>> 
>> Good point, that looks like it would break on arm64 at least. I'll
>> work on it. We may have to make this opt in beyond HUGE_VMAP.
> 
> This is another reason we might need to have an arch opt-ins like the one
> I mentioned before.
> 

Let's try to get the precursor stuff like page table functions and
vmalloc_to_page in this merge window, and then concentrate on the
huge vmalloc support issues after that.

Christophe points out that powerpc is likely to have a similar 
problem which I didn't realise, so I'll re think it.

Thanks,
Nick

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 4/4] mm/vmalloc: Hugepage vmalloc mappings
  2019-06-11  5:39   ` Christophe Leroy
@ 2019-06-19  3:39     ` Nicholas Piggin
  0 siblings, 0 replies; 24+ messages in thread
From: Nicholas Piggin @ 2019-06-19  3:39 UTC (permalink / raw)
  To: Christophe Leroy, linux-mm, Russell Currey; +Cc: linuxppc-dev, linux-arm-kernel

Christophe Leroy's on June 11, 2019 3:39 pm:
> 
> 
> Le 10/06/2019 à 06:38, Nicholas Piggin a écrit :
>> For platforms that define HAVE_ARCH_HUGE_VMAP, have vmap allow vmalloc to
>> allocate huge pages and map them
> 
> Will this be compatible with Russell's series 
> https://patchwork.ozlabs.org/patch/1099857/ for the implementation of 
> STRICT_MODULE_RWX ?
> I see that apply_to_page_range() have things like BUG_ON(pud_huge(*pud));
> 
> Might also be an issue for arm64 as I think Russell's implementation 
> comes from there.

Yeah you're right (and correct about arm64 problem). I'll fix that up.

>> +static int vmap_hpages_range(unsigned long start, unsigned long end,
>> +			   pgprot_t prot, struct page **pages,
>> +			   unsigned int page_shift)
>> +{
>> +	BUG_ON(page_shift != PAGE_SIZE);
> 
> Do we really need a BUG_ON() there ? What happens if this condition is 
> true ?

If it's true then vmap_pages_range would die horribly reading off the
end of the pages array thinking they are struct page pointers.

I guess it could return failure.

>> +	return vmap_pages_range(start, end, prot, pages);
>> +}
>> +#endif
>> +
>> +
>>   int is_vmalloc_or_module_addr(const void *x)
>>   {
>>   	/*
>> @@ -462,7 +498,7 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
>>   {
>>   	unsigned long addr = (unsigned long) vmalloc_addr;
>>   	struct page *page = NULL;
>> -	pgd_t *pgd = pgd_offset_k(addr);
>> +	pgd_t *pgd;
>>   	p4d_t *p4d;
>>   	pud_t *pud;
>>   	pmd_t *pmd;
>> @@ -474,27 +510,38 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
>>   	 */
>>   	VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
>>   
>> +	pgd = pgd_offset_k(addr);
>>   	if (pgd_none(*pgd))
>>   		return NULL;
>> +
>>   	p4d = p4d_offset(pgd, addr);
>>   	if (p4d_none(*p4d))
>>   		return NULL;
>> -	pud = pud_offset(p4d, addr);
>> +#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
> 
> Do we really need that ifdef ? Won't p4d_large() always return 0 when is 
> not set ?
> Otherwise, could we use IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP) instead ?
> 
> Same several places below.

Possibly some of them are not defined without HAVE_ARCH_HUGE_VMAP
I think. I'll try to apply this pattern as much as possible.

>> @@ -2541,14 +2590,17 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
>>   				 pgprot_t prot, int node)
>>   {
>>   	struct page **pages;
>> +	unsigned long addr = (unsigned long)area->addr;
>> +	unsigned long size = get_vm_area_size(area);
>> +	unsigned int page_shift = area->page_shift;
>> +	unsigned int shift = page_shift + PAGE_SHIFT;
>>   	unsigned int nr_pages, array_size, i;
>>   	const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
>>   	const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;
>>   	const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ?
>> -					0 :
>> -					__GFP_HIGHMEM;
>> +					0 : __GFP_HIGHMEM;
> 
> This patch is already quite big, shouldn't this kind of unrelated 
> cleanups be in another patch ?

Okay, 2 against 1. I'll minimise changes like this.

Thanks,
Nick


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 1/4] mm: Move ioremap page table mapping function to mm/
  2019-06-11  5:24 ` Christophe Leroy
@ 2019-06-19  3:43   ` Nicholas Piggin
  2019-06-19 13:18     ` Christophe Leroy
  0 siblings, 1 reply; 24+ messages in thread
From: Nicholas Piggin @ 2019-06-19  3:43 UTC (permalink / raw)
  To: Christophe Leroy, linux-mm; +Cc: linuxppc-dev, linux-arm-kernel

Christophe Leroy's on June 11, 2019 3:24 pm:
> 
> 
> Le 10/06/2019 à 06:38, Nicholas Piggin a écrit :
>> ioremap_page_range is a generic function to create a kernel virtual
>> mapping, move it to mm/vmalloc.c and rename it vmap_range.
>> 
>> For clarity with this move, also:
>> - Rename vunmap_page_range (vmap_range's inverse) to vunmap_range.
>> - Rename vmap_page_range (which takes a page array) to vmap_pages.
> 
> Maybe it would be easier to follow the change if the name change was 
> done in another patch than the move.

I could do that.

>> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
>> ---
>> 
>> Fixed up the arm64 compile errors, fixed a few bugs, and tidied
>> things up a bit more.
>> 
>> Have tested powerpc and x86 but not arm64, would appreciate a review
>> and test of the arm64 patch if possible.
>> 
>>   include/linux/vmalloc.h |   3 +
>>   lib/ioremap.c           | 173 +++---------------------------
>>   mm/vmalloc.c            | 228 ++++++++++++++++++++++++++++++++++++----
>>   3 files changed, 229 insertions(+), 175 deletions(-)
>> 
>> diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
>> index 51e131245379..812bea5866d6 100644
>> --- a/include/linux/vmalloc.h
>> +++ b/include/linux/vmalloc.h
>> @@ -147,6 +147,9 @@ extern struct vm_struct *find_vm_area(const void *addr);
>>   extern int map_vm_area(struct vm_struct *area, pgprot_t prot,
>>   			struct page **pages);
>>   #ifdef CONFIG_MMU
>> +extern int vmap_range(unsigned long addr,
>> +		       unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
>> +		       unsigned int max_page_shift);
> 
> Drop extern keyword here.

I don't know if I was going crazy but at one point I was getting
duplicate symbol errors that were fixed by adding extern somewhere.
Maybe sleep depravation. However...

> As checkpatch tells you, 'CHECK:AVOID_EXTERNS: extern prototypes should 
> be avoided in .h files'

I prefer to follow existing style in surrounding code at the expense
of some checkpatch warnings. If somebody later wants to "fix" it
that's fine.

Thanks,
Nick


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 1/4] mm: Move ioremap page table mapping function to mm/
  2019-06-19  3:43   ` Nicholas Piggin
@ 2019-06-19 13:18     ` Christophe Leroy
  2019-06-22  9:42       ` Nicholas Piggin
  0 siblings, 1 reply; 24+ messages in thread
From: Christophe Leroy @ 2019-06-19 13:18 UTC (permalink / raw)
  To: Nicholas Piggin, linux-mm; +Cc: linuxppc-dev, linux-arm-kernel



Le 19/06/2019 à 05:43, Nicholas Piggin a écrit :
> Christophe Leroy's on June 11, 2019 3:24 pm:
>>
>>
>> Le 10/06/2019 à 06:38, Nicholas Piggin a écrit :

[snip]

>>> diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
>>> index 51e131245379..812bea5866d6 100644
>>> --- a/include/linux/vmalloc.h
>>> +++ b/include/linux/vmalloc.h
>>> @@ -147,6 +147,9 @@ extern struct vm_struct *find_vm_area(const void *addr);
>>>    extern int map_vm_area(struct vm_struct *area, pgprot_t prot,
>>>    			struct page **pages);
>>>    #ifdef CONFIG_MMU
>>> +extern int vmap_range(unsigned long addr,
>>> +		       unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
>>> +		       unsigned int max_page_shift);
>>
>> Drop extern keyword here.
> 
> I don't know if I was going crazy but at one point I was getting
> duplicate symbol errors that were fixed by adding extern somewhere.

probably not on a function name ...

> Maybe sleep depravation. However...
> 
>> As checkpatch tells you, 'CHECK:AVOID_EXTERNS: extern prototypes should
>> be avoided in .h files'
> 
> I prefer to follow existing style in surrounding code at the expense
> of some checkpatch warnings. If somebody later wants to "fix" it
> that's fine.

I don't think that's fine to 'fix' later things that could be done right 
from the begining. 'Cosmetic only' fixes never happen because they are a 
nightmare for backports, and a shame for 'git blame'.

In some patches, you add cleanups to make the code look nicer, and here 
you have the opportunity to make the code nice from the begining and you 
prefer repeating the errors done in the past ? You're surprising me.

Christophe

> 
> Thanks,
> Nick
> 

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 1/4] mm: Move ioremap page table mapping function to mm/
  2019-06-19 13:18     ` Christophe Leroy
@ 2019-06-22  9:42       ` Nicholas Piggin
  0 siblings, 0 replies; 24+ messages in thread
From: Nicholas Piggin @ 2019-06-22  9:42 UTC (permalink / raw)
  To: Christophe Leroy, linux-mm; +Cc: linuxppc-dev, linux-arm-kernel

Christophe Leroy's on June 19, 2019 11:18 pm:
> 
> 
> Le 19/06/2019 à 05:43, Nicholas Piggin a écrit :
>> Christophe Leroy's on June 11, 2019 3:24 pm:
>>>
>>>
>>> Le 10/06/2019 à 06:38, Nicholas Piggin a écrit :
> 
> [snip]
> 
>>>> diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
>>>> index 51e131245379..812bea5866d6 100644
>>>> --- a/include/linux/vmalloc.h
>>>> +++ b/include/linux/vmalloc.h
>>>> @@ -147,6 +147,9 @@ extern struct vm_struct *find_vm_area(const void *addr);
>>>>    extern int map_vm_area(struct vm_struct *area, pgprot_t prot,
>>>>    			struct page **pages);
>>>>    #ifdef CONFIG_MMU
>>>> +extern int vmap_range(unsigned long addr,
>>>> +		       unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
>>>> +		       unsigned int max_page_shift);
>>>
>>> Drop extern keyword here.
>> 
>> I don't know if I was going crazy but at one point I was getting
>> duplicate symbol errors that were fixed by adding extern somewhere.
> 
> probably not on a function name ...

I know it sounds crazy :P

>>> As checkpatch tells you, 'CHECK:AVOID_EXTERNS: extern prototypes should
>>> be avoided in .h files'
>> 
>> I prefer to follow existing style in surrounding code at the expense
>> of some checkpatch warnings. If somebody later wants to "fix" it
>> that's fine.
> 
> I don't think that's fine to 'fix' later things that could be done right 
> from the begining. 'Cosmetic only' fixes never happen because they are a 
> nightmare for backports, and a shame for 'git blame'.
> 
> In some patches, you add cleanups to make the code look nicer, and here 
> you have the opportunity to make the code nice from the begining and you 
> prefer repeating the errors done in the past ? You're surprising me.

Well I never claimed to be consistent. I actually don't mind the
extern keyword so it's probably just my personal preference that
makes me notice something nearby. I have dropped those "cleanup"
changes though, so there.

Thanks,
Nick

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2019-06-22  9:42 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-06-10  4:38 [PATCH 1/4] mm: Move ioremap page table mapping function to mm/ Nicholas Piggin
2019-06-10  4:38 ` [PATCH 2/4] arm64: support huge vmap vmalloc Nicholas Piggin
2019-06-10  5:47   ` Anshuman Khandual
2019-06-10  6:14     ` Nicholas Piggin
2019-06-10  4:38 ` [PATCH 3/4] powerpc/64s/radix: " Nicholas Piggin
2019-06-10  4:38 ` [PATCH 4/4] mm/vmalloc: Hugepage vmalloc mappings Nicholas Piggin
2019-06-10  5:49   ` Nicholas Piggin
2019-06-10  8:08     ` Satheesh Rajendran
2019-06-10  8:53   ` Anshuman Khandual
2019-06-11  0:16     ` Nicholas Piggin
2019-06-11  6:59       ` Anshuman Khandual
2019-06-19  3:29         ` Nicholas Piggin
2019-06-10 14:10   ` Mark Rutland
2019-06-10 14:44     ` Nicholas Piggin
2019-06-11  6:17       ` Anshuman Khandual
2019-06-19  3:33         ` Nicholas Piggin
2019-06-11  5:39   ` Christophe Leroy
2019-06-19  3:39     ` Nicholas Piggin
2019-06-10  5:42 ` [PATCH 1/4] mm: Move ioremap page table mapping function to mm/ Anshuman Khandual
2019-06-10  6:21   ` Nicholas Piggin
2019-06-11  5:24 ` Christophe Leroy
2019-06-19  3:43   ` Nicholas Piggin
2019-06-19 13:18     ` Christophe Leroy
2019-06-22  9:42       ` Nicholas Piggin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).