linux-arm-kernel.lists.infradead.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] arm64: mm: fix linear mapping mem access performace degradation
@ 2022-06-26 11:10 Guanghui Feng
  2022-06-27  6:34 ` Mike Rapoport
  2022-06-28  1:40 ` Leizhen (ThunderTown)
  0 siblings, 2 replies; 15+ messages in thread
From: Guanghui Feng @ 2022-06-26 11:10 UTC (permalink / raw)
  To: baolin.wang, catalin.marinas, will, akpm, david, jianyong.wu,
	james.morse, quic_qiancai, christophe.leroy, jonathan,
	mark.rutland, thunder.leizhen, chenzhou10, anshuman.khandual,
	linux-arm-kernel, linux-kernel, rppt, geert+renesas, ardb,
	linux-mm

The arm64 can build 2M/1G block/sectiion mapping. When using DMA/DMA32 zone
(enable crashkernel, disable rodata full, disable kfence), the mem_map will
use non block/section mapping(for crashkernel requires to shrink the region
in page granularity). But it will degrade performance when doing larging
continuous mem access in kernel(memcpy/memmove, etc).

There are many changes and discussions:
commit 031495635b46
commit 1a8e1cef7603
commit 8424ecdde7df
commit 0a30c53573b0
commit 2687275a5843

This patch changes mem_map to use block/section mapping with crashkernel.
Firstly, do block/section mapping(normally 2M or 1G) for all avail mem at
mem_map, reserve crashkernel memory. And then walking pagetable to split
block/section mapping to non block/section mapping(normally 4K) [[[only]]]
for crashkernel mem. So the linear mem mapping use block/section mapping
as more as possible. We will reduce the cpu dTLB miss conspicuously, and
accelerate mem access about 10-20% performance improvement.

I have tested it with pft(Page Fault Test) and fio, obtained great
performace improvement.

For fio test:
1.prepare ramdisk
  modprobe -r brd
  modprobe brd rd_nr=1 rd_size=67108864
  dmsetup remove_all
  wipefs -a --force /dev/ram0
  mkfs -t ext4 -E lazy_itable_init=0,lazy_journal_init=0 -q -F /dev/ram0
  mkdir -p /fs/ram0
  mount -t ext4 /dev/ram0 /fs/ram0

2.prepare fio paremeter in x.fio file:
[global]
bs=4k
ioengine=psync
iodepth=128
size=32G
direct=1
invalidate=1
group_reporting
thread=1
rw=read
directory=/fs/ram0
numjobs=1

[task_0]
cpus_allowed=16
stonewall=1

3.run testcase:
perf stat -e dTLB-load-misses fio x.fio

4.contrast
------------------------
			without patch		with patch
fio READ		aggrb=1493.2MB/s	aggrb=1775.3MB/s
dTLB-load-misses	1,818,320,693		438,729,774
time elapsed(s)		70.500326434		62.877316408
user(s)			15.926332000		15.684721000
sys(s)			54.211939000		47.046165000

5.conclusion
Using this patch will reduce dTLB misses and improve performace greatly.

Signed-off-by: Guanghui Feng <guanghuifeng@linux.alibaba.com>
---
 arch/arm64/include/asm/mmu.h |   1 +
 arch/arm64/mm/init.c         |   8 +-
 arch/arm64/mm/mmu.c          | 274 +++++++++++++++++++++++++++++++++++++++----
 3 files changed, 253 insertions(+), 30 deletions(-)

diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
index 48f8466..df113cc 100644
--- a/arch/arm64/include/asm/mmu.h
+++ b/arch/arm64/include/asm/mmu.h
@@ -63,6 +63,7 @@ static inline bool arm64_kernel_unmapped_at_el0(void)
 extern void arm64_memblock_init(void);
 extern void paging_init(void);
 extern void bootmem_init(void);
+extern void mapping_crashkernel(void);
 extern void __iomem *early_io_map(phys_addr_t phys, unsigned long virt);
 extern void init_mem_pgprot(void);
 extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 339ee84..0e7540b 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -388,10 +388,6 @@ void __init arm64_memblock_init(void)
 	}
 
 	early_init_fdt_scan_reserved_mem();
-
-	if (!IS_ENABLED(CONFIG_ZONE_DMA) && !IS_ENABLED(CONFIG_ZONE_DMA32))
-		reserve_crashkernel();
-
 	high_memory = __va(memblock_end_of_DRAM() - 1) + 1;
 }
 
@@ -438,8 +434,8 @@ void __init bootmem_init(void)
 	 * request_standard_resources() depends on crashkernel's memory being
 	 * reserved, so do it here.
 	 */
-	if (IS_ENABLED(CONFIG_ZONE_DMA) || IS_ENABLED(CONFIG_ZONE_DMA32))
-		reserve_crashkernel();
+	reserve_crashkernel();
+	mapping_crashkernel();
 
 	memblock_dump_all();
 }
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 626ec32..0672afd 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -498,6 +498,256 @@ static int __init enable_crash_mem_map(char *arg)
 }
 early_param("crashkernel", enable_crash_mem_map);
 
+#ifdef CONFIG_KEXEC_CORE
+static phys_addr_t __init early_crashkernel_pgtable_alloc(int shift)
+{
+	phys_addr_t phys;
+	void *ptr;
+
+	phys = memblock_phys_alloc_range(PAGE_SIZE, PAGE_SIZE, 0,
+					 MEMBLOCK_ALLOC_NOLEAKTRACE);
+	if (!phys)
+		panic("Failed to allocate page table page\n");
+
+	ptr = (void *)__phys_to_virt(phys);
+	memset(ptr, 0, PAGE_SIZE);
+	return phys;
+}
+
+static void init_crashkernel_pte(pmd_t *pmdp, unsigned long addr,
+				 unsigned long end,
+				 phys_addr_t phys, pgprot_t prot)
+{
+	pte_t *ptep;
+	ptep = pte_offset_kernel(pmdp, addr);
+	do {
+		set_pte(ptep, pfn_pte(__phys_to_pfn(phys), prot));
+		phys += PAGE_SIZE;
+	} while (ptep++, addr += PAGE_SIZE, addr != end);
+}
+
+static void alloc_crashkernel_cont_pte(pmd_t *pmdp, unsigned long addr,
+				       unsigned long end, phys_addr_t phys,
+				       pgprot_t prot,
+				       phys_addr_t (*pgtable_alloc)(int),
+				       int flags)
+{
+	unsigned long next;
+	pmd_t pmd = READ_ONCE(*pmdp);
+
+	BUG_ON(pmd_sect(pmd));
+	if (pmd_none(pmd)) {
+		pmdval_t pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN;
+		phys_addr_t pte_phys;
+
+		if (flags & NO_EXEC_MAPPINGS)
+			pmdval |= PMD_TABLE_PXN;
+		BUG_ON(!pgtable_alloc);
+		pte_phys = pgtable_alloc(PAGE_SHIFT);
+		__pmd_populate(pmdp, pte_phys, pmdval);
+		pmd = READ_ONCE(*pmdp);
+	}
+	BUG_ON(pmd_bad(pmd));
+
+	do {
+		pgprot_t __prot = prot;
+		next = pte_cont_addr_end(addr, end);
+		init_crashkernel_pte(pmdp, addr, next, phys, __prot);
+		phys += next - addr;
+	} while (addr = next, addr != end);
+}
+
+static void init_crashkernel_pmd(pud_t *pudp, unsigned long addr,
+				 unsigned long end, phys_addr_t phys,
+				 pgprot_t prot,
+				 phys_addr_t (*pgtable_alloc)(int), int flags)
+{
+	phys_addr_t map_offset;
+	unsigned long next;
+	pmd_t *pmdp;
+	pmdval_t pmdval;
+
+	pmdp = pmd_offset(pudp, addr);
+	do {
+		next = pmd_addr_end(addr, end);
+		if (!pmd_none(*pmdp) && pmd_sect(*pmdp)) {
+			phys_addr_t pte_phys = pgtable_alloc(PAGE_SHIFT);
+			pmd_clear(pmdp);
+			pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN;
+			if (flags & NO_EXEC_MAPPINGS)
+				pmdval |= PMD_TABLE_PXN;
+			__pmd_populate(pmdp, pte_phys, pmdval);
+			flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
+
+			map_offset = addr - (addr & PMD_MASK);
+			if (map_offset)
+			    alloc_init_cont_pte(pmdp, addr & PMD_MASK, addr,
+						phys - map_offset, prot,
+						pgtable_alloc, flags);
+
+			if (next < (addr & PMD_MASK) + PMD_SIZE)
+			    alloc_init_cont_pte(pmdp, next, (addr & PUD_MASK) +
+						PUD_SIZE, next - addr + phys,
+						prot, pgtable_alloc, flags);
+		}
+		alloc_crashkernel_cont_pte(pmdp, addr, next, phys, prot,
+					   pgtable_alloc, flags);
+		phys += next - addr;
+	} while (pmdp++, addr = next, addr != end);
+}
+
+static void alloc_crashkernel_cont_pmd(pud_t *pudp, unsigned long addr,
+				       unsigned long end, phys_addr_t phys,
+				       pgprot_t prot,
+				       phys_addr_t (*pgtable_alloc)(int),
+				       int flags)
+{
+	unsigned long next;
+	pud_t pud = READ_ONCE(*pudp);
+
+	/*
+	 * Check for initial section mappings in the pgd/pud.
+	 */
+	BUG_ON(pud_sect(pud));
+	if (pud_none(pud)) {
+		pudval_t pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN;
+		phys_addr_t pmd_phys;
+
+		if (flags & NO_EXEC_MAPPINGS)
+			pudval |= PUD_TABLE_PXN;
+		BUG_ON(!pgtable_alloc);
+		pmd_phys = pgtable_alloc(PMD_SHIFT);
+		__pud_populate(pudp, pmd_phys, pudval);
+		pud = READ_ONCE(*pudp);
+	}
+	BUG_ON(pud_bad(pud));
+
+	do {
+		pgprot_t __prot = prot;
+		next = pmd_cont_addr_end(addr, end);
+		init_crashkernel_pmd(pudp, addr, next, phys, __prot,
+				     pgtable_alloc, flags);
+		phys += next - addr;
+	} while (addr = next, addr != end);
+}
+
+static void alloc_crashkernel_pud(pgd_t *pgdp, unsigned long addr,
+				  unsigned long end, phys_addr_t phys,
+				  pgprot_t prot,
+				  phys_addr_t (*pgtable_alloc)(int),
+				  int flags)
+{
+	phys_addr_t map_offset;
+	unsigned long next;
+	pud_t *pudp;
+	p4d_t *p4dp = p4d_offset(pgdp, addr);
+	p4d_t p4d = READ_ONCE(*p4dp);
+	pudval_t pudval;
+
+	if (p4d_none(p4d)) {
+		p4dval_t p4dval = P4D_TYPE_TABLE | P4D_TABLE_UXN;
+		phys_addr_t pud_phys;
+
+		if (flags & NO_EXEC_MAPPINGS)
+			p4dval |= P4D_TABLE_PXN;
+		BUG_ON(!pgtable_alloc);
+		pud_phys = pgtable_alloc(PUD_SHIFT);
+		__p4d_populate(p4dp, pud_phys, p4dval);
+		p4d = READ_ONCE(*p4dp);
+	}
+	BUG_ON(p4d_bad(p4d));
+
+	/*
+	 * No need for locking during early boot. And it doesn't work as
+	 * expected with KASLR enabled.
+	 */
+	if (system_state != SYSTEM_BOOTING)
+		mutex_lock(&fixmap_lock);
+	pudp = pud_offset(p4dp, addr);
+	do {
+		next = pud_addr_end(addr, end);
+		if (!pud_none(*pudp) && pud_sect(*pudp)) {
+			phys_addr_t pmd_phys = pgtable_alloc(PMD_SHIFT);
+			pud_clear(pudp);
+
+			pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN;
+			if (flags & NO_EXEC_MAPPINGS)
+				pudval |= PUD_TABLE_PXN;
+			__pud_populate(pudp, pmd_phys, pudval);
+			flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
+
+			map_offset = addr - (addr & PUD_MASK);
+			if (map_offset)
+			    alloc_init_cont_pmd(pudp, addr & PUD_MASK, addr,
+						phys - map_offset, prot,
+						pgtable_alloc, flags);
+
+			if (next < (addr & PUD_MASK) + PUD_SIZE)
+			    alloc_init_cont_pmd(pudp, next, (addr & PUD_MASK) +
+						PUD_SIZE, next - addr + phys,
+						prot, pgtable_alloc, flags);
+		}
+		alloc_crashkernel_cont_pmd(pudp, addr, next, phys, prot,
+					   pgtable_alloc, flags);
+		phys += next - addr;
+	} while (pudp++, addr = next, addr != end);
+
+	if (system_state != SYSTEM_BOOTING)
+		mutex_unlock(&fixmap_lock);
+}
+
+static void __create_crashkernel_mapping(pgd_t *pgdir, phys_addr_t phys,
+					 unsigned long virt, phys_addr_t size,
+					 pgprot_t prot,
+					 phys_addr_t (*pgtable_alloc)(int),
+					 int flags)
+{
+	unsigned long addr, end, next;
+	pgd_t *pgdp = pgd_offset_pgd(pgdir, virt);
+
+	/*
+	 * If the virtual and physical address don't have the same offset
+	 * within a page, we cannot map the region as the caller expects.
+	 */
+	if (WARN_ON((phys ^ virt) & ~PAGE_MASK))
+		return;
+
+	phys &= PAGE_MASK;
+	addr = virt & PAGE_MASK;
+	end = PAGE_ALIGN(virt + size);
+
+	do {
+		next = pgd_addr_end(addr, end);
+		alloc_crashkernel_pud(pgdp, addr, next, phys, prot,
+				      pgtable_alloc, flags);
+		phys += next - addr;
+	} while (pgdp++, addr = next, addr != end);
+}
+
+static void __init map_crashkernel(pgd_t *pgdp, phys_addr_t start,
+				   phys_addr_t end, pgprot_t prot, int flags)
+{
+	__create_crashkernel_mapping(pgdp, start, __phys_to_virt(start),
+				     end - start, prot,
+				     early_crashkernel_pgtable_alloc, flags);
+}
+
+void __init mapping_crashkernel(void)
+{
+	if (can_set_direct_map() || IS_ENABLED(CONFIG_KFENCE))
+	    return;
+
+	if (!crash_mem_map || !crashk_res.end)
+	    return;
+
+	map_crashkernel(swapper_pg_dir, crashk_res.start,
+			crashk_res.end + 1, PAGE_KERNEL,
+			NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS);
+}
+#else
+void __init mapping_crashkernel(void) {}
+#endif
+
 static void __init map_mem(pgd_t *pgdp)
 {
 	static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN);
@@ -527,17 +777,6 @@ static void __init map_mem(pgd_t *pgdp)
 	 */
 	memblock_mark_nomap(kernel_start, kernel_end - kernel_start);
 
-#ifdef CONFIG_KEXEC_CORE
-	if (crash_mem_map) {
-		if (IS_ENABLED(CONFIG_ZONE_DMA) ||
-		    IS_ENABLED(CONFIG_ZONE_DMA32))
-			flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
-		else if (crashk_res.end)
-			memblock_mark_nomap(crashk_res.start,
-			    resource_size(&crashk_res));
-	}
-#endif
-
 	/* map all the memory banks */
 	for_each_mem_range(i, &start, &end) {
 		if (start >= end)
@@ -570,19 +809,6 @@ static void __init map_mem(pgd_t *pgdp)
 	 * in page granularity and put back unused memory to buddy system
 	 * through /sys/kernel/kexec_crash_size interface.
 	 */
-#ifdef CONFIG_KEXEC_CORE
-	if (crash_mem_map &&
-	    !IS_ENABLED(CONFIG_ZONE_DMA) && !IS_ENABLED(CONFIG_ZONE_DMA32)) {
-		if (crashk_res.end) {
-			__map_memblock(pgdp, crashk_res.start,
-				       crashk_res.end + 1,
-				       PAGE_KERNEL,
-				       NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS);
-			memblock_clear_nomap(crashk_res.start,
-					     resource_size(&crashk_res));
-		}
-	}
-#endif
 }
 
 void mark_rodata_ro(void)
-- 
1.8.3.1


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH] arm64: mm: fix linear mapping mem access performace degradation
  2022-06-26 11:10 [PATCH] arm64: mm: fix linear mapping mem access performace degradation Guanghui Feng
@ 2022-06-27  6:34 ` Mike Rapoport
       [not found]   ` <4d18d303-aeed-0beb-a8a4-32893f2d438d@linux.alibaba.com>
  2022-06-28  1:40 ` Leizhen (ThunderTown)
  1 sibling, 1 reply; 15+ messages in thread
From: Mike Rapoport @ 2022-06-27  6:34 UTC (permalink / raw)
  To: Guanghui Feng
  Cc: baolin.wang, catalin.marinas, will, akpm, david, jianyong.wu,
	james.morse, quic_qiancai, christophe.leroy, jonathan,
	mark.rutland, thunder.leizhen, chenzhou10, anshuman.khandual,
	linux-arm-kernel, linux-kernel, geert+renesas, ardb, linux-mm

On Sun, Jun 26, 2022 at 07:10:15PM +0800, Guanghui Feng wrote:
> The arm64 can build 2M/1G block/sectiion mapping. When using DMA/DMA32 zone
> (enable crashkernel, disable rodata full, disable kfence), the mem_map will
> use non block/section mapping(for crashkernel requires to shrink the region
> in page granularity). But it will degrade performance when doing larging
> continuous mem access in kernel(memcpy/memmove, etc).
> 
> There are many changes and discussions:
> commit 031495635b46
> commit 1a8e1cef7603
> commit 8424ecdde7df
> commit 0a30c53573b0
> commit 2687275a5843

Please include oneline summary of the commit. (See section "Describe your
changes" in Documentation/process/submitting-patches.rst)

> This patch changes mem_map to use block/section mapping with crashkernel.
> Firstly, do block/section mapping(normally 2M or 1G) for all avail mem at
> mem_map, reserve crashkernel memory. And then walking pagetable to split
> block/section mapping to non block/section mapping(normally 4K) [[[only]]]
> for crashkernel mem.

This already happens when ZONE_DMA/ZONE_DMA32 are disabled. Please explain
why is it Ok to change the way the memory is mapped with
ZONE_DMA/ZONE_DMA32 enabled.

> So the linear mem mapping use block/section mapping
> as more as possible. We will reduce the cpu dTLB miss conspicuously, and
> accelerate mem access about 10-20% performance improvement.
> 
> I have tested it with pft(Page Fault Test) and fio, obtained great
> performace improvement.
> 
> For fio test:
> 1.prepare ramdisk
>   modprobe -r brd
>   modprobe brd rd_nr=1 rd_size=67108864
>   dmsetup remove_all
>   wipefs -a --force /dev/ram0
>   mkfs -t ext4 -E lazy_itable_init=0,lazy_journal_init=0 -q -F /dev/ram0
>   mkdir -p /fs/ram0
>   mount -t ext4 /dev/ram0 /fs/ram0
> 
> 2.prepare fio paremeter in x.fio file:
> [global]
> bs=4k
> ioengine=psync
> iodepth=128
> size=32G
> direct=1
> invalidate=1
> group_reporting
> thread=1
> rw=read
> directory=/fs/ram0
> numjobs=1
> 
> [task_0]
> cpus_allowed=16
> stonewall=1
> 
> 3.run testcase:
> perf stat -e dTLB-load-misses fio x.fio
> 
> 4.contrast
> ------------------------
> 			without patch		with patch
> fio READ		aggrb=1493.2MB/s	aggrb=1775.3MB/s
> dTLB-load-misses	1,818,320,693		438,729,774
> time elapsed(s)		70.500326434		62.877316408
> user(s)			15.926332000		15.684721000
> sys(s)			54.211939000		47.046165000
> 
> 5.conclusion
> Using this patch will reduce dTLB misses and improve performace greatly.
> 
> Signed-off-by: Guanghui Feng <guanghuifeng@linux.alibaba.com>
> ---
>  arch/arm64/include/asm/mmu.h |   1 +
>  arch/arm64/mm/init.c         |   8 +-
>  arch/arm64/mm/mmu.c          | 274 +++++++++++++++++++++++++++++++++++++++----
>  3 files changed, 253 insertions(+), 30 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
> index 48f8466..df113cc 100644
> --- a/arch/arm64/include/asm/mmu.h
> +++ b/arch/arm64/include/asm/mmu.h
> @@ -63,6 +63,7 @@ static inline bool arm64_kernel_unmapped_at_el0(void)
>  extern void arm64_memblock_init(void);
>  extern void paging_init(void);
>  extern void bootmem_init(void);
> +extern void mapping_crashkernel(void);
>  extern void __iomem *early_io_map(phys_addr_t phys, unsigned long virt);
>  extern void init_mem_pgprot(void);
>  extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
> diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
> index 339ee84..0e7540b 100644
> --- a/arch/arm64/mm/init.c
> +++ b/arch/arm64/mm/init.c
> @@ -388,10 +388,6 @@ void __init arm64_memblock_init(void)
>  	}
>  
>  	early_init_fdt_scan_reserved_mem();
> -
> -	if (!IS_ENABLED(CONFIG_ZONE_DMA) && !IS_ENABLED(CONFIG_ZONE_DMA32))
> -		reserve_crashkernel();
> -
>  	high_memory = __va(memblock_end_of_DRAM() - 1) + 1;
>  }
>  
> @@ -438,8 +434,8 @@ void __init bootmem_init(void)
>  	 * request_standard_resources() depends on crashkernel's memory being
>  	 * reserved, so do it here.
>  	 */
> -	if (IS_ENABLED(CONFIG_ZONE_DMA) || IS_ENABLED(CONFIG_ZONE_DMA32))
> -		reserve_crashkernel();
> +	reserve_crashkernel();
> +	mapping_crashkernel();
>  
>  	memblock_dump_all();
>  }
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index 626ec32..0672afd 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -498,6 +498,256 @@ static int __init enable_crash_mem_map(char *arg)
>  }
>  early_param("crashkernel", enable_crash_mem_map);
>  
> +#ifdef CONFIG_KEXEC_CORE
> +static phys_addr_t __init early_crashkernel_pgtable_alloc(int shift)
> +{
> +	phys_addr_t phys;
> +	void *ptr;
> +
> +	phys = memblock_phys_alloc_range(PAGE_SIZE, PAGE_SIZE, 0,
> +					 MEMBLOCK_ALLOC_NOLEAKTRACE);
> +	if (!phys)
> +		panic("Failed to allocate page table page\n");
> +
> +	ptr = (void *)__phys_to_virt(phys);
> +	memset(ptr, 0, PAGE_SIZE);
> +	return phys;
> +}
> +
> +static void init_crashkernel_pte(pmd_t *pmdp, unsigned long addr,
> +				 unsigned long end,
> +				 phys_addr_t phys, pgprot_t prot)
> +{
> +	pte_t *ptep;
> +	ptep = pte_offset_kernel(pmdp, addr);
> +	do {
> +		set_pte(ptep, pfn_pte(__phys_to_pfn(phys), prot));
> +		phys += PAGE_SIZE;
> +	} while (ptep++, addr += PAGE_SIZE, addr != end);
> +}
> +
> +static void alloc_crashkernel_cont_pte(pmd_t *pmdp, unsigned long addr,
> +				       unsigned long end, phys_addr_t phys,
> +				       pgprot_t prot,
> +				       phys_addr_t (*pgtable_alloc)(int),
> +				       int flags)
> +{
> +	unsigned long next;
> +	pmd_t pmd = READ_ONCE(*pmdp);
> +
> +	BUG_ON(pmd_sect(pmd));
> +	if (pmd_none(pmd)) {
> +		pmdval_t pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN;
> +		phys_addr_t pte_phys;
> +
> +		if (flags & NO_EXEC_MAPPINGS)
> +			pmdval |= PMD_TABLE_PXN;
> +		BUG_ON(!pgtable_alloc);
> +		pte_phys = pgtable_alloc(PAGE_SHIFT);
> +		__pmd_populate(pmdp, pte_phys, pmdval);
> +		pmd = READ_ONCE(*pmdp);
> +	}
> +	BUG_ON(pmd_bad(pmd));
> +
> +	do {
> +		pgprot_t __prot = prot;
> +		next = pte_cont_addr_end(addr, end);
> +		init_crashkernel_pte(pmdp, addr, next, phys, __prot);
> +		phys += next - addr;
> +	} while (addr = next, addr != end);
> +}
> +
> +static void init_crashkernel_pmd(pud_t *pudp, unsigned long addr,
> +				 unsigned long end, phys_addr_t phys,
> +				 pgprot_t prot,
> +				 phys_addr_t (*pgtable_alloc)(int), int flags)
> +{
> +	phys_addr_t map_offset;
> +	unsigned long next;
> +	pmd_t *pmdp;
> +	pmdval_t pmdval;
> +
> +	pmdp = pmd_offset(pudp, addr);
> +	do {
> +		next = pmd_addr_end(addr, end);
> +		if (!pmd_none(*pmdp) && pmd_sect(*pmdp)) {
> +			phys_addr_t pte_phys = pgtable_alloc(PAGE_SHIFT);
> +			pmd_clear(pmdp);
> +			pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN;
> +			if (flags & NO_EXEC_MAPPINGS)
> +				pmdval |= PMD_TABLE_PXN;
> +			__pmd_populate(pmdp, pte_phys, pmdval);
> +			flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
> +
> +			map_offset = addr - (addr & PMD_MASK);
> +			if (map_offset)
> +			    alloc_init_cont_pte(pmdp, addr & PMD_MASK, addr,
> +						phys - map_offset, prot,
> +						pgtable_alloc, flags);
> +
> +			if (next < (addr & PMD_MASK) + PMD_SIZE)
> +			    alloc_init_cont_pte(pmdp, next, (addr & PUD_MASK) +
> +						PUD_SIZE, next - addr + phys,
> +						prot, pgtable_alloc, flags);
> +		}
> +		alloc_crashkernel_cont_pte(pmdp, addr, next, phys, prot,
> +					   pgtable_alloc, flags);
> +		phys += next - addr;
> +	} while (pmdp++, addr = next, addr != end);
> +}
> +
> +static void alloc_crashkernel_cont_pmd(pud_t *pudp, unsigned long addr,
> +				       unsigned long end, phys_addr_t phys,
> +				       pgprot_t prot,
> +				       phys_addr_t (*pgtable_alloc)(int),
> +				       int flags)
> +{
> +	unsigned long next;
> +	pud_t pud = READ_ONCE(*pudp);
> +
> +	/*
> +	 * Check for initial section mappings in the pgd/pud.
> +	 */
> +	BUG_ON(pud_sect(pud));
> +	if (pud_none(pud)) {
> +		pudval_t pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN;
> +		phys_addr_t pmd_phys;
> +
> +		if (flags & NO_EXEC_MAPPINGS)
> +			pudval |= PUD_TABLE_PXN;
> +		BUG_ON(!pgtable_alloc);
> +		pmd_phys = pgtable_alloc(PMD_SHIFT);
> +		__pud_populate(pudp, pmd_phys, pudval);
> +		pud = READ_ONCE(*pudp);
> +	}
> +	BUG_ON(pud_bad(pud));
> +
> +	do {
> +		pgprot_t __prot = prot;
> +		next = pmd_cont_addr_end(addr, end);
> +		init_crashkernel_pmd(pudp, addr, next, phys, __prot,
> +				     pgtable_alloc, flags);
> +		phys += next - addr;
> +	} while (addr = next, addr != end);
> +}
> +
> +static void alloc_crashkernel_pud(pgd_t *pgdp, unsigned long addr,
> +				  unsigned long end, phys_addr_t phys,
> +				  pgprot_t prot,
> +				  phys_addr_t (*pgtable_alloc)(int),
> +				  int flags)
> +{
> +	phys_addr_t map_offset;
> +	unsigned long next;
> +	pud_t *pudp;
> +	p4d_t *p4dp = p4d_offset(pgdp, addr);
> +	p4d_t p4d = READ_ONCE(*p4dp);
> +	pudval_t pudval;
> +
> +	if (p4d_none(p4d)) {
> +		p4dval_t p4dval = P4D_TYPE_TABLE | P4D_TABLE_UXN;
> +		phys_addr_t pud_phys;
> +
> +		if (flags & NO_EXEC_MAPPINGS)
> +			p4dval |= P4D_TABLE_PXN;
> +		BUG_ON(!pgtable_alloc);
> +		pud_phys = pgtable_alloc(PUD_SHIFT);
> +		__p4d_populate(p4dp, pud_phys, p4dval);
> +		p4d = READ_ONCE(*p4dp);
> +	}
> +	BUG_ON(p4d_bad(p4d));
> +
> +	/*
> +	 * No need for locking during early boot. And it doesn't work as
> +	 * expected with KASLR enabled.
> +	 */
> +	if (system_state != SYSTEM_BOOTING)
> +		mutex_lock(&fixmap_lock);
> +	pudp = pud_offset(p4dp, addr);
> +	do {
> +		next = pud_addr_end(addr, end);
> +		if (!pud_none(*pudp) && pud_sect(*pudp)) {
> +			phys_addr_t pmd_phys = pgtable_alloc(PMD_SHIFT);
> +			pud_clear(pudp);
> +
> +			pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN;
> +			if (flags & NO_EXEC_MAPPINGS)
> +				pudval |= PUD_TABLE_PXN;
> +			__pud_populate(pudp, pmd_phys, pudval);
> +			flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
> +
> +			map_offset = addr - (addr & PUD_MASK);
> +			if (map_offset)
> +			    alloc_init_cont_pmd(pudp, addr & PUD_MASK, addr,
> +						phys - map_offset, prot,
> +						pgtable_alloc, flags);
> +
> +			if (next < (addr & PUD_MASK) + PUD_SIZE)
> +			    alloc_init_cont_pmd(pudp, next, (addr & PUD_MASK) +
> +						PUD_SIZE, next - addr + phys,
> +						prot, pgtable_alloc, flags);
> +		}
> +		alloc_crashkernel_cont_pmd(pudp, addr, next, phys, prot,
> +					   pgtable_alloc, flags);
> +		phys += next - addr;
> +	} while (pudp++, addr = next, addr != end);
> +
> +	if (system_state != SYSTEM_BOOTING)
> +		mutex_unlock(&fixmap_lock);
> +}
> +
> +static void __create_crashkernel_mapping(pgd_t *pgdir, phys_addr_t phys,
> +					 unsigned long virt, phys_addr_t size,
> +					 pgprot_t prot,
> +					 phys_addr_t (*pgtable_alloc)(int),
> +					 int flags)
> +{
> +	unsigned long addr, end, next;
> +	pgd_t *pgdp = pgd_offset_pgd(pgdir, virt);
> +
> +	/*
> +	 * If the virtual and physical address don't have the same offset
> +	 * within a page, we cannot map the region as the caller expects.
> +	 */
> +	if (WARN_ON((phys ^ virt) & ~PAGE_MASK))
> +		return;
> +
> +	phys &= PAGE_MASK;
> +	addr = virt & PAGE_MASK;
> +	end = PAGE_ALIGN(virt + size);
> +
> +	do {
> +		next = pgd_addr_end(addr, end);
> +		alloc_crashkernel_pud(pgdp, addr, next, phys, prot,
> +				      pgtable_alloc, flags);
> +		phys += next - addr;
> +	} while (pgdp++, addr = next, addr != end);
> +}
> +
> +static void __init map_crashkernel(pgd_t *pgdp, phys_addr_t start,
> +				   phys_addr_t end, pgprot_t prot, int flags)
> +{
> +	__create_crashkernel_mapping(pgdp, start, __phys_to_virt(start),
> +				     end - start, prot,
> +				     early_crashkernel_pgtable_alloc, flags);
> +}
> +
> +void __init mapping_crashkernel(void)
> +{
> +	if (can_set_direct_map() || IS_ENABLED(CONFIG_KFENCE))
> +	    return;
> +
> +	if (!crash_mem_map || !crashk_res.end)
> +	    return;
> +
> +	map_crashkernel(swapper_pg_dir, crashk_res.start,
> +			crashk_res.end + 1, PAGE_KERNEL,
> +			NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS);
> +}
> +#else
> +void __init mapping_crashkernel(void) {}
> +#endif
> +
>  static void __init map_mem(pgd_t *pgdp)
>  {
>  	static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN);
> @@ -527,17 +777,6 @@ static void __init map_mem(pgd_t *pgdp)
>  	 */
>  	memblock_mark_nomap(kernel_start, kernel_end - kernel_start);
>  
> -#ifdef CONFIG_KEXEC_CORE
> -	if (crash_mem_map) {
> -		if (IS_ENABLED(CONFIG_ZONE_DMA) ||
> -		    IS_ENABLED(CONFIG_ZONE_DMA32))
> -			flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
> -		else if (crashk_res.end)
> -			memblock_mark_nomap(crashk_res.start,
> -			    resource_size(&crashk_res));
> -	}
> -#endif
> -
>  	/* map all the memory banks */
>  	for_each_mem_range(i, &start, &end) {
>  		if (start >= end)
> @@ -570,19 +809,6 @@ static void __init map_mem(pgd_t *pgdp)
>  	 * in page granularity and put back unused memory to buddy system
>  	 * through /sys/kernel/kexec_crash_size interface.
>  	 */
> -#ifdef CONFIG_KEXEC_CORE
> -	if (crash_mem_map &&
> -	    !IS_ENABLED(CONFIG_ZONE_DMA) && !IS_ENABLED(CONFIG_ZONE_DMA32)) {
> -		if (crashk_res.end) {
> -			__map_memblock(pgdp, crashk_res.start,
> -				       crashk_res.end + 1,
> -				       PAGE_KERNEL,
> -				       NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS);
> -			memblock_clear_nomap(crashk_res.start,
> -					     resource_size(&crashk_res));
> -		}
> -	}
> -#endif
>  }
>  
>  void mark_rodata_ro(void)
> -- 
> 1.8.3.1
> 

-- 
Sincerely yours,
Mike.

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] arm64: mm: fix linear mapping mem access performace degradation
       [not found]   ` <4d18d303-aeed-0beb-a8a4-32893f2d438d@linux.alibaba.com>
@ 2022-06-27  9:49     ` Mike Rapoport
  2022-06-27 10:46       ` guanghui.fgh
  0 siblings, 1 reply; 15+ messages in thread
From: Mike Rapoport @ 2022-06-27  9:49 UTC (permalink / raw)
  To: guanghui.fgh
  Cc: baolin.wang, catalin.marinas, will, akpm, david, jianyong.wu,
	james.morse, quic_qiancai, christophe.leroy, jonathan,
	mark.rutland, thunder.leizhen, anshuman.khandual,
	linux-arm-kernel, linux-kernel, geert+renesas, ardb, linux-mm

Please don't post HTML.

On Mon, Jun 27, 2022 at 05:24:10PM +0800, guanghui.fgh wrote:
> Thanks.
> 
> 在 2022/6/27 14:34, Mike Rapoport 写道:
> 
>     On Sun, Jun 26, 2022 at 07:10:15PM +0800, Guanghui Feng wrote:
> 
>         The arm64 can build 2M/1G block/sectiion mapping. When using DMA/DMA32 zone
>         (enable crashkernel, disable rodata full, disable kfence), the mem_map will
>         use non block/section mapping(for crashkernel requires to shrink the region
>         in page granularity). But it will degrade performance when doing larging
>         continuous mem access in kernel(memcpy/memmove, etc).
> 
>         There are many changes and discussions:
>         commit 031495635b46
>         commit 1a8e1cef7603
>         commit 8424ecdde7df
>         commit 0a30c53573b0
>         commit 2687275a5843
> 
>     Please include oneline summary of the commit. (See section "Describe your
>     changes" in Documentation/process/submitting-patches.rst)
> 
> OK, I will add oneline summary in the git commit messages.
> 
>         This patch changes mem_map to use block/section mapping with crashkernel.
>         Firstly, do block/section mapping(normally 2M or 1G) for all avail mem at
>         mem_map, reserve crashkernel memory. And then walking pagetable to split
>         block/section mapping to non block/section mapping(normally 4K) [[[only]]]
>         for crashkernel mem.
> 
>     This already happens when ZONE_DMA/ZONE_DMA32 are disabled. Please explain
>     why is it Ok to change the way the memory is mapped with
>     ZONE_DMA/ZONE_DMA32 enabled.
> 
> In short:
>
> 1.building all avail mem with block/section mapping(normally 1G/2M) without
> inspecting crashkernel
> 2. Reserve crashkernel mem as same as previous doing
> 3. only change the crashkernle mem mapping to normal mapping(normally 4k).
> With this method, there are block/section mapping as more as possible.

This does not answer the question why changing the way the memory is mapped
when there is ZONE_DMA/DMA32 and crashkernel won't cause a regression.

-- 
Sincerely yours,
Mike.

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] arm64: mm: fix linear mapping mem access performace degradation
  2022-06-27  9:49     ` Mike Rapoport
@ 2022-06-27 10:46       ` guanghui.fgh
  2022-06-27 12:06         ` Leizhen (ThunderTown)
  2022-06-27 16:49         ` Mike Rapoport
  0 siblings, 2 replies; 15+ messages in thread
From: guanghui.fgh @ 2022-06-27 10:46 UTC (permalink / raw)
  To: Mike Rapoport
  Cc: baolin.wang, catalin.marinas, will, akpm, david, jianyong.wu,
	james.morse, quic_qiancai, christophe.leroy, jonathan,
	mark.rutland, thunder.leizhen, anshuman.khandual,
	linux-arm-kernel, linux-kernel, geert+renesas, ardb, linux-mm



在 2022/6/27 17:49, Mike Rapoport 写道:
> Please don't post HTML.
> 
> On Mon, Jun 27, 2022 at 05:24:10PM +0800, guanghui.fgh wrote:
>> Thanks.
>>
>> 在 2022/6/27 14:34, Mike Rapoport 写道:
>>
>>      On Sun, Jun 26, 2022 at 07:10:15PM +0800, Guanghui Feng wrote:
>>
>>          The arm64 can build 2M/1G block/sectiion mapping. When using DMA/DMA32 zone
>>          (enable crashkernel, disable rodata full, disable kfence), the mem_map will
>>          use non block/section mapping(for crashkernel requires to shrink the region
>>          in page granularity). But it will degrade performance when doing larging
>>          continuous mem access in kernel(memcpy/memmove, etc).
>>
>>          There are many changes and discussions:
>>          commit 031495635b46
>>          commit 1a8e1cef7603
>>          commit 8424ecdde7df
>>          commit 0a30c53573b0
>>          commit 2687275a5843
>>
>>      Please include oneline summary of the commit. (See section "Describe your
>>      changes" in Documentation/process/submitting-patches.rst)
>>
>> OK, I will add oneline summary in the git commit messages.
>>
>>          This patch changes mem_map to use block/section mapping with crashkernel.
>>          Firstly, do block/section mapping(normally 2M or 1G) for all avail mem at
>>          mem_map, reserve crashkernel memory. And then walking pagetable to split
>>          block/section mapping to non block/section mapping(normally 4K) [[[only]]]
>>          for crashkernel mem.
>>
>>      This already happens when ZONE_DMA/ZONE_DMA32 are disabled. Please explain
>>      why is it Ok to change the way the memory is mapped with
>>      ZONE_DMA/ZONE_DMA32 enabled.
>>
>> In short:
>>
>> 1.building all avail mem with block/section mapping(normally 1G/2M) without
>> inspecting crashkernel
>> 2. Reserve crashkernel mem as same as previous doing
>> 3. only change the crashkernle mem mapping to normal mapping(normally 4k).
>> With this method, there are block/section mapping as more as possible.
> 
> This does not answer the question why changing the way the memory is mapped
> when there is ZONE_DMA/DMA32 and crashkernel won't cause a regression.
> 
1.Quoted messages from arch/arm64/mm/init.c

"Memory reservation for crash kernel either done early or deferred
depending on DMA memory zones configs (ZONE_DMA) --

In absence of ZONE_DMA configs arm64_dma_phys_limit initialized
here instead of max_zone_phys().  This lets early reservation of
crash kernel memory which has a dependency on arm64_dma_phys_limit.
Reserving memory early for crash kernel allows linear creation of block
mappings (greater than page-granularity) for all the memory bank rangs.
In this scheme a comparatively quicker boot is observed.

If ZONE_DMA configs are defined, crash kernel memory reservation
is delayed until DMA zone memory range size initialization performed in
zone_sizes_init().  The defer is necessary to steer clear of DMA zone
memory range to avoid overlap allocation.  So crash kernel memory 
boundaries are not known when mapping all bank memory ranges, which 
otherwise means not possible to exclude crash kernel range from creating 
block mappings so page-granularity mappings are created for the entire 
memory range."

Namely, the init order: memblock init--->linear mem mapping(4k mapping 
for crashkernel, requirinig page-granularity changing))--->zone dma 
limit--->reserve crashkernel.
So when enable ZONE DMA and using crashkernel, the mem mapping using 4k 
mapping.

2.As mentioned above, when linear mem use 4k mapping simply, there is 
high dtlb miss(degrade performance).
This patch use block/section mapping as far as possible with performance 
improvement.

3.This patch reserve crashkernel as same as the history(ZONE DMA & 
crashkernel reserving order), and only change the linear mem mapping to 
block/section mapping.

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] arm64: mm: fix linear mapping mem access performace degradation
  2022-06-27 10:46       ` guanghui.fgh
@ 2022-06-27 12:06         ` Leizhen (ThunderTown)
  2022-06-27 12:25           ` guanghui.fgh
  2022-06-27 16:49         ` Mike Rapoport
  1 sibling, 1 reply; 15+ messages in thread
From: Leizhen (ThunderTown) @ 2022-06-27 12:06 UTC (permalink / raw)
  To: guanghui.fgh, Mike Rapoport
  Cc: baolin.wang, catalin.marinas, will, akpm, david, jianyong.wu,
	james.morse, quic_qiancai, christophe.leroy, jonathan,
	mark.rutland, anshuman.khandual, linux-arm-kernel, linux-kernel,
	geert+renesas, ardb, linux-mm, bhe



On 2022/6/27 18:46, guanghui.fgh wrote:
> 
> 
> 在 2022/6/27 17:49, Mike Rapoport 写道:
>> Please don't post HTML.
>>
>> On Mon, Jun 27, 2022 at 05:24:10PM +0800, guanghui.fgh wrote:
>>> Thanks.
>>>
>>> 在 2022/6/27 14:34, Mike Rapoport 写道:
>>>
>>>      On Sun, Jun 26, 2022 at 07:10:15PM +0800, Guanghui Feng wrote:
>>>
>>>          The arm64 can build 2M/1G block/sectiion mapping. When using DMA/DMA32 zone
>>>          (enable crashkernel, disable rodata full, disable kfence), the mem_map will
>>>          use non block/section mapping(for crashkernel requires to shrink the region
>>>          in page granularity). But it will degrade performance when doing larging
>>>          continuous mem access in kernel(memcpy/memmove, etc).
>>>
>>>          There are many changes and discussions:
>>>          commit 031495635b46
>>>          commit 1a8e1cef7603
>>>          commit 8424ecdde7df
>>>          commit 0a30c53573b0
>>>          commit 2687275a5843
>>>
>>>      Please include oneline summary of the commit. (See section "Describe your
>>>      changes" in Documentation/process/submitting-patches.rst)
>>>
>>> OK, I will add oneline summary in the git commit messages.
>>>
>>>          This patch changes mem_map to use block/section mapping with crashkernel.
>>>          Firstly, do block/section mapping(normally 2M or 1G) for all avail mem at
>>>          mem_map, reserve crashkernel memory. And then walking pagetable to split
>>>          block/section mapping to non block/section mapping(normally 4K) [[[only]]]
>>>          for crashkernel mem.
>>>
>>>      This already happens when ZONE_DMA/ZONE_DMA32 are disabled. Please explain
>>>      why is it Ok to change the way the memory is mapped with
>>>      ZONE_DMA/ZONE_DMA32 enabled.
>>>
>>> In short:
>>>
>>> 1.building all avail mem with block/section mapping(normally 1G/2M) without
>>> inspecting crashkernel
>>> 2. Reserve crashkernel mem as same as previous doing
>>> 3. only change the crashkernle mem mapping to normal mapping(normally 4k).
>>> With this method, there are block/section mapping as more as possible.
>>
>> This does not answer the question why changing the way the memory is mapped
>> when there is ZONE_DMA/DMA32 and crashkernel won't cause a regression.
>>
> 1.Quoted messages from arch/arm64/mm/init.c
> 
> "Memory reservation for crash kernel either done early or deferred
> depending on DMA memory zones configs (ZONE_DMA) --
> 
> In absence of ZONE_DMA configs arm64_dma_phys_limit initialized
> here instead of max_zone_phys().  This lets early reservation of
> crash kernel memory which has a dependency on arm64_dma_phys_limit.
> Reserving memory early for crash kernel allows linear creation of block
> mappings (greater than page-granularity) for all the memory bank rangs.
> In this scheme a comparatively quicker boot is observed.
> 
> If ZONE_DMA configs are defined, crash kernel memory reservation
> is delayed until DMA zone memory range size initialization performed in
> zone_sizes_init().  The defer is necessary to steer clear of DMA zone
> memory range to avoid overlap allocation.  So crash kernel memory boundaries are not known when mapping all bank memory ranges, which otherwise means not possible to exclude crash kernel range from creating block mappings so page-granularity mappings are created for the entire memory range."
> 
> Namely, the init order: memblock init--->linear mem mapping(4k mapping for crashkernel, requirinig page-granularity changing))--->zone dma limit--->reserve crashkernel.
> So when enable ZONE DMA and using crashkernel, the mem mapping using 4k mapping.
> 
> 2.As mentioned above, when linear mem use 4k mapping simply, there is high dtlb miss(degrade performance).
> This patch use block/section mapping as far as possible with performance improvement.
> 
> 3.This patch reserve crashkernel as same as the history(ZONE DMA & crashkernel reserving order), and only change the linear mem mapping to block/section mapping.
> .
> 

I think Mike Rapoport's probably asking you to answer whether you've
taken into account such as BBM. For example, the following code:
we should prepare the next level pgtable first, then change 2M block
mapping to 4K page mapping, and flush TLB at the end.

+static void init_crashkernel_pmd(pud_t *pudp, unsigned long addr,
+				 unsigned long end, phys_addr_t phys,
+				 pgprot_t prot,
+				 phys_addr_t (*pgtable_alloc)(int), int flags)
+{
+	phys_addr_t map_offset;
+	unsigned long next;
+	pmd_t *pmdp;
+	pmdval_t pmdval;
+
+	pmdp = pmd_offset(pudp, addr);
+	do {
+		next = pmd_addr_end(addr, end);
+		if (!pmd_none(*pmdp) && pmd_sect(*pmdp)) {
+			phys_addr_t pte_phys = pgtable_alloc(PAGE_SHIFT);
+			pmd_clear(pmdp);
+			pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN;
+			if (flags & NO_EXEC_MAPPINGS)
+				pmdval |= PMD_TABLE_PXN;
+			__pmd_populate(pmdp, pte_phys, pmdval);
+			flush_tlb_kernel_range(addr, addr + PAGE_SIZE);

The pgtable is empty now. However, memory other than crashkernel may be being accessed.

+
+			map_offset = addr - (addr & PMD_MASK);
+			if (map_offset)
+			    alloc_init_cont_pte(pmdp, addr & PMD_MASK, addr,
+						phys - map_offset, prot,
+						pgtable_alloc, flags);
+
+			if (next < (addr & PMD_MASK) + PMD_SIZE)
+			    alloc_init_cont_pte(pmdp, next, (addr & PUD_MASK) +
+						PUD_SIZE, next - addr + phys,
+						prot, pgtable_alloc, flags);
+		}
+		alloc_crashkernel_cont_pte(pmdp, addr, next, phys, prot,
+					   pgtable_alloc, flags);
+		phys += next - addr;
+	} while (pmdp++, addr = next, addr != end);
+}

-- 
Regards,
  Zhen Lei

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] arm64: mm: fix linear mapping mem access performace degradation
  2022-06-27 12:06         ` Leizhen (ThunderTown)
@ 2022-06-27 12:25           ` guanghui.fgh
  2022-06-28  1:34             ` Leizhen (ThunderTown)
  0 siblings, 1 reply; 15+ messages in thread
From: guanghui.fgh @ 2022-06-27 12:25 UTC (permalink / raw)
  To: Leizhen (ThunderTown), Mike Rapoport
  Cc: baolin.wang, catalin.marinas, will, akpm, david, jianyong.wu,
	james.morse, quic_qiancai, christophe.leroy, jonathan,
	mark.rutland, anshuman.khandual, linux-arm-kernel, linux-kernel,
	geert+renesas, ardb, linux-mm, bhe

Thanks.

在 2022/6/27 20:06, Leizhen (ThunderTown) 写道:
> 
> 
> On 2022/6/27 18:46, guanghui.fgh wrote:
>>
>>
>> 在 2022/6/27 17:49, Mike Rapoport 写道:
>>> Please don't post HTML.
>>>
>>> On Mon, Jun 27, 2022 at 05:24:10PM +0800, guanghui.fgh wrote:
>>>> Thanks.
>>>>
>>>> 在 2022/6/27 14:34, Mike Rapoport 写道:
>>>>
>>>>       On Sun, Jun 26, 2022 at 07:10:15PM +0800, Guanghui Feng wrote:
>>>>
>>>>           The arm64 can build 2M/1G block/sectiion mapping. When using DMA/DMA32 zone
>>>>           (enable crashkernel, disable rodata full, disable kfence), the mem_map will
>>>>           use non block/section mapping(for crashkernel requires to shrink the region
>>>>           in page granularity). But it will degrade performance when doing larging
>>>>           continuous mem access in kernel(memcpy/memmove, etc).
>>>>
>>>>           There are many changes and discussions:
>>>>           commit 031495635b46
>>>>           commit 1a8e1cef7603
>>>>           commit 8424ecdde7df
>>>>           commit 0a30c53573b0
>>>>           commit 2687275a5843
>>>>
>>>>       Please include oneline summary of the commit. (See section "Describe your
>>>>       changes" in Documentation/process/submitting-patches.rst)
>>>>
>>>> OK, I will add oneline summary in the git commit messages.
>>>>
>>>>           This patch changes mem_map to use block/section mapping with crashkernel.
>>>>           Firstly, do block/section mapping(normally 2M or 1G) for all avail mem at
>>>>           mem_map, reserve crashkernel memory. And then walking pagetable to split
>>>>           block/section mapping to non block/section mapping(normally 4K) [[[only]]]
>>>>           for crashkernel mem.
>>>>
>>>>       This already happens when ZONE_DMA/ZONE_DMA32 are disabled. Please explain
>>>>       why is it Ok to change the way the memory is mapped with
>>>>       ZONE_DMA/ZONE_DMA32 enabled.
>>>>
>>>> In short:
>>>>
>>>> 1.building all avail mem with block/section mapping(normally 1G/2M) without
>>>> inspecting crashkernel
>>>> 2. Reserve crashkernel mem as same as previous doing
>>>> 3. only change the crashkernle mem mapping to normal mapping(normally 4k).
>>>> With this method, there are block/section mapping as more as possible.
>>>
>>> This does not answer the question why changing the way the memory is mapped
>>> when there is ZONE_DMA/DMA32 and crashkernel won't cause a regression.
>>>
>> 1.Quoted messages from arch/arm64/mm/init.c
>>
>> "Memory reservation for crash kernel either done early or deferred
>> depending on DMA memory zones configs (ZONE_DMA) --
>>
>> In absence of ZONE_DMA configs arm64_dma_phys_limit initialized
>> here instead of max_zone_phys().  This lets early reservation of
>> crash kernel memory which has a dependency on arm64_dma_phys_limit.
>> Reserving memory early for crash kernel allows linear creation of block
>> mappings (greater than page-granularity) for all the memory bank rangs.
>> In this scheme a comparatively quicker boot is observed.
>>
>> If ZONE_DMA configs are defined, crash kernel memory reservation
>> is delayed until DMA zone memory range size initialization performed in
>> zone_sizes_init().  The defer is necessary to steer clear of DMA zone
>> memory range to avoid overlap allocation.  So crash kernel memory boundaries are not known when mapping all bank memory ranges, which otherwise means not possible to exclude crash kernel range from creating block mappings so page-granularity mappings are created for the entire memory range."
>>
>> Namely, the init order: memblock init--->linear mem mapping(4k mapping for crashkernel, requirinig page-granularity changing))--->zone dma limit--->reserve crashkernel.
>> So when enable ZONE DMA and using crashkernel, the mem mapping using 4k mapping.
>>
>> 2.As mentioned above, when linear mem use 4k mapping simply, there is high dtlb miss(degrade performance).
>> This patch use block/section mapping as far as possible with performance improvement.
>>
>> 3.This patch reserve crashkernel as same as the history(ZONE DMA & crashkernel reserving order), and only change the linear mem mapping to block/section mapping.
>> .
>>
> 
> I think Mike Rapoport's probably asking you to answer whether you've
> taken into account such as BBM. For example, the following code:
> we should prepare the next level pgtable first, then change 2M block
> mapping to 4K page mapping, and flush TLB at the end.
> > +static void init_crashkernel_pmd(pud_t *pudp, unsigned long addr,
> +				 unsigned long end, phys_addr_t phys,
> +				 pgprot_t prot,
> +				 phys_addr_t (*pgtable_alloc)(int), int flags)
> +{
> +	phys_addr_t map_offset;
> +	unsigned long next;
> +	pmd_t *pmdp;
> +	pmdval_t pmdval;
> +
> +	pmdp = pmd_offset(pudp, addr);
> +	do {
> +		next = pmd_addr_end(addr, end);
> +		if (!pmd_none(*pmdp) && pmd_sect(*pmdp)) {
> +			phys_addr_t pte_phys = pgtable_alloc(PAGE_SHIFT);
> +			pmd_clear(pmdp);
> +			pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN;
> +			if (flags & NO_EXEC_MAPPINGS)
> +				pmdval |= PMD_TABLE_PXN;
> +			__pmd_populate(pmdp, pte_phys, pmdval);
> +			flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
> 
> The pgtable is empty now. However, memory other than crashkernel may be being accessed.
1.When reserving crashkernel and remapping linear mem mapping, there is 
only one boot cpu running. There is no other cpu/thread running at the 
same time.

2.When clearing block/section mapping, I have flush tlb by 
flush_tlb_kernel_range. Afterwards rebuilt 4k mapping(I think it's no 
need flush tlb).

> 
> +
> +			map_offset = addr - (addr & PMD_MASK);
> +			if (map_offset)
> +			    alloc_init_cont_pte(pmdp, addr & PMD_MASK, addr,
> +						phys - map_offset, prot,
> +						pgtable_alloc, flags);
> +
> +			if (next < (addr & PMD_MASK) + PMD_SIZE)
> +			    alloc_init_cont_pte(pmdp, next, (addr & PUD_MASK) +
> +						PUD_SIZE, next - addr + phys,
> +						prot, pgtable_alloc, flags);
> +		}
> +		alloc_crashkernel_cont_pte(pmdp, addr, next, phys, prot,
> +					   pgtable_alloc, flags);
> +		phys += next - addr;
> +	} while (pmdp++, addr = next, addr != end);
> +}
> 

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] arm64: mm: fix linear mapping mem access performace degradation
  2022-06-27 10:46       ` guanghui.fgh
  2022-06-27 12:06         ` Leizhen (ThunderTown)
@ 2022-06-27 16:49         ` Mike Rapoport
  2022-06-28  1:50           ` Kefeng Wang
  2022-06-28  2:55           ` guanghui.fgh
  1 sibling, 2 replies; 15+ messages in thread
From: Mike Rapoport @ 2022-06-27 16:49 UTC (permalink / raw)
  To: guanghui.fgh
  Cc: baolin.wang, catalin.marinas, will, akpm, david, jianyong.wu,
	james.morse, quic_qiancai, christophe.leroy, jonathan,
	mark.rutland, thunder.leizhen, anshuman.khandual,
	linux-arm-kernel, linux-kernel, geert+renesas, ardb, linux-mm

On Mon, Jun 27, 2022 at 06:46:50PM +0800, guanghui.fgh wrote:
> 在 2022/6/27 17:49, Mike Rapoport 写道:
> > On Mon, Jun 27, 2022 at 05:24:10PM +0800, guanghui.fgh wrote:
> > > 在 2022/6/27 14:34, Mike Rapoport 写道:
> > > 
> > >      On Sun, Jun 26, 2022 at 07:10:15PM +0800, Guanghui Feng wrote:
> > > 
> > >          The arm64 can build 2M/1G block/sectiion mapping. When using DMA/DMA32 zone
> > >          (enable crashkernel, disable rodata full, disable kfence), the mem_map will
> > >          use non block/section mapping(for crashkernel requires to shrink the region
> > >          in page granularity). But it will degrade performance when doing larging
> > >          continuous mem access in kernel(memcpy/memmove, etc).
> > > 
> > >          There are many changes and discussions:
> > >          commit 031495635b46
> > >          commit 1a8e1cef7603
> > >          commit 8424ecdde7df
> > >          commit 0a30c53573b0
> > >          commit 2687275a5843
> > > 
> > >      Please include oneline summary of the commit. (See section "Describe your
> > >      changes" in Documentation/process/submitting-patches.rst)
> > > 
> > > OK, I will add oneline summary in the git commit messages.
> > > 
> > >          This patch changes mem_map to use block/section mapping with crashkernel.
> > >          Firstly, do block/section mapping(normally 2M or 1G) for all avail mem at
> > >          mem_map, reserve crashkernel memory. And then walking pagetable to split
> > >          block/section mapping to non block/section mapping(normally 4K) [[[only]]]
> > >          for crashkernel mem.
> > > 
> > >      This already happens when ZONE_DMA/ZONE_DMA32 are disabled. Please explain
> > >      why is it Ok to change the way the memory is mapped with
> > >      ZONE_DMA/ZONE_DMA32 enabled.
> > > 
> > > In short:
> > > 
> > > 1.building all avail mem with block/section mapping(normally 1G/2M) without
> > > inspecting crashkernel
> > > 2. Reserve crashkernel mem as same as previous doing
> > > 3. only change the crashkernle mem mapping to normal mapping(normally 4k).
> > > With this method, there are block/section mapping as more as possible.
> > 
> > This does not answer the question why changing the way the memory is mapped
> > when there is ZONE_DMA/DMA32 and crashkernel won't cause a regression.
> > 
> 1.Quoted messages from arch/arm64/mm/init.c
> 
> "Memory reservation for crash kernel either done early or deferred
> depending on DMA memory zones configs (ZONE_DMA) --
> 
> In absence of ZONE_DMA configs arm64_dma_phys_limit initialized
> here instead of max_zone_phys().  This lets early reservation of
> crash kernel memory which has a dependency on arm64_dma_phys_limit.
> Reserving memory early for crash kernel allows linear creation of block
> mappings (greater than page-granularity) for all the memory bank rangs.
> In this scheme a comparatively quicker boot is observed.
> 
> If ZONE_DMA configs are defined, crash kernel memory reservation
> is delayed until DMA zone memory range size initialization performed in
> zone_sizes_init().  The defer is necessary to steer clear of DMA zone
> memory range to avoid overlap allocation.  So crash kernel memory boundaries
> are not known when mapping all bank memory ranges, which otherwise means not
> possible to exclude crash kernel range from creating block mappings so
> page-granularity mappings are created for the entire memory range."
> 
> Namely, the init order: memblock init--->linear mem mapping(4k mapping for
> crashkernel, requirinig page-granularity changing))--->zone dma
> limit--->reserve crashkernel.
> So when enable ZONE DMA and using crashkernel, the mem mapping using 4k
> mapping.
> 
> 2.As mentioned above, when linear mem use 4k mapping simply, there is high
> dtlb miss(degrade performance).
> This patch use block/section mapping as far as possible with performance
> improvement.
> 
> 3.This patch reserve crashkernel as same as the history(ZONE DMA &
> crashkernel reserving order), and only change the linear mem mapping to
> block/section mapping.

Thank you for the explanation. 

So if I understand correctly, you suggest to reserve the crash kernel
memory late regardless of ZONE_DMA/32 configuration and then map that memory 
using page level mappings after the memory is reserved. 

This makes sense, but I think the patch has a few shortcomings than need to
be addressed.

> Signed-off-by: Guanghui Feng <guanghuifeng@linux.alibaba.com>
> ---
>  arch/arm64/include/asm/mmu.h |   1 +
>  arch/arm64/mm/init.c         |   8 +-
>  arch/arm64/mm/mmu.c          | 274 +++++++++++++++++++++++++++++++++++++++----
>  3 files changed, 253 insertions(+), 30 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
> index 48f8466..df113cc 100644
> --- a/arch/arm64/include/asm/mmu.h
> +++ b/arch/arm64/include/asm/mmu.h
> @@ -63,6 +63,7 @@ static inline bool arm64_kernel_unmapped_at_el0(void)
>  extern void arm64_memblock_init(void);
>  extern void paging_init(void);
>  extern void bootmem_init(void);
> +extern void mapping_crashkernel(void);

I think map_crashkernel() would be a better name.

>  extern void __iomem *early_io_map(phys_addr_t phys, unsigned long virt);
>  extern void init_mem_pgprot(void);
>  extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
> diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
> index 339ee84..0e7540b 100644
> --- a/arch/arm64/mm/init.c
> +++ b/arch/arm64/mm/init.c
> @@ -388,10 +388,6 @@ void __init arm64_memblock_init(void)
>  	}
>  
>  	early_init_fdt_scan_reserved_mem();
> -
> -	if (!IS_ENABLED(CONFIG_ZONE_DMA) && !IS_ENABLED(CONFIG_ZONE_DMA32))
> -		reserve_crashkernel();
> -
>  	high_memory = __va(memblock_end_of_DRAM() - 1) + 1;
>  }
>  
> @@ -438,8 +434,8 @@ void __init bootmem_init(void)
>  	 * request_standard_resources() depends on crashkernel's memory being
>  	 * reserved, so do it here.
>  	 */
> -	if (IS_ENABLED(CONFIG_ZONE_DMA) || IS_ENABLED(CONFIG_ZONE_DMA32))
> -		reserve_crashkernel();
> +	reserve_crashkernel();
> +	mapping_crashkernel();

This can be called from reserve_crashkernel() directly.
 
>  	memblock_dump_all();
>  }
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index 626ec32..0672afd 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -498,6 +498,256 @@ static int __init enable_crash_mem_map(char *arg)
>  }
>  early_param("crashkernel", enable_crash_mem_map);
>  
> +#ifdef CONFIG_KEXEC_CORE
> +static phys_addr_t __init early_crashkernel_pgtable_alloc(int shift)
> +{
> +	phys_addr_t phys;
> +	void *ptr;
> +
> +	phys = memblock_phys_alloc_range(PAGE_SIZE, PAGE_SIZE, 0,
> +					 MEMBLOCK_ALLOC_NOLEAKTRACE);
> +	if (!phys)
> +		panic("Failed to allocate page table page\n");
> +
> +	ptr = (void *)__phys_to_virt(phys);
> +	memset(ptr, 0, PAGE_SIZE);
> +	return phys;
> +}
> +
> +static void init_crashkernel_pte(pmd_t *pmdp, unsigned long addr,
> +				 unsigned long end,
> +				 phys_addr_t phys, pgprot_t prot)
> +{
> +	pte_t *ptep;
> +	ptep = pte_offset_kernel(pmdp, addr);
> +	do {
> +		set_pte(ptep, pfn_pte(__phys_to_pfn(phys), prot));
> +		phys += PAGE_SIZE;
> +	} while (ptep++, addr += PAGE_SIZE, addr != end);
> +}
> +
> +static void alloc_crashkernel_cont_pte(pmd_t *pmdp, unsigned long addr,
> +				       unsigned long end, phys_addr_t phys,
> +				       pgprot_t prot,
> +				       phys_addr_t (*pgtable_alloc)(int),
> +				       int flags)
> +{
> +	unsigned long next;
> +	pmd_t pmd = READ_ONCE(*pmdp);
> +
> +	BUG_ON(pmd_sect(pmd));
> +	if (pmd_none(pmd)) {
> +		pmdval_t pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN;
> +		phys_addr_t pte_phys;
> +
> +		if (flags & NO_EXEC_MAPPINGS)
> +			pmdval |= PMD_TABLE_PXN;
> +		BUG_ON(!pgtable_alloc);
> +		pte_phys = pgtable_alloc(PAGE_SHIFT);
> +		__pmd_populate(pmdp, pte_phys, pmdval);
> +		pmd = READ_ONCE(*pmdp);
> +	}
> +	BUG_ON(pmd_bad(pmd));
> +
> +	do {
> +		pgprot_t __prot = prot;
> +		next = pte_cont_addr_end(addr, end);
> +		init_crashkernel_pte(pmdp, addr, next, phys, __prot);
> +		phys += next - addr;
> +	} while (addr = next, addr != end);
> +}
> +
> +static void init_crashkernel_pmd(pud_t *pudp, unsigned long addr,
> +				 unsigned long end, phys_addr_t phys,
> +				 pgprot_t prot,
> +				 phys_addr_t (*pgtable_alloc)(int), int flags)
> +{
> +	phys_addr_t map_offset;
> +	unsigned long next;
> +	pmd_t *pmdp;
> +	pmdval_t pmdval;
> +
> +	pmdp = pmd_offset(pudp, addr);
> +	do {
> +		next = pmd_addr_end(addr, end);
> +		if (!pmd_none(*pmdp) && pmd_sect(*pmdp)) {
> +			phys_addr_t pte_phys = pgtable_alloc(PAGE_SHIFT);
> +			pmd_clear(pmdp);
> +			pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN;
> +			if (flags & NO_EXEC_MAPPINGS)
> +				pmdval |= PMD_TABLE_PXN;
> +			__pmd_populate(pmdp, pte_phys, pmdval);
> +			flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
> +
> +			map_offset = addr - (addr & PMD_MASK);
> +			if (map_offset)
> +			    alloc_init_cont_pte(pmdp, addr & PMD_MASK, addr,
> +						phys - map_offset, prot,
> +						pgtable_alloc, flags);
> +
> +			if (next < (addr & PMD_MASK) + PMD_SIZE)
> +			    alloc_init_cont_pte(pmdp, next, (addr & PUD_MASK) +
> +						PUD_SIZE, next - addr + phys,
> +						prot, pgtable_alloc, flags);
> +		}
> +		alloc_crashkernel_cont_pte(pmdp, addr, next, phys, prot,
> +					   pgtable_alloc, flags);
> +		phys += next - addr;
> +	} while (pmdp++, addr = next, addr != end);
> +}
> +
> +static void alloc_crashkernel_cont_pmd(pud_t *pudp, unsigned long addr,
> +				       unsigned long end, phys_addr_t phys,
> +				       pgprot_t prot,
> +				       phys_addr_t (*pgtable_alloc)(int),
> +				       int flags)
> +{
> +	unsigned long next;
> +	pud_t pud = READ_ONCE(*pudp);
> +
> +	/*
> +	 * Check for initial section mappings in the pgd/pud.
> +	 */
> +	BUG_ON(pud_sect(pud));
> +	if (pud_none(pud)) {
> +		pudval_t pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN;
> +		phys_addr_t pmd_phys;
> +
> +		if (flags & NO_EXEC_MAPPINGS)
> +			pudval |= PUD_TABLE_PXN;
> +		BUG_ON(!pgtable_alloc);
> +		pmd_phys = pgtable_alloc(PMD_SHIFT);
> +		__pud_populate(pudp, pmd_phys, pudval);
> +		pud = READ_ONCE(*pudp);
> +	}
> +	BUG_ON(pud_bad(pud));
> +
> +	do {
> +		pgprot_t __prot = prot;
> +		next = pmd_cont_addr_end(addr, end);
> +		init_crashkernel_pmd(pudp, addr, next, phys, __prot,
> +				     pgtable_alloc, flags);
> +		phys += next - addr;
> +	} while (addr = next, addr != end);
> +}
> +
> +static void alloc_crashkernel_pud(pgd_t *pgdp, unsigned long addr,
> +				  unsigned long end, phys_addr_t phys,
> +				  pgprot_t prot,
> +				  phys_addr_t (*pgtable_alloc)(int),
> +				  int flags)
> +{
> +	phys_addr_t map_offset;
> +	unsigned long next;
> +	pud_t *pudp;
> +	p4d_t *p4dp = p4d_offset(pgdp, addr);
> +	p4d_t p4d = READ_ONCE(*p4dp);
> +	pudval_t pudval;
> +
> +	if (p4d_none(p4d)) {
> +		p4dval_t p4dval = P4D_TYPE_TABLE | P4D_TABLE_UXN;
> +		phys_addr_t pud_phys;
> +
> +		if (flags & NO_EXEC_MAPPINGS)
> +			p4dval |= P4D_TABLE_PXN;
> +		BUG_ON(!pgtable_alloc);
> +		pud_phys = pgtable_alloc(PUD_SHIFT);
> +		__p4d_populate(p4dp, pud_phys, p4dval);
> +		p4d = READ_ONCE(*p4dp);
> +	}
> +	BUG_ON(p4d_bad(p4d));
> +
> +	/*
> +	 * No need for locking during early boot. And it doesn't work as
> +	 * expected with KASLR enabled.
> +	 */
> +	if (system_state != SYSTEM_BOOTING)
> +		mutex_lock(&fixmap_lock);
> +	pudp = pud_offset(p4dp, addr);
> +	do {
> +		next = pud_addr_end(addr, end);
> +		if (!pud_none(*pudp) && pud_sect(*pudp)) {
> +			phys_addr_t pmd_phys = pgtable_alloc(PMD_SHIFT);
> +			pud_clear(pudp);
> +
> +			pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN;
> +			if (flags & NO_EXEC_MAPPINGS)
> +				pudval |= PUD_TABLE_PXN;
> +			__pud_populate(pudp, pmd_phys, pudval);
> +			flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
> +
> +			map_offset = addr - (addr & PUD_MASK);
> +			if (map_offset)
> +			    alloc_init_cont_pmd(pudp, addr & PUD_MASK, addr,
> +						phys - map_offset, prot,
> +						pgtable_alloc, flags);
> +
> +			if (next < (addr & PUD_MASK) + PUD_SIZE)
> +			    alloc_init_cont_pmd(pudp, next, (addr & PUD_MASK) +
> +						PUD_SIZE, next - addr + phys,
> +						prot, pgtable_alloc, flags);
> +		}
> +		alloc_crashkernel_cont_pmd(pudp, addr, next, phys, prot,
> +					   pgtable_alloc, flags);
> +		phys += next - addr;
> +	} while (pudp++, addr = next, addr != end);
> +
> +	if (system_state != SYSTEM_BOOTING)
> +		mutex_unlock(&fixmap_lock);
> +}
> +
> +static void __create_crashkernel_mapping(pgd_t *pgdir, phys_addr_t phys,
> +					 unsigned long virt, phys_addr_t size,
> +					 pgprot_t prot,
> +					 phys_addr_t (*pgtable_alloc)(int),
> +					 int flags)
> +{
> +	unsigned long addr, end, next;
> +	pgd_t *pgdp = pgd_offset_pgd(pgdir, virt);
> +
> +	/*
> +	 * If the virtual and physical address don't have the same offset
> +	 * within a page, we cannot map the region as the caller expects.
> +	 */
> +	if (WARN_ON((phys ^ virt) & ~PAGE_MASK))
> +		return;
> +
> +	phys &= PAGE_MASK;
> +	addr = virt & PAGE_MASK;
> +	end = PAGE_ALIGN(virt + size);
> +
> +	do {
> +		next = pgd_addr_end(addr, end);
> +		alloc_crashkernel_pud(pgdp, addr, next, phys, prot,
> +				      pgtable_alloc, flags);
> +		phys += next - addr;
> +	} while (pgdp++, addr = next, addr != end);
> +}

__create_crashkernel_mapping() and the helpers it uses resemble very much
__create_pgd_mapping() and it's helpers. I think a cleaner way would be to
clear section mappings and than call __create_pgd_mapping() for crash
kernel memory with block/section mappings disabled.

> +static void __init map_crashkernel(pgd_t *pgdp, phys_addr_t start,
> +				   phys_addr_t end, pgprot_t prot, int flags)
> +{
> +	__create_crashkernel_mapping(pgdp, start, __phys_to_virt(start),
> +				     end - start, prot,
> +				     early_crashkernel_pgtable_alloc, flags);
> +}

No need for this, you can call __create_crashkernel_mapping() directly from
mapping_crashkernel().

> +
> +void __init mapping_crashkernel(void)
> +{
> +	if (can_set_direct_map() || IS_ENABLED(CONFIG_KFENCE))
> +	    return;
> +
> +	if (!crash_mem_map || !crashk_res.end)
> +	    return;
> +
> +	map_crashkernel(swapper_pg_dir, crashk_res.start,
> +			crashk_res.end + 1, PAGE_KERNEL,
> +			NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS);
> +}
> +#else
> +void __init mapping_crashkernel(void) {}
> +#endif
> +
>  static void __init map_mem(pgd_t *pgdp)
>  {
>  	static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN);
> @@ -527,17 +777,6 @@ static void __init map_mem(pgd_t *pgdp)
>  	 */
>  	memblock_mark_nomap(kernel_start, kernel_end - kernel_start);
>  
> -#ifdef CONFIG_KEXEC_CORE
> -	if (crash_mem_map) {
> -		if (IS_ENABLED(CONFIG_ZONE_DMA) ||
> -		    IS_ENABLED(CONFIG_ZONE_DMA32))
> -			flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
> -		else if (crashk_res.end)
> -			memblock_mark_nomap(crashk_res.start,
> -			    resource_size(&crashk_res));
> -	}
> -#endif
> -
>  	/* map all the memory banks */
>  	for_each_mem_range(i, &start, &end) {
>  		if (start >= end)
> @@ -570,19 +809,6 @@ static void __init map_mem(pgd_t *pgdp)
>  	 * in page granularity and put back unused memory to buddy system
>  	 * through /sys/kernel/kexec_crash_size interface.
>  	 */
> -#ifdef CONFIG_KEXEC_CORE
> -	if (crash_mem_map &&
> -	    !IS_ENABLED(CONFIG_ZONE_DMA) && !IS_ENABLED(CONFIG_ZONE_DMA32)) {
> -		if (crashk_res.end) {
> -			__map_memblock(pgdp, crashk_res.start,
> -				       crashk_res.end + 1,
> -				       PAGE_KERNEL,
> -				       NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS);
> -			memblock_clear_nomap(crashk_res.start,
> -					     resource_size(&crashk_res));
> -		}
> -	}
> -#endif
>  }
>  
>  void mark_rodata_ro(void)
> -- 
> 1.8.3.1


-- 
Sincerely yours,
Mike.

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] arm64: mm: fix linear mapping mem access performace degradation
  2022-06-27 12:25           ` guanghui.fgh
@ 2022-06-28  1:34             ` Leizhen (ThunderTown)
  2022-06-28  3:06               ` guanghui.fgh
  0 siblings, 1 reply; 15+ messages in thread
From: Leizhen (ThunderTown) @ 2022-06-28  1:34 UTC (permalink / raw)
  To: guanghui.fgh, Mike Rapoport
  Cc: baolin.wang, catalin.marinas, will, akpm, david, jianyong.wu,
	james.morse, quic_qiancai, christophe.leroy, jonathan,
	mark.rutland, anshuman.khandual, linux-arm-kernel, linux-kernel,
	geert+renesas, ardb, linux-mm, bhe



On 2022/6/27 20:25, guanghui.fgh wrote:
> Thanks.
> 
> 在 2022/6/27 20:06, Leizhen (ThunderTown) 写道:
>>
>>
>> On 2022/6/27 18:46, guanghui.fgh wrote:
>>>
>>>
>>> 在 2022/6/27 17:49, Mike Rapoport 写道:
>>>> Please don't post HTML.
>>>>
>>>> On Mon, Jun 27, 2022 at 05:24:10PM +0800, guanghui.fgh wrote:
>>>>> Thanks.
>>>>>
>>>>> 在 2022/6/27 14:34, Mike Rapoport 写道:
>>>>>
>>>>>       On Sun, Jun 26, 2022 at 07:10:15PM +0800, Guanghui Feng wrote:
>>>>>
>>>>>           The arm64 can build 2M/1G block/sectiion mapping. When using DMA/DMA32 zone
>>>>>           (enable crashkernel, disable rodata full, disable kfence), the mem_map will
>>>>>           use non block/section mapping(for crashkernel requires to shrink the region
>>>>>           in page granularity). But it will degrade performance when doing larging
>>>>>           continuous mem access in kernel(memcpy/memmove, etc).
>>>>>
>>>>>           There are many changes and discussions:
>>>>>           commit 031495635b46
>>>>>           commit 1a8e1cef7603
>>>>>           commit 8424ecdde7df
>>>>>           commit 0a30c53573b0
>>>>>           commit 2687275a5843
>>>>>
>>>>>       Please include oneline summary of the commit. (See section "Describe your
>>>>>       changes" in Documentation/process/submitting-patches.rst)
>>>>>
>>>>> OK, I will add oneline summary in the git commit messages.
>>>>>
>>>>>           This patch changes mem_map to use block/section mapping with crashkernel.
>>>>>           Firstly, do block/section mapping(normally 2M or 1G) for all avail mem at
>>>>>           mem_map, reserve crashkernel memory. And then walking pagetable to split
>>>>>           block/section mapping to non block/section mapping(normally 4K) [[[only]]]
>>>>>           for crashkernel mem.
>>>>>
>>>>>       This already happens when ZONE_DMA/ZONE_DMA32 are disabled. Please explain
>>>>>       why is it Ok to change the way the memory is mapped with
>>>>>       ZONE_DMA/ZONE_DMA32 enabled.
>>>>>
>>>>> In short:
>>>>>
>>>>> 1.building all avail mem with block/section mapping(normally 1G/2M) without
>>>>> inspecting crashkernel
>>>>> 2. Reserve crashkernel mem as same as previous doing
>>>>> 3. only change the crashkernle mem mapping to normal mapping(normally 4k).
>>>>> With this method, there are block/section mapping as more as possible.
>>>>
>>>> This does not answer the question why changing the way the memory is mapped
>>>> when there is ZONE_DMA/DMA32 and crashkernel won't cause a regression.
>>>>
>>> 1.Quoted messages from arch/arm64/mm/init.c
>>>
>>> "Memory reservation for crash kernel either done early or deferred
>>> depending on DMA memory zones configs (ZONE_DMA) --
>>>
>>> In absence of ZONE_DMA configs arm64_dma_phys_limit initialized
>>> here instead of max_zone_phys().  This lets early reservation of
>>> crash kernel memory which has a dependency on arm64_dma_phys_limit.
>>> Reserving memory early for crash kernel allows linear creation of block
>>> mappings (greater than page-granularity) for all the memory bank rangs.
>>> In this scheme a comparatively quicker boot is observed.
>>>
>>> If ZONE_DMA configs are defined, crash kernel memory reservation
>>> is delayed until DMA zone memory range size initialization performed in
>>> zone_sizes_init().  The defer is necessary to steer clear of DMA zone
>>> memory range to avoid overlap allocation.  So crash kernel memory boundaries are not known when mapping all bank memory ranges, which otherwise means not possible to exclude crash kernel range from creating block mappings so page-granularity mappings are created for the entire memory range."
>>>
>>> Namely, the init order: memblock init--->linear mem mapping(4k mapping for crashkernel, requirinig page-granularity changing))--->zone dma limit--->reserve crashkernel.
>>> So when enable ZONE DMA and using crashkernel, the mem mapping using 4k mapping.
>>>
>>> 2.As mentioned above, when linear mem use 4k mapping simply, there is high dtlb miss(degrade performance).
>>> This patch use block/section mapping as far as possible with performance improvement.
>>>
>>> 3.This patch reserve crashkernel as same as the history(ZONE DMA & crashkernel reserving order), and only change the linear mem mapping to block/section mapping.
>>> .
>>>
>>
>> I think Mike Rapoport's probably asking you to answer whether you've
>> taken into account such as BBM. For example, the following code:
>> we should prepare the next level pgtable first, then change 2M block
>> mapping to 4K page mapping, and flush TLB at the end.
>> > +static void init_crashkernel_pmd(pud_t *pudp, unsigned long addr,
>> +                 unsigned long end, phys_addr_t phys,
>> +                 pgprot_t prot,
>> +                 phys_addr_t (*pgtable_alloc)(int), int flags)
>> +{
>> +    phys_addr_t map_offset;
>> +    unsigned long next;
>> +    pmd_t *pmdp;
>> +    pmdval_t pmdval;
>> +
>> +    pmdp = pmd_offset(pudp, addr);
>> +    do {
>> +        next = pmd_addr_end(addr, end);
>> +        if (!pmd_none(*pmdp) && pmd_sect(*pmdp)) {
>> +            phys_addr_t pte_phys = pgtable_alloc(PAGE_SHIFT);
>> +            pmd_clear(pmdp);
>> +            pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN;
>> +            if (flags & NO_EXEC_MAPPINGS)
>> +                pmdval |= PMD_TABLE_PXN;
>> +            __pmd_populate(pmdp, pte_phys, pmdval);
>> +            flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
>>
>> The pgtable is empty now. However, memory other than crashkernel may be being accessed.
> 1.When reserving crashkernel and remapping linear mem mapping, there is only one boot cpu running. There is no other cpu/thread running at the same time.

So, put this in the code comment?

If scalability is considered and unpredictable changes occur in the future, for example,
other modules also need this mapping function. It would be better to deal with the BBM now,
and make this public.


> 
> 2.When clearing block/section mapping, I have flush tlb by flush_tlb_kernel_range. Afterwards rebuilt 4k mapping(I think it's no need flush tlb).


> 
>>
>> +
>> +            map_offset = addr - (addr & PMD_MASK);
>> +            if (map_offset)
>> +                alloc_init_cont_pte(pmdp, addr & PMD_MASK, addr,
>> +                        phys - map_offset, prot,
>> +                        pgtable_alloc, flags);
>> +
>> +            if (next < (addr & PMD_MASK) + PMD_SIZE)
>> +                alloc_init_cont_pte(pmdp, next, (addr & PUD_MASK) +
>> +                        PUD_SIZE, next - addr + phys,
>> +                        prot, pgtable_alloc, flags);

Here and alloc_crashkernel_pud() should use the raw flags. It may not contain  (NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS)

>> +        }
>> +        alloc_crashkernel_cont_pte(pmdp, addr, next, phys, prot,
>> +                       pgtable_alloc, flags);
>> +        phys += next - addr;
>> +    } while (pmdp++, addr = next, addr != end);
>> +}
>>
> .
> 

-- 
Regards,
  Zhen Lei

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] arm64: mm: fix linear mapping mem access performace degradation
  2022-06-26 11:10 [PATCH] arm64: mm: fix linear mapping mem access performace degradation Guanghui Feng
  2022-06-27  6:34 ` Mike Rapoport
@ 2022-06-28  1:40 ` Leizhen (ThunderTown)
  1 sibling, 0 replies; 15+ messages in thread
From: Leizhen (ThunderTown) @ 2022-06-28  1:40 UTC (permalink / raw)
  To: Guanghui Feng, baolin.wang, catalin.marinas, will, akpm, david,
	jianyong.wu, james.morse, quic_qiancai, christophe.leroy,
	jonathan, mark.rutland, chenzhou10, anshuman.khandual,
	linux-arm-kernel, linux-kernel, rppt, geert+renesas, ardb,
	linux-mm



On 2022/6/26 19:10, Guanghui Feng wrote:
> +void __init mapping_crashkernel(void)
> +{
> +	if (can_set_direct_map() || IS_ENABLED(CONFIG_KFENCE))
> +	    return;
> +
> +	if (!crash_mem_map || !crashk_res.end)
> +	    return;

All the code related to crash_mem_map can be removed now.

> +
> +	map_crashkernel(swapper_pg_dir, crashk_res.start,
> +			crashk_res.end + 1, PAGE_KERNEL,
> +			NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS);
> +}

-- 
Regards,
  Zhen Lei

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] arm64: mm: fix linear mapping mem access performace degradation
  2022-06-27 16:49         ` Mike Rapoport
@ 2022-06-28  1:50           ` Kefeng Wang
  2022-06-28  3:22             ` guanghui.fgh
  2022-06-28  2:55           ` guanghui.fgh
  1 sibling, 1 reply; 15+ messages in thread
From: Kefeng Wang @ 2022-06-28  1:50 UTC (permalink / raw)
  To: Mike Rapoport, guanghui.fgh
  Cc: baolin.wang, catalin.marinas, will, akpm, david, jianyong.wu,
	james.morse, quic_qiancai, christophe.leroy, jonathan,
	mark.rutland, thunder.leizhen, anshuman.khandual,
	linux-arm-kernel, linux-kernel, geert+renesas, ardb, linux-mm


On 2022/6/28 0:49, Mike Rapoport wrote:
> On Mon, Jun 27, 2022 at 06:46:50PM +0800, guanghui.fgh wrote:
>> 在 2022/6/27 17:49, Mike Rapoport 写道:
>>> On Mon, Jun 27, 2022 at 05:24:10PM +0800, guanghui.fgh wrote:
>>>> 在 2022/6/27 14:34, Mike Rapoport 写道:
>>>>
>>>>       On Sun, Jun 26, 2022 at 07:10:15PM +0800, Guanghui Feng wrote:
>>>>
>>>>           The arm64 can build 2M/1G block/sectiion mapping. When using DMA/DMA32 zone
>>>>           (enable crashkernel, disable rodata full, disable kfence), the mem_map will
>>>>           use non block/section mapping(for crashkernel requires to shrink the region
>>>>           in page granularity). But it will degrade performance when doing larging
>>>>           continuous mem access in kernel(memcpy/memmove, etc).
>>>>
>>>>           There are many changes and discussions:
>>>>           commit 031495635b46
>>>>           commit 1a8e1cef7603
>>>>           commit 8424ecdde7df
>>>>           commit 0a30c53573b0
>>>>           commit 2687275a5843
>>>>
>>>>       Please include oneline summary of the commit. (See section "Describe your
>>>>       changes" in Documentation/process/submitting-patches.rst)
>>>>
>>>> OK, I will add oneline summary in the git commit messages.
>>>>
>>>>           This patch changes mem_map to use block/section mapping with crashkernel.
>>>>           Firstly, do block/section mapping(normally 2M or 1G) for all avail mem at
>>>>           mem_map, reserve crashkernel memory. And then walking pagetable to split
>>>>           block/section mapping to non block/section mapping(normally 4K) [[[only]]]
>>>>           for crashkernel mem.
>>>>
>>>>       This already happens when ZONE_DMA/ZONE_DMA32 are disabled. Please explain
>>>>       why is it Ok to change the way the memory is mapped with
>>>>       ZONE_DMA/ZONE_DMA32 enabled.
>>>>
>>>> In short:
>>>>
>>>> 1.building all avail mem with block/section mapping(normally 1G/2M) without
>>>> inspecting crashkernel
>>>> 2. Reserve crashkernel mem as same as previous doing
>>>> 3. only change the crashkernle mem mapping to normal mapping(normally 4k).
>>>> With this method, there are block/section mapping as more as possible.
>>> This does not answer the question why changing the way the memory is mapped
>>> when there is ZONE_DMA/DMA32 and crashkernel won't cause a regression.
>>>
>> 1.Quoted messages from arch/arm64/mm/init.c
>>
>> "Memory reservation for crash kernel either done early or deferred
>> depending on DMA memory zones configs (ZONE_DMA) --
>>
>> In absence of ZONE_DMA configs arm64_dma_phys_limit initialized
>> here instead of max_zone_phys().  This lets early reservation of
>> crash kernel memory which has a dependency on arm64_dma_phys_limit.
>> Reserving memory early for crash kernel allows linear creation of block
>> mappings (greater than page-granularity) for all the memory bank rangs.
>> In this scheme a comparatively quicker boot is observed.
>>
>> If ZONE_DMA configs are defined, crash kernel memory reservation
>> is delayed until DMA zone memory range size initialization performed in
>> zone_sizes_init().  The defer is necessary to steer clear of DMA zone
>> memory range to avoid overlap allocation.  So crash kernel memory boundaries
>> are not known when mapping all bank memory ranges, which otherwise means not
>> possible to exclude crash kernel range from creating block mappings so
>> page-granularity mappings are created for the entire memory range."
>>
>> Namely, the init order: memblock init--->linear mem mapping(4k mapping for
>> crashkernel, requirinig page-granularity changing))--->zone dma
>> limit--->reserve crashkernel.
>> So when enable ZONE DMA and using crashkernel, the mem mapping using 4k
>> mapping.
>>
>> 2.As mentioned above, when linear mem use 4k mapping simply, there is high
>> dtlb miss(degrade performance).
>> This patch use block/section mapping as far as possible with performance
>> improvement.
>>
>> 3.This patch reserve crashkernel as same as the history(ZONE DMA &
>> crashkernel reserving order), and only change the linear mem mapping to
>> block/section mapping.
> Thank you for the explanation.
>
> So if I understand correctly, you suggest to reserve the crash kernel
> memory late regardless of ZONE_DMA/32 configuration and then map that memory
> using page level mappings after the memory is reserved.
>
> This makes sense, but I think the patch has a few shortcomings than need to
> be addressed.
>
>> Signed-off-by: Guanghui Feng <guanghuifeng@linux.alibaba.com>
>> ---
>>   arch/arm64/include/asm/mmu.h |   1 +
>>   arch/arm64/mm/init.c         |   8 +-
>>   arch/arm64/mm/mmu.c          | 274 +++++++++++++++++++++++++++++++++++++++----
>>   3 files changed, 253 insertions(+), 30 deletions(-)
>>
>> diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
>> index 48f8466..df113cc 100644
>> --- a/arch/arm64/include/asm/mmu.h
>> +++ b/arch/arm64/include/asm/mmu.h
>> @@ -63,6 +63,7 @@ static inline bool arm64_kernel_unmapped_at_el0(void)
>>   extern void arm64_memblock_init(void);
>>   extern void paging_init(void);
>>   extern void bootmem_init(void);
>> +extern void mapping_crashkernel(void);
> I think map_crashkernel() would be a better name.

Or a more generic naming (and for other helpers),  it's used in 
crashkernel for now,

but maybe more cases.


>
>>   extern void __iomem *early_io_map(phys_addr_t phys, unsigned long virt);
>>   extern void init_mem_pgprot(void);
>>   extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
>> diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
>> index 339ee84..0e7540b 100644
>> --- a/arch/arm64/mm/init.c
>> +++ b/arch/arm64/mm/init.c
>> @@ -388,10 +388,6 @@ void __init arm64_memblock_init(void)
>>   	}
>>   
>>   	early_init_fdt_scan_reserved_mem();
>> -
>> -	if (!IS_ENABLED(CONFIG_ZONE_DMA) && !IS_ENABLED(CONFIG_ZONE_DMA32))
>> -		reserve_crashkernel();
>> -
>>   	high_memory = __va(memblock_end_of_DRAM() - 1) + 1;
>>   }
>>   
>> @@ -438,8 +434,8 @@ void __init bootmem_init(void)
>>   	 * request_standard_resources() depends on crashkernel's memory being
>>   	 * reserved, so do it here.
>>   	 */
>> -	if (IS_ENABLED(CONFIG_ZONE_DMA) || IS_ENABLED(CONFIG_ZONE_DMA32))
>> -		reserve_crashkernel();
>> +	reserve_crashkernel();
>> +	mapping_crashkernel();
> This can be called from reserve_crashkernel() directly.
>   
>>   	memblock_dump_all();
>>   }
>> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
>> index 626ec32..0672afd 100644
>> --- a/arch/arm64/mm/mmu.c
>> +++ b/arch/arm64/mm/mmu.c
>> @@ -498,6 +498,256 @@ static int __init enable_crash_mem_map(char *arg)
>>   }
>>   early_param("crashkernel", enable_crash_mem_map);
>>   
>> +#ifdef CONFIG_KEXEC_CORE
>> +static phys_addr_t __init early_crashkernel_pgtable_alloc(int shift)
>> +{
>> +	phys_addr_t phys;
>> +	void *ptr;
>> +
>> +	phys = memblock_phys_alloc_range(PAGE_SIZE, PAGE_SIZE, 0,
>> +					 MEMBLOCK_ALLOC_NOLEAKTRACE);
>> +	if (!phys)
>> +		panic("Failed to allocate page table page\n");
>> +
>> +	ptr = (void *)__phys_to_virt(phys);
>> +	memset(ptr, 0, PAGE_SIZE);
add dsb() to ensure page is visible to page table walk?
>> +	return phys;
>> +}
>> +
>> +static void init_crashkernel_pte(pmd_t *pmdp, unsigned long addr,
>> +				 unsigned long end,
>> +				 phys_addr_t phys, pgprot_t prot)
>> +{
>> +	pte_t *ptep;
>> +	ptep = pte_offset_kernel(pmdp, addr);
>> +	do {
>> +		set_pte(ptep, pfn_pte(__phys_to_pfn(phys), prot));
>> +		phys += PAGE_SIZE;
>> +	} while (ptep++, addr += PAGE_SIZE, addr != end);
>> +}
>> +
>> +static void alloc_crashkernel_cont_pte(pmd_t *pmdp, unsigned long addr,
>> +				       unsigned long end, phys_addr_t phys,
>> +				       pgprot_t prot,
>> +				       phys_addr_t (*pgtable_alloc)(int),
>> +				       int flags)
>> +{
>> +	unsigned long next;
>> +	pmd_t pmd = READ_ONCE(*pmdp);
>> +
>> +	BUG_ON(pmd_sect(pmd));
>> +	if (pmd_none(pmd)) {
>> +		pmdval_t pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN;
>> +		phys_addr_t pte_phys;
>> +
>> +		if (flags & NO_EXEC_MAPPINGS)
>> +			pmdval |= PMD_TABLE_PXN;
>> +		BUG_ON(!pgtable_alloc);
>> +		pte_phys = pgtable_alloc(PAGE_SHIFT);
>> +		__pmd_populate(pmdp, pte_phys, pmdval);
>> +		pmd = READ_ONCE(*pmdp);
>> +	}
>> +	BUG_ON(pmd_bad(pmd));
>> +
>> +	do {
>> +		pgprot_t __prot = prot;
>> +		next = pte_cont_addr_end(addr, end);
>> +		init_crashkernel_pte(pmdp, addr, next, phys, __prot);
>> +		phys += next - addr;
>> +	} while (addr = next, addr != end);
>> +}
>> +
>> +static void init_crashkernel_pmd(pud_t *pudp, unsigned long addr,
>> +				 unsigned long end, phys_addr_t phys,
>> +				 pgprot_t prot,
>> +				 phys_addr_t (*pgtable_alloc)(int), int flags)
>> +{
>> +	phys_addr_t map_offset;
>> +	unsigned long next;
>> +	pmd_t *pmdp;
>> +	pmdval_t pmdval;
>> +
>> +	pmdp = pmd_offset(pudp, addr);
>> +	do {
>> +		next = pmd_addr_end(addr, end);
>> +		if (!pmd_none(*pmdp) && pmd_sect(*pmdp)) {
>> +			phys_addr_t pte_phys = pgtable_alloc(PAGE_SHIFT);
>> +			pmd_clear(pmdp);
>> +			pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN;
>> +			if (flags & NO_EXEC_MAPPINGS)
>> +				pmdval |= PMD_TABLE_PXN;
>> +			__pmd_populate(pmdp, pte_phys, pmdval);
>> +			flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
>> +
>> +			map_offset = addr - (addr & PMD_MASK);
>> +			if (map_offset)
>> +			    alloc_init_cont_pte(pmdp, addr & PMD_MASK, addr,
>> +						phys - map_offset, prot,
>> +						pgtable_alloc, flags);
>> +
>> +			if (next < (addr & PMD_MASK) + PMD_SIZE)
>> +			    alloc_init_cont_pte(pmdp, next, (addr & PUD_MASK) +
>> +						PUD_SIZE, next - addr + phys,
>> +						prot, pgtable_alloc, flags);
>> +		}
>> +		alloc_crashkernel_cont_pte(pmdp, addr, next, phys, prot,
>> +					   pgtable_alloc, flags);
>> +		phys += next - addr;
>> +	} while (pmdp++, addr = next, addr != end);
>> +}
>> +
>> +static void alloc_crashkernel_cont_pmd(pud_t *pudp, unsigned long addr,
>> +				       unsigned long end, phys_addr_t phys,
>> +				       pgprot_t prot,
>> +				       phys_addr_t (*pgtable_alloc)(int),
>> +				       int flags)
>> +{
>> +	unsigned long next;
>> +	pud_t pud = READ_ONCE(*pudp);
>> +
>> +	/*
>> +	 * Check for initial section mappings in the pgd/pud.
>> +	 */
>> +	BUG_ON(pud_sect(pud));
>> +	if (pud_none(pud)) {
>> +		pudval_t pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN;
>> +		phys_addr_t pmd_phys;
>> +
>> +		if (flags & NO_EXEC_MAPPINGS)
>> +			pudval |= PUD_TABLE_PXN;
>> +		BUG_ON(!pgtable_alloc);
>> +		pmd_phys = pgtable_alloc(PMD_SHIFT);
>> +		__pud_populate(pudp, pmd_phys, pudval);
>> +		pud = READ_ONCE(*pudp);
>> +	}
>> +	BUG_ON(pud_bad(pud));
>> +
>> +	do {
>> +		pgprot_t __prot = prot;
>> +		next = pmd_cont_addr_end(addr, end);
>> +		init_crashkernel_pmd(pudp, addr, next, phys, __prot,
>> +				     pgtable_alloc, flags);
>> +		phys += next - addr;
>> +	} while (addr = next, addr != end);
>> +}
>> +
>> +static void alloc_crashkernel_pud(pgd_t *pgdp, unsigned long addr,
>> +				  unsigned long end, phys_addr_t phys,
>> +				  pgprot_t prot,
>> +				  phys_addr_t (*pgtable_alloc)(int),
>> +				  int flags)
>> +{
>> +	phys_addr_t map_offset;
>> +	unsigned long next;
>> +	pud_t *pudp;
>> +	p4d_t *p4dp = p4d_offset(pgdp, addr);
>> +	p4d_t p4d = READ_ONCE(*p4dp);
>> +	pudval_t pudval;
>> +
>> +	if (p4d_none(p4d)) {
>> +		p4dval_t p4dval = P4D_TYPE_TABLE | P4D_TABLE_UXN;
>> +		phys_addr_t pud_phys;
>> +
>> +		if (flags & NO_EXEC_MAPPINGS)
>> +			p4dval |= P4D_TABLE_PXN;
>> +		BUG_ON(!pgtable_alloc);
>> +		pud_phys = pgtable_alloc(PUD_SHIFT);
>> +		__p4d_populate(p4dp, pud_phys, p4dval);
>> +		p4d = READ_ONCE(*p4dp);
>> +	}
>> +	BUG_ON(p4d_bad(p4d));
>> +
>> +	/*
>> +	 * No need for locking during early boot. And it doesn't work as
>> +	 * expected with KASLR enabled.
>> +	 */
>> +	if (system_state != SYSTEM_BOOTING)
>> +		mutex_lock(&fixmap_lock);
It seems that no need to use fixmap_lock.
>> +	pudp = pud_offset(p4dp, addr);
>> +	do {
>> +		next = pud_addr_end(addr, end);
>> +		if (!pud_none(*pudp) && pud_sect(*pudp)) {
>> +			phys_addr_t pmd_phys = pgtable_alloc(PMD_SHIFT);
>> +			pud_clear(pudp);
>> +
>> +			pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN;
>> +			if (flags & NO_EXEC_MAPPINGS)
>> +				pudval |= PUD_TABLE_PXN;
>> +			__pud_populate(pudp, pmd_phys, pudval);
>> +			flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
>> +
>> +			map_offset = addr - (addr & PUD_MASK);
>> +			if (map_offset)
>> +			    alloc_init_cont_pmd(pudp, addr & PUD_MASK, addr,
>> +						phys - map_offset, prot,
>> +						pgtable_alloc, flags);
>> +
>> +			if (next < (addr & PUD_MASK) + PUD_SIZE)
>> +			    alloc_init_cont_pmd(pudp, next, (addr & PUD_MASK) +
>> +						PUD_SIZE, next - addr + phys,
>> +						prot, pgtable_alloc, flags);
>> +		}
>> +		alloc_crashkernel_cont_pmd(pudp, addr, next, phys, prot,
>> +					   pgtable_alloc, flags);
>> +		phys += next - addr;
>> +	} while (pudp++, addr = next, addr != end);
>> +
>> +	if (system_state != SYSTEM_BOOTING)
>> +		mutex_unlock(&fixmap_lock);
>> +}
>> +
>> +static void __create_crashkernel_mapping(pgd_t *pgdir, phys_addr_t phys,
>> +					 unsigned long virt, phys_addr_t size,
>> +					 pgprot_t prot,
>> +					 phys_addr_t (*pgtable_alloc)(int),
>> +					 int flags)
>> +{
>> +	unsigned long addr, end, next;
>> +	pgd_t *pgdp = pgd_offset_pgd(pgdir, virt);
>> +
>> +	/*
>> +	 * If the virtual and physical address don't have the same offset
>> +	 * within a page, we cannot map the region as the caller expects.
>> +	 */
>> +	if (WARN_ON((phys ^ virt) & ~PAGE_MASK))
>> +		return;
>> +
>> +	phys &= PAGE_MASK;
>> +	addr = virt & PAGE_MASK;
>> +	end = PAGE_ALIGN(virt + size);
>> +
>> +	do {
>> +		next = pgd_addr_end(addr, end);
>> +		alloc_crashkernel_pud(pgdp, addr, next, phys, prot,
>> +				      pgtable_alloc, flags);
>> +		phys += next - addr;
>> +	} while (pgdp++, addr = next, addr != end);
>> +}
> __create_crashkernel_mapping() and the helpers it uses resemble very much
> __create_pgd_mapping() and it's helpers. I think a cleaner way would be to
> clear section mappings and than call __create_pgd_mapping() for crash
> kernel memory with block/section mappings disabled.
Yes, there are too much duplicated codes, if the handling of crash 
reserve memory
is acceptable, we could find a better way(as Mike said) to deal with the 
repeat.
Actually, we try something like this before, but I am not sure it is 
right way,
and we saw TLB conflict,  I will test this patch.
>> +static void __init map_crashkernel(pgd_t *pgdp, phys_addr_t start,
>> +				   phys_addr_t end, pgprot_t prot, int flags)
>> +{
>> +	__create_crashkernel_mapping(pgdp, start, __phys_to_virt(start),
>> +				     end - start, prot,
>> +				     early_crashkernel_pgtable_alloc, flags);
>> +}
> No need for this, you can call __create_crashkernel_mapping() directly from
> mapping_crashkernel().
>
>> +
>> +void __init mapping_crashkernel(void)
>> +{
>> +	if (can_set_direct_map() || IS_ENABLED(CONFIG_KFENCE))
>> +	    return;
>> +
>> +	if (!crash_mem_map || !crashk_res.end)
>> +	    return;
>> +
>> +	map_crashkernel(swapper_pg_dir, crashk_res.start,
>> +			crashk_res.end + 1, PAGE_KERNEL,
>> +			NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS);
>> +}
>> +#else
>> +void __init mapping_crashkernel(void) {}
>> +#endif
>> +
>>   static void __init map_mem(pgd_t *pgdp)
>>   {
>>   	static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN);
>> @@ -527,17 +777,6 @@ static void __init map_mem(pgd_t *pgdp)
>>   	 */
>>   	memblock_mark_nomap(kernel_start, kernel_end - kernel_start);
>>   
>> -#ifdef CONFIG_KEXEC_CORE
>> -	if (crash_mem_map) {
>> -		if (IS_ENABLED(CONFIG_ZONE_DMA) ||
>> -		    IS_ENABLED(CONFIG_ZONE_DMA32))
>> -			flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
>> -		else if (crashk_res.end)
>> -			memblock_mark_nomap(crashk_res.start,
>> -			    resource_size(&crashk_res));
>> -	}
>> -#endif
>> -
>>   	/* map all the memory banks */
>>   	for_each_mem_range(i, &start, &end) {
>>   		if (start >= end)
>> @@ -570,19 +809,6 @@ static void __init map_mem(pgd_t *pgdp)
>>   	 * in page granularity and put back unused memory to buddy system
>>   	 * through /sys/kernel/kexec_crash_size interface.
>>   	 */
>> -#ifdef CONFIG_KEXEC_CORE
>> -	if (crash_mem_map &&
>> -	    !IS_ENABLED(CONFIG_ZONE_DMA) && !IS_ENABLED(CONFIG_ZONE_DMA32)) {
>> -		if (crashk_res.end) {
>> -			__map_memblock(pgdp, crashk_res.start,
>> -				       crashk_res.end + 1,
>> -				       PAGE_KERNEL,
>> -				       NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS);
>> -			memblock_clear_nomap(crashk_res.start,
>> -					     resource_size(&crashk_res));
>> -		}
>> -	}
>> -#endif
>>   }
>>   
>>   void mark_rodata_ro(void)
>> -- 
>> 1.8.3.1
>

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] arm64: mm: fix linear mapping mem access performace degradation
  2022-06-27 16:49         ` Mike Rapoport
  2022-06-28  1:50           ` Kefeng Wang
@ 2022-06-28  2:55           ` guanghui.fgh
  1 sibling, 0 replies; 15+ messages in thread
From: guanghui.fgh @ 2022-06-28  2:55 UTC (permalink / raw)
  To: Mike Rapoport
  Cc: baolin.wang, catalin.marinas, will, akpm, david, jianyong.wu,
	james.morse, quic_qiancai, christophe.leroy, jonathan,
	mark.rutland, thunder.leizhen, anshuman.khandual,
	linux-arm-kernel, linux-kernel, geert+renesas, ardb, linux-mm,
	Yao Hongbo

Thanks.

在 2022/6/28 0:49, Mike Rapoport 写道:
> On Mon, Jun 27, 2022 at 06:46:50PM +0800, guanghui.fgh wrote:
>> 在 2022/6/27 17:49, Mike Rapoport 写道:
>>> On Mon, Jun 27, 2022 at 05:24:10PM +0800, guanghui.fgh wrote:
>>>> 在 2022/6/27 14:34, Mike Rapoport 写道:
>>>>
>>>>       On Sun, Jun 26, 2022 at 07:10:15PM +0800, Guanghui Feng wrote:
>>>>
>>>>           The arm64 can build 2M/1G block/sectiion mapping. When using DMA/DMA32 zone
>>>>           (enable crashkernel, disable rodata full, disable kfence), the mem_map will
>>>>           use non block/section mapping(for crashkernel requires to shrink the region
>>>>           in page granularity). But it will degrade performance when doing larging
>>>>           continuous mem access in kernel(memcpy/memmove, etc).
>>>>
>>>>           There are many changes and discussions:
>>>>           commit 031495635b46
>>>>           commit 1a8e1cef7603
>>>>           commit 8424ecdde7df
>>>>           commit 0a30c53573b0
>>>>           commit 2687275a5843
>>>>
>>>>       Please include oneline summary of the commit. (See section "Describe your
>>>>       changes" in Documentation/process/submitting-patches.rst)
>>>>
>>>> OK, I will add oneline summary in the git commit messages.
>>>>
>>>>           This patch changes mem_map to use block/section mapping with crashkernel.
>>>>           Firstly, do block/section mapping(normally 2M or 1G) for all avail mem at
>>>>           mem_map, reserve crashkernel memory. And then walking pagetable to split
>>>>           block/section mapping to non block/section mapping(normally 4K) [[[only]]]
>>>>           for crashkernel mem.
>>>>
>>>>       This already happens when ZONE_DMA/ZONE_DMA32 are disabled. Please explain
>>>>       why is it Ok to change the way the memory is mapped with
>>>>       ZONE_DMA/ZONE_DMA32 enabled.
>>>>
>>>> In short:
>>>>
>>>> 1.building all avail mem with block/section mapping(normally 1G/2M) without
>>>> inspecting crashkernel
>>>> 2. Reserve crashkernel mem as same as previous doing
>>>> 3. only change the crashkernle mem mapping to normal mapping(normally 4k).
>>>> With this method, there are block/section mapping as more as possible.
>>>
>>> This does not answer the question why changing the way the memory is mapped
>>> when there is ZONE_DMA/DMA32 and crashkernel won't cause a regression.
>>>
>> 1.Quoted messages from arch/arm64/mm/init.c
>>
>> "Memory reservation for crash kernel either done early or deferred
>> depending on DMA memory zones configs (ZONE_DMA) --
>>
>> In absence of ZONE_DMA configs arm64_dma_phys_limit initialized
>> here instead of max_zone_phys().  This lets early reservation of
>> crash kernel memory which has a dependency on arm64_dma_phys_limit.
>> Reserving memory early for crash kernel allows linear creation of block
>> mappings (greater than page-granularity) for all the memory bank rangs.
>> In this scheme a comparatively quicker boot is observed.
>>
>> If ZONE_DMA configs are defined, crash kernel memory reservation
>> is delayed until DMA zone memory range size initialization performed in
>> zone_sizes_init().  The defer is necessary to steer clear of DMA zone
>> memory range to avoid overlap allocation.  So crash kernel memory boundaries
>> are not known when mapping all bank memory ranges, which otherwise means not
>> possible to exclude crash kernel range from creating block mappings so
>> page-granularity mappings are created for the entire memory range."
>>
>> Namely, the init order: memblock init--->linear mem mapping(4k mapping for
>> crashkernel, requirinig page-granularity changing))--->zone dma
>> limit--->reserve crashkernel.
>> So when enable ZONE DMA and using crashkernel, the mem mapping using 4k
>> mapping.
>>
>> 2.As mentioned above, when linear mem use 4k mapping simply, there is high
>> dtlb miss(degrade performance).
>> This patch use block/section mapping as far as possible with performance
>> improvement.
>>
>> 3.This patch reserve crashkernel as same as the history(ZONE DMA &
>> crashkernel reserving order), and only change the linear mem mapping to
>> block/section mapping.
> 
> Thank you for the explanation.
> 
> So if I understand correctly, you suggest to reserve the crash kernel
> memory late regardless of ZONE_DMA/32 configuration and then map that memory
> using page level mappings after the memory is reserved.
> 
> This makes sense, but I think the patch has a few shortcomings than need to
> be addressed.
>Yes, It's that.
>> Signed-off-by: Guanghui Feng <guanghuifeng@linux.alibaba.com>
>> ---
>>   arch/arm64/include/asm/mmu.h |   1 +
>>   arch/arm64/mm/init.c         |   8 +-
>>   arch/arm64/mm/mmu.c          | 274 +++++++++++++++++++++++++++++++++++++++----
>>   3 files changed, 253 insertions(+), 30 deletions(-)
>>
>> diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
>> index 48f8466..df113cc 100644
>> --- a/arch/arm64/include/asm/mmu.h
>> +++ b/arch/arm64/include/asm/mmu.h
>> @@ -63,6 +63,7 @@ static inline bool arm64_kernel_unmapped_at_el0(void)
>>   extern void arm64_memblock_init(void);
>>   extern void paging_init(void);
>>   extern void bootmem_init(void);
>> +extern void mapping_crashkernel(void);
> 
> I think map_crashkernel() would be a better name.
OK.
> 
>>   extern void __iomem *early_io_map(phys_addr_t phys, unsigned long virt);
>>   extern void init_mem_pgprot(void);
>>   extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
>> diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
>> index 339ee84..0e7540b 100644
>> --- a/arch/arm64/mm/init.c
>> +++ b/arch/arm64/mm/init.c
>> @@ -388,10 +388,6 @@ void __init arm64_memblock_init(void)
>>   	}
>>   
>>   	early_init_fdt_scan_reserved_mem();
>> -
>> -	if (!IS_ENABLED(CONFIG_ZONE_DMA) && !IS_ENABLED(CONFIG_ZONE_DMA32))
>> -		reserve_crashkernel();
>> -
>>   	high_memory = __va(memblock_end_of_DRAM() - 1) + 1;
>>   }
>>   
>> @@ -438,8 +434,8 @@ void __init bootmem_init(void)
>>   	 * request_standard_resources() depends on crashkernel's memory being
>>   	 * reserved, so do it here.
>>   	 */
>> -	if (IS_ENABLED(CONFIG_ZONE_DMA) || IS_ENABLED(CONFIG_ZONE_DMA32))
>> -		reserve_crashkernel();
>> +	reserve_crashkernel();
>> +	mapping_crashkernel();
> 
> This can be called from reserve_crashkernel() directly.
OK
>   
>>   	memblock_dump_all();
>>   }
>> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
>> index 626ec32..0672afd 100644
>> --- a/arch/arm64/mm/mmu.c
>> +++ b/arch/arm64/mm/mmu.c
>> @@ -498,6 +498,256 @@ static int __init enable_crash_mem_map(char *arg)
>>   }
>>   early_param("crashkernel", enable_crash_mem_map);
>>   
>> +#ifdef CONFIG_KEXEC_CORE
>> +static phys_addr_t __init early_crashkernel_pgtable_alloc(int shift)
>> +{
>> +	phys_addr_t phys;
>> +	void *ptr;
>> +
>> +	phys = memblock_phys_alloc_range(PAGE_SIZE, PAGE_SIZE, 0,
>> +					 MEMBLOCK_ALLOC_NOLEAKTRACE);
>> +	if (!phys)
>> +		panic("Failed to allocate page table page\n");
>> +
>> +	ptr = (void *)__phys_to_virt(phys);
>> +	memset(ptr, 0, PAGE_SIZE);
>> +	return phys;
>> +}
>> +
>> +static void init_crashkernel_pte(pmd_t *pmdp, unsigned long addr,
>> +				 unsigned long end,
>> +				 phys_addr_t phys, pgprot_t prot)
>> +{
>> +	pte_t *ptep;
>> +	ptep = pte_offset_kernel(pmdp, addr);
>> +	do {
>> +		set_pte(ptep, pfn_pte(__phys_to_pfn(phys), prot));
>> +		phys += PAGE_SIZE;
>> +	} while (ptep++, addr += PAGE_SIZE, addr != end);
>> +}
>> +
>> +static void alloc_crashkernel_cont_pte(pmd_t *pmdp, unsigned long addr,
>> +				       unsigned long end, phys_addr_t phys,
>> +				       pgprot_t prot,
>> +				       phys_addr_t (*pgtable_alloc)(int),
>> +				       int flags)
>> +{
>> +	unsigned long next;
>> +	pmd_t pmd = READ_ONCE(*pmdp);
>> +
>> +	BUG_ON(pmd_sect(pmd));
>> +	if (pmd_none(pmd)) {
>> +		pmdval_t pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN;
>> +		phys_addr_t pte_phys;
>> +
>> +		if (flags & NO_EXEC_MAPPINGS)
>> +			pmdval |= PMD_TABLE_PXN;
>> +		BUG_ON(!pgtable_alloc);
>> +		pte_phys = pgtable_alloc(PAGE_SHIFT);
>> +		__pmd_populate(pmdp, pte_phys, pmdval);
>> +		pmd = READ_ONCE(*pmdp);
>> +	}
>> +	BUG_ON(pmd_bad(pmd));
>> +
>> +	do {
>> +		pgprot_t __prot = prot;
>> +		next = pte_cont_addr_end(addr, end);
>> +		init_crashkernel_pte(pmdp, addr, next, phys, __prot);
>> +		phys += next - addr;
>> +	} while (addr = next, addr != end);
>> +}
>> +
>> +static void init_crashkernel_pmd(pud_t *pudp, unsigned long addr,
>> +				 unsigned long end, phys_addr_t phys,
>> +				 pgprot_t prot,
>> +				 phys_addr_t (*pgtable_alloc)(int), int flags)
>> +{
>> +	phys_addr_t map_offset;
>> +	unsigned long next;
>> +	pmd_t *pmdp;
>> +	pmdval_t pmdval;
>> +
>> +	pmdp = pmd_offset(pudp, addr);
>> +	do {
>> +		next = pmd_addr_end(addr, end);
>> +		if (!pmd_none(*pmdp) && pmd_sect(*pmdp)) {
>> +			phys_addr_t pte_phys = pgtable_alloc(PAGE_SHIFT);
>> +			pmd_clear(pmdp);
>> +			pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN;
>> +			if (flags & NO_EXEC_MAPPINGS)
>> +				pmdval |= PMD_TABLE_PXN;
>> +			__pmd_populate(pmdp, pte_phys, pmdval);
>> +			flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
>> +
>> +			map_offset = addr - (addr & PMD_MASK);
>> +			if (map_offset)
>> +			    alloc_init_cont_pte(pmdp, addr & PMD_MASK, addr,
>> +						phys - map_offset, prot,
>> +						pgtable_alloc, flags);
>> +
>> +			if (next < (addr & PMD_MASK) + PMD_SIZE)
>> +			    alloc_init_cont_pte(pmdp, next, (addr & PUD_MASK) +
>> +						PUD_SIZE, next - addr + phys,
>> +						prot, pgtable_alloc, flags);
>> +		}
>> +		alloc_crashkernel_cont_pte(pmdp, addr, next, phys, prot,
>> +					   pgtable_alloc, flags);
>> +		phys += next - addr;
>> +	} while (pmdp++, addr = next, addr != end);
>> +}
>> +
>> +static void alloc_crashkernel_cont_pmd(pud_t *pudp, unsigned long addr,
>> +				       unsigned long end, phys_addr_t phys,
>> +				       pgprot_t prot,
>> +				       phys_addr_t (*pgtable_alloc)(int),
>> +				       int flags)
>> +{
>> +	unsigned long next;
>> +	pud_t pud = READ_ONCE(*pudp);
>> +
>> +	/*
>> +	 * Check for initial section mappings in the pgd/pud.
>> +	 */
>> +	BUG_ON(pud_sect(pud));
>> +	if (pud_none(pud)) {
>> +		pudval_t pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN;
>> +		phys_addr_t pmd_phys;
>> +
>> +		if (flags & NO_EXEC_MAPPINGS)
>> +			pudval |= PUD_TABLE_PXN;
>> +		BUG_ON(!pgtable_alloc);
>> +		pmd_phys = pgtable_alloc(PMD_SHIFT);
>> +		__pud_populate(pudp, pmd_phys, pudval);
>> +		pud = READ_ONCE(*pudp);
>> +	}
>> +	BUG_ON(pud_bad(pud));
>> +
>> +	do {
>> +		pgprot_t __prot = prot;
>> +		next = pmd_cont_addr_end(addr, end);
>> +		init_crashkernel_pmd(pudp, addr, next, phys, __prot,
>> +				     pgtable_alloc, flags);
>> +		phys += next - addr;
>> +	} while (addr = next, addr != end);
>> +}
>> +
>> +static void alloc_crashkernel_pud(pgd_t *pgdp, unsigned long addr,
>> +				  unsigned long end, phys_addr_t phys,
>> +				  pgprot_t prot,
>> +				  phys_addr_t (*pgtable_alloc)(int),
>> +				  int flags)
>> +{
>> +	phys_addr_t map_offset;
>> +	unsigned long next;
>> +	pud_t *pudp;
>> +	p4d_t *p4dp = p4d_offset(pgdp, addr);
>> +	p4d_t p4d = READ_ONCE(*p4dp);
>> +	pudval_t pudval;
>> +
>> +	if (p4d_none(p4d)) {
>> +		p4dval_t p4dval = P4D_TYPE_TABLE | P4D_TABLE_UXN;
>> +		phys_addr_t pud_phys;
>> +
>> +		if (flags & NO_EXEC_MAPPINGS)
>> +			p4dval |= P4D_TABLE_PXN;
>> +		BUG_ON(!pgtable_alloc);
>> +		pud_phys = pgtable_alloc(PUD_SHIFT);
>> +		__p4d_populate(p4dp, pud_phys, p4dval);
>> +		p4d = READ_ONCE(*p4dp);
>> +	}
>> +	BUG_ON(p4d_bad(p4d));
>> +
>> +	/*
>> +	 * No need for locking during early boot. And it doesn't work as
>> +	 * expected with KASLR enabled.
>> +	 */
>> +	if (system_state != SYSTEM_BOOTING)
>> +		mutex_lock(&fixmap_lock);
>> +	pudp = pud_offset(p4dp, addr);
>> +	do {
>> +		next = pud_addr_end(addr, end);
>> +		if (!pud_none(*pudp) && pud_sect(*pudp)) {
>> +			phys_addr_t pmd_phys = pgtable_alloc(PMD_SHIFT);
>> +			pud_clear(pudp);
>> +
>> +			pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN;
>> +			if (flags & NO_EXEC_MAPPINGS)
>> +				pudval |= PUD_TABLE_PXN;
>> +			__pud_populate(pudp, pmd_phys, pudval);
>> +			flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
>> +
>> +			map_offset = addr - (addr & PUD_MASK);
>> +			if (map_offset)
>> +			    alloc_init_cont_pmd(pudp, addr & PUD_MASK, addr,
>> +						phys - map_offset, prot,
>> +						pgtable_alloc, flags);
>> +
>> +			if (next < (addr & PUD_MASK) + PUD_SIZE)
>> +			    alloc_init_cont_pmd(pudp, next, (addr & PUD_MASK) +
>> +						PUD_SIZE, next - addr + phys,
>> +						prot, pgtable_alloc, flags);
>> +		}
>> +		alloc_crashkernel_cont_pmd(pudp, addr, next, phys, prot,
>> +					   pgtable_alloc, flags);
>> +		phys += next - addr;
>> +	} while (pudp++, addr = next, addr != end);
>> +
>> +	if (system_state != SYSTEM_BOOTING)
>> +		mutex_unlock(&fixmap_lock);
>> +}
>> +
>> +static void __create_crashkernel_mapping(pgd_t *pgdir, phys_addr_t phys,
>> +					 unsigned long virt, phys_addr_t size,
>> +					 pgprot_t prot,
>> +					 phys_addr_t (*pgtable_alloc)(int),
>> +					 int flags)
>> +{
>> +	unsigned long addr, end, next;
>> +	pgd_t *pgdp = pgd_offset_pgd(pgdir, virt);
>> +
>> +	/*
>> +	 * If the virtual and physical address don't have the same offset
>> +	 * within a page, we cannot map the region as the caller expects.
>> +	 */
>> +	if (WARN_ON((phys ^ virt) & ~PAGE_MASK))
>> +		return;
>> +
>> +	phys &= PAGE_MASK;
>> +	addr = virt & PAGE_MASK;
>> +	end = PAGE_ALIGN(virt + size);
>> +
>> +	do {
>> +		next = pgd_addr_end(addr, end);
>> +		alloc_crashkernel_pud(pgdp, addr, next, phys, prot,
>> +				      pgtable_alloc, flags);
>> +		phys += next - addr;
>> +	} while (pgdp++, addr = next, addr != end);
>> +}
> 
> __create_crashkernel_mapping() and the helpers it uses resemble very much
> __create_pgd_mapping() and it's helpers. I think a cleaner way would be to
> clear section mappings and than call __create_pgd_mapping() for crash
> kernel memory with block/section mappings disabled.
> 
>> +static void __init map_crashkernel(pgd_t *pgdp, phys_addr_t start,
>> +				   phys_addr_t end, pgprot_t prot, int flags)
>> +{
>> +	__create_crashkernel_mapping(pgdp, start, __phys_to_virt(start),
>> +				     end - start, prot,
>> +				     early_crashkernel_pgtable_alloc, flags);
>> +}
> 
> No need for this, you can call __create_crashkernel_mapping() directly from
> mapping_crashkernel().
> 
>> +
>> +void __init mapping_crashkernel(void)
>> +{
>> +	if (can_set_direct_map() || IS_ENABLED(CONFIG_KFENCE))
>> +	    return;
>> +
>> +	if (!crash_mem_map || !crashk_res.end)
>> +	    return;
>> +
>> +	map_crashkernel(swapper_pg_dir, crashk_res.start,
>> +			crashk_res.end + 1, PAGE_KERNEL,
>> +			NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS);
>> +}
>> +#else
>> +void __init mapping_crashkernel(void) {}
>> +#endif
>> +
>>   static void __init map_mem(pgd_t *pgdp)
>>   {
>>   	static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN);
>> @@ -527,17 +777,6 @@ static void __init map_mem(pgd_t *pgdp)
>>   	 */
>>   	memblock_mark_nomap(kernel_start, kernel_end - kernel_start);
>>   
>> -#ifdef CONFIG_KEXEC_CORE
>> -	if (crash_mem_map) {
>> -		if (IS_ENABLED(CONFIG_ZONE_DMA) ||
>> -		    IS_ENABLED(CONFIG_ZONE_DMA32))
>> -			flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
>> -		else if (crashk_res.end)
>> -			memblock_mark_nomap(crashk_res.start,
>> -			    resource_size(&crashk_res));
>> -	}
>> -#endif
>> -
>>   	/* map all the memory banks */
>>   	for_each_mem_range(i, &start, &end) {
>>   		if (start >= end)
>> @@ -570,19 +809,6 @@ static void __init map_mem(pgd_t *pgdp)
>>   	 * in page granularity and put back unused memory to buddy system
>>   	 * through /sys/kernel/kexec_crash_size interface.
>>   	 */
>> -#ifdef CONFIG_KEXEC_CORE
>> -	if (crash_mem_map &&
>> -	    !IS_ENABLED(CONFIG_ZONE_DMA) && !IS_ENABLED(CONFIG_ZONE_DMA32)) {
>> -		if (crashk_res.end) {
>> -			__map_memblock(pgdp, crashk_res.start,
>> -				       crashk_res.end + 1,
>> -				       PAGE_KERNEL,
>> -				       NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS);
>> -			memblock_clear_nomap(crashk_res.start,
>> -					     resource_size(&crashk_res));
>> -		}
>> -	}
>> -#endif
>>   }
>>   
>>   void mark_rodata_ro(void)
>> -- 
>> 1.8.3.1
> 
> 

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] arm64: mm: fix linear mapping mem access performace degradation
  2022-06-28  1:34             ` Leizhen (ThunderTown)
@ 2022-06-28  3:06               ` guanghui.fgh
  2022-06-28  6:21                 ` Leizhen (ThunderTown)
  2022-06-28  7:52                 ` guanghui.fgh
  0 siblings, 2 replies; 15+ messages in thread
From: guanghui.fgh @ 2022-06-28  3:06 UTC (permalink / raw)
  To: Leizhen (ThunderTown), Mike Rapoport
  Cc: baolin.wang, catalin.marinas, will, akpm, david, jianyong.wu,
	james.morse, quic_qiancai, christophe.leroy, jonathan,
	mark.rutland, anshuman.khandual, linux-arm-kernel, linux-kernel,
	geert+renesas, ardb, linux-mm, bhe, Yao Hongbo

Thanks.

在 2022/6/28 9:34, Leizhen (ThunderTown) 写道:
> 
> 
> On 2022/6/27 20:25, guanghui.fgh wrote:
>> Thanks.
>>
>> 在 2022/6/27 20:06, Leizhen (ThunderTown) 写道:
>>>
>>>
>>> On 2022/6/27 18:46, guanghui.fgh wrote:
>>>>
>>>>
>>>> 在 2022/6/27 17:49, Mike Rapoport 写道:
>>>>> Please don't post HTML.
>>>>>
>>>>> On Mon, Jun 27, 2022 at 05:24:10PM +0800, guanghui.fgh wrote:
>>>>>> Thanks.
>>>>>>
>>>>>> 在 2022/6/27 14:34, Mike Rapoport 写道:
>>>>>>
>>>>>>        On Sun, Jun 26, 2022 at 07:10:15PM +0800, Guanghui Feng wrote:
>>>>>>
>>>>>>            The arm64 can build 2M/1G block/sectiion mapping. When using DMA/DMA32 zone
>>>>>>            (enable crashkernel, disable rodata full, disable kfence), the mem_map will
>>>>>>            use non block/section mapping(for crashkernel requires to shrink the region
>>>>>>            in page granularity). But it will degrade performance when doing larging
>>>>>>            continuous mem access in kernel(memcpy/memmove, etc).
>>>>>>
>>>>>>            There are many changes and discussions:
>>>>>>            commit 031495635b46
>>>>>>            commit 1a8e1cef7603
>>>>>>            commit 8424ecdde7df
>>>>>>            commit 0a30c53573b0
>>>>>>            commit 2687275a5843
>>>>>>
>>>>>>        Please include oneline summary of the commit. (See section "Describe your
>>>>>>        changes" in Documentation/process/submitting-patches.rst)
>>>>>>
>>>>>> OK, I will add oneline summary in the git commit messages.
>>>>>>
>>>>>>            This patch changes mem_map to use block/section mapping with crashkernel.
>>>>>>            Firstly, do block/section mapping(normally 2M or 1G) for all avail mem at
>>>>>>            mem_map, reserve crashkernel memory. And then walking pagetable to split
>>>>>>            block/section mapping to non block/section mapping(normally 4K) [[[only]]]
>>>>>>            for crashkernel mem.
>>>>>>
>>>>>>        This already happens when ZONE_DMA/ZONE_DMA32 are disabled. Please explain
>>>>>>        why is it Ok to change the way the memory is mapped with
>>>>>>        ZONE_DMA/ZONE_DMA32 enabled.
>>>>>>
>>>>>> In short:
>>>>>>
>>>>>> 1.building all avail mem with block/section mapping(normally 1G/2M) without
>>>>>> inspecting crashkernel
>>>>>> 2. Reserve crashkernel mem as same as previous doing
>>>>>> 3. only change the crashkernle mem mapping to normal mapping(normally 4k).
>>>>>> With this method, there are block/section mapping as more as possible.
>>>>>
>>>>> This does not answer the question why changing the way the memory is mapped
>>>>> when there is ZONE_DMA/DMA32 and crashkernel won't cause a regression.
>>>>>
>>>> 1.Quoted messages from arch/arm64/mm/init.c
>>>>
>>>> "Memory reservation for crash kernel either done early or deferred
>>>> depending on DMA memory zones configs (ZONE_DMA) --
>>>>
>>>> In absence of ZONE_DMA configs arm64_dma_phys_limit initialized
>>>> here instead of max_zone_phys().  This lets early reservation of
>>>> crash kernel memory which has a dependency on arm64_dma_phys_limit.
>>>> Reserving memory early for crash kernel allows linear creation of block
>>>> mappings (greater than page-granularity) for all the memory bank rangs.
>>>> In this scheme a comparatively quicker boot is observed.
>>>>
>>>> If ZONE_DMA configs are defined, crash kernel memory reservation
>>>> is delayed until DMA zone memory range size initialization performed in
>>>> zone_sizes_init().  The defer is necessary to steer clear of DMA zone
>>>> memory range to avoid overlap allocation.  So crash kernel memory boundaries are not known when mapping all bank memory ranges, which otherwise means not possible to exclude crash kernel range from creating block mappings so page-granularity mappings are created for the entire memory range."
>>>>
>>>> Namely, the init order: memblock init--->linear mem mapping(4k mapping for crashkernel, requirinig page-granularity changing))--->zone dma limit--->reserve crashkernel.
>>>> So when enable ZONE DMA and using crashkernel, the mem mapping using 4k mapping.
>>>>
>>>> 2.As mentioned above, when linear mem use 4k mapping simply, there is high dtlb miss(degrade performance).
>>>> This patch use block/section mapping as far as possible with performance improvement.
>>>>
>>>> 3.This patch reserve crashkernel as same as the history(ZONE DMA & crashkernel reserving order), and only change the linear mem mapping to block/section mapping.
>>>> .
>>>>
>>>
>>> I think Mike Rapoport's probably asking you to answer whether you've
>>> taken into account such as BBM. For example, the following code:
>>> we should prepare the next level pgtable first, then change 2M block
>>> mapping to 4K page mapping, and flush TLB at the end.
>>>> +static void init_crashkernel_pmd(pud_t *pudp, unsigned long addr,
>>> +                 unsigned long end, phys_addr_t phys,
>>> +                 pgprot_t prot,
>>> +                 phys_addr_t (*pgtable_alloc)(int), int flags)
>>> +{
>>> +    phys_addr_t map_offset;
>>> +    unsigned long next;
>>> +    pmd_t *pmdp;
>>> +    pmdval_t pmdval;
>>> +
>>> +    pmdp = pmd_offset(pudp, addr);
>>> +    do {
>>> +        next = pmd_addr_end(addr, end);
>>> +        if (!pmd_none(*pmdp) && pmd_sect(*pmdp)) {
>>> +            phys_addr_t pte_phys = pgtable_alloc(PAGE_SHIFT);
>>> +            pmd_clear(pmdp);
>>> +            pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN;
>>> +            if (flags & NO_EXEC_MAPPINGS)
>>> +                pmdval |= PMD_TABLE_PXN;
>>> +            __pmd_populate(pmdp, pte_phys, pmdval);
>>> +            flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
>>>
>>> The pgtable is empty now. However, memory other than crashkernel may be being accessed.
>> 1.When reserving crashkernel and remapping linear mem mapping, there is only one boot cpu running. There is no other cpu/thread running at the same time.
> 
> So, put this in the code comment?
OK.
> 
> If scalability is considered and unpredictable changes occur in the future, for example,
> other modules also need this mapping function. It would be better to deal with the BBM now,
> and make this public.
OK, could you give me some advice?
> 
> 
>>
>> 2.When clearing block/section mapping, I have flush tlb by flush_tlb_kernel_range. Afterwards rebuilt 4k mapping(I think it's no need flush tlb).
> 
> 
>>
>>>
>>> +
>>> +            map_offset = addr - (addr & PMD_MASK);
>>> +            if (map_offset)
>>> +                alloc_init_cont_pte(pmdp, addr & PMD_MASK, addr,
>>> +                        phys - map_offset, prot,
>>> +                        pgtable_alloc, flags);
>>> +
>>> +            if (next < (addr & PMD_MASK) + PMD_SIZE)
>>> +                alloc_init_cont_pte(pmdp, next, (addr & PUD_MASK) +
>>> +                        PUD_SIZE, next - addr + phys,
>>> +                        prot, pgtable_alloc, flags);
> 
> Here and alloc_crashkernel_pud() should use the raw flags. It may not contain  (NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS)
Yes. the mem out of crashkernel should use block/section mapping as far 
as possible including the LeftMargin and RightMargin.
But I had test it on HiSilicon Kunpeng 920-6426 with it and get 
performacne degrade(without NO_BLOCK_MAPPINGS/NO_CONT_MAPPINGS flags for 
the left/right margin)
It's strange, could you give some advice? Maybe it's good for other arm 
platform except for HiSilicon Kunpeng 920-6426.
> 
>>> +        }
>>> +        alloc_crashkernel_cont_pte(pmdp, addr, next, phys, prot,
>>> +                       pgtable_alloc, flags);
>>> +        phys += next - addr;
>>> +    } while (pmdp++, addr = next, addr != end);
>>> +}
>>>
>> .
>>
> 

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] arm64: mm: fix linear mapping mem access performace degradation
  2022-06-28  1:50           ` Kefeng Wang
@ 2022-06-28  3:22             ` guanghui.fgh
  0 siblings, 0 replies; 15+ messages in thread
From: guanghui.fgh @ 2022-06-28  3:22 UTC (permalink / raw)
  To: Kefeng Wang, Mike Rapoport
  Cc: baolin.wang, catalin.marinas, will, akpm, david, jianyong.wu,
	james.morse, quic_qiancai, christophe.leroy, jonathan,
	mark.rutland, thunder.leizhen, anshuman.khandual,
	linux-arm-kernel, linux-kernel, geert+renesas, ardb, linux-mm,
	Yao Hongbo

Thanks.

在 2022/6/28 9:50, Kefeng Wang 写道:
> 
> On 2022/6/28 0:49, Mike Rapoport wrote:
>> On Mon, Jun 27, 2022 at 06:46:50PM +0800, guanghui.fgh wrote:
>>> 在 2022/6/27 17:49, Mike Rapoport 写道:
>>>> On Mon, Jun 27, 2022 at 05:24:10PM +0800, guanghui.fgh wrote:
>>>>> 在 2022/6/27 14:34, Mike Rapoport 写道:
>>>>>
>>>>>       On Sun, Jun 26, 2022 at 07:10:15PM +0800, Guanghui Feng wrote:
>>>>>
>>>>>           The arm64 can build 2M/1G block/sectiion mapping. When 
>>>>> using DMA/DMA32 zone
>>>>>           (enable crashkernel, disable rodata full, disable 
>>>>> kfence), the mem_map will
>>>>>           use non block/section mapping(for crashkernel requires to 
>>>>> shrink the region
>>>>>           in page granularity). But it will degrade performance 
>>>>> when doing larging
>>>>>           continuous mem access in kernel(memcpy/memmove, etc).
>>>>>
>>>>>           There are many changes and discussions:
>>>>>           commit 031495635b46
>>>>>           commit 1a8e1cef7603
>>>>>           commit 8424ecdde7df
>>>>>           commit 0a30c53573b0
>>>>>           commit 2687275a5843
>>>>>
>>>>>       Please include oneline summary of the commit. (See section 
>>>>> "Describe your
>>>>>       changes" in Documentation/process/submitting-patches.rst)
>>>>>
>>>>> OK, I will add oneline summary in the git commit messages.
>>>>>
>>>>>           This patch changes mem_map to use block/section mapping 
>>>>> with crashkernel.
>>>>>           Firstly, do block/section mapping(normally 2M or 1G) for 
>>>>> all avail mem at
>>>>>           mem_map, reserve crashkernel memory. And then walking 
>>>>> pagetable to split
>>>>>           block/section mapping to non block/section 
>>>>> mapping(normally 4K) [[[only]]]
>>>>>           for crashkernel mem.
>>>>>
>>>>>       This already happens when ZONE_DMA/ZONE_DMA32 are disabled. 
>>>>> Please explain
>>>>>       why is it Ok to change the way the memory is mapped with
>>>>>       ZONE_DMA/ZONE_DMA32 enabled.
>>>>>
>>>>> In short:
>>>>>
>>>>> 1.building all avail mem with block/section mapping(normally 
>>>>> 1G/2M) without
>>>>> inspecting crashkernel
>>>>> 2. Reserve crashkernel mem as same as previous doing
>>>>> 3. only change the crashkernle mem mapping to normal 
>>>>> mapping(normally 4k).
>>>>> With this method, there are block/section mapping as more as possible.
>>>> This does not answer the question why changing the way the memory is 
>>>> mapped
>>>> when there is ZONE_DMA/DMA32 and crashkernel won't cause a regression.
>>>>
>>> 1.Quoted messages from arch/arm64/mm/init.c
>>>
>>> "Memory reservation for crash kernel either done early or deferred
>>> depending on DMA memory zones configs (ZONE_DMA) --
>>>
>>> In absence of ZONE_DMA configs arm64_dma_phys_limit initialized
>>> here instead of max_zone_phys().  This lets early reservation of
>>> crash kernel memory which has a dependency on arm64_dma_phys_limit.
>>> Reserving memory early for crash kernel allows linear creation of block
>>> mappings (greater than page-granularity) for all the memory bank rangs.
>>> In this scheme a comparatively quicker boot is observed.
>>>
>>> If ZONE_DMA configs are defined, crash kernel memory reservation
>>> is delayed until DMA zone memory range size initialization performed in
>>> zone_sizes_init().  The defer is necessary to steer clear of DMA zone
>>> memory range to avoid overlap allocation.  So crash kernel memory 
>>> boundaries
>>> are not known when mapping all bank memory ranges, which otherwise 
>>> means not
>>> possible to exclude crash kernel range from creating block mappings so
>>> page-granularity mappings are created for the entire memory range."
>>>
>>> Namely, the init order: memblock init--->linear mem mapping(4k 
>>> mapping for
>>> crashkernel, requirinig page-granularity changing))--->zone dma
>>> limit--->reserve crashkernel.
>>> So when enable ZONE DMA and using crashkernel, the mem mapping using 4k
>>> mapping.
>>>
>>> 2.As mentioned above, when linear mem use 4k mapping simply, there is 
>>> high
>>> dtlb miss(degrade performance).
>>> This patch use block/section mapping as far as possible with performance
>>> improvement.
>>>
>>> 3.This patch reserve crashkernel as same as the history(ZONE DMA &
>>> crashkernel reserving order), and only change the linear mem mapping to
>>> block/section mapping.
>> Thank you for the explanation.
>>
>> So if I understand correctly, you suggest to reserve the crash kernel
>> memory late regardless of ZONE_DMA/32 configuration and then map that 
>> memory
>> using page level mappings after the memory is reserved.
>>
>> This makes sense, but I think the patch has a few shortcomings than 
>> need to
>> be addressed.
>>
>>> Signed-off-by: Guanghui Feng <guanghuifeng@linux.alibaba.com>
>>> ---
>>>   arch/arm64/include/asm/mmu.h |   1 +
>>>   arch/arm64/mm/init.c         |   8 +-
>>>   arch/arm64/mm/mmu.c          | 274 
>>> +++++++++++++++++++++++++++++++++++++++----
>>>   3 files changed, 253 insertions(+), 30 deletions(-)
>>>
>>> diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
>>> index 48f8466..df113cc 100644
>>> --- a/arch/arm64/include/asm/mmu.h
>>> +++ b/arch/arm64/include/asm/mmu.h
>>> @@ -63,6 +63,7 @@ static inline bool arm64_kernel_unmapped_at_el0(void)
>>>   extern void arm64_memblock_init(void);
>>>   extern void paging_init(void);
>>>   extern void bootmem_init(void);
>>> +extern void mapping_crashkernel(void);
>> I think map_crashkernel() would be a better name.
> 
> Or a more generic naming (and for other helpers),  it's used in 
> crashkernel for now,
> 
> but maybe more cases.
> 
> 
>>
>>>   extern void __iomem *early_io_map(phys_addr_t phys, unsigned long 
>>> virt);
>>>   extern void init_mem_pgprot(void);
>>>   extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
>>> diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
>>> index 339ee84..0e7540b 100644
>>> --- a/arch/arm64/mm/init.c
>>> +++ b/arch/arm64/mm/init.c
>>> @@ -388,10 +388,6 @@ void __init arm64_memblock_init(void)
>>>       }
>>>       early_init_fdt_scan_reserved_mem();
>>> -
>>> -    if (!IS_ENABLED(CONFIG_ZONE_DMA) && !IS_ENABLED(CONFIG_ZONE_DMA32))
>>> -        reserve_crashkernel();
>>> -
>>>       high_memory = __va(memblock_end_of_DRAM() - 1) + 1;
>>>   }
>>> @@ -438,8 +434,8 @@ void __init bootmem_init(void)
>>>        * request_standard_resources() depends on crashkernel's memory 
>>> being
>>>        * reserved, so do it here.
>>>        */
>>> -    if (IS_ENABLED(CONFIG_ZONE_DMA) || IS_ENABLED(CONFIG_ZONE_DMA32))
>>> -        reserve_crashkernel();
>>> +    reserve_crashkernel();
>>> +    mapping_crashkernel();
>> This can be called from reserve_crashkernel() directly.
>>>       memblock_dump_all();
>>>   }
>>> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
>>> index 626ec32..0672afd 100644
>>> --- a/arch/arm64/mm/mmu.c
>>> +++ b/arch/arm64/mm/mmu.c
>>> @@ -498,6 +498,256 @@ static int __init enable_crash_mem_map(char *arg)
>>>   }
>>>   early_param("crashkernel", enable_crash_mem_map);
>>> +#ifdef CONFIG_KEXEC_CORE
>>> +static phys_addr_t __init early_crashkernel_pgtable_alloc(int shift)
>>> +{
>>> +    phys_addr_t phys;
>>> +    void *ptr;
>>> +
>>> +    phys = memblock_phys_alloc_range(PAGE_SIZE, PAGE_SIZE, 0,
>>> +                     MEMBLOCK_ALLOC_NOLEAKTRACE);
>>> +    if (!phys)
>>> +        panic("Failed to allocate page table page\n");
>>> +
>>> +    ptr = (void *)__phys_to_virt(phys);
>>> +    memset(ptr, 0, PAGE_SIZE);
> add dsb() to ensure page is visible to page table walk?
I had consulted from alloc_init_cont_pte. There maybe no need dsb actually.
>>> +    return phys;
>>> +}
>>> +
>>> +static void init_crashkernel_pte(pmd_t *pmdp, unsigned long addr,
>>> +                 unsigned long end,
>>> +                 phys_addr_t phys, pgprot_t prot)
>>> +{
>>> +    pte_t *ptep;
>>> +    ptep = pte_offset_kernel(pmdp, addr);
>>> +    do {
>>> +        set_pte(ptep, pfn_pte(__phys_to_pfn(phys), prot));
>>> +        phys += PAGE_SIZE;
>>> +    } while (ptep++, addr += PAGE_SIZE, addr != end);
>>> +}
>>> +
>>> +static void alloc_crashkernel_cont_pte(pmd_t *pmdp, unsigned long addr,
>>> +                       unsigned long end, phys_addr_t phys,
>>> +                       pgprot_t prot,
>>> +                       phys_addr_t (*pgtable_alloc)(int),
>>> +                       int flags)
>>> +{
>>> +    unsigned long next;
>>> +    pmd_t pmd = READ_ONCE(*pmdp);
>>> +
>>> +    BUG_ON(pmd_sect(pmd));
>>> +    if (pmd_none(pmd)) {
>>> +        pmdval_t pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN;
>>> +        phys_addr_t pte_phys;
>>> +
>>> +        if (flags & NO_EXEC_MAPPINGS)
>>> +            pmdval |= PMD_TABLE_PXN;
>>> +        BUG_ON(!pgtable_alloc);
>>> +        pte_phys = pgtable_alloc(PAGE_SHIFT);
>>> +        __pmd_populate(pmdp, pte_phys, pmdval);
>>> +        pmd = READ_ONCE(*pmdp);
>>> +    }
>>> +    BUG_ON(pmd_bad(pmd));
>>> +
>>> +    do {
>>> +        pgprot_t __prot = prot;
>>> +        next = pte_cont_addr_end(addr, end);
>>> +        init_crashkernel_pte(pmdp, addr, next, phys, __prot);
>>> +        phys += next - addr;
>>> +    } while (addr = next, addr != end);
>>> +}
>>> +
>>> +static void init_crashkernel_pmd(pud_t *pudp, unsigned long addr,
>>> +                 unsigned long end, phys_addr_t phys,
>>> +                 pgprot_t prot,
>>> +                 phys_addr_t (*pgtable_alloc)(int), int flags)
>>> +{
>>> +    phys_addr_t map_offset;
>>> +    unsigned long next;
>>> +    pmd_t *pmdp;
>>> +    pmdval_t pmdval;
>>> +
>>> +    pmdp = pmd_offset(pudp, addr);
>>> +    do {
>>> +        next = pmd_addr_end(addr, end);
>>> +        if (!pmd_none(*pmdp) && pmd_sect(*pmdp)) {
>>> +            phys_addr_t pte_phys = pgtable_alloc(PAGE_SHIFT);
>>> +            pmd_clear(pmdp);
>>> +            pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN;
>>> +            if (flags & NO_EXEC_MAPPINGS)
>>> +                pmdval |= PMD_TABLE_PXN;
>>> +            __pmd_populate(pmdp, pte_phys, pmdval);
>>> +            flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
>>> +
>>> +            map_offset = addr - (addr & PMD_MASK);
>>> +            if (map_offset)
>>> +                alloc_init_cont_pte(pmdp, addr & PMD_MASK, addr,
>>> +                        phys - map_offset, prot,
>>> +                        pgtable_alloc, flags);
>>> +
>>> +            if (next < (addr & PMD_MASK) + PMD_SIZE)
>>> +                alloc_init_cont_pte(pmdp, next, (addr & PUD_MASK) +
>>> +                        PUD_SIZE, next - addr + phys,
>>> +                        prot, pgtable_alloc, flags);
>>> +        }
>>> +        alloc_crashkernel_cont_pte(pmdp, addr, next, phys, prot,
>>> +                       pgtable_alloc, flags);
>>> +        phys += next - addr;
>>> +    } while (pmdp++, addr = next, addr != end);
>>> +}
>>> +
>>> +static void alloc_crashkernel_cont_pmd(pud_t *pudp, unsigned long addr,
>>> +                       unsigned long end, phys_addr_t phys,
>>> +                       pgprot_t prot,
>>> +                       phys_addr_t (*pgtable_alloc)(int),
>>> +                       int flags)
>>> +{
>>> +    unsigned long next;
>>> +    pud_t pud = READ_ONCE(*pudp);
>>> +
>>> +    /*
>>> +     * Check for initial section mappings in the pgd/pud.
>>> +     */
>>> +    BUG_ON(pud_sect(pud));
>>> +    if (pud_none(pud)) {
>>> +        pudval_t pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN;
>>> +        phys_addr_t pmd_phys;
>>> +
>>> +        if (flags & NO_EXEC_MAPPINGS)
>>> +            pudval |= PUD_TABLE_PXN;
>>> +        BUG_ON(!pgtable_alloc);
>>> +        pmd_phys = pgtable_alloc(PMD_SHIFT);
>>> +        __pud_populate(pudp, pmd_phys, pudval);
>>> +        pud = READ_ONCE(*pudp);
>>> +    }
>>> +    BUG_ON(pud_bad(pud));
>>> +
>>> +    do {
>>> +        pgprot_t __prot = prot;
>>> +        next = pmd_cont_addr_end(addr, end);
>>> +        init_crashkernel_pmd(pudp, addr, next, phys, __prot,
>>> +                     pgtable_alloc, flags);
>>> +        phys += next - addr;
>>> +    } while (addr = next, addr != end);
>>> +}
>>> +
>>> +static void alloc_crashkernel_pud(pgd_t *pgdp, unsigned long addr,
>>> +                  unsigned long end, phys_addr_t phys,
>>> +                  pgprot_t prot,
>>> +                  phys_addr_t (*pgtable_alloc)(int),
>>> +                  int flags)
>>> +{
>>> +    phys_addr_t map_offset;
>>> +    unsigned long next;
>>> +    pud_t *pudp;
>>> +    p4d_t *p4dp = p4d_offset(pgdp, addr);
>>> +    p4d_t p4d = READ_ONCE(*p4dp);
>>> +    pudval_t pudval;
>>> +
>>> +    if (p4d_none(p4d)) {
>>> +        p4dval_t p4dval = P4D_TYPE_TABLE | P4D_TABLE_UXN;
>>> +        phys_addr_t pud_phys;
>>> +
>>> +        if (flags & NO_EXEC_MAPPINGS)
>>> +            p4dval |= P4D_TABLE_PXN;
>>> +        BUG_ON(!pgtable_alloc);
>>> +        pud_phys = pgtable_alloc(PUD_SHIFT);
>>> +        __p4d_populate(p4dp, pud_phys, p4dval);
>>> +        p4d = READ_ONCE(*p4dp);
>>> +    }
>>> +    BUG_ON(p4d_bad(p4d));
>>> +
>>> +    /*
>>> +     * No need for locking during early boot. And it doesn't work as
>>> +     * expected with KASLR enabled.
>>> +     */
>>> +    if (system_state != SYSTEM_BOOTING)
>>> +        mutex_lock(&fixmap_lock);
> It seems that no need to use fixmap_lock.
Yes, it's that.
>>> +    pudp = pud_offset(p4dp, addr);
>>> +    do {
>>> +        next = pud_addr_end(addr, end);
>>> +        if (!pud_none(*pudp) && pud_sect(*pudp)) {
>>> +            phys_addr_t pmd_phys = pgtable_alloc(PMD_SHIFT);
>>> +            pud_clear(pudp);
>>> +
>>> +            pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN;
>>> +            if (flags & NO_EXEC_MAPPINGS)
>>> +                pudval |= PUD_TABLE_PXN;
>>> +            __pud_populate(pudp, pmd_phys, pudval);
>>> +            flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
>>> +
>>> +            map_offset = addr - (addr & PUD_MASK);
>>> +            if (map_offset)
>>> +                alloc_init_cont_pmd(pudp, addr & PUD_MASK, addr,
>>> +                        phys - map_offset, prot,
>>> +                        pgtable_alloc, flags);
>>> +
>>> +            if (next < (addr & PUD_MASK) + PUD_SIZE)
>>> +                alloc_init_cont_pmd(pudp, next, (addr & PUD_MASK) +
>>> +                        PUD_SIZE, next - addr + phys,
>>> +                        prot, pgtable_alloc, flags);
>>> +        }
>>> +        alloc_crashkernel_cont_pmd(pudp, addr, next, phys, prot,
>>> +                       pgtable_alloc, flags);
>>> +        phys += next - addr;
>>> +    } while (pudp++, addr = next, addr != end);
>>> +
>>> +    if (system_state != SYSTEM_BOOTING)
>>> +        mutex_unlock(&fixmap_lock);
>>> +}
>>> +
>>> +static void __create_crashkernel_mapping(pgd_t *pgdir, phys_addr_t 
>>> phys,
>>> +                     unsigned long virt, phys_addr_t size,
>>> +                     pgprot_t prot,
>>> +                     phys_addr_t (*pgtable_alloc)(int),
>>> +                     int flags)
>>> +{
>>> +    unsigned long addr, end, next;
>>> +    pgd_t *pgdp = pgd_offset_pgd(pgdir, virt);
>>> +
>>> +    /*
>>> +     * If the virtual and physical address don't have the same offset
>>> +     * within a page, we cannot map the region as the caller expects.
>>> +     */
>>> +    if (WARN_ON((phys ^ virt) & ~PAGE_MASK))
>>> +        return;
>>> +
>>> +    phys &= PAGE_MASK;
>>> +    addr = virt & PAGE_MASK;
>>> +    end = PAGE_ALIGN(virt + size);
>>> +
>>> +    do {
>>> +        next = pgd_addr_end(addr, end);
>>> +        alloc_crashkernel_pud(pgdp, addr, next, phys, prot,
>>> +                      pgtable_alloc, flags);
>>> +        phys += next - addr;
>>> +    } while (pgdp++, addr = next, addr != end);
>>> +}
>> __create_crashkernel_mapping() and the helpers it uses resemble very much
>> __create_pgd_mapping() and it's helpers. I think a cleaner way would 
>> be to
>> clear section mappings and than call __create_pgd_mapping() for crash
>> kernel memory with block/section mappings disabled.
> Yes, there are too much duplicated codes, if the handling of crash 
> reserve memory
> is acceptable, we could find a better way(as Mike said) to deal with the 
> repeat.
> Actually, we try something like this before, but I am not sure it is 
> right way,
> and we saw TLB conflict,  I will test this patch.
OK, I had test this patch and worked well.

>>> +static void __init map_crashkernel(pgd_t *pgdp, phys_addr_t start,
>>> +                   phys_addr_t end, pgprot_t prot, int flags)
>>> +{
>>> +    __create_crashkernel_mapping(pgdp, start, __phys_to_virt(start),
>>> +                     end - start, prot,
>>> +                     early_crashkernel_pgtable_alloc, flags);
>>> +}
>> No need for this, you can call __create_crashkernel_mapping() directly 
>> from
>> mapping_crashkernel().
>>
>>> +
>>> +void __init mapping_crashkernel(void)
>>> +{
>>> +    if (can_set_direct_map() || IS_ENABLED(CONFIG_KFENCE))
>>> +        return;
>>> +
>>> +    if (!crash_mem_map || !crashk_res.end)
>>> +        return;
>>> +
>>> +    map_crashkernel(swapper_pg_dir, crashk_res.start,
>>> +            crashk_res.end + 1, PAGE_KERNEL,
>>> +            NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS);
>>> +}
>>> +#else
>>> +void __init mapping_crashkernel(void) {}
>>> +#endif
>>> +
>>>   static void __init map_mem(pgd_t *pgdp)
>>>   {
>>>       static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN);
>>> @@ -527,17 +777,6 @@ static void __init map_mem(pgd_t *pgdp)
>>>        */
>>>       memblock_mark_nomap(kernel_start, kernel_end - kernel_start);
>>> -#ifdef CONFIG_KEXEC_CORE
>>> -    if (crash_mem_map) {
>>> -        if (IS_ENABLED(CONFIG_ZONE_DMA) ||
>>> -            IS_ENABLED(CONFIG_ZONE_DMA32))
>>> -            flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
>>> -        else if (crashk_res.end)
>>> -            memblock_mark_nomap(crashk_res.start,
>>> -                resource_size(&crashk_res));
>>> -    }
>>> -#endif
>>> -
>>>       /* map all the memory banks */
>>>       for_each_mem_range(i, &start, &end) {
>>>           if (start >= end)
>>> @@ -570,19 +809,6 @@ static void __init map_mem(pgd_t *pgdp)
>>>        * in page granularity and put back unused memory to buddy system
>>>        * through /sys/kernel/kexec_crash_size interface.
>>>        */
>>> -#ifdef CONFIG_KEXEC_CORE
>>> -    if (crash_mem_map &&
>>> -        !IS_ENABLED(CONFIG_ZONE_DMA) && 
>>> !IS_ENABLED(CONFIG_ZONE_DMA32)) {
>>> -        if (crashk_res.end) {
>>> -            __map_memblock(pgdp, crashk_res.start,
>>> -                       crashk_res.end + 1,
>>> -                       PAGE_KERNEL,
>>> -                       NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS);
>>> -            memblock_clear_nomap(crashk_res.start,
>>> -                         resource_size(&crashk_res));
>>> -        }
>>> -    }
>>> -#endif
>>>   }
>>>   void mark_rodata_ro(void)
>>> -- 
>>> 1.8.3.1
>>

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] arm64: mm: fix linear mapping mem access performace degradation
  2022-06-28  3:06               ` guanghui.fgh
@ 2022-06-28  6:21                 ` Leizhen (ThunderTown)
  2022-06-28  7:52                 ` guanghui.fgh
  1 sibling, 0 replies; 15+ messages in thread
From: Leizhen (ThunderTown) @ 2022-06-28  6:21 UTC (permalink / raw)
  To: guanghui.fgh, Mike Rapoport
  Cc: baolin.wang, catalin.marinas, will, akpm, david, jianyong.wu,
	james.morse, quic_qiancai, christophe.leroy, jonathan,
	mark.rutland, anshuman.khandual, linux-arm-kernel, linux-kernel,
	geert+renesas, ardb, linux-mm, bhe, Yao Hongbo



On 2022/6/28 11:06, guanghui.fgh wrote:
> Thanks.
> 
> 在 2022/6/28 9:34, Leizhen (ThunderTown) 写道:
>>
>>
>> On 2022/6/27 20:25, guanghui.fgh wrote:
>>> Thanks.
>>>
>>> 在 2022/6/27 20:06, Leizhen (ThunderTown) 写道:
>>>>
>>>>
>>>> On 2022/6/27 18:46, guanghui.fgh wrote:
>>>>>
>>>>>
>>>>> 在 2022/6/27 17:49, Mike Rapoport 写道:
>>>>>> Please don't post HTML.
>>>>>>
>>>>>> On Mon, Jun 27, 2022 at 05:24:10PM +0800, guanghui.fgh wrote:
>>>>>>> Thanks.
>>>>>>>
>>>>>>> 在 2022/6/27 14:34, Mike Rapoport 写道:
>>>>>>>
>>>>>>>        On Sun, Jun 26, 2022 at 07:10:15PM +0800, Guanghui Feng wrote:
>>>>>>>
>>>>>>>            The arm64 can build 2M/1G block/sectiion mapping. When using DMA/DMA32 zone
>>>>>>>            (enable crashkernel, disable rodata full, disable kfence), the mem_map will
>>>>>>>            use non block/section mapping(for crashkernel requires to shrink the region
>>>>>>>            in page granularity). But it will degrade performance when doing larging
>>>>>>>            continuous mem access in kernel(memcpy/memmove, etc).
>>>>>>>
>>>>>>>            There are many changes and discussions:
>>>>>>>            commit 031495635b46
>>>>>>>            commit 1a8e1cef7603
>>>>>>>            commit 8424ecdde7df
>>>>>>>            commit 0a30c53573b0
>>>>>>>            commit 2687275a5843
>>>>>>>
>>>>>>>        Please include oneline summary of the commit. (See section "Describe your
>>>>>>>        changes" in Documentation/process/submitting-patches.rst)
>>>>>>>
>>>>>>> OK, I will add oneline summary in the git commit messages.
>>>>>>>
>>>>>>>            This patch changes mem_map to use block/section mapping with crashkernel.
>>>>>>>            Firstly, do block/section mapping(normally 2M or 1G) for all avail mem at
>>>>>>>            mem_map, reserve crashkernel memory. And then walking pagetable to split
>>>>>>>            block/section mapping to non block/section mapping(normally 4K) [[[only]]]
>>>>>>>            for crashkernel mem.
>>>>>>>
>>>>>>>        This already happens when ZONE_DMA/ZONE_DMA32 are disabled. Please explain
>>>>>>>        why is it Ok to change the way the memory is mapped with
>>>>>>>        ZONE_DMA/ZONE_DMA32 enabled.
>>>>>>>
>>>>>>> In short:
>>>>>>>
>>>>>>> 1.building all avail mem with block/section mapping(normally 1G/2M) without
>>>>>>> inspecting crashkernel
>>>>>>> 2. Reserve crashkernel mem as same as previous doing
>>>>>>> 3. only change the crashkernle mem mapping to normal mapping(normally 4k).
>>>>>>> With this method, there are block/section mapping as more as possible.
>>>>>>
>>>>>> This does not answer the question why changing the way the memory is mapped
>>>>>> when there is ZONE_DMA/DMA32 and crashkernel won't cause a regression.
>>>>>>
>>>>> 1.Quoted messages from arch/arm64/mm/init.c
>>>>>
>>>>> "Memory reservation for crash kernel either done early or deferred
>>>>> depending on DMA memory zones configs (ZONE_DMA) --
>>>>>
>>>>> In absence of ZONE_DMA configs arm64_dma_phys_limit initialized
>>>>> here instead of max_zone_phys().  This lets early reservation of
>>>>> crash kernel memory which has a dependency on arm64_dma_phys_limit.
>>>>> Reserving memory early for crash kernel allows linear creation of block
>>>>> mappings (greater than page-granularity) for all the memory bank rangs.
>>>>> In this scheme a comparatively quicker boot is observed.
>>>>>
>>>>> If ZONE_DMA configs are defined, crash kernel memory reservation
>>>>> is delayed until DMA zone memory range size initialization performed in
>>>>> zone_sizes_init().  The defer is necessary to steer clear of DMA zone
>>>>> memory range to avoid overlap allocation.  So crash kernel memory boundaries are not known when mapping all bank memory ranges, which otherwise means not possible to exclude crash kernel range from creating block mappings so page-granularity mappings are created for the entire memory range."
>>>>>
>>>>> Namely, the init order: memblock init--->linear mem mapping(4k mapping for crashkernel, requirinig page-granularity changing))--->zone dma limit--->reserve crashkernel.
>>>>> So when enable ZONE DMA and using crashkernel, the mem mapping using 4k mapping.
>>>>>
>>>>> 2.As mentioned above, when linear mem use 4k mapping simply, there is high dtlb miss(degrade performance).
>>>>> This patch use block/section mapping as far as possible with performance improvement.
>>>>>
>>>>> 3.This patch reserve crashkernel as same as the history(ZONE DMA & crashkernel reserving order), and only change the linear mem mapping to block/section mapping.
>>>>> .
>>>>>
>>>>
>>>> I think Mike Rapoport's probably asking you to answer whether you've
>>>> taken into account such as BBM. For example, the following code:
>>>> we should prepare the next level pgtable first, then change 2M block
>>>> mapping to 4K page mapping, and flush TLB at the end.
>>>>> +static void init_crashkernel_pmd(pud_t *pudp, unsigned long addr,
>>>> +                 unsigned long end, phys_addr_t phys,
>>>> +                 pgprot_t prot,
>>>> +                 phys_addr_t (*pgtable_alloc)(int), int flags)
>>>> +{
>>>> +    phys_addr_t map_offset;
>>>> +    unsigned long next;
>>>> +    pmd_t *pmdp;
>>>> +    pmdval_t pmdval;
>>>> +
>>>> +    pmdp = pmd_offset(pudp, addr);
>>>> +    do {
>>>> +        next = pmd_addr_end(addr, end);
>>>> +        if (!pmd_none(*pmdp) && pmd_sect(*pmdp)) {
>>>> +            phys_addr_t pte_phys = pgtable_alloc(PAGE_SHIFT);
>>>> +            pmd_clear(pmdp);
>>>> +            pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN;
>>>> +            if (flags & NO_EXEC_MAPPINGS)
>>>> +                pmdval |= PMD_TABLE_PXN;
>>>> +            __pmd_populate(pmdp, pte_phys, pmdval);
>>>> +            flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
>>>>
>>>> The pgtable is empty now. However, memory other than crashkernel may be being accessed.
>>> 1.When reserving crashkernel and remapping linear mem mapping, there is only one boot cpu running. There is no other cpu/thread running at the same time.
>>
>> So, put this in the code comment?
> OK.
>>
>> If scalability is considered and unpredictable changes occur in the future, for example,
>> other modules also need this mapping function. It would be better to deal with the BBM now,
>> and make this public.
> OK, could you give me some advice?

I said that yesterday:
we should prepare the next level pgtable first, then change 2M block
mapping to 4K page mapping, and flush TLB at the end.

But I'm just suggesting, it's determined by the maintainers.

>>
>>
>>>
>>> 2.When clearing block/section mapping, I have flush tlb by flush_tlb_kernel_range. Afterwards rebuilt 4k mapping(I think it's no need flush tlb).
>>
>>
>>>
>>>>
>>>> +
>>>> +            map_offset = addr - (addr & PMD_MASK);
>>>> +            if (map_offset)
>>>> +                alloc_init_cont_pte(pmdp, addr & PMD_MASK, addr,
>>>> +                        phys - map_offset, prot,
>>>> +                        pgtable_alloc, flags);
>>>> +
>>>> +            if (next < (addr & PMD_MASK) + PMD_SIZE)
>>>> +                alloc_init_cont_pte(pmdp, next, (addr & PUD_MASK) +
>>>> +                        PUD_SIZE, next - addr + phys,
>>>> +                        prot, pgtable_alloc, flags);
>>
>> Here and alloc_crashkernel_pud() should use the raw flags. It may not contain  (NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS)
> Yes. the mem out of crashkernel should use block/section mapping as far as possible including the LeftMargin and RightMargin.
> But I had test it on HiSilicon Kunpeng 920-6426 with it and get performacne degrade(without NO_BLOCK_MAPPINGS/NO_CONT_MAPPINGS flags for the left/right margin)
> It's strange, could you give some advice? Maybe it's good for other arm platform except for HiSilicon Kunpeng 920-6426.

I think you can first use show_pte() to check whether the mapping results are as expected.

>>
>>>> +        }
>>>> +        alloc_crashkernel_cont_pte(pmdp, addr, next, phys, prot,
>>>> +                       pgtable_alloc, flags);
>>>> +        phys += next - addr;
>>>> +    } while (pmdp++, addr = next, addr != end);
>>>> +}
>>>>
>>> .
>>>
>>
> .
> 

-- 
Regards,
  Zhen Lei

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] arm64: mm: fix linear mapping mem access performace degradation
  2022-06-28  3:06               ` guanghui.fgh
  2022-06-28  6:21                 ` Leizhen (ThunderTown)
@ 2022-06-28  7:52                 ` guanghui.fgh
  1 sibling, 0 replies; 15+ messages in thread
From: guanghui.fgh @ 2022-06-28  7:52 UTC (permalink / raw)
  To: Leizhen (ThunderTown), Mike Rapoport
  Cc: baolin.wang, catalin.marinas, will, akpm, david, jianyong.wu,
	james.morse, quic_qiancai, christophe.leroy, jonathan,
	mark.rutland, anshuman.khandual, linux-arm-kernel, linux-kernel,
	geert+renesas, ardb, linux-mm, bhe, Yao Hongbo



在 2022/6/28 11:06, guanghui.fgh 写道:
> Thanks.
> 
> 在 2022/6/28 9:34, Leizhen (ThunderTown) 写道:
>>
>>
>> On 2022/6/27 20:25, guanghui.fgh wrote:
>>> Thanks.
>>>
>>> 在 2022/6/27 20:06, Leizhen (ThunderTown) 写道:
>>>>
>>>>
>>>> On 2022/6/27 18:46, guanghui.fgh wrote:
>>>>>
>>>>>
>>>>> 在 2022/6/27 17:49, Mike Rapoport 写道:
>>>>>> Please don't post HTML.
>>>>>>
>>>>>> On Mon, Jun 27, 2022 at 05:24:10PM +0800, guanghui.fgh wrote:
>>>>>>> Thanks.
>>>>>>>
>>>>>>> 在 2022/6/27 14:34, Mike Rapoport 写道:
>>>>>>>
>>>>>>>        On Sun, Jun 26, 2022 at 07:10:15PM +0800, Guanghui Feng 
>>>>>>> wrote:
>>>>>>>
>>>>>>>            The arm64 can build 2M/1G block/sectiion mapping. When 
>>>>>>> using DMA/DMA32 zone
>>>>>>>            (enable crashkernel, disable rodata full, disable 
>>>>>>> kfence), the mem_map will
>>>>>>>            use non block/section mapping(for crashkernel requires 
>>>>>>> to shrink the region
>>>>>>>            in page granularity). But it will degrade performance 
>>>>>>> when doing larging
>>>>>>>            continuous mem access in kernel(memcpy/memmove, etc).
>>>>>>>
>>>>>>>            There are many changes and discussions:
>>>>>>>            commit 031495635b46
>>>>>>>            commit 1a8e1cef7603
>>>>>>>            commit 8424ecdde7df
>>>>>>>            commit 0a30c53573b0
>>>>>>>            commit 2687275a5843
>>>>>>>
>>>>>>>        Please include oneline summary of the commit. (See section 
>>>>>>> "Describe your
>>>>>>>        changes" in Documentation/process/submitting-patches.rst)
>>>>>>>
>>>>>>> OK, I will add oneline summary in the git commit messages.
>>>>>>>
>>>>>>>            This patch changes mem_map to use block/section 
>>>>>>> mapping with crashkernel.
>>>>>>>            Firstly, do block/section mapping(normally 2M or 1G) 
>>>>>>> for all avail mem at
>>>>>>>            mem_map, reserve crashkernel memory. And then walking 
>>>>>>> pagetable to split
>>>>>>>            block/section mapping to non block/section 
>>>>>>> mapping(normally 4K) [[[only]]]
>>>>>>>            for crashkernel mem.
>>>>>>>
>>>>>>>        This already happens when ZONE_DMA/ZONE_DMA32 are 
>>>>>>> disabled. Please explain
>>>>>>>        why is it Ok to change the way the memory is mapped with
>>>>>>>        ZONE_DMA/ZONE_DMA32 enabled.
>>>>>>>
>>>>>>> In short:
>>>>>>>
>>>>>>> 1.building all avail mem with block/section mapping(normally 
>>>>>>> 1G/2M) without
>>>>>>> inspecting crashkernel
>>>>>>> 2. Reserve crashkernel mem as same as previous doing
>>>>>>> 3. only change the crashkernle mem mapping to normal 
>>>>>>> mapping(normally 4k).
>>>>>>> With this method, there are block/section mapping as more as 
>>>>>>> possible.
>>>>>>
>>>>>> This does not answer the question why changing the way the memory 
>>>>>> is mapped
>>>>>> when there is ZONE_DMA/DMA32 and crashkernel won't cause a 
>>>>>> regression.
>>>>>>
>>>>> 1.Quoted messages from arch/arm64/mm/init.c
>>>>>
>>>>> "Memory reservation for crash kernel either done early or deferred
>>>>> depending on DMA memory zones configs (ZONE_DMA) --
>>>>>
>>>>> In absence of ZONE_DMA configs arm64_dma_phys_limit initialized
>>>>> here instead of max_zone_phys().  This lets early reservation of
>>>>> crash kernel memory which has a dependency on arm64_dma_phys_limit.
>>>>> Reserving memory early for crash kernel allows linear creation of 
>>>>> block
>>>>> mappings (greater than page-granularity) for all the memory bank 
>>>>> rangs.
>>>>> In this scheme a comparatively quicker boot is observed.
>>>>>
>>>>> If ZONE_DMA configs are defined, crash kernel memory reservation
>>>>> is delayed until DMA zone memory range size initialization 
>>>>> performed in
>>>>> zone_sizes_init().  The defer is necessary to steer clear of DMA zone
>>>>> memory range to avoid overlap allocation.  So crash kernel memory 
>>>>> boundaries are not known when mapping all bank memory ranges, which 
>>>>> otherwise means not possible to exclude crash kernel range from 
>>>>> creating block mappings so page-granularity mappings are created 
>>>>> for the entire memory range."
>>>>>
>>>>> Namely, the init order: memblock init--->linear mem mapping(4k 
>>>>> mapping for crashkernel, requirinig page-granularity 
>>>>> changing))--->zone dma limit--->reserve crashkernel.
>>>>> So when enable ZONE DMA and using crashkernel, the mem mapping 
>>>>> using 4k mapping.
>>>>>
>>>>> 2.As mentioned above, when linear mem use 4k mapping simply, there 
>>>>> is high dtlb miss(degrade performance).
>>>>> This patch use block/section mapping as far as possible with 
>>>>> performance improvement.
>>>>>
>>>>> 3.This patch reserve crashkernel as same as the history(ZONE DMA & 
>>>>> crashkernel reserving order), and only change the linear mem 
>>>>> mapping to block/section mapping.
>>>>> .
>>>>>
>>>>
>>>> I think Mike Rapoport's probably asking you to answer whether you've
>>>> taken into account such as BBM. For example, the following code:
>>>> we should prepare the next level pgtable first, then change 2M block
>>>> mapping to 4K page mapping, and flush TLB at the end.
>>>>> +static void init_crashkernel_pmd(pud_t *pudp, unsigned long addr,
>>>> +                 unsigned long end, phys_addr_t phys,
>>>> +                 pgprot_t prot,
>>>> +                 phys_addr_t (*pgtable_alloc)(int), int flags)
>>>> +{
>>>> +    phys_addr_t map_offset;
>>>> +    unsigned long next;
>>>> +    pmd_t *pmdp;
>>>> +    pmdval_t pmdval;
>>>> +
>>>> +    pmdp = pmd_offset(pudp, addr);
>>>> +    do {
>>>> +        next = pmd_addr_end(addr, end);
>>>> +        if (!pmd_none(*pmdp) && pmd_sect(*pmdp)) {
>>>> +            phys_addr_t pte_phys = pgtable_alloc(PAGE_SHIFT);
>>>> +            pmd_clear(pmdp);
>>>> +            pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN;
>>>> +            if (flags & NO_EXEC_MAPPINGS)
>>>> +                pmdval |= PMD_TABLE_PXN;
>>>> +            __pmd_populate(pmdp, pte_phys, pmdval);
>>>> +            flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
>>>>
>>>> The pgtable is empty now. However, memory other than crashkernel may 
>>>> be being accessed.
>>> 1.When reserving crashkernel and remapping linear mem mapping, there 
>>> is only one boot cpu running. There is no other cpu/thread running at 
>>> the same time.
>>
>> So, put this in the code comment?
> OK.
>>
>> If scalability is considered and unpredictable changes occur in the 
>> future, for example,
>> other modules also need this mapping function. It would be better to 
>> deal with the BBM now,
>> and make this public.
> OK, could you give me some advice?
>>
>>
>>>
>>> 2.When clearing block/section mapping, I have flush tlb by 
>>> flush_tlb_kernel_range. Afterwards rebuilt 4k mapping(I think it's no 
>>> need flush tlb).
>>
>>
>>>
>>>>
>>>> +
>>>> +            map_offset = addr - (addr & PMD_MASK);
>>>> +            if (map_offset)
>>>> +                alloc_init_cont_pte(pmdp, addr & PMD_MASK, addr,
>>>> +                        phys - map_offset, prot,
>>>> +                        pgtable_alloc, flags);
>>>> +
>>>> +            if (next < (addr & PMD_MASK) + PMD_SIZE)
>>>> +                alloc_init_cont_pte(pmdp, next, (addr & PUD_MASK) +
>>>> +                        PUD_SIZE, next - addr + phys,
>>>> +                        prot, pgtable_alloc, flags);
>>
>> Here and alloc_crashkernel_pud() should use the raw flags. It may not 
>> contain  (NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS)
> Yes. the mem out of crashkernel should use block/section mapping as far 
> as possible including the LeftMargin and RightMargin.
> But I had test it on HiSilicon Kunpeng 920-6426 with it and get 
> performacne degrade(without NO_BLOCK_MAPPINGS/NO_CONT_MAPPINGS flags for 
> the left/right margin)
> It's strange, could you give some advice? Maybe it's good for other arm 
> platform except for HiSilicon Kunpeng 920-6426.
There should split non-crashkernel mem [[[ without ]]]
NO_BLOCK_MAPPINGS/NO_CONT_MAPPINGS flags

I had test it on other arm platform [[[ non HiSilicon arm platform ]]] 
and also get performance improvement greatly.

Could you help me to check the difference betweent HiSilicon Kunpeng 
920-6426 and other arm platform for the block/section mapping TLB support?

>>
>>>> +        }
>>>> +        alloc_crashkernel_cont_pte(pmdp, addr, next, phys, prot,
>>>> +                       pgtable_alloc, flags);
>>>> +        phys += next - addr;
>>>> +    } while (pmdp++, addr = next, addr != end);
>>>> +}
>>>>
>>> .
>>>
>>

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2022-06-28  7:54 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-06-26 11:10 [PATCH] arm64: mm: fix linear mapping mem access performace degradation Guanghui Feng
2022-06-27  6:34 ` Mike Rapoport
     [not found]   ` <4d18d303-aeed-0beb-a8a4-32893f2d438d@linux.alibaba.com>
2022-06-27  9:49     ` Mike Rapoport
2022-06-27 10:46       ` guanghui.fgh
2022-06-27 12:06         ` Leizhen (ThunderTown)
2022-06-27 12:25           ` guanghui.fgh
2022-06-28  1:34             ` Leizhen (ThunderTown)
2022-06-28  3:06               ` guanghui.fgh
2022-06-28  6:21                 ` Leizhen (ThunderTown)
2022-06-28  7:52                 ` guanghui.fgh
2022-06-27 16:49         ` Mike Rapoport
2022-06-28  1:50           ` Kefeng Wang
2022-06-28  3:22             ` guanghui.fgh
2022-06-28  2:55           ` guanghui.fgh
2022-06-28  1:40 ` Leizhen (ThunderTown)

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).