linux-riscv.lists.infradead.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v5 0/4] riscv, mm: detect svnapot cpu support at runtime
@ 2022-10-03 13:47 panqinglin2020
  2022-10-03 13:47 ` [PATCH v5 1/4] mm: modify pte format for Svnapot panqinglin2020
                   ` (4 more replies)
  0 siblings, 5 replies; 26+ messages in thread
From: panqinglin2020 @ 2022-10-03 13:47 UTC (permalink / raw)
  To: palmer, linux-riscv; +Cc: jeff, xuyinan, Qinglin Pan

From: Qinglin Pan <panqinglin2020@iscas.ac.cn>

Svnapot is a RISC-V extension for marking contiguous 4K pages as a non-4K
page. This patch set is for using Svnapot in Linux Kernel's boot process
and hugetlb fs.

This patchset adds a Kconfig item for using Svnapot in
"Platform type"->"SVNAPOT extension support". Its default value is off,
and people can set it on if they allow kernel to detect Svnapot hardware
support and leverage it.

Tested on:
  - qemu rv64 with "Svnapot support" off and svnapot=true.
  - qemu rv64 with "Svnapot support" on and svnapot=true.
  - qemu rv64 with "Svnapot support" off and svnapot=false.
  - qemu rv64 with "Svnapot support" on and svnapot=false.


Qinglin Pan (4):
  mm: modify pte format for Svnapot
  mm: support Svnapot in physical page linear-mapping
  mm: support Svnapot in hugetlb page
  mm: support Svnapot in huge vmap

 arch/riscv/Kconfig                   |  17 +-
 arch/riscv/include/asm/errata_list.h |  23 ++-
 arch/riscv/include/asm/hugetlb.h     |  30 +++-
 arch/riscv/include/asm/hwcap.h       |   3 +-
 arch/riscv/include/asm/page.h        |   2 +-
 arch/riscv/include/asm/pgtable-64.h  |  13 ++
 arch/riscv/include/asm/pgtable.h     |  68 +++++++-
 arch/riscv/include/asm/vmalloc.h     |  28 +++
 arch/riscv/kernel/cpu.c              |   1 +
 arch/riscv/kernel/cpufeature.c       |  18 ++
 arch/riscv/mm/hugetlbpage.c          | 250 ++++++++++++++++++++++++++-
 arch/riscv/mm/init.c                 |  28 ++-
 12 files changed, 467 insertions(+), 14 deletions(-)

-- 
2.35.1


_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH v5 1/4] mm: modify pte format for Svnapot
  2022-10-03 13:47 [PATCH v5 0/4] riscv, mm: detect svnapot cpu support at runtime panqinglin2020
@ 2022-10-03 13:47 ` panqinglin2020
  2022-10-04 17:00   ` Andrew Jones
  2022-10-03 13:47 ` [PATCH v5 2/4] mm: support Svnapot in physical page linear-mapping panqinglin2020
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 26+ messages in thread
From: panqinglin2020 @ 2022-10-03 13:47 UTC (permalink / raw)
  To: palmer, linux-riscv; +Cc: jeff, xuyinan, Qinglin Pan

From: Qinglin Pan <panqinglin2020@iscas.ac.cn>

This commit adds two erratas to enable/disable svnapot support, patches
code dynamicly when "svnapot" is in the "riscv,isa" field of fdt and
SVNAPOT compile option is set. It will influence the behavior of
has_svnapot function and pte_pfn function. All code dependent on svnapot
should make sure that has_svnapot return true firstly.

Also, this commit modifies PTE definition for Svnapot, and creates some
functions in pgtable.h to mark a PTE as napot and check if it is a Svnapot
PTE. Until now, only 64KB napot size is supported in draft spec, so some
macros has only 64KB version.

Signed-off-by: Qinglin Pan <panqinglin2020@iscas.ac.cn>

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index d557cc50295d..4354024aae21 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -383,6 +383,20 @@ config RISCV_ISA_C
 
 	  If you don't know what to do here, say Y.
 
+config RISCV_ISA_SVNAPOT
+	bool "SVNAPOT extension support"
+	depends on 64BIT && MMU
+	select RISCV_ALTERNATIVE
+	default n
+	help
+	  Say Y if you want to allow kernel to detect SVNAPOT ISA-extension
+	  dynamically in boot time and enable its usage.
+
+	  SVNAPOT extension helps to mark contiguous PTEs as a range
+	  of contiguous virtual-to-physical translations, with a naturally
+	  aligned power-of-2 (NAPOT) granularity larger than the base 4KB page
+	  size.
+
 config RISCV_ISA_SVPBMT
 	bool "SVPBMT extension support"
 	depends on 64BIT && MMU
diff --git a/arch/riscv/include/asm/errata_list.h b/arch/riscv/include/asm/errata_list.h
index 19a771085781..f3aff5ef52e4 100644
--- a/arch/riscv/include/asm/errata_list.h
+++ b/arch/riscv/include/asm/errata_list.h
@@ -22,7 +22,8 @@
 
 #define	CPUFEATURE_SVPBMT 0
 #define	CPUFEATURE_ZICBOM 1
-#define	CPUFEATURE_NUMBER 2
+#define	CPUFEATURE_SVNAPOT 2
+#define	CPUFEATURE_NUMBER 3
 
 #ifdef __ASSEMBLY__
 
@@ -142,6 +143,26 @@ asm volatile(ALTERNATIVE_2(						\
 	    "r"((unsigned long)(_start) + (_size))			\
 	: "a0")
 
+#define ALT_SVNAPOT(_val)						\
+asm(ALTERNATIVE("li %0, 0", "li %0, 1", 0,				\
+		CPUFEATURE_SVNAPOT, CONFIG_RISCV_ISA_SVNAPOT)		\
+		: "=r"(_val) :)
+
+#define ALT_SVNAPOT_PTE_PFN(_val, _napot_shift, _pfn_mask, _pfn_shift)	\
+asm(ALTERNATIVE("and %0, %0, %1\n\t"					\
+		"srli %0, %0, %2\n\t"					\
+		__nops(3),						\
+		"srli t3, %0, %3\n\t"					\
+		"and %0, %0, %1\n\t"					\
+		"srli %0, %0, %2\n\t"					\
+		"sub  t4, %0, t3\n\t"					\
+		"and  %0, %0, t4",					\
+		0, CPUFEATURE_SVNAPOT, CONFIG_RISCV_ISA_SVNAPOT)	\
+		: "+r"(_val)						\
+		: "r"(_pfn_mask),					\
+		  "i"(_pfn_shift),					\
+		  "i"(_napot_shift))
+
 #endif /* __ASSEMBLY__ */
 
 #endif
diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
index 6f59ec64175e..83b8e21a3573 100644
--- a/arch/riscv/include/asm/hwcap.h
+++ b/arch/riscv/include/asm/hwcap.h
@@ -54,10 +54,11 @@ extern unsigned long elf_hwcap;
  */
 enum riscv_isa_ext_id {
 	RISCV_ISA_EXT_SSCOFPMF = RISCV_ISA_EXT_BASE,
+	RISCV_ISA_EXT_SSTC,
+	RISCV_ISA_EXT_SVNAPOT,
 	RISCV_ISA_EXT_SVPBMT,
 	RISCV_ISA_EXT_ZICBOM,
 	RISCV_ISA_EXT_ZIHINTPAUSE,
-	RISCV_ISA_EXT_SSTC,
 	RISCV_ISA_EXT_ID_MAX = RISCV_ISA_EXT_MAX,
 };
 
diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h
index dc42375c2357..25ec541192e5 100644
--- a/arch/riscv/include/asm/pgtable-64.h
+++ b/arch/riscv/include/asm/pgtable-64.h
@@ -74,6 +74,19 @@ typedef struct {
  */
 #define _PAGE_PFN_MASK  GENMASK(53, 10)
 
+/*
+ * [63] Svnapot definitions:
+ * 0 Svnapot disabled
+ * 1 Svnapot enabled
+ */
+#define _PAGE_NAPOT_SHIFT 63
+#define _PAGE_NAPOT      BIT(_PAGE_NAPOT_SHIFT)
+#define NAPOT_CONT64KB_ORDER 4UL
+#define NAPOT_CONT64KB_SHIFT (NAPOT_CONT64KB_ORDER + PAGE_SHIFT)
+#define NAPOT_CONT64KB_SIZE BIT(NAPOT_CONT64KB_SHIFT)
+#define NAPOT_CONT64KB_MASK (NAPOT_CONT64KB_SIZE - 1UL)
+#define NAPOT_64KB_PTE_NUM BIT(NAPOT_CONT64KB_ORDER)
+
 /*
  * [62:61] Svpbmt Memory Type definitions:
  *
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 7ec936910a96..c3fc3c661699 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -264,10 +264,39 @@ static inline pte_t pud_pte(pud_t pud)
 	return __pte(pud_val(pud));
 }
 
+static inline bool has_svnapot(void)
+{
+	u64 _val;
+
+	ALT_SVNAPOT(_val);
+	return _val;
+}
+
+#ifdef CONFIG_RISCV_ISA_SVNAPOT
+
+static inline unsigned long pte_napot(pte_t pte)
+{
+	return pte_val(pte) & _PAGE_NAPOT;
+}
+
+static inline pte_t pte_mknapot(pte_t pte, unsigned int order)
+{
+	int pos = order - 1 + _PAGE_PFN_SHIFT;
+	unsigned long napot_bit = BIT(pos);
+	unsigned long napot_mask = ~GENMASK(pos, _PAGE_PFN_SHIFT);
+
+	return __pte((pte_val(pte) & napot_mask) | napot_bit | _PAGE_NAPOT);
+}
+#endif /* CONFIG_RISCV_ISA_SVNAPOT */
+
 /* Yields the page frame number (PFN) of a page table entry */
 static inline unsigned long pte_pfn(pte_t pte)
 {
-	return __page_val_to_pfn(pte_val(pte));
+	unsigned long _val  = pte_val(pte);
+
+	ALT_SVNAPOT_PTE_PFN(_val, _PAGE_NAPOT_SHIFT,
+			    _PAGE_PFN_MASK, _PAGE_PFN_SHIFT);
+	return _val;
 }
 
 #define pte_page(x)     pfn_to_page(pte_pfn(x))
diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c
index 0be8a2403212..d2a61122c595 100644
--- a/arch/riscv/kernel/cpu.c
+++ b/arch/riscv/kernel/cpu.c
@@ -96,6 +96,7 @@ static struct riscv_isa_ext_data isa_ext_arr[] = {
 	__RISCV_ISA_EXT_DATA(zicbom, RISCV_ISA_EXT_ZICBOM),
 	__RISCV_ISA_EXT_DATA(zihintpause, RISCV_ISA_EXT_ZIHINTPAUSE),
 	__RISCV_ISA_EXT_DATA(sstc, RISCV_ISA_EXT_SSTC),
+	__RISCV_ISA_EXT_DATA(svnapot, RISCV_ISA_EXT_SVNAPOT),
 	__RISCV_ISA_EXT_DATA("", RISCV_ISA_EXT_MAX),
 };
 
diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
index 553d755483ed..3557c5cc6f04 100644
--- a/arch/riscv/kernel/cpufeature.c
+++ b/arch/riscv/kernel/cpufeature.c
@@ -204,6 +204,7 @@ void __init riscv_fill_hwcap(void)
 				SET_ISA_EXT_MAP("zicbom", RISCV_ISA_EXT_ZICBOM);
 				SET_ISA_EXT_MAP("zihintpause", RISCV_ISA_EXT_ZIHINTPAUSE);
 				SET_ISA_EXT_MAP("sstc", RISCV_ISA_EXT_SSTC);
+				SET_ISA_EXT_MAP("svnapot", RISCV_ISA_EXT_SVNAPOT);
 			}
 #undef SET_ISA_EXT_MAP
 		}
@@ -284,6 +285,20 @@ static bool __init_or_module cpufeature_probe_zicbom(unsigned int stage)
 	return false;
 }
 
+static bool __init_or_module cpufeature_probe_svnapot(unsigned int stage)
+{
+#ifdef CONFIG_RISCV_ISA_SVNAPOT
+	switch (stage) {
+	case RISCV_ALTERNATIVES_EARLY_BOOT:
+		return false;
+	default:
+		return riscv_isa_extension_available(NULL, SVNAPOT);
+	}
+#endif
+
+	return false;
+}
+
 /*
  * Probe presence of individual extensions.
  *
@@ -301,6 +316,9 @@ static u32 __init_or_module cpufeature_probe(unsigned int stage)
 	if (cpufeature_probe_zicbom(stage))
 		cpu_req_feature |= (1U << CPUFEATURE_ZICBOM);
 
+	if (cpufeature_probe_svnapot(stage))
+		cpu_req_feature |= BIT(CPUFEATURE_SVNAPOT);
+
 	return cpu_req_feature;
 }
 
-- 
2.35.1


_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH v5 2/4] mm: support Svnapot in physical page linear-mapping
  2022-10-03 13:47 [PATCH v5 0/4] riscv, mm: detect svnapot cpu support at runtime panqinglin2020
  2022-10-03 13:47 ` [PATCH v5 1/4] mm: modify pte format for Svnapot panqinglin2020
@ 2022-10-03 13:47 ` panqinglin2020
  2022-10-04 18:40   ` Conor Dooley
  2022-10-05 11:19   ` Andrew Jones
  2022-10-03 13:47 ` [PATCH v5 3/4] mm: support Svnapot in hugetlb page panqinglin2020
                   ` (2 subsequent siblings)
  4 siblings, 2 replies; 26+ messages in thread
From: panqinglin2020 @ 2022-10-03 13:47 UTC (permalink / raw)
  To: palmer, linux-riscv; +Cc: jeff, xuyinan, Qinglin Pan

From: Qinglin Pan <panqinglin2020@iscas.ac.cn>

Svnapot is powerful when a physical region is going to mapped to a
virtual region. Kernel will do like this when mapping all allocable
physical pages to kernel vm space. This commit modifies the
create_pte_mapping function used in linear-mapping procedure, so the
kernel can be able to use Svnapot when both address and length of
physical region are 64KB align. Code here will be executed only when
other size huge page is not suitable, so it can be an addition of
PMD_SIZE and PUD_SIZE mapping.

This commit also modifies the best_map_size function to give map_size
many times instead of only once, so a memory region can be mapped by
both PMD_SIZE and 64KB napot size.

It is tested by setting qemu's memory to a 262272k region, and the
kernel can boot successfully.

Currently, the modified create_pte_mapping will never take use of SVNAPOT,
because this extension is detected in riscv_fill_hwcap and enabled in
apply_boot_alternatives(called from setup_arch) which is called
after setup_vm_final. We will need to support function like
riscv_fill_hwcap_early to fill hardware capabilities more earlier, and
try to enable SVNAPOT more earlier in apply_early_boot_alternatives,
so that we can determine SVNAPOT's presence during setup_vm_final.

Signed-off-by: Qinglin Pan <panqinglin2020@iscas.ac.cn>

diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index b56a0a75533f..76317bb28f29 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -373,9 +373,21 @@ static void __init create_pte_mapping(pte_t *ptep,
 				      phys_addr_t sz, pgprot_t prot)
 {
 	uintptr_t pte_idx = pte_index(va);
+#ifdef CONFIG_RISCV_ISA_SVNAPOT
+	pte_t pte;
+
+	if (has_svnapot() && sz == NAPOT_CONT64KB_SIZE) {
+		do {
+			pte = pfn_pte(PFN_DOWN(pa), prot);
+			ptep[pte_idx] = pte_mknapot(pte, NAPOT_CONT64KB_ORDER);
+			pte_idx++;
+			sz -= PAGE_SIZE;
+		} while (sz > 0);
+		return;
+	}
+#endif
 
 	BUG_ON(sz != PAGE_SIZE);
-
 	if (pte_none(ptep[pte_idx]))
 		ptep[pte_idx] = pfn_pte(PFN_DOWN(pa), prot);
 }
@@ -673,10 +685,18 @@ void __init create_pgd_mapping(pgd_t *pgdp,
 static uintptr_t __init best_map_size(phys_addr_t base, phys_addr_t size)
 {
 	/* Upgrade to PMD_SIZE mappings whenever possible */
-	if ((base & (PMD_SIZE - 1)) || (size & (PMD_SIZE - 1)))
+	base &= PMD_SIZE - 1;
+	if (!base && size >= PMD_SIZE)
+		return PMD_SIZE;
+
+	if (!has_svnapot())
 		return PAGE_SIZE;
 
-	return PMD_SIZE;
+	base &= NAPOT_CONT64KB_SIZE - 1;
+	if (!base && size >= NAPOT_CONT64KB_SIZE)
+		return NAPOT_CONT64KB_SIZE;
+
+	return PAGE_SIZE;
 }
 
 #ifdef CONFIG_XIP_KERNEL
@@ -1111,9 +1131,9 @@ static void __init setup_vm_final(void)
 		if (end >= __pa(PAGE_OFFSET) + memory_limit)
 			end = __pa(PAGE_OFFSET) + memory_limit;
 
-		map_size = best_map_size(start, end - start);
 		for (pa = start; pa < end; pa += map_size) {
 			va = (uintptr_t)__va(pa);
+			map_size = best_map_size(pa, end - pa);
 
 			create_pgd_mapping(swapper_pg_dir, va, pa, map_size,
 					   pgprot_from_va(va));
-- 
2.35.1


_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH v5 3/4] mm: support Svnapot in hugetlb page
  2022-10-03 13:47 [PATCH v5 0/4] riscv, mm: detect svnapot cpu support at runtime panqinglin2020
  2022-10-03 13:47 ` [PATCH v5 1/4] mm: modify pte format for Svnapot panqinglin2020
  2022-10-03 13:47 ` [PATCH v5 2/4] mm: support Svnapot in physical page linear-mapping panqinglin2020
@ 2022-10-03 13:47 ` panqinglin2020
  2022-10-04 18:43   ` Conor Dooley
  2022-10-05 13:11   ` Andrew Jones
  2022-10-03 13:47 ` [PATCH v5 4/4] mm: support Svnapot in huge vmap panqinglin2020
  2022-10-03 13:53 ` [PATCH v5 0/4] riscv, mm: detect svnapot cpu support at runtime Qinglin Pan
  4 siblings, 2 replies; 26+ messages in thread
From: panqinglin2020 @ 2022-10-03 13:47 UTC (permalink / raw)
  To: palmer, linux-riscv; +Cc: jeff, xuyinan, Qinglin Pan

From: Qinglin Pan <panqinglin2020@iscas.ac.cn>

Svnapot can be used to support 64KB hugetlb page, so it can become a new
option when using hugetlbfs. This commit adds a basic implementation of
hugetlb page, and support 64KB as a size in it by using Svnapot.

For test, boot kernel with command line contains "default_hugepagesz=64K
hugepagesz=64K hugepages=20" and run a simple test like this:

int main() {
	void *addr;
	addr = mmap(NULL, 64 * 1024, PROT_WRITE | PROT_READ,
			MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_HUGE_64KB, -1, 0);
	printf("back from mmap \n");
	long *ptr = (long *)addr;
	unsigned int i = 0;
	for(; i < 8 * 1024;i += 512) {
		printf("%lp \n", ptr);
		*ptr = 0xdeafabcd12345678;
		ptr += 512;
	}
	ptr = (long *)addr;
	i = 0;
	for(; i < 8 * 1024;i += 512) {
		if (*ptr != 0xdeafabcd12345678) {
			printf("failed! 0x%lx \n", *ptr);
			break;
		}
		ptr += 512;
	}
	if(i == 8 * 1024)
		printf("simple test passed!\n");
}

And it should be passed.

Signed-off-by: Qinglin Pan <panqinglin2020@iscas.ac.cn>

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 4354024aae21..3d5ec1391046 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -43,7 +43,7 @@ config RISCV
 	select ARCH_USE_QUEUED_RWLOCKS
 	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
 	select ARCH_WANT_FRAME_POINTERS
-	select ARCH_WANT_GENERAL_HUGETLB
+	select ARCH_WANT_GENERAL_HUGETLB if !RISCV_ISA_SVNAPOT
 	select ARCH_WANT_HUGE_PMD_SHARE if 64BIT
 	select BINFMT_FLAT_NO_DATA_START_OFFSET if !MMU
 	select BUILDTIME_TABLE_SORT if MMU
diff --git a/arch/riscv/include/asm/hugetlb.h b/arch/riscv/include/asm/hugetlb.h
index a5c2ca1d1cd8..79cbb482f0a0 100644
--- a/arch/riscv/include/asm/hugetlb.h
+++ b/arch/riscv/include/asm/hugetlb.h
@@ -2,7 +2,35 @@
 #ifndef _ASM_RISCV_HUGETLB_H
 #define _ASM_RISCV_HUGETLB_H
 
-#include <asm-generic/hugetlb.h>
 #include <asm/page.h>
 
+#ifdef CONFIG_RISCV_ISA_SVNAPOT
+pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags);
+#define arch_make_huge_pte arch_make_huge_pte
+#define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT
+void set_huge_pte_at(struct mm_struct *mm,
+		     unsigned long addr, pte_t *ptep, pte_t pte);
+#define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR
+pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
+			      unsigned long addr, pte_t *ptep);
+#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH
+pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
+			    unsigned long addr, pte_t *ptep);
+#define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS
+int huge_ptep_set_access_flags(struct vm_area_struct *vma,
+			       unsigned long addr, pte_t *ptep,
+			       pte_t pte, int dirty);
+#define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT
+void huge_ptep_set_wrprotect(struct mm_struct *mm,
+			     unsigned long addr, pte_t *ptep);
+#define __HAVE_ARCH_HUGE_PTE_CLEAR
+void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
+		    pte_t *ptep, unsigned long sz);
+#define set_huge_swap_pte_at riscv_set_huge_swap_pte_at
+void riscv_set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
+				pte_t *ptep, pte_t pte, unsigned long sz);
+#endif /*CONFIG_RISCV_ISA_SVNAPOT*/
+
+#include <asm-generic/hugetlb.h>
+
 #endif /* _ASM_RISCV_HUGETLB_H */
diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
index ac70b0fd9a9a..1ea06476902a 100644
--- a/arch/riscv/include/asm/page.h
+++ b/arch/riscv/include/asm/page.h
@@ -17,7 +17,7 @@
 #define PAGE_MASK	(~(PAGE_SIZE - 1))
 
 #ifdef CONFIG_64BIT
-#define HUGE_MAX_HSTATE		2
+#define HUGE_MAX_HSTATE		3
 #else
 #define HUGE_MAX_HSTATE		1
 #endif
diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
index 932dadfdca54..faa207826260 100644
--- a/arch/riscv/mm/hugetlbpage.c
+++ b/arch/riscv/mm/hugetlbpage.c
@@ -2,6 +2,239 @@
 #include <linux/hugetlb.h>
 #include <linux/err.h>
 
+#ifdef CONFIG_RISCV_ISA_SVNAPOT
+pte_t *huge_pte_alloc(struct mm_struct *mm,
+		      struct vm_area_struct *vma,
+		      unsigned long addr,
+		      unsigned long sz)
+{
+	pgd_t *pgdp = pgd_offset(mm, addr);
+	p4d_t *p4dp = p4d_alloc(mm, pgdp, addr);
+	pud_t *pudp = pud_alloc(mm, p4dp, addr);
+	pmd_t *pmdp = pmd_alloc(mm, pudp, addr);
+
+	if (sz == NAPOT_CONT64KB_SIZE) {
+		if (!pmdp)
+			return NULL;
+		WARN_ON(addr & (sz - 1));
+		return pte_alloc_map(mm, pmdp, addr);
+	}
+
+	return NULL;
+}
+
+pte_t *huge_pte_offset(struct mm_struct *mm,
+		       unsigned long addr,
+		       unsigned long sz)
+{
+	pgd_t *pgdp;
+	p4d_t *p4dp;
+	pud_t *pudp;
+	pmd_t *pmdp;
+	pte_t *ptep = NULL;
+
+	pgdp = pgd_offset(mm, addr);
+	if (!pgd_present(READ_ONCE(*pgdp)))
+		return NULL;
+
+	p4dp = p4d_offset(pgdp, addr);
+	if (!p4d_present(READ_ONCE(*p4dp)))
+		return NULL;
+
+	pudp = pud_offset(p4dp, addr);
+	if (!pud_present(READ_ONCE(*pudp)))
+		return NULL;
+
+	pmdp = pmd_offset(pudp, addr);
+	if (!pmd_present(READ_ONCE(*pmdp)))
+		return NULL;
+
+	if (sz == NAPOT_CONT64KB_SIZE)
+		ptep = pte_offset_kernel(pmdp, (addr & ~NAPOT_CONT64KB_MASK));
+
+	return ptep;
+}
+
+static int napot_pte_num(pte_t pte)
+{
+	if (pte_val(pte) & _PAGE_NAPOT)
+		return NAPOT_64KB_PTE_NUM;
+
+	pr_warn("%s: unrecognized napot pte size 0x%lx\n",
+		__func__, pte_val(pte));
+	return 1;
+}
+
+static pte_t get_clear_flush(struct mm_struct *mm,
+			     unsigned long addr,
+			     pte_t *ptep,
+			     unsigned long pte_num)
+{
+	pte_t orig_pte = huge_ptep_get(ptep);
+	bool valid = pte_val(orig_pte);
+	unsigned long i, saddr = addr;
+
+	for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++) {
+		pte_t pte = ptep_get_and_clear(mm, addr, ptep);
+
+		if (pte_dirty(pte))
+			orig_pte = pte_mkdirty(orig_pte);
+
+		if (pte_young(pte))
+			orig_pte = pte_mkyoung(orig_pte);
+	}
+
+	if (valid) {
+		struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
+
+		flush_tlb_range(&vma, saddr, addr);
+	}
+	return orig_pte;
+}
+
+static void clear_flush(struct mm_struct *mm,
+			unsigned long addr,
+			pte_t *ptep,
+			unsigned long pte_num)
+{
+	struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
+	unsigned long i, saddr = addr;
+
+	for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
+		pte_clear(mm, addr, ptep);
+
+	flush_tlb_range(&vma, saddr, addr);
+}
+
+pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
+{
+	if (shift == NAPOT_CONT64KB_SHIFT)
+		entry = pte_mknapot(entry, NAPOT_CONT64KB_SHIFT - PAGE_SHIFT);
+
+	return entry;
+}
+
+void set_huge_pte_at(struct mm_struct *mm,
+		     unsigned long addr,
+		     pte_t *ptep,
+		     pte_t pte)
+{
+	int i;
+	int pte_num;
+
+	if (!pte_napot(pte)) {
+		set_pte_at(mm, addr, ptep, pte);
+		return;
+	}
+
+	pte_num = napot_pte_num(pte);
+	for (i = 0; i < pte_num; i++, ptep++, addr += PAGE_SIZE)
+		set_pte_at(mm, addr, ptep, pte);
+}
+
+int huge_ptep_set_access_flags(struct vm_area_struct *vma,
+			       unsigned long addr,
+			       pte_t *ptep,
+			       pte_t pte,
+			       int dirty)
+{
+	pte_t orig_pte;
+	int i;
+	int pte_num;
+
+	if (!pte_napot(pte))
+		return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
+
+	pte_num = napot_pte_num(pte);
+	ptep = huge_pte_offset(vma->vm_mm, addr, NAPOT_CONT64KB_SIZE);
+	orig_pte = huge_ptep_get(ptep);
+
+	if (pte_dirty(orig_pte))
+		pte = pte_mkdirty(pte);
+
+	if (pte_young(orig_pte))
+		pte = pte_mkyoung(pte);
+
+	for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
+		ptep_set_access_flags(vma, addr, ptep, pte, dirty);
+
+	return true;
+}
+
+pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
+			      unsigned long addr,
+			      pte_t *ptep)
+{
+	int pte_num;
+	pte_t orig_pte = huge_ptep_get(ptep);
+
+	if (!pte_napot(orig_pte))
+		return ptep_get_and_clear(mm, addr, ptep);
+
+	pte_num = napot_pte_num(orig_pte);
+	return get_clear_flush(mm, addr, ptep, pte_num);
+}
+
+void huge_ptep_set_wrprotect(struct mm_struct *mm,
+			     unsigned long addr,
+			     pte_t *ptep)
+{
+	int i;
+	int pte_num;
+	pte_t pte = READ_ONCE(*ptep);
+
+	if (!pte_napot(pte))
+		return ptep_set_wrprotect(mm, addr, ptep);
+
+	pte_num = napot_pte_num(pte);
+	ptep = huge_pte_offset(mm, addr, NAPOT_CONT64KB_SIZE);
+
+	for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
+		ptep_set_wrprotect(mm, addr, ptep);
+}
+
+pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
+			    unsigned long addr,
+			    pte_t *ptep)
+{
+	int pte_num;
+	pte_t pte = READ_ONCE(*ptep);
+
+	if (!pte_napot(pte)) {
+		ptep_clear_flush(vma, addr, ptep);
+		return pte;
+	}
+
+	pte_num = napot_pte_num(pte);
+	clear_flush(vma->vm_mm, addr, ptep, pte_num);
+
+	return pte;
+}
+
+void huge_pte_clear(struct mm_struct *mm,
+		    unsigned long addr,
+		    pte_t *ptep,
+		    unsigned long sz)
+{
+	int i, pte_num;
+
+	pte_num = napot_pte_num(READ_ONCE(*ptep));
+	for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
+		pte_clear(mm, addr, ptep);
+}
+
+void riscv_set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
+				pte_t *ptep, pte_t pte, unsigned long sz)
+{
+	int i, pte_num;
+
+	pte_num = napot_pte_num(READ_ONCE(*ptep));
+
+	for (i = 0; i < pte_num; i++, ptep++)
+		set_pte(ptep, pte);
+}
+#endif /*CONFIG_RISCV_ISA_SVNAPOT*/
+
 int pud_huge(pud_t pud)
 {
 	return pud_leaf(pud);
@@ -18,17 +251,26 @@ bool __init arch_hugetlb_valid_size(unsigned long size)
 		return true;
 	else if (IS_ENABLED(CONFIG_64BIT) && size == PUD_SIZE)
 		return true;
+#ifdef CONFIG_RISCV_ISA_SVNAPOT
+	else if (has_svnapot() && size == NAPOT_CONT64KB_SIZE)
+		return true;
+#endif /*CONFIG_RISCV_ISA_SVNAPOT*/
 	else
 		return false;
 }
 
-#ifdef CONFIG_CONTIG_ALLOC
-static __init int gigantic_pages_init(void)
+static __init int hugetlbpage_init(void)
 {
+#ifdef CONFIG_CONTIG_ALLOC
 	/* With CONTIG_ALLOC, we can allocate gigantic pages at runtime */
 	if (IS_ENABLED(CONFIG_64BIT))
 		hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
+#endif /*CONFIG_CONTIG_ALLOC*/
+	hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
+#ifdef CONFIG_RISCV_ISA_SVNAPOT
+	if (has_svnapot())
+		hugetlb_add_hstate(NAPOT_CONT64KB_SHIFT - PAGE_SHIFT);
+#endif /*CONFIG_RISCV_ISA_SVNAPOT*/
 	return 0;
 }
-arch_initcall(gigantic_pages_init);
-#endif
+arch_initcall(hugetlbpage_init);
-- 
2.35.1


_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH v5 4/4] mm: support Svnapot in huge vmap
  2022-10-03 13:47 [PATCH v5 0/4] riscv, mm: detect svnapot cpu support at runtime panqinglin2020
                   ` (2 preceding siblings ...)
  2022-10-03 13:47 ` [PATCH v5 3/4] mm: support Svnapot in hugetlb page panqinglin2020
@ 2022-10-03 13:47 ` panqinglin2020
  2022-10-04 18:46   ` Conor Dooley
  2022-10-03 13:53 ` [PATCH v5 0/4] riscv, mm: detect svnapot cpu support at runtime Qinglin Pan
  4 siblings, 1 reply; 26+ messages in thread
From: panqinglin2020 @ 2022-10-03 13:47 UTC (permalink / raw)
  To: palmer, linux-riscv; +Cc: jeff, xuyinan, Qinglin Pan

From: Qinglin Pan <panqinglin2020@iscas.ac.cn>

The HAVE_ARCH_HUGE_VMAP option can be used to help implement arch
special huge vmap size. This commit selects this option by default and
re-writes the arch_vmap_pte_range_map_size for Svnapot 64KB size.

It can be tested when booting kernel in qemu with pci device, which
will make the kernel to call pci driver using ioremap, and the
re-written function will be called.

Signed-off-by: Qinglin Pan <panqinglin2020@iscas.ac.cn>

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 3d5ec1391046..571f77b16ee8 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -70,6 +70,7 @@ config RISCV
 	select GENERIC_TIME_VSYSCALL if MMU && 64BIT
 	select GENERIC_VDSO_TIME_NS if HAVE_GENERIC_VDSO
 	select HAVE_ARCH_AUDITSYSCALL
+	select HAVE_ARCH_HUGE_VMAP
 	select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL
 	select HAVE_ARCH_JUMP_LABEL_RELATIVE if !XIP_KERNEL
 	select HAVE_ARCH_KASAN if MMU && 64BIT
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index c3fc3c661699..1740d859331a 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -748,6 +748,43 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
+static inline int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
+{
+	return 0;
+}
+
+static inline int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
+{
+	return 0;
+}
+
+static inline void p4d_clear_huge(p4d_t *p4d) { }
+
+static inline int pud_clear_huge(pud_t *pud)
+{
+	return 0;
+}
+
+static inline int pmd_clear_huge(pmd_t *pmd)
+{
+	return 0;
+}
+
+static inline int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
+{
+	return 0;
+}
+
+static inline int pud_free_pmd_page(pud_t *pud, unsigned long addr)
+{
+	return 0;
+}
+
+static inline int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
+{
+	return 0;
+}
+
 /*
  * Encode and decode a swap entry
  *
diff --git a/arch/riscv/include/asm/vmalloc.h b/arch/riscv/include/asm/vmalloc.h
index ff9abc00d139..ecd1f784299b 100644
--- a/arch/riscv/include/asm/vmalloc.h
+++ b/arch/riscv/include/asm/vmalloc.h
@@ -1,4 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
 #ifndef _ASM_RISCV_VMALLOC_H
 #define _ASM_RISCV_VMALLOC_H
 
+#include <linux/pgtable.h>
+
+#ifdef CONFIG_RISCV_ISA_SVNAPOT
+#define arch_vmap_pte_range_map_size vmap_pte_range_map_size
+static inline unsigned long
+vmap_pte_range_map_size(unsigned long addr, unsigned long end, u64 pfn,
+			unsigned int max_page_shift)
+{
+	if (!has_svnapot())
+		return PAGE_SIZE;
+
+	if (addr & NAPOT_CONT64KB_MASK)
+		return PAGE_SIZE;
+
+	if (pfn & (NAPOT_64KB_PTE_NUM - 1UL))
+		return PAGE_SIZE;
+
+	if ((end - addr) < NAPOT_CONT64KB_SIZE)
+		return PAGE_SIZE;
+
+	if (max_page_shift < NAPOT_CONT64KB_SHIFT)
+		return PAGE_SIZE;
+
+	return NAPOT_CONT64KB_SIZE;
+}
+#endif /*CONFIG_RISCV_ISA_SVNAPOT*/
+
 #endif /* _ASM_RISCV_VMALLOC_H */
-- 
2.35.1


_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [PATCH v5 0/4] riscv, mm: detect svnapot cpu support at runtime
  2022-10-03 13:47 [PATCH v5 0/4] riscv, mm: detect svnapot cpu support at runtime panqinglin2020
                   ` (3 preceding siblings ...)
  2022-10-03 13:47 ` [PATCH v5 4/4] mm: support Svnapot in huge vmap panqinglin2020
@ 2022-10-03 13:53 ` Qinglin Pan
  4 siblings, 0 replies; 26+ messages in thread
From: Qinglin Pan @ 2022-10-03 13:53 UTC (permalink / raw)
  To: palmer, linux-riscv; +Cc: jeff, xuyinan

Sorry for missing below information.

Changes in v2:
   - detect Svnapot hardware support at boot time.
Changes in v3:
   - do linear mapping again if has_svnapot
Changes in v4:
   - fix some errors/warns reported by checkpatch.pl, thanks @Conor
Changes in v5:
   - modify code according to @Conor and @Heiko's suggestions.
     Thanks @Conor @Heiko


On 10/3/22 9:47 PM, panqinglin2020@iscas.ac.cn wrote:
> From: Qinglin Pan <panqinglin2020@iscas.ac.cn>
> 
> Svnapot is a RISC-V extension for marking contiguous 4K pages as a non-4K
> page. This patch set is for using Svnapot in Linux Kernel's boot process
> and hugetlb fs.
> 
> This patchset adds a Kconfig item for using Svnapot in
> "Platform type"->"SVNAPOT extension support". Its default value is off,
> and people can set it on if they allow kernel to detect Svnapot hardware
> support and leverage it.
> 
> Tested on:
>    - qemu rv64 with "Svnapot support" off and svnapot=true.
>    - qemu rv64 with "Svnapot support" on and svnapot=true.
>    - qemu rv64 with "Svnapot support" off and svnapot=false.
>    - qemu rv64 with "Svnapot support" on and svnapot=false.
> 
> 
> Qinglin Pan (4):
>    mm: modify pte format for Svnapot
>    mm: support Svnapot in physical page linear-mapping
>    mm: support Svnapot in hugetlb page
>    mm: support Svnapot in huge vmap
> 
>   arch/riscv/Kconfig                   |  17 +-
>   arch/riscv/include/asm/errata_list.h |  23 ++-
>   arch/riscv/include/asm/hugetlb.h     |  30 +++-
>   arch/riscv/include/asm/hwcap.h       |   3 +-
>   arch/riscv/include/asm/page.h        |   2 +-
>   arch/riscv/include/asm/pgtable-64.h  |  13 ++
>   arch/riscv/include/asm/pgtable.h     |  68 +++++++-
>   arch/riscv/include/asm/vmalloc.h     |  28 +++
>   arch/riscv/kernel/cpu.c              |   1 +
>   arch/riscv/kernel/cpufeature.c       |  18 ++
>   arch/riscv/mm/hugetlbpage.c          | 250 ++++++++++++++++++++++++++-
>   arch/riscv/mm/init.c                 |  28 ++-
>   12 files changed, 467 insertions(+), 14 deletions(-)
> 


_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v5 1/4] mm: modify pte format for Svnapot
  2022-10-03 13:47 ` [PATCH v5 1/4] mm: modify pte format for Svnapot panqinglin2020
@ 2022-10-04 17:00   ` Andrew Jones
  2022-10-04 17:09     ` Conor Dooley
  2022-10-04 18:33     ` Conor Dooley
  0 siblings, 2 replies; 26+ messages in thread
From: Andrew Jones @ 2022-10-04 17:00 UTC (permalink / raw)
  To: panqinglin2020; +Cc: palmer, linux-riscv, jeff, xuyinan

On Mon, Oct 03, 2022 at 09:47:18PM +0800, panqinglin2020@iscas.ac.cn wrote:
> From: Qinglin Pan <panqinglin2020@iscas.ac.cn>
> 
> This commit adds two erratas to enable/disable svnapot support, patches
> code dynamicly when "svnapot" is in the "riscv,isa" field of fdt and
> SVNAPOT compile option is set. It will influence the behavior of
> has_svnapot function and pte_pfn function. All code dependent on svnapot
> should make sure that has_svnapot return true firstly.
> 
> Also, this commit modifies PTE definition for Svnapot, and creates some
> functions in pgtable.h to mark a PTE as napot and check if it is a Svnapot
> PTE. Until now, only 64KB napot size is supported in draft spec, so some
> macros has only 64KB version.
> 
> Signed-off-by: Qinglin Pan <panqinglin2020@iscas.ac.cn>
> 
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index d557cc50295d..4354024aae21 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -383,6 +383,20 @@ config RISCV_ISA_C
>  
>  	  If you don't know what to do here, say Y.
>  
> +config RISCV_ISA_SVNAPOT
> +	bool "SVNAPOT extension support"
> +	depends on 64BIT && MMU
> +	select RISCV_ALTERNATIVE
> +	default n

What's the reason for this to be default no? If there's a trade-off to be
made which requires opt-in, then it should be described in the text below.

> +	help
> +	  Say Y if you want to allow kernel to detect SVNAPOT ISA-extension
> +	  dynamically in boot time and enable its usage.
> +
> +	  SVNAPOT extension helps to mark contiguous PTEs as a range
> +	  of contiguous virtual-to-physical translations, with a naturally
> +	  aligned power-of-2 (NAPOT) granularity larger than the base 4KB page
> +	  size.
> +
>  config RISCV_ISA_SVPBMT
>  	bool "SVPBMT extension support"
>  	depends on 64BIT && MMU
> diff --git a/arch/riscv/include/asm/errata_list.h b/arch/riscv/include/asm/errata_list.h
> index 19a771085781..f3aff5ef52e4 100644
> --- a/arch/riscv/include/asm/errata_list.h
> +++ b/arch/riscv/include/asm/errata_list.h
> @@ -22,7 +22,8 @@
>  
>  #define	CPUFEATURE_SVPBMT 0
>  #define	CPUFEATURE_ZICBOM 1
> -#define	CPUFEATURE_NUMBER 2
> +#define	CPUFEATURE_SVNAPOT 2
> +#define	CPUFEATURE_NUMBER 3

nit: Maybe take the opportunity to tab out the numbers to get them all
lined up. And tab enough for future longer names too.

>  
>  #ifdef __ASSEMBLY__
>  
> @@ -142,6 +143,26 @@ asm volatile(ALTERNATIVE_2(						\
>  	    "r"((unsigned long)(_start) + (_size))			\
>  	: "a0")
>  
> +#define ALT_SVNAPOT(_val)						\
> +asm(ALTERNATIVE("li %0, 0", "li %0, 1", 0,				\
> +		CPUFEATURE_SVNAPOT, CONFIG_RISCV_ISA_SVNAPOT)		\
> +		: "=r"(_val) :)
> +
> +#define ALT_SVNAPOT_PTE_PFN(_val, _napot_shift, _pfn_mask, _pfn_shift)	\
> +asm(ALTERNATIVE("and %0, %0, %1\n\t"					\
> +		"srli %0, %0, %2\n\t"					\
> +		__nops(3),						\
> +		"srli t3, %0, %3\n\t"					\
> +		"and %0, %0, %1\n\t"					\
> +		"srli %0, %0, %2\n\t"					\
> +		"sub  t4, %0, t3\n\t"					\
> +		"and  %0, %0, t4",					\

This implements

  temp = ((pte & _PAGE_PFN_MASK) >> _PAGE_PFN_SHIFT);
  pfn = temp & (temp - (pte >> _PAGE_NAPOT_SHIFT));

which for a non-napot pte just returns the same as the non-napot
case would, but for a napot pte we return the same as the non-napot
case but with its first set bit cleared, wherever that first set
bit was. Can you explain to me why that's what we want to do?

> +		0, CPUFEATURE_SVNAPOT, CONFIG_RISCV_ISA_SVNAPOT)	\
> +		: "+r"(_val)						\
> +		: "r"(_pfn_mask),					\
> +		  "i"(_pfn_shift),					\
> +		  "i"(_napot_shift))

Should add t3 and t4 to clobbers list.

> +
>  #endif /* __ASSEMBLY__ */
>  
>  #endif
> diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
> index 6f59ec64175e..83b8e21a3573 100644
> --- a/arch/riscv/include/asm/hwcap.h
> +++ b/arch/riscv/include/asm/hwcap.h
> @@ -54,10 +54,11 @@ extern unsigned long elf_hwcap;
>   */
>  enum riscv_isa_ext_id {
>  	RISCV_ISA_EXT_SSCOFPMF = RISCV_ISA_EXT_BASE,
> +	RISCV_ISA_EXT_SSTC,
> +	RISCV_ISA_EXT_SVNAPOT,
>  	RISCV_ISA_EXT_SVPBMT,
>  	RISCV_ISA_EXT_ZICBOM,
>  	RISCV_ISA_EXT_ZIHINTPAUSE,
> -	RISCV_ISA_EXT_SSTC,

Opportunistic alphabetizing an enum is a bit risky. It'd be better if this
was a separate patch, because, even though nothing should care about the
order, if something does care about the order, then it'd be easier to
debug when this when changed alone.

>  	RISCV_ISA_EXT_ID_MAX = RISCV_ISA_EXT_MAX,
>  };
>  
> diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h
> index dc42375c2357..25ec541192e5 100644
> --- a/arch/riscv/include/asm/pgtable-64.h
> +++ b/arch/riscv/include/asm/pgtable-64.h
> @@ -74,6 +74,19 @@ typedef struct {
>   */
>  #define _PAGE_PFN_MASK  GENMASK(53, 10)
>  
> +/*
> + * [63] Svnapot definitions:
> + * 0 Svnapot disabled
> + * 1 Svnapot enabled
> + */
> +#define _PAGE_NAPOT_SHIFT 63
> +#define _PAGE_NAPOT      BIT(_PAGE_NAPOT_SHIFT)
> +#define NAPOT_CONT64KB_ORDER 4UL
> +#define NAPOT_CONT64KB_SHIFT (NAPOT_CONT64KB_ORDER + PAGE_SHIFT)
> +#define NAPOT_CONT64KB_SIZE BIT(NAPOT_CONT64KB_SHIFT)
> +#define NAPOT_CONT64KB_MASK (NAPOT_CONT64KB_SIZE - 1UL)
> +#define NAPOT_64KB_PTE_NUM BIT(NAPOT_CONT64KB_ORDER)

Please make three lined up columns out of these defines.

#define DEF		VAL

> +
>  /*
>   * [62:61] Svpbmt Memory Type definitions:
>   *
> diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
> index 7ec936910a96..c3fc3c661699 100644
> --- a/arch/riscv/include/asm/pgtable.h
> +++ b/arch/riscv/include/asm/pgtable.h
> @@ -264,10 +264,39 @@ static inline pte_t pud_pte(pud_t pud)
>  	return __pte(pud_val(pud));
>  }
>  
> +static inline bool has_svnapot(void)
> +{
> +	u64 _val;
> +
> +	ALT_SVNAPOT(_val);

Why aren't we using the isa extension static key framework for this?

> +	return _val;
> +}
> +
> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
> +
> +static inline unsigned long pte_napot(pte_t pte)
> +{
> +	return pte_val(pte) & _PAGE_NAPOT;
> +}
> +
> +static inline pte_t pte_mknapot(pte_t pte, unsigned int order)
> +{
> +	int pos = order - 1 + _PAGE_PFN_SHIFT;
> +	unsigned long napot_bit = BIT(pos);
> +	unsigned long napot_mask = ~GENMASK(pos, _PAGE_PFN_SHIFT);
> +
> +	return __pte((pte_val(pte) & napot_mask) | napot_bit | _PAGE_NAPOT);
> +}
> +#endif /* CONFIG_RISCV_ISA_SVNAPOT */
> +
>  /* Yields the page frame number (PFN) of a page table entry */
>  static inline unsigned long pte_pfn(pte_t pte)
>  {
> -	return __page_val_to_pfn(pte_val(pte));
> +	unsigned long _val  = pte_val(pte);
> +
> +	ALT_SVNAPOT_PTE_PFN(_val, _PAGE_NAPOT_SHIFT,
> +			    _PAGE_PFN_MASK, _PAGE_PFN_SHIFT);
> +	return _val;
>  }
>  
>  #define pte_page(x)     pfn_to_page(pte_pfn(x))
> diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c
> index 0be8a2403212..d2a61122c595 100644
> --- a/arch/riscv/kernel/cpu.c
> +++ b/arch/riscv/kernel/cpu.c
> @@ -96,6 +96,7 @@ static struct riscv_isa_ext_data isa_ext_arr[] = {
>  	__RISCV_ISA_EXT_DATA(zicbom, RISCV_ISA_EXT_ZICBOM),
>  	__RISCV_ISA_EXT_DATA(zihintpause, RISCV_ISA_EXT_ZIHINTPAUSE),
>  	__RISCV_ISA_EXT_DATA(sstc, RISCV_ISA_EXT_SSTC),
> +	__RISCV_ISA_EXT_DATA(svnapot, RISCV_ISA_EXT_SVNAPOT),

If we want to do a separate isa extension alphabetizing patch, then this
array would be a good one to do since it's the output order.

>  	__RISCV_ISA_EXT_DATA("", RISCV_ISA_EXT_MAX),
>  };
>  
> diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
> index 553d755483ed..3557c5cc6f04 100644
> --- a/arch/riscv/kernel/cpufeature.c
> +++ b/arch/riscv/kernel/cpufeature.c
> @@ -204,6 +204,7 @@ void __init riscv_fill_hwcap(void)
>  				SET_ISA_EXT_MAP("zicbom", RISCV_ISA_EXT_ZICBOM);
>  				SET_ISA_EXT_MAP("zihintpause", RISCV_ISA_EXT_ZIHINTPAUSE);
>  				SET_ISA_EXT_MAP("sstc", RISCV_ISA_EXT_SSTC);
> +				SET_ISA_EXT_MAP("svnapot", RISCV_ISA_EXT_SVNAPOT);

Could alphabetize this too, even though it doesn't matter.

>  			}
>  #undef SET_ISA_EXT_MAP
>  		}
> @@ -284,6 +285,20 @@ static bool __init_or_module cpufeature_probe_zicbom(unsigned int stage)
>  	return false;
>  }
>  
> +static bool __init_or_module cpufeature_probe_svnapot(unsigned int stage)
> +{
> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
> +	switch (stage) {
> +	case RISCV_ALTERNATIVES_EARLY_BOOT:
> +		return false;
> +	default:
> +		return riscv_isa_extension_available(NULL, SVNAPOT);
> +	}
> +#endif
> +
> +	return false;
> +}
> +
>  /*
>   * Probe presence of individual extensions.
>   *
> @@ -301,6 +316,9 @@ static u32 __init_or_module cpufeature_probe(unsigned int stage)
>  	if (cpufeature_probe_zicbom(stage))
>  		cpu_req_feature |= (1U << CPUFEATURE_ZICBOM);
>  
> +	if (cpufeature_probe_svnapot(stage))
> +		cpu_req_feature |= BIT(CPUFEATURE_SVNAPOT);
> +
>  	return cpu_req_feature;
>  }
>  
> -- 
> 2.35.1
>

Thanks,
drew

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v5 1/4] mm: modify pte format for Svnapot
  2022-10-04 17:00   ` Andrew Jones
@ 2022-10-04 17:09     ` Conor Dooley
  2022-10-04 18:33     ` Conor Dooley
  1 sibling, 0 replies; 26+ messages in thread
From: Conor Dooley @ 2022-10-04 17:09 UTC (permalink / raw)
  To: Andrew Jones; +Cc: panqinglin2020, palmer, linux-riscv, jeff, xuyinan

On Tue, Oct 04, 2022 at 07:00:27PM +0200, Andrew Jones wrote:
> On Mon, Oct 03, 2022 at 09:47:18PM +0800, panqinglin2020@iscas.ac.cn wrote:
> > From: Qinglin Pan <panqinglin2020@iscas.ac.cn>
> > 
> > This commit adds two erratas to enable/disable svnapot support, patches
> > code dynamicly when "svnapot" is in the "riscv,isa" field of fdt and
> > SVNAPOT compile option is set. It will influence the behavior of
> > has_svnapot function and pte_pfn function. All code dependent on svnapot
> > should make sure that has_svnapot return true firstly.
> > 
> > Also, this commit modifies PTE definition for Svnapot, and creates some
> > functions in pgtable.h to mark a PTE as napot and check if it is a Svnapot
> > PTE. Until now, only 64KB napot size is supported in draft spec, so some
> > macros has only 64KB version.

> > diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c
> > index 0be8a2403212..d2a61122c595 100644
> > --- a/arch/riscv/kernel/cpu.c
> > +++ b/arch/riscv/kernel/cpu.c
> > @@ -96,6 +96,7 @@ static struct riscv_isa_ext_data isa_ext_arr[] = {
> >  	__RISCV_ISA_EXT_DATA(zicbom, RISCV_ISA_EXT_ZICBOM),
> >  	__RISCV_ISA_EXT_DATA(zihintpause, RISCV_ISA_EXT_ZIHINTPAUSE),
> >  	__RISCV_ISA_EXT_DATA(sstc, RISCV_ISA_EXT_SSTC),
> > +	__RISCV_ISA_EXT_DATA(svnapot, RISCV_ISA_EXT_SVNAPOT),
> 
> If we want to do a separate isa extension alphabetizing patch, then this
> array would be a good one to do since it's the output order.

Just quickly jumping in to say that Palmer's already sent one:
https://lore.kernel.org/all/20220920204518.10988-1-palmer@rivosinc.com/

Thanks,
Conor.


_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v5 1/4] mm: modify pte format for Svnapot
  2022-10-04 17:00   ` Andrew Jones
  2022-10-04 17:09     ` Conor Dooley
@ 2022-10-04 18:33     ` Conor Dooley
  2022-10-05  4:43       ` Qinglin Pan
  2022-10-05  9:52       ` Heiko Stübner
  1 sibling, 2 replies; 26+ messages in thread
From: Conor Dooley @ 2022-10-04 18:33 UTC (permalink / raw)
  To: Andrew Jones; +Cc: panqinglin2020, palmer, linux-riscv, jeff, xuyinan

On Tue, Oct 04, 2022 at 07:00:27PM +0200, Andrew Jones wrote:
> On Mon, Oct 03, 2022 at 09:47:18PM +0800, panqinglin2020@iscas.ac.cn wrote:
> > From: Qinglin Pan <panqinglin2020@iscas.ac.cn>
> > 
> > This commit adds two erratas to enable/disable svnapot support, patches
> > code dynamicly when "svnapot" is in the "riscv,isa" field of fdt and
> > SVNAPOT compile option is set. It will influence the behavior of
> > has_svnapot function and pte_pfn function. All code dependent on svnapot
> > should make sure that has_svnapot return true firstly.
> > 
> > Also, this commit modifies PTE definition for Svnapot, and creates some
> > functions in pgtable.h to mark a PTE as napot and check if it is a Svnapot
> > PTE. Until now, only 64KB napot size is supported in draft spec, so some
> > macros has only 64KB version.
> > 
> > Signed-off-by: Qinglin Pan <panqinglin2020@iscas.ac.cn>
> > 
> > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > index d557cc50295d..4354024aae21 100644
> > --- a/arch/riscv/Kconfig
> > +++ b/arch/riscv/Kconfig
> > @@ -383,6 +383,20 @@ config RISCV_ISA_C
> >  
> >  	  If you don't know what to do here, say Y.
> >  
> > +config RISCV_ISA_SVNAPOT
> > +	bool "SVNAPOT extension support"
> > +	depends on 64BIT && MMU
> > +	select RISCV_ALTERNATIVE
> > +	default n
> 
> What's the reason for this to be default no? If there's a trade-off to be
> made which requires opt-in, then it should be described in the text below.

In addition, existing extensions are default y unless they depend on a
compiler option.

> > +	help
> > +	  Say Y if you want to allow kernel to detect SVNAPOT ISA-extension
> > +	  dynamically in boot time and enable its usage.
> > +
> > +	  SVNAPOT extension helps to mark contiguous PTEs as a range
> > +	  of contiguous virtual-to-physical translations, with a naturally
> > +	  aligned power-of-2 (NAPOT) granularity larger than the base 4KB page
> > +	  size.
> > +
> >  config RISCV_ISA_SVPBMT
> >  	bool "SVPBMT extension support"
> >  	depends on 64BIT && MMU
> > diff --git a/arch/riscv/include/asm/errata_list.h b/arch/riscv/include/asm/errata_list.h
> > index 19a771085781..f3aff5ef52e4 100644
> > --- a/arch/riscv/include/asm/errata_list.h
> > +++ b/arch/riscv/include/asm/errata_list.h
> > @@ -22,7 +22,8 @@
> >  
> >  #define	CPUFEATURE_SVPBMT 0
> >  #define	CPUFEATURE_ZICBOM 1
> > -#define	CPUFEATURE_NUMBER 2
> > +#define	CPUFEATURE_SVNAPOT 2
> > +#define	CPUFEATURE_NUMBER 3
> 
> nit: Maybe take the opportunity to tab out the numbers to get them all
> lined up. And tab enough for future longer names too.

s/maybe// ;)

> 
> >  
> >  #ifdef __ASSEMBLY__
> >  
> > @@ -142,6 +143,26 @@ asm volatile(ALTERNATIVE_2(						\
> >  	    "r"((unsigned long)(_start) + (_size))			\
> >  	: "a0")
> >  
> > +#define ALT_SVNAPOT(_val)						\
> > +asm(ALTERNATIVE("li %0, 0", "li %0, 1", 0,				\
> > +		CPUFEATURE_SVNAPOT, CONFIG_RISCV_ISA_SVNAPOT)		\
> > +		: "=r"(_val) :)
> > +
> > +#define ALT_SVNAPOT_PTE_PFN(_val, _napot_shift, _pfn_mask, _pfn_shift)	\
> > +asm(ALTERNATIVE("and %0, %0, %1\n\t"					\
> > +		"srli %0, %0, %2\n\t"					\
> > +		__nops(3),						\
> > +		"srli t3, %0, %3\n\t"					\
> > +		"and %0, %0, %1\n\t"					\
> > +		"srli %0, %0, %2\n\t"					\
> > +		"sub  t4, %0, t3\n\t"					\
> > +		"and  %0, %0, t4",					\
> 
> This implements
> 
>   temp = ((pte & _PAGE_PFN_MASK) >> _PAGE_PFN_SHIFT);
>   pfn = temp & (temp - (pte >> _PAGE_NAPOT_SHIFT));
> 
> which for a non-napot pte just returns the same as the non-napot
> case would, but for a napot pte we return the same as the non-napot
> case but with its first set bit cleared, wherever that first set
> bit was. Can you explain to me why that's what we want to do?
> 
> > +		0, CPUFEATURE_SVNAPOT, CONFIG_RISCV_ISA_SVNAPOT)	\
> > +		: "+r"(_val)						\
> > +		: "r"(_pfn_mask),					\
> > +		  "i"(_pfn_shift),					\
> > +		  "i"(_napot_shift))
> 
> Should add t3 and t4 to clobbers list.
> 
> > +
> >  #endif /* __ASSEMBLY__ */
> >  
> >  #endif
> > diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
> > index 6f59ec64175e..83b8e21a3573 100644
> > --- a/arch/riscv/include/asm/hwcap.h
> > +++ b/arch/riscv/include/asm/hwcap.h
> > @@ -54,10 +54,11 @@ extern unsigned long elf_hwcap;
> >   */
> >  enum riscv_isa_ext_id {
> >  	RISCV_ISA_EXT_SSCOFPMF = RISCV_ISA_EXT_BASE,
> > +	RISCV_ISA_EXT_SSTC,
> > +	RISCV_ISA_EXT_SVNAPOT,
> >  	RISCV_ISA_EXT_SVPBMT,
> >  	RISCV_ISA_EXT_ZICBOM,
> >  	RISCV_ISA_EXT_ZIHINTPAUSE,
> > -	RISCV_ISA_EXT_SSTC,
> 
> Opportunistic alphabetizing an enum is a bit risky. It'd be better if this
> was a separate patch, because, even though nothing should care about the
> order, if something does care about the order, then it'd be easier to
> debug when this when changed alone.

I think for these things it is s/alphabetical/canonical order, given the
patch from Palmer I linked in my previous mail. Although thinking about
it, that means V before S, so SV before SS? Which his patch did not do:
https://lore.kernel.org/linux-riscv/20220920204518.10988-1-palmer@rivosinc.com/

> 
> >  	RISCV_ISA_EXT_ID_MAX = RISCV_ISA_EXT_MAX,
> >  };
> >  
> > diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h
> > index dc42375c2357..25ec541192e5 100644
> > --- a/arch/riscv/include/asm/pgtable-64.h
> > +++ b/arch/riscv/include/asm/pgtable-64.h
> > @@ -74,6 +74,19 @@ typedef struct {
> >   */
> >  #define _PAGE_PFN_MASK  GENMASK(53, 10)
> >  
> > +/*
> > + * [63] Svnapot definitions:
> > + * 0 Svnapot disabled
> > + * 1 Svnapot enabled
> > + */
> > +#define _PAGE_NAPOT_SHIFT 63
> > +#define _PAGE_NAPOT      BIT(_PAGE_NAPOT_SHIFT)
> > +#define NAPOT_CONT64KB_ORDER 4UL
> > +#define NAPOT_CONT64KB_SHIFT (NAPOT_CONT64KB_ORDER + PAGE_SHIFT)
> > +#define NAPOT_CONT64KB_SIZE BIT(NAPOT_CONT64KB_SHIFT)
> > +#define NAPOT_CONT64KB_MASK (NAPOT_CONT64KB_SIZE - 1UL)
> > +#define NAPOT_64KB_PTE_NUM BIT(NAPOT_CONT64KB_ORDER)
> 
> Please make three lined up columns out of these defines.
> 
> #define DEF		VAL
> 
> > +
> >  /*
> >   * [62:61] Svpbmt Memory Type definitions:
> >   *
> > diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
> > index 7ec936910a96..c3fc3c661699 100644
> > --- a/arch/riscv/include/asm/pgtable.h
> > +++ b/arch/riscv/include/asm/pgtable.h
> > @@ -264,10 +264,39 @@ static inline pte_t pud_pte(pud_t pud)
> >  	return __pte(pud_val(pud));
> >  }
> >  
> > +static inline bool has_svnapot(void)
> > +{
> > +	u64 _val;
> > +
> > +	ALT_SVNAPOT(_val);
> 
> Why aren't we using the isa extension static key framework for this?
> 
> > +	return _val;
> > +}
> > +
> > +#ifdef CONFIG_RISCV_ISA_SVNAPOT
> > +
> > +static inline unsigned long pte_napot(pte_t pte)
> > +{
> > +	return pte_val(pte) & _PAGE_NAPOT;
> > +}
> > +
> > +static inline pte_t pte_mknapot(pte_t pte, unsigned int order)
> > +{
> > +	int pos = order - 1 + _PAGE_PFN_SHIFT;
> > +	unsigned long napot_bit = BIT(pos);
> > +	unsigned long napot_mask = ~GENMASK(pos, _PAGE_PFN_SHIFT);
> > +
> > +	return __pte((pte_val(pte) & napot_mask) | napot_bit | _PAGE_NAPOT);
> > +}
> > +#endif /* CONFIG_RISCV_ISA_SVNAPOT */
> > +
> >  /* Yields the page frame number (PFN) of a page table entry */
> >  static inline unsigned long pte_pfn(pte_t pte)
> >  {
> > -	return __page_val_to_pfn(pte_val(pte));
> > +	unsigned long _val  = pte_val(pte);
> > +
> > +	ALT_SVNAPOT_PTE_PFN(_val, _PAGE_NAPOT_SHIFT,
> > +			    _PAGE_PFN_MASK, _PAGE_PFN_SHIFT);
> > +	return _val;
> >  }
> >  
> >  #define pte_page(x)     pfn_to_page(pte_pfn(x))
> > diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c
> > index 0be8a2403212..d2a61122c595 100644
> > --- a/arch/riscv/kernel/cpu.c
> > +++ b/arch/riscv/kernel/cpu.c
> > @@ -96,6 +96,7 @@ static struct riscv_isa_ext_data isa_ext_arr[] = {
> >  	__RISCV_ISA_EXT_DATA(zicbom, RISCV_ISA_EXT_ZICBOM),
> >  	__RISCV_ISA_EXT_DATA(zihintpause, RISCV_ISA_EXT_ZIHINTPAUSE),
> >  	__RISCV_ISA_EXT_DATA(sstc, RISCV_ISA_EXT_SSTC),
> > +	__RISCV_ISA_EXT_DATA(svnapot, RISCV_ISA_EXT_SVNAPOT),
> 
> If we want to do a separate isa extension alphabetizing patch, then this
> array would be a good one to do since it's the output order.
> 
> >  	__RISCV_ISA_EXT_DATA("", RISCV_ISA_EXT_MAX),
> >  };
> >  
> > diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
> > index 553d755483ed..3557c5cc6f04 100644
> > --- a/arch/riscv/kernel/cpufeature.c
> > +++ b/arch/riscv/kernel/cpufeature.c
> > @@ -204,6 +204,7 @@ void __init riscv_fill_hwcap(void)
> >  				SET_ISA_EXT_MAP("zicbom", RISCV_ISA_EXT_ZICBOM);
> >  				SET_ISA_EXT_MAP("zihintpause", RISCV_ISA_EXT_ZIHINTPAUSE);
> >  				SET_ISA_EXT_MAP("sstc", RISCV_ISA_EXT_SSTC);
> > +				SET_ISA_EXT_MAP("svnapot", RISCV_ISA_EXT_SVNAPOT);
> 
> Could alphabetize this too, even though it doesn't matter.
> 
> >  			}
> >  #undef SET_ISA_EXT_MAP
> >  		}
> > @@ -284,6 +285,20 @@ static bool __init_or_module cpufeature_probe_zicbom(unsigned int stage)
> >  	return false;
> >  }
> >  
> > +static bool __init_or_module cpufeature_probe_svnapot(unsigned int stage)
> > +{
> > +#ifdef CONFIG_RISCV_ISA_SVNAPOT

Is there a reason that this cannot be if IS_ENABLED(RISCV_ISA_SVNAPOT)?

> > +	switch (stage) {
> > +	case RISCV_ALTERNATIVES_EARLY_BOOT:
> > +		return false;
> > +	default:
> > +		return riscv_isa_extension_available(NULL, SVNAPOT);
> > +	}
> > +#endif
> > +
> > +	return false;
> > +}
> > +
> >  /*
> >   * Probe presence of individual extensions.
> >   *
> > @@ -301,6 +316,9 @@ static u32 __init_or_module cpufeature_probe(unsigned int stage)
> >  	if (cpufeature_probe_zicbom(stage))
> >  		cpu_req_feature |= (1U << CPUFEATURE_ZICBOM);

While in the area, could this be converted to BIT(CPUFEATURE_ZICBOM)?

> >  
> > +	if (cpufeature_probe_svnapot(stage))
> > +		cpu_req_feature |= BIT(CPUFEATURE_SVNAPOT);
> > +
> >  	return cpu_req_feature;
> >  }
> >  
> > -- 
> > 2.35.1
> >
> 
> Thanks,
> drew
> 
> _______________________________________________
> linux-riscv mailing list
> linux-riscv@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-riscv

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v5 2/4] mm: support Svnapot in physical page linear-mapping
  2022-10-03 13:47 ` [PATCH v5 2/4] mm: support Svnapot in physical page linear-mapping panqinglin2020
@ 2022-10-04 18:40   ` Conor Dooley
  2022-10-05  2:43     ` Qinglin Pan
  2022-10-05 11:19   ` Andrew Jones
  1 sibling, 1 reply; 26+ messages in thread
From: Conor Dooley @ 2022-10-04 18:40 UTC (permalink / raw)
  To: panqinglin2020; +Cc: palmer, linux-riscv, jeff, xuyinan


Hey Qinglin Pan,

Other comments aside, it'd be good to add previous reviewers to the CC
list on follow-up versions.

On Mon, Oct 03, 2022 at 09:47:19PM +0800, panqinglin2020@iscas.ac.cn wrote:
> From: Qinglin Pan <panqinglin2020@iscas.ac.cn>
> mm: modify pte format for Svnapot

"riscv: mm: foo" please.

> 
> Svnapot is powerful when a physical region is going to mapped to a
> virtual region. Kernel will do like this when mapping all allocable
> physical pages to kernel vm space. This commit modifies the

s/This commit modifies/Modify

> create_pte_mapping function used in linear-mapping procedure, so the
> kernel can be able to use Svnapot when both address and length of
> physical region are 64KB align. Code here will be executed only when
> other size huge page is not suitable, so it can be an addition of
> PMD_SIZE and PUD_SIZE mapping.
> 
> This commit also modifies the best_map_size function to give map_size

s/This commit also modifies/Modify/

Although, with the "also" should this be two patches or is there a
compile time dependency?

> many times instead of only once, so a memory region can be mapped by
> both PMD_SIZE and 64KB napot size.
> 
> It is tested by setting qemu's memory to a 262272k region, and the
> kernel can boot successfully.
> 
> Currently, the modified create_pte_mapping will never take use of SVNAPOT,
> because this extension is detected in riscv_fill_hwcap and enabled in
> apply_boot_alternatives(called from setup_arch) which is called
> after setup_vm_final. We will need to support function like

Out of curiousity, why doesn't this series add the support?
Do you intend sending a follow up series?

> riscv_fill_hwcap_early to fill hardware capabilities more earlier, and
> try to enable SVNAPOT more earlier in apply_early_boot_alternatives,
> so that we can determine SVNAPOT's presence during setup_vm_final.
> 
> Signed-off-by: Qinglin Pan <panqinglin2020@iscas.ac.cn>
> 
> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
> index b56a0a75533f..76317bb28f29 100644
> --- a/arch/riscv/mm/init.c
> +++ b/arch/riscv/mm/init.c
> @@ -373,9 +373,21 @@ static void __init create_pte_mapping(pte_t *ptep,
>  				      phys_addr_t sz, pgprot_t prot)
>  {
>  	uintptr_t pte_idx = pte_index(va);
> +#ifdef CONFIG_RISCV_ISA_SVNAPOT

Would using IS_ENBLED() cause problems here?

> +	pte_t pte;
> +
> +	if (has_svnapot() && sz == NAPOT_CONT64KB_SIZE) {
> +		do {
> +			pte = pfn_pte(PFN_DOWN(pa), prot);
> +			ptep[pte_idx] = pte_mknapot(pte, NAPOT_CONT64KB_ORDER);
> +			pte_idx++;
> +			sz -= PAGE_SIZE;
> +		} while (sz > 0);
> +		return;
> +	}
> +#endif
>  
>  	BUG_ON(sz != PAGE_SIZE);
> -
>  	if (pte_none(ptep[pte_idx]))
>  		ptep[pte_idx] = pfn_pte(PFN_DOWN(pa), prot);
>  }
> @@ -673,10 +685,18 @@ void __init create_pgd_mapping(pgd_t *pgdp,
>  static uintptr_t __init best_map_size(phys_addr_t base, phys_addr_t size)
>  {
>  	/* Upgrade to PMD_SIZE mappings whenever possible */
> -	if ((base & (PMD_SIZE - 1)) || (size & (PMD_SIZE - 1)))
> +	base &= PMD_SIZE - 1;
> +	if (!base && size >= PMD_SIZE)
> +		return PMD_SIZE;
> +
> +	if (!has_svnapot())
>  		return PAGE_SIZE;
>  
> -	return PMD_SIZE;
> +	base &= NAPOT_CONT64KB_SIZE - 1;
> +	if (!base && size >= NAPOT_CONT64KB_SIZE)
> +		return NAPOT_CONT64KB_SIZE;
> +
> +	return PAGE_SIZE;
>  }
>  
>  #ifdef CONFIG_XIP_KERNEL
> @@ -1111,9 +1131,9 @@ static void __init setup_vm_final(void)
>  		if (end >= __pa(PAGE_OFFSET) + memory_limit)
>  			end = __pa(PAGE_OFFSET) + memory_limit;
>  
> -		map_size = best_map_size(start, end - start);
>  		for (pa = start; pa < end; pa += map_size) {
>  			va = (uintptr_t)__va(pa);
> +			map_size = best_map_size(pa, end - pa);
>  
>  			create_pgd_mapping(swapper_pg_dir, va, pa, map_size,
>  					   pgprot_from_va(va));
> -- 
> 2.35.1
> 
> 
> _______________________________________________
> linux-riscv mailing list
> linux-riscv@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-riscv

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v5 3/4] mm: support Svnapot in hugetlb page
  2022-10-03 13:47 ` [PATCH v5 3/4] mm: support Svnapot in hugetlb page panqinglin2020
@ 2022-10-04 18:43   ` Conor Dooley
  2022-10-05  2:31     ` Qinglin Pan
  2022-10-05 13:11   ` Andrew Jones
  1 sibling, 1 reply; 26+ messages in thread
From: Conor Dooley @ 2022-10-04 18:43 UTC (permalink / raw)
  To: panqinglin2020; +Cc: palmer, linux-riscv, jeff, xuyinan

On Mon, Oct 03, 2022 at 09:47:20PM +0800, panqinglin2020@iscas.ac.cn wrote:
> From: Qinglin Pan <panqinglin2020@iscas.ac.cn>
> 
> Svnapot can be used to support 64KB hugetlb page, so it can become a new
> option when using hugetlbfs. This commit adds a basic implementation of

s/this commit adds/add

> hugetlb page, and support 64KB as a size in it by using Svnapot.
> 

I think the test could could be kept out of the commit message & under a
--- line.

> For test, boot kernel with command line contains "default_hugepagesz=64K
> hugepagesz=64K hugepages=20" and run a simple test like this:
> 
> int main() {
> 	void *addr;
> 	addr = mmap(NULL, 64 * 1024, PROT_WRITE | PROT_READ,
> 			MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_HUGE_64KB, -1, 0);
> 	printf("back from mmap \n");
> 	long *ptr = (long *)addr;
> 	unsigned int i = 0;
> 	for(; i < 8 * 1024;i += 512) {
> 		printf("%lp \n", ptr);
> 		*ptr = 0xdeafabcd12345678;
> 		ptr += 512;
> 	}
> 	ptr = (long *)addr;
> 	i = 0;
> 	for(; i < 8 * 1024;i += 512) {
> 		if (*ptr != 0xdeafabcd12345678) {
> 			printf("failed! 0x%lx \n", *ptr);
> 			break;
> 		}
> 		ptr += 512;
> 	}
> 	if(i == 8 * 1024)
> 		printf("simple test passed!\n");
> }
> 
> And it should be passed.
> 
> Signed-off-by: Qinglin Pan <panqinglin2020@iscas.ac.cn>
> 
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index 4354024aae21..3d5ec1391046 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -43,7 +43,7 @@ config RISCV
>  	select ARCH_USE_QUEUED_RWLOCKS
>  	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
>  	select ARCH_WANT_FRAME_POINTERS
> -	select ARCH_WANT_GENERAL_HUGETLB
> +	select ARCH_WANT_GENERAL_HUGETLB if !RISCV_ISA_SVNAPOT
>  	select ARCH_WANT_HUGE_PMD_SHARE if 64BIT
>  	select BINFMT_FLAT_NO_DATA_START_OFFSET if !MMU
>  	select BUILDTIME_TABLE_SORT if MMU
> diff --git a/arch/riscv/include/asm/hugetlb.h b/arch/riscv/include/asm/hugetlb.h
> index a5c2ca1d1cd8..79cbb482f0a0 100644
> --- a/arch/riscv/include/asm/hugetlb.h
> +++ b/arch/riscv/include/asm/hugetlb.h
> @@ -2,7 +2,35 @@
>  #ifndef _ASM_RISCV_HUGETLB_H
>  #define _ASM_RISCV_HUGETLB_H
>  
> -#include <asm-generic/hugetlb.h>
>  #include <asm/page.h>
>  
> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
> +pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags);
> +#define arch_make_huge_pte arch_make_huge_pte
> +#define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT
> +void set_huge_pte_at(struct mm_struct *mm,
> +		     unsigned long addr, pte_t *ptep, pte_t pte);
> +#define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR
> +pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
> +			      unsigned long addr, pte_t *ptep);
> +#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH
> +pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
> +			    unsigned long addr, pte_t *ptep);
> +#define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS
> +int huge_ptep_set_access_flags(struct vm_area_struct *vma,
> +			       unsigned long addr, pte_t *ptep,
> +			       pte_t pte, int dirty);
> +#define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT
> +void huge_ptep_set_wrprotect(struct mm_struct *mm,
> +			     unsigned long addr, pte_t *ptep);
> +#define __HAVE_ARCH_HUGE_PTE_CLEAR
> +void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
> +		    pte_t *ptep, unsigned long sz);
> +#define set_huge_swap_pte_at riscv_set_huge_swap_pte_at
> +void riscv_set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
> +				pte_t *ptep, pte_t pte, unsigned long sz);
> +#endif /*CONFIG_RISCV_ISA_SVNAPOT*/

Maybe I am just getting old or something, but some whitespace would go a
long way here I think.

I'm not qualified for any further comments on this patch however :/
Conor.

> +
> +#include <asm-generic/hugetlb.h>
> +
>  #endif /* _ASM_RISCV_HUGETLB_H */
> diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
> index ac70b0fd9a9a..1ea06476902a 100644
> --- a/arch/riscv/include/asm/page.h
> +++ b/arch/riscv/include/asm/page.h
> @@ -17,7 +17,7 @@
>  #define PAGE_MASK	(~(PAGE_SIZE - 1))
>  
>  #ifdef CONFIG_64BIT
> -#define HUGE_MAX_HSTATE		2
> +#define HUGE_MAX_HSTATE		3
>  #else
>  #define HUGE_MAX_HSTATE		1
>  #endif
> diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
> index 932dadfdca54..faa207826260 100644
> --- a/arch/riscv/mm/hugetlbpage.c
> +++ b/arch/riscv/mm/hugetlbpage.c
> @@ -2,6 +2,239 @@
>  #include <linux/hugetlb.h>
>  #include <linux/err.h>
>  
> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
> +pte_t *huge_pte_alloc(struct mm_struct *mm,
> +		      struct vm_area_struct *vma,
> +		      unsigned long addr,
> +		      unsigned long sz)
> +{
> +	pgd_t *pgdp = pgd_offset(mm, addr);
> +	p4d_t *p4dp = p4d_alloc(mm, pgdp, addr);
> +	pud_t *pudp = pud_alloc(mm, p4dp, addr);
> +	pmd_t *pmdp = pmd_alloc(mm, pudp, addr);
> +
> +	if (sz == NAPOT_CONT64KB_SIZE) {
> +		if (!pmdp)
> +			return NULL;
> +		WARN_ON(addr & (sz - 1));
> +		return pte_alloc_map(mm, pmdp, addr);
> +	}
> +
> +	return NULL;
> +}
> +
> +pte_t *huge_pte_offset(struct mm_struct *mm,
> +		       unsigned long addr,
> +		       unsigned long sz)
> +{
> +	pgd_t *pgdp;
> +	p4d_t *p4dp;
> +	pud_t *pudp;
> +	pmd_t *pmdp;
> +	pte_t *ptep = NULL;
> +
> +	pgdp = pgd_offset(mm, addr);
> +	if (!pgd_present(READ_ONCE(*pgdp)))
> +		return NULL;
> +
> +	p4dp = p4d_offset(pgdp, addr);
> +	if (!p4d_present(READ_ONCE(*p4dp)))
> +		return NULL;
> +
> +	pudp = pud_offset(p4dp, addr);
> +	if (!pud_present(READ_ONCE(*pudp)))
> +		return NULL;
> +
> +	pmdp = pmd_offset(pudp, addr);
> +	if (!pmd_present(READ_ONCE(*pmdp)))
> +		return NULL;
> +
> +	if (sz == NAPOT_CONT64KB_SIZE)
> +		ptep = pte_offset_kernel(pmdp, (addr & ~NAPOT_CONT64KB_MASK));
> +
> +	return ptep;
> +}
> +
> +static int napot_pte_num(pte_t pte)
> +{
> +	if (pte_val(pte) & _PAGE_NAPOT)
> +		return NAPOT_64KB_PTE_NUM;
> +
> +	pr_warn("%s: unrecognized napot pte size 0x%lx\n",
> +		__func__, pte_val(pte));
> +	return 1;
> +}
> +
> +static pte_t get_clear_flush(struct mm_struct *mm,
> +			     unsigned long addr,
> +			     pte_t *ptep,
> +			     unsigned long pte_num)
> +{
> +	pte_t orig_pte = huge_ptep_get(ptep);
> +	bool valid = pte_val(orig_pte);
> +	unsigned long i, saddr = addr;
> +
> +	for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++) {
> +		pte_t pte = ptep_get_and_clear(mm, addr, ptep);
> +
> +		if (pte_dirty(pte))
> +			orig_pte = pte_mkdirty(orig_pte);
> +
> +		if (pte_young(pte))
> +			orig_pte = pte_mkyoung(orig_pte);
> +	}
> +
> +	if (valid) {
> +		struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
> +
> +		flush_tlb_range(&vma, saddr, addr);
> +	}
> +	return orig_pte;
> +}
> +
> +static void clear_flush(struct mm_struct *mm,
> +			unsigned long addr,
> +			pte_t *ptep,
> +			unsigned long pte_num)
> +{
> +	struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
> +	unsigned long i, saddr = addr;
> +
> +	for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
> +		pte_clear(mm, addr, ptep);
> +
> +	flush_tlb_range(&vma, saddr, addr);
> +}
> +
> +pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
> +{
> +	if (shift == NAPOT_CONT64KB_SHIFT)
> +		entry = pte_mknapot(entry, NAPOT_CONT64KB_SHIFT - PAGE_SHIFT);
> +
> +	return entry;
> +}
> +
> +void set_huge_pte_at(struct mm_struct *mm,
> +		     unsigned long addr,
> +		     pte_t *ptep,
> +		     pte_t pte)
> +{
> +	int i;
> +	int pte_num;
> +
> +	if (!pte_napot(pte)) {
> +		set_pte_at(mm, addr, ptep, pte);
> +		return;
> +	}
> +
> +	pte_num = napot_pte_num(pte);
> +	for (i = 0; i < pte_num; i++, ptep++, addr += PAGE_SIZE)
> +		set_pte_at(mm, addr, ptep, pte);
> +}
> +
> +int huge_ptep_set_access_flags(struct vm_area_struct *vma,
> +			       unsigned long addr,
> +			       pte_t *ptep,
> +			       pte_t pte,
> +			       int dirty)
> +{
> +	pte_t orig_pte;
> +	int i;
> +	int pte_num;
> +
> +	if (!pte_napot(pte))
> +		return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
> +
> +	pte_num = napot_pte_num(pte);
> +	ptep = huge_pte_offset(vma->vm_mm, addr, NAPOT_CONT64KB_SIZE);
> +	orig_pte = huge_ptep_get(ptep);
> +
> +	if (pte_dirty(orig_pte))
> +		pte = pte_mkdirty(pte);
> +
> +	if (pte_young(orig_pte))
> +		pte = pte_mkyoung(pte);
> +
> +	for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
> +		ptep_set_access_flags(vma, addr, ptep, pte, dirty);
> +
> +	return true;
> +}
> +
> +pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
> +			      unsigned long addr,
> +			      pte_t *ptep)
> +{
> +	int pte_num;
> +	pte_t orig_pte = huge_ptep_get(ptep);
> +
> +	if (!pte_napot(orig_pte))
> +		return ptep_get_and_clear(mm, addr, ptep);
> +
> +	pte_num = napot_pte_num(orig_pte);
> +	return get_clear_flush(mm, addr, ptep, pte_num);
> +}
> +
> +void huge_ptep_set_wrprotect(struct mm_struct *mm,
> +			     unsigned long addr,
> +			     pte_t *ptep)
> +{
> +	int i;
> +	int pte_num;
> +	pte_t pte = READ_ONCE(*ptep);
> +
> +	if (!pte_napot(pte))
> +		return ptep_set_wrprotect(mm, addr, ptep);
> +
> +	pte_num = napot_pte_num(pte);
> +	ptep = huge_pte_offset(mm, addr, NAPOT_CONT64KB_SIZE);
> +
> +	for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
> +		ptep_set_wrprotect(mm, addr, ptep);
> +}
> +
> +pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
> +			    unsigned long addr,
> +			    pte_t *ptep)
> +{
> +	int pte_num;
> +	pte_t pte = READ_ONCE(*ptep);
> +
> +	if (!pte_napot(pte)) {
> +		ptep_clear_flush(vma, addr, ptep);
> +		return pte;
> +	}
> +
> +	pte_num = napot_pte_num(pte);
> +	clear_flush(vma->vm_mm, addr, ptep, pte_num);
> +
> +	return pte;
> +}
> +
> +void huge_pte_clear(struct mm_struct *mm,
> +		    unsigned long addr,
> +		    pte_t *ptep,
> +		    unsigned long sz)
> +{
> +	int i, pte_num;
> +
> +	pte_num = napot_pte_num(READ_ONCE(*ptep));
> +	for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
> +		pte_clear(mm, addr, ptep);
> +}
> +
> +void riscv_set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
> +				pte_t *ptep, pte_t pte, unsigned long sz)
> +{
> +	int i, pte_num;
> +
> +	pte_num = napot_pte_num(READ_ONCE(*ptep));
> +
> +	for (i = 0; i < pte_num; i++, ptep++)
> +		set_pte(ptep, pte);
> +}
> +#endif /*CONFIG_RISCV_ISA_SVNAPOT*/
> +
>  int pud_huge(pud_t pud)
>  {
>  	return pud_leaf(pud);
> @@ -18,17 +251,26 @@ bool __init arch_hugetlb_valid_size(unsigned long size)
>  		return true;
>  	else if (IS_ENABLED(CONFIG_64BIT) && size == PUD_SIZE)
>  		return true;
> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
> +	else if (has_svnapot() && size == NAPOT_CONT64KB_SIZE)
> +		return true;
> +#endif /*CONFIG_RISCV_ISA_SVNAPOT*/
>  	else
>  		return false;
>  }
>  
> -#ifdef CONFIG_CONTIG_ALLOC
> -static __init int gigantic_pages_init(void)
> +static __init int hugetlbpage_init(void)
>  {
> +#ifdef CONFIG_CONTIG_ALLOC
>  	/* With CONTIG_ALLOC, we can allocate gigantic pages at runtime */
>  	if (IS_ENABLED(CONFIG_64BIT))
>  		hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
> +#endif /*CONFIG_CONTIG_ALLOC*/
> +	hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
> +	if (has_svnapot())
> +		hugetlb_add_hstate(NAPOT_CONT64KB_SHIFT - PAGE_SHIFT);
> +#endif /*CONFIG_RISCV_ISA_SVNAPOT*/
>  	return 0;
>  }
> -arch_initcall(gigantic_pages_init);
> -#endif
> +arch_initcall(hugetlbpage_init);
> -- 
> 2.35.1
> 
> 
> _______________________________________________
> linux-riscv mailing list
> linux-riscv@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-riscv

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v5 4/4] mm: support Svnapot in huge vmap
  2022-10-03 13:47 ` [PATCH v5 4/4] mm: support Svnapot in huge vmap panqinglin2020
@ 2022-10-04 18:46   ` Conor Dooley
  2022-10-05  4:44     ` Qinglin Pan
  0 siblings, 1 reply; 26+ messages in thread
From: Conor Dooley @ 2022-10-04 18:46 UTC (permalink / raw)
  To: panqinglin2020; +Cc: palmer, linux-riscv, jeff, xuyinan

On Mon, Oct 03, 2022 at 09:47:21PM +0800, panqinglin2020@iscas.ac.cn wrote:
> From: Qinglin Pan <panqinglin2020@iscas.ac.cn>
> 
> The HAVE_ARCH_HUGE_VMAP option can be used to help implement arch
> special huge vmap size. This commit selects this option by default and
> re-writes the arch_vmap_pte_range_map_size for Svnapot 64KB size.
> 
> It can be tested when booting kernel in qemu with pci device, which
> will make the kernel to call pci driver using ioremap, and the
> re-written function will be called.
> 
> Signed-off-by: Qinglin Pan <panqinglin2020@iscas.ac.cn>
> 
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index 3d5ec1391046..571f77b16ee8 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -70,6 +70,7 @@ config RISCV
>  	select GENERIC_TIME_VSYSCALL if MMU && 64BIT
>  	select GENERIC_VDSO_TIME_NS if HAVE_GENERIC_VDSO
>  	select HAVE_ARCH_AUDITSYSCALL
> +	select HAVE_ARCH_HUGE_VMAP

Maybe you should take a look at the following patchset and see how your
code interacts with it:
https://lore.kernel.org/linux-riscv/20220915065027.3501044-1-liushixin2@huawei.com/

Possible you may have some feedback for Liu Shixin or get some ideas :)

Thanks,
Conor.

>  	select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL
>  	select HAVE_ARCH_JUMP_LABEL_RELATIVE if !XIP_KERNEL
>  	select HAVE_ARCH_KASAN if MMU && 64BIT
> diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
> index c3fc3c661699..1740d859331a 100644
> --- a/arch/riscv/include/asm/pgtable.h
> +++ b/arch/riscv/include/asm/pgtable.h
> @@ -748,6 +748,43 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
>  }
>  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
>  
> +static inline int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
> +{
> +	return 0;
> +}
> +
> +static inline int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
> +{
> +	return 0;
> +}
> +
> +static inline void p4d_clear_huge(p4d_t *p4d) { }
> +
> +static inline int pud_clear_huge(pud_t *pud)
> +{
> +	return 0;
> +}
> +
> +static inline int pmd_clear_huge(pmd_t *pmd)
> +{
> +	return 0;
> +}
> +
> +static inline int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
> +{
> +	return 0;
> +}
> +
> +static inline int pud_free_pmd_page(pud_t *pud, unsigned long addr)
> +{
> +	return 0;
> +}
> +
> +static inline int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
> +{
> +	return 0;
> +}
> +
>  /*
>   * Encode and decode a swap entry
>   *
> diff --git a/arch/riscv/include/asm/vmalloc.h b/arch/riscv/include/asm/vmalloc.h
> index ff9abc00d139..ecd1f784299b 100644
> --- a/arch/riscv/include/asm/vmalloc.h
> +++ b/arch/riscv/include/asm/vmalloc.h
> @@ -1,4 +1,32 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
>  #ifndef _ASM_RISCV_VMALLOC_H
>  #define _ASM_RISCV_VMALLOC_H
>  
> +#include <linux/pgtable.h>
> +
> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
> +#define arch_vmap_pte_range_map_size vmap_pte_range_map_size
> +static inline unsigned long
> +vmap_pte_range_map_size(unsigned long addr, unsigned long end, u64 pfn,
> +			unsigned int max_page_shift)
> +{
> +	if (!has_svnapot())
> +		return PAGE_SIZE;
> +
> +	if (addr & NAPOT_CONT64KB_MASK)
> +		return PAGE_SIZE;
> +
> +	if (pfn & (NAPOT_64KB_PTE_NUM - 1UL))
> +		return PAGE_SIZE;
> +
> +	if ((end - addr) < NAPOT_CONT64KB_SIZE)
> +		return PAGE_SIZE;
> +
> +	if (max_page_shift < NAPOT_CONT64KB_SHIFT)
> +		return PAGE_SIZE;
> +
> +	return NAPOT_CONT64KB_SIZE;
> +}
> +#endif /*CONFIG_RISCV_ISA_SVNAPOT*/
> +
>  #endif /* _ASM_RISCV_VMALLOC_H */
> -- 
> 2.35.1
> 
> 
> _______________________________________________
> linux-riscv mailing list
> linux-riscv@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-riscv

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v5 3/4] mm: support Svnapot in hugetlb page
  2022-10-04 18:43   ` Conor Dooley
@ 2022-10-05  2:31     ` Qinglin Pan
  0 siblings, 0 replies; 26+ messages in thread
From: Qinglin Pan @ 2022-10-05  2:31 UTC (permalink / raw)
  To: Conor Dooley; +Cc: palmer, linux-riscv, jeff, xuyinan

Hi Conor,

Will fix these all in next version : )
Thanks for your suggestions!

Qinglin.

On 10/5/22 2:43 AM, Conor Dooley wrote:
> On Mon, Oct 03, 2022 at 09:47:20PM +0800, panqinglin2020@iscas.ac.cn wrote:
>> From: Qinglin Pan <panqinglin2020@iscas.ac.cn>
>>
>> Svnapot can be used to support 64KB hugetlb page, so it can become a new
>> option when using hugetlbfs. This commit adds a basic implementation of
> 
> s/this commit adds/add
> 
>> hugetlb page, and support 64KB as a size in it by using Svnapot.
>>
> 
> I think the test could could be kept out of the commit message & under a
> --- line.
> 
>> For test, boot kernel with command line contains "default_hugepagesz=64K
>> hugepagesz=64K hugepages=20" and run a simple test like this:
>>
>> int main() {
>> 	void *addr;
>> 	addr = mmap(NULL, 64 * 1024, PROT_WRITE | PROT_READ,
>> 			MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_HUGE_64KB, -1, 0);
>> 	printf("back from mmap \n");
>> 	long *ptr = (long *)addr;
>> 	unsigned int i = 0;
>> 	for(; i < 8 * 1024;i += 512) {
>> 		printf("%lp \n", ptr);
>> 		*ptr = 0xdeafabcd12345678;
>> 		ptr += 512;
>> 	}
>> 	ptr = (long *)addr;
>> 	i = 0;
>> 	for(; i < 8 * 1024;i += 512) {
>> 		if (*ptr != 0xdeafabcd12345678) {
>> 			printf("failed! 0x%lx \n", *ptr);
>> 			break;
>> 		}
>> 		ptr += 512;
>> 	}
>> 	if(i == 8 * 1024)
>> 		printf("simple test passed!\n");
>> }
>>
>> And it should be passed.
>>
>> Signed-off-by: Qinglin Pan <panqinglin2020@iscas.ac.cn>
>>
>> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
>> index 4354024aae21..3d5ec1391046 100644
>> --- a/arch/riscv/Kconfig
>> +++ b/arch/riscv/Kconfig
>> @@ -43,7 +43,7 @@ config RISCV
>>   	select ARCH_USE_QUEUED_RWLOCKS
>>   	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
>>   	select ARCH_WANT_FRAME_POINTERS
>> -	select ARCH_WANT_GENERAL_HUGETLB
>> +	select ARCH_WANT_GENERAL_HUGETLB if !RISCV_ISA_SVNAPOT
>>   	select ARCH_WANT_HUGE_PMD_SHARE if 64BIT
>>   	select BINFMT_FLAT_NO_DATA_START_OFFSET if !MMU
>>   	select BUILDTIME_TABLE_SORT if MMU
>> diff --git a/arch/riscv/include/asm/hugetlb.h b/arch/riscv/include/asm/hugetlb.h
>> index a5c2ca1d1cd8..79cbb482f0a0 100644
>> --- a/arch/riscv/include/asm/hugetlb.h
>> +++ b/arch/riscv/include/asm/hugetlb.h
>> @@ -2,7 +2,35 @@
>>   #ifndef _ASM_RISCV_HUGETLB_H
>>   #define _ASM_RISCV_HUGETLB_H
>>   
>> -#include <asm-generic/hugetlb.h>
>>   #include <asm/page.h>
>>   
>> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
>> +pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags);
>> +#define arch_make_huge_pte arch_make_huge_pte
>> +#define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT
>> +void set_huge_pte_at(struct mm_struct *mm,
>> +		     unsigned long addr, pte_t *ptep, pte_t pte);
>> +#define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR
>> +pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
>> +			      unsigned long addr, pte_t *ptep);
>> +#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH
>> +pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
>> +			    unsigned long addr, pte_t *ptep);
>> +#define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS
>> +int huge_ptep_set_access_flags(struct vm_area_struct *vma,
>> +			       unsigned long addr, pte_t *ptep,
>> +			       pte_t pte, int dirty);
>> +#define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT
>> +void huge_ptep_set_wrprotect(struct mm_struct *mm,
>> +			     unsigned long addr, pte_t *ptep);
>> +#define __HAVE_ARCH_HUGE_PTE_CLEAR
>> +void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
>> +		    pte_t *ptep, unsigned long sz);
>> +#define set_huge_swap_pte_at riscv_set_huge_swap_pte_at
>> +void riscv_set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
>> +				pte_t *ptep, pte_t pte, unsigned long sz);
>> +#endif /*CONFIG_RISCV_ISA_SVNAPOT*/
> 
> Maybe I am just getting old or something, but some whitespace would go a
> long way here I think.
> 
> I'm not qualified for any further comments on this patch however :/
> Conor.
> 
>> +
>> +#include <asm-generic/hugetlb.h>
>> +
>>   #endif /* _ASM_RISCV_HUGETLB_H */
>> diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
>> index ac70b0fd9a9a..1ea06476902a 100644
>> --- a/arch/riscv/include/asm/page.h
>> +++ b/arch/riscv/include/asm/page.h
>> @@ -17,7 +17,7 @@
>>   #define PAGE_MASK	(~(PAGE_SIZE - 1))
>>   
>>   #ifdef CONFIG_64BIT
>> -#define HUGE_MAX_HSTATE		2
>> +#define HUGE_MAX_HSTATE		3
>>   #else
>>   #define HUGE_MAX_HSTATE		1
>>   #endif
>> diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
>> index 932dadfdca54..faa207826260 100644
>> --- a/arch/riscv/mm/hugetlbpage.c
>> +++ b/arch/riscv/mm/hugetlbpage.c
>> @@ -2,6 +2,239 @@
>>   #include <linux/hugetlb.h>
>>   #include <linux/err.h>
>>   
>> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
>> +pte_t *huge_pte_alloc(struct mm_struct *mm,
>> +		      struct vm_area_struct *vma,
>> +		      unsigned long addr,
>> +		      unsigned long sz)
>> +{
>> +	pgd_t *pgdp = pgd_offset(mm, addr);
>> +	p4d_t *p4dp = p4d_alloc(mm, pgdp, addr);
>> +	pud_t *pudp = pud_alloc(mm, p4dp, addr);
>> +	pmd_t *pmdp = pmd_alloc(mm, pudp, addr);
>> +
>> +	if (sz == NAPOT_CONT64KB_SIZE) {
>> +		if (!pmdp)
>> +			return NULL;
>> +		WARN_ON(addr & (sz - 1));
>> +		return pte_alloc_map(mm, pmdp, addr);
>> +	}
>> +
>> +	return NULL;
>> +}
>> +
>> +pte_t *huge_pte_offset(struct mm_struct *mm,
>> +		       unsigned long addr,
>> +		       unsigned long sz)
>> +{
>> +	pgd_t *pgdp;
>> +	p4d_t *p4dp;
>> +	pud_t *pudp;
>> +	pmd_t *pmdp;
>> +	pte_t *ptep = NULL;
>> +
>> +	pgdp = pgd_offset(mm, addr);
>> +	if (!pgd_present(READ_ONCE(*pgdp)))
>> +		return NULL;
>> +
>> +	p4dp = p4d_offset(pgdp, addr);
>> +	if (!p4d_present(READ_ONCE(*p4dp)))
>> +		return NULL;
>> +
>> +	pudp = pud_offset(p4dp, addr);
>> +	if (!pud_present(READ_ONCE(*pudp)))
>> +		return NULL;
>> +
>> +	pmdp = pmd_offset(pudp, addr);
>> +	if (!pmd_present(READ_ONCE(*pmdp)))
>> +		return NULL;
>> +
>> +	if (sz == NAPOT_CONT64KB_SIZE)
>> +		ptep = pte_offset_kernel(pmdp, (addr & ~NAPOT_CONT64KB_MASK));
>> +
>> +	return ptep;
>> +}
>> +
>> +static int napot_pte_num(pte_t pte)
>> +{
>> +	if (pte_val(pte) & _PAGE_NAPOT)
>> +		return NAPOT_64KB_PTE_NUM;
>> +
>> +	pr_warn("%s: unrecognized napot pte size 0x%lx\n",
>> +		__func__, pte_val(pte));
>> +	return 1;
>> +}
>> +
>> +static pte_t get_clear_flush(struct mm_struct *mm,
>> +			     unsigned long addr,
>> +			     pte_t *ptep,
>> +			     unsigned long pte_num)
>> +{
>> +	pte_t orig_pte = huge_ptep_get(ptep);
>> +	bool valid = pte_val(orig_pte);
>> +	unsigned long i, saddr = addr;
>> +
>> +	for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++) {
>> +		pte_t pte = ptep_get_and_clear(mm, addr, ptep);
>> +
>> +		if (pte_dirty(pte))
>> +			orig_pte = pte_mkdirty(orig_pte);
>> +
>> +		if (pte_young(pte))
>> +			orig_pte = pte_mkyoung(orig_pte);
>> +	}
>> +
>> +	if (valid) {
>> +		struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
>> +
>> +		flush_tlb_range(&vma, saddr, addr);
>> +	}
>> +	return orig_pte;
>> +}
>> +
>> +static void clear_flush(struct mm_struct *mm,
>> +			unsigned long addr,
>> +			pte_t *ptep,
>> +			unsigned long pte_num)
>> +{
>> +	struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
>> +	unsigned long i, saddr = addr;
>> +
>> +	for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
>> +		pte_clear(mm, addr, ptep);
>> +
>> +	flush_tlb_range(&vma, saddr, addr);
>> +}
>> +
>> +pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
>> +{
>> +	if (shift == NAPOT_CONT64KB_SHIFT)
>> +		entry = pte_mknapot(entry, NAPOT_CONT64KB_SHIFT - PAGE_SHIFT);
>> +
>> +	return entry;
>> +}
>> +
>> +void set_huge_pte_at(struct mm_struct *mm,
>> +		     unsigned long addr,
>> +		     pte_t *ptep,
>> +		     pte_t pte)
>> +{
>> +	int i;
>> +	int pte_num;
>> +
>> +	if (!pte_napot(pte)) {
>> +		set_pte_at(mm, addr, ptep, pte);
>> +		return;
>> +	}
>> +
>> +	pte_num = napot_pte_num(pte);
>> +	for (i = 0; i < pte_num; i++, ptep++, addr += PAGE_SIZE)
>> +		set_pte_at(mm, addr, ptep, pte);
>> +}
>> +
>> +int huge_ptep_set_access_flags(struct vm_area_struct *vma,
>> +			       unsigned long addr,
>> +			       pte_t *ptep,
>> +			       pte_t pte,
>> +			       int dirty)
>> +{
>> +	pte_t orig_pte;
>> +	int i;
>> +	int pte_num;
>> +
>> +	if (!pte_napot(pte))
>> +		return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
>> +
>> +	pte_num = napot_pte_num(pte);
>> +	ptep = huge_pte_offset(vma->vm_mm, addr, NAPOT_CONT64KB_SIZE);
>> +	orig_pte = huge_ptep_get(ptep);
>> +
>> +	if (pte_dirty(orig_pte))
>> +		pte = pte_mkdirty(pte);
>> +
>> +	if (pte_young(orig_pte))
>> +		pte = pte_mkyoung(pte);
>> +
>> +	for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
>> +		ptep_set_access_flags(vma, addr, ptep, pte, dirty);
>> +
>> +	return true;
>> +}
>> +
>> +pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
>> +			      unsigned long addr,
>> +			      pte_t *ptep)
>> +{
>> +	int pte_num;
>> +	pte_t orig_pte = huge_ptep_get(ptep);
>> +
>> +	if (!pte_napot(orig_pte))
>> +		return ptep_get_and_clear(mm, addr, ptep);
>> +
>> +	pte_num = napot_pte_num(orig_pte);
>> +	return get_clear_flush(mm, addr, ptep, pte_num);
>> +}
>> +
>> +void huge_ptep_set_wrprotect(struct mm_struct *mm,
>> +			     unsigned long addr,
>> +			     pte_t *ptep)
>> +{
>> +	int i;
>> +	int pte_num;
>> +	pte_t pte = READ_ONCE(*ptep);
>> +
>> +	if (!pte_napot(pte))
>> +		return ptep_set_wrprotect(mm, addr, ptep);
>> +
>> +	pte_num = napot_pte_num(pte);
>> +	ptep = huge_pte_offset(mm, addr, NAPOT_CONT64KB_SIZE);
>> +
>> +	for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
>> +		ptep_set_wrprotect(mm, addr, ptep);
>> +}
>> +
>> +pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
>> +			    unsigned long addr,
>> +			    pte_t *ptep)
>> +{
>> +	int pte_num;
>> +	pte_t pte = READ_ONCE(*ptep);
>> +
>> +	if (!pte_napot(pte)) {
>> +		ptep_clear_flush(vma, addr, ptep);
>> +		return pte;
>> +	}
>> +
>> +	pte_num = napot_pte_num(pte);
>> +	clear_flush(vma->vm_mm, addr, ptep, pte_num);
>> +
>> +	return pte;
>> +}
>> +
>> +void huge_pte_clear(struct mm_struct *mm,
>> +		    unsigned long addr,
>> +		    pte_t *ptep,
>> +		    unsigned long sz)
>> +{
>> +	int i, pte_num;
>> +
>> +	pte_num = napot_pte_num(READ_ONCE(*ptep));
>> +	for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
>> +		pte_clear(mm, addr, ptep);
>> +}
>> +
>> +void riscv_set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
>> +				pte_t *ptep, pte_t pte, unsigned long sz)
>> +{
>> +	int i, pte_num;
>> +
>> +	pte_num = napot_pte_num(READ_ONCE(*ptep));
>> +
>> +	for (i = 0; i < pte_num; i++, ptep++)
>> +		set_pte(ptep, pte);
>> +}
>> +#endif /*CONFIG_RISCV_ISA_SVNAPOT*/
>> +
>>   int pud_huge(pud_t pud)
>>   {
>>   	return pud_leaf(pud);
>> @@ -18,17 +251,26 @@ bool __init arch_hugetlb_valid_size(unsigned long size)
>>   		return true;
>>   	else if (IS_ENABLED(CONFIG_64BIT) && size == PUD_SIZE)
>>   		return true;
>> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
>> +	else if (has_svnapot() && size == NAPOT_CONT64KB_SIZE)
>> +		return true;
>> +#endif /*CONFIG_RISCV_ISA_SVNAPOT*/
>>   	else
>>   		return false;
>>   }
>>   
>> -#ifdef CONFIG_CONTIG_ALLOC
>> -static __init int gigantic_pages_init(void)
>> +static __init int hugetlbpage_init(void)
>>   {
>> +#ifdef CONFIG_CONTIG_ALLOC
>>   	/* With CONTIG_ALLOC, we can allocate gigantic pages at runtime */
>>   	if (IS_ENABLED(CONFIG_64BIT))
>>   		hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
>> +#endif /*CONFIG_CONTIG_ALLOC*/
>> +	hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
>> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
>> +	if (has_svnapot())
>> +		hugetlb_add_hstate(NAPOT_CONT64KB_SHIFT - PAGE_SHIFT);
>> +#endif /*CONFIG_RISCV_ISA_SVNAPOT*/
>>   	return 0;
>>   }
>> -arch_initcall(gigantic_pages_init);
>> -#endif
>> +arch_initcall(hugetlbpage_init);
>> -- 
>> 2.35.1
>>
>>
>> _______________________________________________
>> linux-riscv mailing list
>> linux-riscv@lists.infradead.org
>> http://lists.infradead.org/mailman/listinfo/linux-riscv


_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v5 2/4] mm: support Svnapot in physical page linear-mapping
  2022-10-04 18:40   ` Conor Dooley
@ 2022-10-05  2:43     ` Qinglin Pan
  0 siblings, 0 replies; 26+ messages in thread
From: Qinglin Pan @ 2022-10-05  2:43 UTC (permalink / raw)
  To: Conor Dooley; +Cc: palmer, linux-riscv, jeff, xuyinan

Hi Conor,

On 10/5/22 2:40 AM, Conor Dooley wrote:
> 
> Hey Qinglin Pan,
> 
> Other comments aside, it'd be good to add previous reviewers to the CC
> list on follow-up versions.
> 

Will do it in the next version:)

> On Mon, Oct 03, 2022 at 09:47:19PM +0800, panqinglin2020@iscas.ac.cn wrote:
>> From: Qinglin Pan <panqinglin2020@iscas.ac.cn>
>> mm: modify pte format for Svnapot
> 
> "riscv: mm: foo" please.
> 
>>
>> Svnapot is powerful when a physical region is going to mapped to a
>> virtual region. Kernel will do like this when mapping all allocable
>> physical pages to kernel vm space. This commit modifies the
> 
> s/This commit modifies/Modify
> 
>> create_pte_mapping function used in linear-mapping procedure, so the
>> kernel can be able to use Svnapot when both address and length of
>> physical region are 64KB align. Code here will be executed only when
>> other size huge page is not suitable, so it can be an addition of
>> PMD_SIZE and PUD_SIZE mapping.
>>
>> This commit also modifies the best_map_size function to give map_size
> 
> s/This commit also modifies/Modify/
> 
> Although, with the "also" should this be two patches or is there a
> compile time dependency?
> 

The above typo will be repaired in the next version.

>> many times instead of only once, so a memory region can be mapped by
>> both PMD_SIZE and 64KB napot size.
>>
>> It is tested by setting qemu's memory to a 262272k region, and the
>> kernel can boot successfully.
>>
>> Currently, the modified create_pte_mapping will never take use of SVNAPOT,
>> because this extension is detected in riscv_fill_hwcap and enabled in
>> apply_boot_alternatives(called from setup_arch) which is called
>> after setup_vm_final. We will need to support function like
> 
> Out of curiousity, why doesn't this series add the support?

Because I am not familiar with parsing fdt without memory alloction:(
It may delay this merging of this patchset. I will try to do this in
a follow up series:)

> Do you intend sending a follow up series?
> 
>> riscv_fill_hwcap_early to fill hardware capabilities more earlier, and
>> try to enable SVNAPOT more earlier in apply_early_boot_alternatives,
>> so that we can determine SVNAPOT's presence during setup_vm_final.
>>
>> Signed-off-by: Qinglin Pan <panqinglin2020@iscas.ac.cn>
>>
>> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
>> index b56a0a75533f..76317bb28f29 100644
>> --- a/arch/riscv/mm/init.c
>> +++ b/arch/riscv/mm/init.c
>> @@ -373,9 +373,21 @@ static void __init create_pte_mapping(pte_t *ptep,
>>   				      phys_addr_t sz, pgprot_t prot)
>>   {
>>   	uintptr_t pte_idx = pte_index(va);
>> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
> 
> Would using IS_ENBLED() cause problems here?

Yes, will do it in next version.

Qinglin


_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v5 1/4] mm: modify pte format for Svnapot
  2022-10-04 18:33     ` Conor Dooley
@ 2022-10-05  4:43       ` Qinglin Pan
  2022-10-05  6:57         ` Conor Dooley
  2022-10-05  7:15         ` Andrew Jones
  2022-10-05  9:52       ` Heiko Stübner
  1 sibling, 2 replies; 26+ messages in thread
From: Qinglin Pan @ 2022-10-05  4:43 UTC (permalink / raw)
  To: Conor Dooley, Andrew Jones; +Cc: palmer, linux-riscv, jeff, xuyinan

Hi Conor and Andrew,

On 10/5/22 2:33 AM, Conor Dooley wrote:
> On Tue, Oct 04, 2022 at 07:00:27PM +0200, Andrew Jones wrote:
>> On Mon, Oct 03, 2022 at 09:47:18PM +0800, panqinglin2020@iscas.ac.cn wrote:
>>> From: Qinglin Pan <panqinglin2020@iscas.ac.cn>
>>>
>>> This commit adds two erratas to enable/disable svnapot support, patches
>>> code dynamicly when "svnapot" is in the "riscv,isa" field of fdt and
>>> SVNAPOT compile option is set. It will influence the behavior of
>>> has_svnapot function and pte_pfn function. All code dependent on svnapot
>>> should make sure that has_svnapot return true firstly.
>>>
>>> Also, this commit modifies PTE definition for Svnapot, and creates some
>>> functions in pgtable.h to mark a PTE as napot and check if it is a Svnapot
>>> PTE. Until now, only 64KB napot size is supported in draft spec, so some
>>> macros has only 64KB version.
>>>
>>> Signed-off-by: Qinglin Pan <panqinglin2020@iscas.ac.cn>
>>>
>>> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
>>> index d557cc50295d..4354024aae21 100644
>>> --- a/arch/riscv/Kconfig
>>> +++ b/arch/riscv/Kconfig
>>> @@ -383,6 +383,20 @@ config RISCV_ISA_C
>>>   
>>>   	  If you don't know what to do here, say Y.
>>>   
>>> +config RISCV_ISA_SVNAPOT
>>> +	bool "SVNAPOT extension support"
>>> +	depends on 64BIT && MMU
>>> +	select RISCV_ALTERNATIVE
>>> +	default n
>>
>> What's the reason for this to be default no? If there's a trade-off to be
>> made which requires opt-in, then it should be described in the text below.
> 
> In addition, existing extensions are default y unless they depend on a
> compiler option.

Get it. I will set the default value to y :)

> 
>>> +	help
>>> +	  Say Y if you want to allow kernel to detect SVNAPOT ISA-extension
>>> +	  dynamically in boot time and enable its usage.
>>> +
>>> +	  SVNAPOT extension helps to mark contiguous PTEs as a range
>>> +	  of contiguous virtual-to-physical translations, with a naturally
>>> +	  aligned power-of-2 (NAPOT) granularity larger than the base 4KB page
>>> +	  size.
>>> +
>>>   config RISCV_ISA_SVPBMT
>>>   	bool "SVPBMT extension support"
>>>   	depends on 64BIT && MMU
>>> diff --git a/arch/riscv/include/asm/errata_list.h b/arch/riscv/include/asm/errata_list.h
>>> index 19a771085781..f3aff5ef52e4 100644
>>> --- a/arch/riscv/include/asm/errata_list.h
>>> +++ b/arch/riscv/include/asm/errata_list.h
>>> @@ -22,7 +22,8 @@
>>>   
>>>   #define	CPUFEATURE_SVPBMT 0
>>>   #define	CPUFEATURE_ZICBOM 1
>>> -#define	CPUFEATURE_NUMBER 2
>>> +#define	CPUFEATURE_SVNAPOT 2
>>> +#define	CPUFEATURE_NUMBER 3
>>
>> nit: Maybe take the opportunity to tab out the numbers to get them all
>> lined up. And tab enough for future longer names too.
> 
> s/maybe// ;)
> 

Thanks:)
Will fix this lining up and the below one in the next version.

>>
>>>   
>>>   #ifdef __ASSEMBLY__
>>>   
>>> @@ -142,6 +143,26 @@ asm volatile(ALTERNATIVE_2(						\
>>>   	    "r"((unsigned long)(_start) + (_size))			\
>>>   	: "a0")
>>>   
>>> +#define ALT_SVNAPOT(_val)						\
>>> +asm(ALTERNATIVE("li %0, 0", "li %0, 1", 0,				\
>>> +		CPUFEATURE_SVNAPOT, CONFIG_RISCV_ISA_SVNAPOT)		\
>>> +		: "=r"(_val) :)
>>> +
>>> +#define ALT_SVNAPOT_PTE_PFN(_val, _napot_shift, _pfn_mask, _pfn_shift)	\
>>> +asm(ALTERNATIVE("and %0, %0, %1\n\t"					\
>>> +		"srli %0, %0, %2\n\t"					\
>>> +		__nops(3),						\
>>> +		"srli t3, %0, %3\n\t"					\
>>> +		"and %0, %0, %1\n\t"					\
>>> +		"srli %0, %0, %2\n\t"					\
>>> +		"sub  t4, %0, t3\n\t"					\
>>> +		"and  %0, %0, t4",					\
>>
>> This implements
>>
>>    temp = ((pte & _PAGE_PFN_MASK) >> _PAGE_PFN_SHIFT);
>>    pfn = temp & (temp - (pte >> _PAGE_NAPOT_SHIFT));
>>
>> which for a non-napot pte just returns the same as the non-napot
>> case would, but for a napot pte we return the same as the non-napot
>> case but with its first set bit cleared, wherever that first set
>> bit was. Can you explain to me why that's what we want to do?
>>

For 64KB napot pte, (pte >> _PAGE_NAPOT_SHIFT) will get 1, and temp will 
be something like 0xabcdabcdab8, but the correct pfn we expect should be
0xabcdabcdab0. We can get it by calculating (temp & (temp - 1)).
So temp & (temp - (pte >> _PAGE_NAPOT_SHIFT)) will give correct pfn
both in non-napot and napot case:)

>>> +		0, CPUFEATURE_SVNAPOT, CONFIG_RISCV_ISA_SVNAPOT)	\
>>> +		: "+r"(_val)						\
>>> +		: "r"(_pfn_mask),					\
>>> +		  "i"(_pfn_shift),					\
>>> +		  "i"(_napot_shift))
>>
>> Should add t3 and t4 to clobbers list.
>>

Will do it in the next version.

>>> +
>>>   #endif /* __ASSEMBLY__ */
>>>   
>>>   #endif
>>> diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
>>> index 6f59ec64175e..83b8e21a3573 100644
>>> --- a/arch/riscv/include/asm/hwcap.h
>>> +++ b/arch/riscv/include/asm/hwcap.h
>>> @@ -54,10 +54,11 @@ extern unsigned long elf_hwcap;
>>>    */
>>>   enum riscv_isa_ext_id {
>>>   	RISCV_ISA_EXT_SSCOFPMF = RISCV_ISA_EXT_BASE,
>>> +	RISCV_ISA_EXT_SSTC,
>>> +	RISCV_ISA_EXT_SVNAPOT,
>>>   	RISCV_ISA_EXT_SVPBMT,
>>>   	RISCV_ISA_EXT_ZICBOM,
>>>   	RISCV_ISA_EXT_ZIHINTPAUSE,
>>> -	RISCV_ISA_EXT_SSTC,
>>
>> Opportunistic alphabetizing an enum is a bit risky. It'd be better if this
>> was a separate patch, because, even though nothing should care about the
>> order, if something does care about the order, then it'd be easier to
>> debug when this when changed alone.
> 
> I think for these things it is s/alphabetical/canonical order, given the
> patch from Palmer I linked in my previous mail. Although thinking about
> it, that means V before S, so SV before SS? Which his patch did not do:
> https://lore.kernel.org/linux-riscv/20220920204518.10988-1-palmer@rivosinc.com/
> 

Not very sure about how to place it by canonical order:(
I will avoid reorderint other enum variants in this patch, and only add
SVNAPOT just before SVPBMT. Another patch to reroder them is welcome:)

>>
>>>   	RISCV_ISA_EXT_ID_MAX = RISCV_ISA_EXT_MAX,
>>>   };
>>>   
>>> diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h
>>> index dc42375c2357..25ec541192e5 100644
>>> --- a/arch/riscv/include/asm/pgtable-64.h
>>> +++ b/arch/riscv/include/asm/pgtable-64.h
>>> @@ -74,6 +74,19 @@ typedef struct {
>>>    */
>>>   #define _PAGE_PFN_MASK  GENMASK(53, 10)
>>>   
>>> +/*
>>> + * [63] Svnapot definitions:
>>> + * 0 Svnapot disabled
>>> + * 1 Svnapot enabled
>>> + */
>>> +#define _PAGE_NAPOT_SHIFT 63
>>> +#define _PAGE_NAPOT      BIT(_PAGE_NAPOT_SHIFT)
>>> +#define NAPOT_CONT64KB_ORDER 4UL
>>> +#define NAPOT_CONT64KB_SHIFT (NAPOT_CONT64KB_ORDER + PAGE_SHIFT)
>>> +#define NAPOT_CONT64KB_SIZE BIT(NAPOT_CONT64KB_SHIFT)
>>> +#define NAPOT_CONT64KB_MASK (NAPOT_CONT64KB_SIZE - 1UL)
>>> +#define NAPOT_64KB_PTE_NUM BIT(NAPOT_CONT64KB_ORDER)
>>
>> Please make three lined up columns out of these defines.
>>
>> #define DEF		VAL
>>
>>> +
>>>   /*
>>>    * [62:61] Svpbmt Memory Type definitions:
>>>    *
>>> diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
>>> index 7ec936910a96..c3fc3c661699 100644
>>> --- a/arch/riscv/include/asm/pgtable.h
>>> +++ b/arch/riscv/include/asm/pgtable.h
>>> @@ -264,10 +264,39 @@ static inline pte_t pud_pte(pud_t pud)
>>>   	return __pte(pud_val(pud));
>>>   }
>>>   
>>> +static inline bool has_svnapot(void)
>>> +{
>>> +	u64 _val;
>>> +
>>> +	ALT_SVNAPOT(_val);
>>
>> Why aren't we using the isa extension static key framework for this?

I am not very sure why static key is better than errata? If it is
really necessary, I will convert it to static key in the next version.

>>
>>> +	return _val;
>>> +}
>>> +
>>> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
>>> +
>>> +static inline unsigned long pte_napot(pte_t pte)
>>> +{
>>> +	return pte_val(pte) & _PAGE_NAPOT;
>>> +}
>>> +
>>> +static inline pte_t pte_mknapot(pte_t pte, unsigned int order)
>>> +{
>>> +	int pos = order - 1 + _PAGE_PFN_SHIFT;
>>> +	unsigned long napot_bit = BIT(pos);
>>> +	unsigned long napot_mask = ~GENMASK(pos, _PAGE_PFN_SHIFT);
>>> +
>>> +	return __pte((pte_val(pte) & napot_mask) | napot_bit | _PAGE_NAPOT);
>>> +}
>>> +#endif /* CONFIG_RISCV_ISA_SVNAPOT */
>>> +
>>>   /* Yields the page frame number (PFN) of a page table entry */
>>>   static inline unsigned long pte_pfn(pte_t pte)
>>>   {
>>> -	return __page_val_to_pfn(pte_val(pte));
>>> +	unsigned long _val  = pte_val(pte);
>>> +
>>> +	ALT_SVNAPOT_PTE_PFN(_val, _PAGE_NAPOT_SHIFT,
>>> +			    _PAGE_PFN_MASK, _PAGE_PFN_SHIFT);
>>> +	return _val;
>>>   }
>>>   
>>>   #define pte_page(x)     pfn_to_page(pte_pfn(x))
>>> diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c
>>> index 0be8a2403212..d2a61122c595 100644
>>> --- a/arch/riscv/kernel/cpu.c
>>> +++ b/arch/riscv/kernel/cpu.c
>>> @@ -96,6 +96,7 @@ static struct riscv_isa_ext_data isa_ext_arr[] = {
>>>   	__RISCV_ISA_EXT_DATA(zicbom, RISCV_ISA_EXT_ZICBOM),
>>>   	__RISCV_ISA_EXT_DATA(zihintpause, RISCV_ISA_EXT_ZIHINTPAUSE),
>>>   	__RISCV_ISA_EXT_DATA(sstc, RISCV_ISA_EXT_SSTC),
>>> +	__RISCV_ISA_EXT_DATA(svnapot, RISCV_ISA_EXT_SVNAPOT),
>>
>> If we want to do a separate isa extension alphabetizing patch, then this
>> array would be a good one to do since it's the output order.
>>
>>>   	__RISCV_ISA_EXT_DATA("", RISCV_ISA_EXT_MAX),
>>>   };
>>>   
>>> diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
>>> index 553d755483ed..3557c5cc6f04 100644
>>> --- a/arch/riscv/kernel/cpufeature.c
>>> +++ b/arch/riscv/kernel/cpufeature.c
>>> @@ -204,6 +204,7 @@ void __init riscv_fill_hwcap(void)
>>>   				SET_ISA_EXT_MAP("zicbom", RISCV_ISA_EXT_ZICBOM);
>>>   				SET_ISA_EXT_MAP("zihintpause", RISCV_ISA_EXT_ZIHINTPAUSE);
>>>   				SET_ISA_EXT_MAP("sstc", RISCV_ISA_EXT_SSTC);
>>> +				SET_ISA_EXT_MAP("svnapot", RISCV_ISA_EXT_SVNAPOT);
>>
>> Could alphabetize this too, even though it doesn't matter.
>>
>>>   			}
>>>   #undef SET_ISA_EXT_MAP
>>>   		}
>>> @@ -284,6 +285,20 @@ static bool __init_or_module cpufeature_probe_zicbom(unsigned int stage)
>>>   	return false;
>>>   }
>>>   
>>> +static bool __init_or_module cpufeature_probe_svnapot(unsigned int stage)
>>> +{
>>> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
> 
> Is there a reason that this cannot be if IS_ENABLED(RISCV_ISA_SVNAPOT)?

Will IS_ENABLED in the next version.

> 
>>> +	switch (stage) {
>>> +	case RISCV_ALTERNATIVES_EARLY_BOOT:
>>> +		return false;
>>> +	default:
>>> +		return riscv_isa_extension_available(NULL, SVNAPOT);
>>> +	}
>>> +#endif
>>> +
>>> +	return false;
>>> +}
>>> +
>>>   /*
>>>    * Probe presence of individual extensions.
>>>    *
>>> @@ -301,6 +316,9 @@ static u32 __init_or_module cpufeature_probe(unsigned int stage)
>>>   	if (cpufeature_probe_zicbom(stage))
>>>   		cpu_req_feature |= (1U << CPUFEATURE_ZICBOM);
> 
> While in the area, could this be converted to BIT(CPUFEATURE_ZICBOM)?

This zicbom related code is not produced by this patchset. It may be 
better to convert it to BIT in another patch.

Qinglin.

> 
>>>   
>>> +	if (cpufeature_probe_svnapot(stage))
>>> +		cpu_req_feature |= BIT(CPUFEATURE_SVNAPOT);
>>> +
>>>   	return cpu_req_feature;
>>>   }
>>>   
>>> -- 
>>> 2.35.1
>>>
>>
>> Thanks,
>> drew
>>
>> _______________________________________________
>> linux-riscv mailing list
>> linux-riscv@lists.infradead.org
>> http://lists.infradead.org/mailman/listinfo/linux-riscv


_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v5 4/4] mm: support Svnapot in huge vmap
  2022-10-04 18:46   ` Conor Dooley
@ 2022-10-05  4:44     ` Qinglin Pan
  0 siblings, 0 replies; 26+ messages in thread
From: Qinglin Pan @ 2022-10-05  4:44 UTC (permalink / raw)
  To: Conor Dooley; +Cc: palmer, linux-riscv, jeff, xuyinan

Hi Conor,

On 10/5/22 2:46 AM, Conor Dooley wrote:
> On Mon, Oct 03, 2022 at 09:47:21PM +0800, panqinglin2020@iscas.ac.cn wrote:
>> From: Qinglin Pan <panqinglin2020@iscas.ac.cn>
>>
>> The HAVE_ARCH_HUGE_VMAP option can be used to help implement arch
>> special huge vmap size. This commit selects this option by default and
>> re-writes the arch_vmap_pte_range_map_size for Svnapot 64KB size.
>>
>> It can be tested when booting kernel in qemu with pci device, which
>> will make the kernel to call pci driver using ioremap, and the
>> re-written function will be called.
>>
>> Signed-off-by: Qinglin Pan <panqinglin2020@iscas.ac.cn>
>>
>> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
>> index 3d5ec1391046..571f77b16ee8 100644
>> --- a/arch/riscv/Kconfig
>> +++ b/arch/riscv/Kconfig
>> @@ -70,6 +70,7 @@ config RISCV
>>   	select GENERIC_TIME_VSYSCALL if MMU && 64BIT
>>   	select GENERIC_VDSO_TIME_NS if HAVE_GENERIC_VDSO
>>   	select HAVE_ARCH_AUDITSYSCALL
>> +	select HAVE_ARCH_HUGE_VMAP
> 
> Maybe you should take a look at the following patchset and see how your
> code interacts with it:
> https://lore.kernel.org/linux-riscv/20220915065027.3501044-1-liushixin2@huawei.com/
> 
> Possible you may have some feedback for Liu Shixin or get some ideas :)
> 
> Thanks,
> Conor.
> 

Will have a look at it and to interact with it:)
Thanks for your information again : )

Qinglin.


_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v5 1/4] mm: modify pte format for Svnapot
  2022-10-05  4:43       ` Qinglin Pan
@ 2022-10-05  6:57         ` Conor Dooley
  2022-10-05  7:15         ` Andrew Jones
  1 sibling, 0 replies; 26+ messages in thread
From: Conor Dooley @ 2022-10-05  6:57 UTC (permalink / raw)
  To: Qinglin Pan
  Cc: Conor Dooley, Andrew Jones, palmer, linux-riscv, jeff, xuyinan

On Wed, Oct 05, 2022 at 12:43:01PM +0800, Qinglin Pan wrote:
> Hi Conor and Andrew,
> 
> On 10/5/22 2:33 AM, Conor Dooley wrote:
> > > >   /*
> > > >    * Probe presence of individual extensions.
> > > >    *
> > > > @@ -301,6 +316,9 @@ static u32 __init_or_module cpufeature_probe(unsigned int stage)
> > > >   	if (cpufeature_probe_zicbom(stage))
> > > >   		cpu_req_feature |= (1U << CPUFEATURE_ZICBOM);
> > 
> > While in the area, could this be converted to BIT(CPUFEATURE_ZICBOM)?
> 
> This zicbom related code is not produced by this patchset. It may be better
> to convert it to BIT in another patch.
> 

Yeah, it'd be another patch - but I realised last night that Heiko
actually sent the conversion already so don't worry about it ;)
https://lore.kernel.org/all/20220905111027.2463297-1-heiko@sntech.de/

Thanks,
Conor.

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v5 1/4] mm: modify pte format for Svnapot
  2022-10-05  4:43       ` Qinglin Pan
  2022-10-05  6:57         ` Conor Dooley
@ 2022-10-05  7:15         ` Andrew Jones
  2022-10-05 12:41           ` Qinglin Pan
  1 sibling, 1 reply; 26+ messages in thread
From: Andrew Jones @ 2022-10-05  7:15 UTC (permalink / raw)
  To: Qinglin Pan; +Cc: Conor Dooley, palmer, linux-riscv, jeff, xuyinan

On Wed, Oct 05, 2022 at 12:43:01PM +0800, Qinglin Pan wrote:
> Hi Conor and Andrew,
> 
> On 10/5/22 2:33 AM, Conor Dooley wrote:
> > On Tue, Oct 04, 2022 at 07:00:27PM +0200, Andrew Jones wrote:
> > > On Mon, Oct 03, 2022 at 09:47:18PM +0800, panqinglin2020@iscas.ac.cn wrote:

> > > > +#define ALT_SVNAPOT_PTE_PFN(_val, _napot_shift, _pfn_mask, _pfn_shift)	\
> > > > +asm(ALTERNATIVE("and %0, %0, %1\n\t"					\
> > > > +		"srli %0, %0, %2\n\t"					\
> > > > +		__nops(3),						\
> > > > +		"srli t3, %0, %3\n\t"					\
> > > > +		"and %0, %0, %1\n\t"					\
> > > > +		"srli %0, %0, %2\n\t"					\
> > > > +		"sub  t4, %0, t3\n\t"					\
> > > > +		"and  %0, %0, t4",					\
> > > 
> > > This implements
> > > 
> > >    temp = ((pte & _PAGE_PFN_MASK) >> _PAGE_PFN_SHIFT);
> > >    pfn = temp & (temp - (pte >> _PAGE_NAPOT_SHIFT));
> > > 
> > > which for a non-napot pte just returns the same as the non-napot
> > > case would, but for a napot pte we return the same as the non-napot
> > > case but with its first set bit cleared, wherever that first set
> > > bit was. Can you explain to me why that's what we want to do?
> > > 
> 
> For 64KB napot pte, (pte >> _PAGE_NAPOT_SHIFT) will get 1, and temp will be
> something like 0xabcdabcdab8, but the correct pfn we expect should be
> 0xabcdabcdab0. We can get it by calculating (temp & (temp - 1)).
> So temp & (temp - (pte >> _PAGE_NAPOT_SHIFT)) will give correct pfn
> both in non-napot and napot case:)

I understood that and it makes sense to me for your example, where
temp = 0xabcdabcdab8, as we effectively clear the lower four bits as
expected (I think) for napot-order = 4. But, what if temp = 0xabcdabcdab0,
then we'll get 0xabcdabcdaa0 for the napot case. Is that still correct?
With the (temp & (temp - 1)) approach we'll always clear the first set
bit, wherever it is, e.g. 0xabcd0000800 would be 0xabcd0000000.  Am I
missing something about the expectations of the lower PPN bits of the PTE?

> > > >    */
> > > >   enum riscv_isa_ext_id {
> > > >   	RISCV_ISA_EXT_SSCOFPMF = RISCV_ISA_EXT_BASE,
> > > > +	RISCV_ISA_EXT_SSTC,
> > > > +	RISCV_ISA_EXT_SVNAPOT,
> > > >   	RISCV_ISA_EXT_SVPBMT,
> > > >   	RISCV_ISA_EXT_ZICBOM,
> > > >   	RISCV_ISA_EXT_ZIHINTPAUSE,
> > > > -	RISCV_ISA_EXT_SSTC,
> > > 
> > > Opportunistic alphabetizing an enum is a bit risky. It'd be better if this
> > > was a separate patch, because, even though nothing should care about the
> > > order, if something does care about the order, then it'd be easier to
> > > debug when this when changed alone.
> > 
> > I think for these things it is s/alphabetical/canonical order, given the
> > patch from Palmer I linked in my previous mail. Although thinking about
> > it, that means V before S, so SV before SS? Which his patch did not do:
> > https://lore.kernel.org/linux-riscv/20220920204518.10988-1-palmer@rivosinc.com/
> > 

I wonder if we should just go with alphabetical order (or no order) for
places that the order doesn't matter. In my experience it's hard to
enforce even alphabetical order when nothing breaks when the order is
"wrong". Where the order does matter we should enforce whatever that order
is supposed to be, of course. So maybe Palmer's patch isn't quite right,
but the canonical order, which is almost-alphabetic, is going to make my
head explode trying to use it...

> 
> Not very sure about how to place it by canonical order:(
> I will avoid reorderint other enum variants in this patch, and only add
> SVNAPOT just before SVPBMT. Another patch to reroder them is welcome:)

Or even just put SVNAPOT at the bottom of the enum for this patch.

> > > > +
> > > >   /*
> > > >    * [62:61] Svpbmt Memory Type definitions:
> > > >    *
> > > > diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
> > > > index 7ec936910a96..c3fc3c661699 100644
> > > > --- a/arch/riscv/include/asm/pgtable.h
> > > > +++ b/arch/riscv/include/asm/pgtable.h
> > > > @@ -264,10 +264,39 @@ static inline pte_t pud_pte(pud_t pud)
> > > >   	return __pte(pud_val(pud));
> > > >   }
> > > > +static inline bool has_svnapot(void)
> > > > +{
> > > > +	u64 _val;
> > > > +
> > > > +	ALT_SVNAPOT(_val);
> > > 
> > > Why aren't we using the isa extension static key framework for this?
> 
> I am not very sure why static key is better than errata? If it is
> really necessary, I will convert it to static key in the next version.

Well the errata framework is for errata and the static key framework is
for zero-cost CPU feature checks like has_svnapot() is implementing. The
question shouldn't be why static key is better, but rather why would we
need to abuse the errata framework for a CPU feature check.

> > > >   /*
> > > >    * Probe presence of individual extensions.
> > > >    *
> > > > @@ -301,6 +316,9 @@ static u32 __init_or_module cpufeature_probe(unsigned int stage)
> > > >   	if (cpufeature_probe_zicbom(stage))
> > > >   		cpu_req_feature |= (1U << CPUFEATURE_ZICBOM);
> > 
> > While in the area, could this be converted to BIT(CPUFEATURE_ZICBOM)?
> 
> This zicbom related code is not produced by this patchset. It may be better
> to convert it to BIT in another patch.
> 

Actually, to be type pedantic we shouldn't use BIT() below, but rather use
the (1U << ...) like above. cpu_req_feature is a u32.

> 
> > 
> > > > +	if (cpufeature_probe_svnapot(stage))
> > > > +		cpu_req_feature |= BIT(CPUFEATURE_SVNAPOT);

Thanks,
drew

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v5 1/4] mm: modify pte format for Svnapot
  2022-10-04 18:33     ` Conor Dooley
  2022-10-05  4:43       ` Qinglin Pan
@ 2022-10-05  9:52       ` Heiko Stübner
  1 sibling, 0 replies; 26+ messages in thread
From: Heiko Stübner @ 2022-10-05  9:52 UTC (permalink / raw)
  To: Andrew Jones, linux-riscv
  Cc: panqinglin2020, palmer, linux-riscv, jeff, xuyinan, Conor Dooley

Am Dienstag, 4. Oktober 2022, 20:33:05 CEST schrieb Conor Dooley:
> On Tue, Oct 04, 2022 at 07:00:27PM +0200, Andrew Jones wrote:
> > On Mon, Oct 03, 2022 at 09:47:18PM +0800, panqinglin2020@iscas.ac.cn wrote:
> > > From: Qinglin Pan <panqinglin2020@iscas.ac.cn>
> > > 
> > > This commit adds two erratas to enable/disable svnapot support, patches
> > > code dynamicly when "svnapot" is in the "riscv,isa" field of fdt and
> > > SVNAPOT compile option is set. It will influence the behavior of
> > > has_svnapot function and pte_pfn function. All code dependent on svnapot
> > > should make sure that has_svnapot return true firstly.
> > > 
> > > Also, this commit modifies PTE definition for Svnapot, and creates some
> > > functions in pgtable.h to mark a PTE as napot and check if it is a Svnapot
> > > PTE. Until now, only 64KB napot size is supported in draft spec, so some
> > > macros has only 64KB version.
> > > 
> > > Signed-off-by: Qinglin Pan <panqinglin2020@iscas.ac.cn>

[...]

> > > +	switch (stage) {
> > > +	case RISCV_ALTERNATIVES_EARLY_BOOT:
> > > +		return false;
> > > +	default:
> > > +		return riscv_isa_extension_available(NULL, SVNAPOT);
> > > +	}
> > > +#endif
> > > +
> > > +	return false;
> > > +}
> > > +
> > >  /*
> > >   * Probe presence of individual extensions.
> > >   *
> > > @@ -301,6 +316,9 @@ static u32 __init_or_module cpufeature_probe(unsigned int stage)
> > >  	if (cpufeature_probe_zicbom(stage))
> > >  		cpu_req_feature |= (1U << CPUFEATURE_ZICBOM);
> 
> While in the area, could this be converted to BIT(CPUFEATURE_ZICBOM)?

It already is [0], although it looks like I need to ping Palmer about
that series.


Heiko

[0] https://lore.kernel.org/all/20220905111027.2463297-5-heiko@sntech.de/



_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v5 2/4] mm: support Svnapot in physical page linear-mapping
  2022-10-03 13:47 ` [PATCH v5 2/4] mm: support Svnapot in physical page linear-mapping panqinglin2020
  2022-10-04 18:40   ` Conor Dooley
@ 2022-10-05 11:19   ` Andrew Jones
  2022-10-05 12:45     ` Qinglin Pan
  1 sibling, 1 reply; 26+ messages in thread
From: Andrew Jones @ 2022-10-05 11:19 UTC (permalink / raw)
  To: panqinglin2020; +Cc: palmer, linux-riscv, jeff, xuyinan

On Mon, Oct 03, 2022 at 09:47:19PM +0800, panqinglin2020@iscas.ac.cn wrote:
> From: Qinglin Pan <panqinglin2020@iscas.ac.cn>
> 
> Svnapot is powerful when a physical region is going to mapped to a
> virtual region. Kernel will do like this when mapping all allocable
> physical pages to kernel vm space. This commit modifies the
> create_pte_mapping function used in linear-mapping procedure, so the
> kernel can be able to use Svnapot when both address and length of
> physical region are 64KB align. Code here will be executed only when
> other size huge page is not suitable, so it can be an addition of
> PMD_SIZE and PUD_SIZE mapping.
> 
> This commit also modifies the best_map_size function to give map_size
> many times instead of only once, so a memory region can be mapped by
> both PMD_SIZE and 64KB napot size.

I'd prefer to see the best_map_size() change for PMD_SIZE and PAGE_SIZE
be a separate patch. Then, the NAPOT_CONT64KB_SIZE support can be added
on to a ready best_map_size(). In fact, I'd prefer this patch be dropped
from this series and the best_map_size() for PMD_SIZE and PAGE_SIZE patch
be either posted as a first patch of the "use svnapot in early boot"
series or be posted alone, as it's already applicable.

> 
> It is tested by setting qemu's memory to a 262272k region, and the
> kernel can boot successfully.
> 
> Currently, the modified create_pte_mapping will never take use of SVNAPOT,
> because this extension is detected in riscv_fill_hwcap and enabled in
> apply_boot_alternatives(called from setup_arch) which is called
> after setup_vm_final. We will need to support function like
> riscv_fill_hwcap_early to fill hardware capabilities more earlier, and
> try to enable SVNAPOT more earlier in apply_early_boot_alternatives,
> so that we can determine SVNAPOT's presence during setup_vm_final.

Thanks,
drew

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v5 1/4] mm: modify pte format for Svnapot
  2022-10-05  7:15         ` Andrew Jones
@ 2022-10-05 12:41           ` Qinglin Pan
  2022-10-05 13:25             ` Andrew Jones
  0 siblings, 1 reply; 26+ messages in thread
From: Qinglin Pan @ 2022-10-05 12:41 UTC (permalink / raw)
  To: Andrew Jones; +Cc: Conor Dooley, palmer, linux-riscv, jeff, xuyinan

On 10/5/22 3:15 PM, Andrew Jones wrote:
> On Wed, Oct 05, 2022 at 12:43:01PM +0800, Qinglin Pan wrote:
>> Hi Conor and Andrew,
>>
>> On 10/5/22 2:33 AM, Conor Dooley wrote:
>>> On Tue, Oct 04, 2022 at 07:00:27PM +0200, Andrew Jones wrote:
>>>> On Mon, Oct 03, 2022 at 09:47:18PM +0800, panqinglin2020@iscas.ac.cn wrote:
> 
>>>>> +#define ALT_SVNAPOT_PTE_PFN(_val, _napot_shift, _pfn_mask, _pfn_shift)	\
>>>>> +asm(ALTERNATIVE("and %0, %0, %1\n\t"					\
>>>>> +		"srli %0, %0, %2\n\t"					\
>>>>> +		__nops(3),						\
>>>>> +		"srli t3, %0, %3\n\t"					\
>>>>> +		"and %0, %0, %1\n\t"					\
>>>>> +		"srli %0, %0, %2\n\t"					\
>>>>> +		"sub  t4, %0, t3\n\t"					\
>>>>> +		"and  %0, %0, t4",					\
>>>>
>>>> This implements
>>>>
>>>>     temp = ((pte & _PAGE_PFN_MASK) >> _PAGE_PFN_SHIFT);
>>>>     pfn = temp & (temp - (pte >> _PAGE_NAPOT_SHIFT));
>>>>
>>>> which for a non-napot pte just returns the same as the non-napot
>>>> case would, but for a napot pte we return the same as the non-napot
>>>> case but with its first set bit cleared, wherever that first set
>>>> bit was. Can you explain to me why that's what we want to do?
>>>>
>>
>> For 64KB napot pte, (pte >> _PAGE_NAPOT_SHIFT) will get 1, and temp will be
>> something like 0xabcdabcdab8, but the correct pfn we expect should be
>> 0xabcdabcdab0. We can get it by calculating (temp & (temp - 1)).
>> So temp & (temp - (pte >> _PAGE_NAPOT_SHIFT)) will give correct pfn
>> both in non-napot and napot case:)
> 
> I understood that and it makes sense to me for your example, where
> temp = 0xabcdabcdab8, as we effectively clear the lower four bits as
> expected (I think) for napot-order = 4. But, what if temp = 0xabcdabcdab0,
> then we'll get 0xabcdabcdaa0 for the napot case. Is that still correct?
> With the (temp & (temp - 1)) approach we'll always clear the first set
> bit, wherever it is, e.g. 0xabcd0000800 would be 0xabcd0000000.  Am I
> missing something about the expectations of the lower PPN bits of the PTE?

According to spec, when napot-order=4, the last 3 bit of temp will be 0, 
and the fourth bit from the bottom must be 1. All 16 PTEs will be the 
same. We'll always need to clear the first set bit of napot PTE's pfn.
The first set bit is used by MMU to determine this PTE's napot-order.

Thanks,
Qinglin


_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v5 2/4] mm: support Svnapot in physical page linear-mapping
  2022-10-05 11:19   ` Andrew Jones
@ 2022-10-05 12:45     ` Qinglin Pan
  0 siblings, 0 replies; 26+ messages in thread
From: Qinglin Pan @ 2022-10-05 12:45 UTC (permalink / raw)
  To: Andrew Jones; +Cc: palmer, linux-riscv, jeff, xuyinan

Hi Andrew,

On 10/5/22 7:19 PM, Andrew Jones wrote:
> On Mon, Oct 03, 2022 at 09:47:19PM +0800, panqinglin2020@iscas.ac.cn wrote:
>> From: Qinglin Pan <panqinglin2020@iscas.ac.cn>
>>
>> Svnapot is powerful when a physical region is going to mapped to a
>> virtual region. Kernel will do like this when mapping all allocable
>> physical pages to kernel vm space. This commit modifies the
>> create_pte_mapping function used in linear-mapping procedure, so the
>> kernel can be able to use Svnapot when both address and length of
>> physical region are 64KB align. Code here will be executed only when
>> other size huge page is not suitable, so it can be an addition of
>> PMD_SIZE and PUD_SIZE mapping.
>>
>> This commit also modifies the best_map_size function to give map_size
>> many times instead of only once, so a memory region can be mapped by
>> both PMD_SIZE and 64KB napot size.
> 
> I'd prefer to see the best_map_size() change for PMD_SIZE and PAGE_SIZE
> be a separate patch. Then, the NAPOT_CONT64KB_SIZE support can be added
> on to a ready best_map_size(). In fact, I'd prefer this patch be dropped
> from this series and the best_map_size() for PMD_SIZE and PAGE_SIZE patch
> be either posted as a first patch of the "use svnapot in early boot"
> series or be posted alone, as it's already applicable.
> 

I agree with you. I will drop this commit from the series and post the 
modification on best_map_size for PMD_SIZE and PAGE_SIZE as a single 
patch alone. I will do this in this series' v7.

Thanks,
Qinglin

>>
>> It is tested by setting qemu's memory to a 262272k region, and the
>> kernel can boot successfully.
>>
>> Currently, the modified create_pte_mapping will never take use of SVNAPOT,
>> because this extension is detected in riscv_fill_hwcap and enabled in
>> apply_boot_alternatives(called from setup_arch) which is called
>> after setup_vm_final. We will need to support function like
>> riscv_fill_hwcap_early to fill hardware capabilities more earlier, and
>> try to enable SVNAPOT more earlier in apply_early_boot_alternatives,
>> so that we can determine SVNAPOT's presence during setup_vm_final.
> 
> Thanks,
> drew


_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v5 3/4] mm: support Svnapot in hugetlb page
  2022-10-03 13:47 ` [PATCH v5 3/4] mm: support Svnapot in hugetlb page panqinglin2020
  2022-10-04 18:43   ` Conor Dooley
@ 2022-10-05 13:11   ` Andrew Jones
  2022-10-05 14:54     ` Qinglin Pan
  1 sibling, 1 reply; 26+ messages in thread
From: Andrew Jones @ 2022-10-05 13:11 UTC (permalink / raw)
  To: panqinglin2020; +Cc: palmer, linux-riscv, jeff, xuyinan

On Mon, Oct 03, 2022 at 09:47:20PM +0800, panqinglin2020@iscas.ac.cn wrote:
> From: Qinglin Pan <panqinglin2020@iscas.ac.cn>
> 
> Svnapot can be used to support 64KB hugetlb page, so it can become a new
> option when using hugetlbfs. This commit adds a basic implementation of
> hugetlb page, and support 64KB as a size in it by using Svnapot.
> 
> For test, boot kernel with command line contains "default_hugepagesz=64K
> hugepagesz=64K hugepages=20" and run a simple test like this:
> 
> int main() {
> 	void *addr;
> 	addr = mmap(NULL, 64 * 1024, PROT_WRITE | PROT_READ,
> 			MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_HUGE_64KB, -1, 0);
> 	printf("back from mmap \n");
> 	long *ptr = (long *)addr;
> 	unsigned int i = 0;
> 	for(; i < 8 * 1024;i += 512) {
> 		printf("%lp \n", ptr);
> 		*ptr = 0xdeafabcd12345678;
> 		ptr += 512;
> 	}
> 	ptr = (long *)addr;
> 	i = 0;
> 	for(; i < 8 * 1024;i += 512) {
> 		if (*ptr != 0xdeafabcd12345678) {
> 			printf("failed! 0x%lx \n", *ptr);
> 			break;
> 		}
> 		ptr += 512;
> 	}
> 	if(i == 8 * 1024)
> 		printf("simple test passed!\n");
> }
> 
> And it should be passed.

I think the above test is nearly equivalent to running

  tools/testing/selftests/vm/map_hugetlb 1 16

> 
> Signed-off-by: Qinglin Pan <panqinglin2020@iscas.ac.cn>
> 
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index 4354024aae21..3d5ec1391046 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -43,7 +43,7 @@ config RISCV
>  	select ARCH_USE_QUEUED_RWLOCKS
>  	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
>  	select ARCH_WANT_FRAME_POINTERS
> -	select ARCH_WANT_GENERAL_HUGETLB
> +	select ARCH_WANT_GENERAL_HUGETLB if !RISCV_ISA_SVNAPOT
>  	select ARCH_WANT_HUGE_PMD_SHARE if 64BIT
>  	select BINFMT_FLAT_NO_DATA_START_OFFSET if !MMU
>  	select BUILDTIME_TABLE_SORT if MMU
> diff --git a/arch/riscv/include/asm/hugetlb.h b/arch/riscv/include/asm/hugetlb.h
> index a5c2ca1d1cd8..79cbb482f0a0 100644
> --- a/arch/riscv/include/asm/hugetlb.h
> +++ b/arch/riscv/include/asm/hugetlb.h
> @@ -2,7 +2,35 @@
>  #ifndef _ASM_RISCV_HUGETLB_H
>  #define _ASM_RISCV_HUGETLB_H
>  
> -#include <asm-generic/hugetlb.h>
>  #include <asm/page.h>
>  
> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
> +pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags);
> +#define arch_make_huge_pte arch_make_huge_pte

The above define should be moved above its function.

> +#define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT
> +void set_huge_pte_at(struct mm_struct *mm,
> +		     unsigned long addr, pte_t *ptep, pte_t pte);
> +#define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR
> +pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
> +			      unsigned long addr, pte_t *ptep);
> +#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH
> +pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
> +			    unsigned long addr, pte_t *ptep);
> +#define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS
> +int huge_ptep_set_access_flags(struct vm_area_struct *vma,
> +			       unsigned long addr, pte_t *ptep,
> +			       pte_t pte, int dirty);
> +#define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT
> +void huge_ptep_set_wrprotect(struct mm_struct *mm,
> +			     unsigned long addr, pte_t *ptep);
> +#define __HAVE_ARCH_HUGE_PTE_CLEAR
> +void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
> +		    pte_t *ptep, unsigned long sz);
> +#define set_huge_swap_pte_at riscv_set_huge_swap_pte_at
> +void riscv_set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
> +				pte_t *ptep, pte_t pte, unsigned long sz);

Why the riscv_ prefix on this one? Wait, what is set_huge_swap_pte_at()?
It's not in asm-generic/hugetlb.h.

> +#endif /*CONFIG_RISCV_ISA_SVNAPOT*/
> +
> +#include <asm-generic/hugetlb.h>
> +
>  #endif /* _ASM_RISCV_HUGETLB_H */
> diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
> index ac70b0fd9a9a..1ea06476902a 100644
> --- a/arch/riscv/include/asm/page.h
> +++ b/arch/riscv/include/asm/page.h
> @@ -17,7 +17,7 @@
>  #define PAGE_MASK	(~(PAGE_SIZE - 1))
>  
>  #ifdef CONFIG_64BIT
> -#define HUGE_MAX_HSTATE		2
> +#define HUGE_MAX_HSTATE		3
>  #else
>  #define HUGE_MAX_HSTATE		1
>  #endif
> diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
> index 932dadfdca54..faa207826260 100644
> --- a/arch/riscv/mm/hugetlbpage.c
> +++ b/arch/riscv/mm/hugetlbpage.c
> @@ -2,6 +2,239 @@
>  #include <linux/hugetlb.h>
>  #include <linux/err.h>
>  
> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
> +pte_t *huge_pte_alloc(struct mm_struct *mm,
> +		      struct vm_area_struct *vma,
> +		      unsigned long addr,
> +		      unsigned long sz)
> +{
> +	pgd_t *pgdp = pgd_offset(mm, addr);
> +	p4d_t *p4dp = p4d_alloc(mm, pgdp, addr);
> +	pud_t *pudp = pud_alloc(mm, p4dp, addr);
> +	pmd_t *pmdp = pmd_alloc(mm, pudp, addr);

We should be checking all the *_alloc's for NULL returns before
progressing to the next one.

> +
> +	if (sz == NAPOT_CONT64KB_SIZE) {
> +		if (!pmdp)
> +			return NULL;
> +		WARN_ON(addr & (sz - 1));
> +		return pte_alloc_map(mm, pmdp, addr);
> +	}

What happened to the PUD_SIZE and PMD_SIZE handling that the
generic huge_pte_alloc() does? I'm pretty sure we need to
duplicate it here, at least it appears arm64 does it that way.

> +
> +	return NULL;
> +}
> +
> +pte_t *huge_pte_offset(struct mm_struct *mm,
> +		       unsigned long addr,
> +		       unsigned long sz)
> +{
> +	pgd_t *pgdp;
> +	p4d_t *p4dp;
> +	pud_t *pudp;
> +	pmd_t *pmdp;
> +	pte_t *ptep = NULL;
> +
> +	pgdp = pgd_offset(mm, addr);
> +	if (!pgd_present(READ_ONCE(*pgdp)))
> +		return NULL;
> +
> +	p4dp = p4d_offset(pgdp, addr);
> +	if (!p4d_present(READ_ONCE(*p4dp)))
> +		return NULL;
> +
> +	pudp = pud_offset(p4dp, addr);
> +	if (!pud_present(READ_ONCE(*pudp)))
> +		return NULL;
> +
> +	pmdp = pmd_offset(pudp, addr);
> +	if (!pmd_present(READ_ONCE(*pmdp)))
> +		return NULL;
> +
> +	if (sz == NAPOT_CONT64KB_SIZE)
> +		ptep = pte_offset_kernel(pmdp, (addr & ~NAPOT_CONT64KB_MASK));

Same question as above; where's the PUD and PMD size handling?

> +
> +	return ptep;
> +}
> +
> +static int napot_pte_num(pte_t pte)
> +{
> +	if (pte_val(pte) & _PAGE_NAPOT)
> +		return NAPOT_64KB_PTE_NUM;
> +
> +	pr_warn("%s: unrecognized napot pte size 0x%lx\n",
> +		__func__, pte_val(pte));

If we should never call this function with a non-napot pte, then
maybe this should be

 BUG(!(pte_val(pte) & _PAGE_NAPOT));
 return NAPOT_64KB_PTE_NUM;

> +	return 1;
> +}
> +
> +static pte_t get_clear_flush(struct mm_struct *mm,
> +			     unsigned long addr,
> +			     pte_t *ptep,
> +			     unsigned long pte_num)
> +{
> +	pte_t orig_pte = huge_ptep_get(ptep);

It seems like we'd only want orig_pte = ptep_get(ptep) here (even
though I know for riscv they're the same).

> +	bool valid = pte_val(orig_pte);

I think !pte_none(orig_pte) is more idiomatic.

> +	unsigned long i, saddr = addr;
> +
> +	for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++) {
> +		pte_t pte = ptep_get_and_clear(mm, addr, ptep);
> +
> +		if (pte_dirty(pte))
> +			orig_pte = pte_mkdirty(orig_pte);
> +
> +		if (pte_young(pte))
> +			orig_pte = pte_mkyoung(orig_pte);
> +	}
> +
> +	if (valid) {

I'm not sure why we only flush all the cleared ptes when the
the head pte wasn't originally cleared. And what if orig_pte
had dirty and young bits propagated to it even though it
originally cleared?

> +		struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
> +
> +		flush_tlb_range(&vma, saddr, addr);
> +	}

blank line here, please

> +	return orig_pte;
> +}
> +
> +static void clear_flush(struct mm_struct *mm,
> +			unsigned long addr,
> +			pte_t *ptep,
> +			unsigned long pte_num)
> +{
> +	struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
> +	unsigned long i, saddr = addr;
> +
> +	for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
> +		pte_clear(mm, addr, ptep);
> +
> +	flush_tlb_range(&vma, saddr, addr);
> +}
> +
> +pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
> +{

riscv doesn't do anything in pte_mkhuge(), but I think I'd still prefer to
see a

  entry = pte_mkhuge(entry);

here.

> +	if (shift == NAPOT_CONT64KB_SHIFT)
> +		entry = pte_mknapot(entry, NAPOT_CONT64KB_SHIFT - PAGE_SHIFT);
> +
> +	return entry;
> +}
> +
> +void set_huge_pte_at(struct mm_struct *mm,
> +		     unsigned long addr,
> +		     pte_t *ptep,
> +		     pte_t pte)
> +{
> +	int i;
> +	int pte_num;
> +
> +	if (!pte_napot(pte)) {
> +		set_pte_at(mm, addr, ptep, pte);
> +		return;
> +	}
> +
> +	pte_num = napot_pte_num(pte);
> +	for (i = 0; i < pte_num; i++, ptep++, addr += PAGE_SIZE)
> +		set_pte_at(mm, addr, ptep, pte);
> +}
> +
> +int huge_ptep_set_access_flags(struct vm_area_struct *vma,
> +			       unsigned long addr,
> +			       pte_t *ptep,
> +			       pte_t pte,
> +			       int dirty)
> +{
> +	pte_t orig_pte;
> +	int i;
> +	int pte_num;
> +
> +	if (!pte_napot(pte))
> +		return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
> +
> +	pte_num = napot_pte_num(pte);
> +	ptep = huge_pte_offset(vma->vm_mm, addr, NAPOT_CONT64KB_SIZE);

How about replacing NAPOT_CONT64KB_SIZE with

  napot_pte_num(pte) << PAGE_SHIFT

which should allow it to work with other napot sizes?

> +	orig_pte = huge_ptep_get(ptep);
> +
> +	if (pte_dirty(orig_pte))
> +		pte = pte_mkdirty(pte);
> +
> +	if (pte_young(orig_pte))
> +		pte = pte_mkyoung(pte);
> +
> +	for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
> +		ptep_set_access_flags(vma, addr, ptep, pte, dirty);
> +
> +	return true;
> +}
> +
> +pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
> +			      unsigned long addr,
> +			      pte_t *ptep)
> +{
> +	int pte_num;
> +	pte_t orig_pte = huge_ptep_get(ptep);
> +
> +	if (!pte_napot(orig_pte))
> +		return ptep_get_and_clear(mm, addr, ptep);
> +
> +	pte_num = napot_pte_num(orig_pte);
> +	return get_clear_flush(mm, addr, ptep, pte_num);
> +}
> +
> +void huge_ptep_set_wrprotect(struct mm_struct *mm,
> +			     unsigned long addr,
> +			     pte_t *ptep)
> +{
> +	int i;
> +	int pte_num;
> +	pte_t pte = READ_ONCE(*ptep);

ptep_get() ?

> +
> +	if (!pte_napot(pte))
> +		return ptep_set_wrprotect(mm, addr, ptep);

ptep_set_wrprotect is a void function so this should be
written as

  ptep_set_wrprotect(...);
  return;

even though it's valid to return void from a void function...

> +
> +	pte_num = napot_pte_num(pte);
> +	ptep = huge_pte_offset(mm, addr, NAPOT_CONT64KB_SIZE);
  
Change NAPOT_CONT64KB_SIZE to napot_pte_num(pte) << PAGE_SHIFT ?

> +
> +	for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
> +		ptep_set_wrprotect(mm, addr, ptep);
> +}
> +
> +pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
> +			    unsigned long addr,
> +			    pte_t *ptep)
> +{
> +	int pte_num;
> +	pte_t pte = READ_ONCE(*ptep);

ptep_get() ?

> +
> +	if (!pte_napot(pte)) {
> +		ptep_clear_flush(vma, addr, ptep);
> +		return pte;

This would look nicer as

  if (!pte_napot(pte))
     return ptep_clear_flush(vma, addr, ptep);

> +	}
> +
> +	pte_num = napot_pte_num(pte);
> +	clear_flush(vma->vm_mm, addr, ptep, pte_num);
> +
> +	return pte;

arm64 ensures the pte returned from this function has had its
dirty and young bits updated, but this function does not?

> +}
> +
> +void huge_pte_clear(struct mm_struct *mm,
> +		    unsigned long addr,
> +		    pte_t *ptep,
> +		    unsigned long sz)
> +{
> +	int i, pte_num;

No check that the pte is a napot pte?

> +
> +	pte_num = napot_pte_num(READ_ONCE(*ptep));
> +	for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
> +		pte_clear(mm, addr, ptep);
> +}
> +
> +void riscv_set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
> +				pte_t *ptep, pte_t pte, unsigned long sz)
> +{
> +	int i, pte_num;

No check that the pte is a napot pte?

> +
> +	pte_num = napot_pte_num(READ_ONCE(*ptep));
> +
> +	for (i = 0; i < pte_num; i++, ptep++)
> +		set_pte(ptep, pte);
> +}
> +#endif /*CONFIG_RISCV_ISA_SVNAPOT*/
> +
>  int pud_huge(pud_t pud)
>  {
>  	return pud_leaf(pud);
> @@ -18,17 +251,26 @@ bool __init arch_hugetlb_valid_size(unsigned long size)
>  		return true;
>  	else if (IS_ENABLED(CONFIG_64BIT) && size == PUD_SIZE)
>  		return true;
> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
> +	else if (has_svnapot() && size == NAPOT_CONT64KB_SIZE)

I think it'd be best if we avoid using explicit 64KB constants and instead
define, e.g. napot_cont_size(pte), macros which, for now, will always
return 64KB-based values.

> +		return true;
> +#endif /*CONFIG_RISCV_ISA_SVNAPOT*/
>  	else
>  		return false;
>  }
>  
> -#ifdef CONFIG_CONTIG_ALLOC
> -static __init int gigantic_pages_init(void)
> +static __init int hugetlbpage_init(void)
>  {
> +#ifdef CONFIG_CONTIG_ALLOC
>  	/* With CONTIG_ALLOC, we can allocate gigantic pages at runtime */
>  	if (IS_ENABLED(CONFIG_64BIT))
>  		hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
> +#endif /*CONFIG_CONTIG_ALLOC*/
> +	hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);

Where'd this come from? It looks like there should be another patch
that just does the change of this gigantic_pages_init() function to
a hugetlbpage function that does both gigantic (PUD) pages and PMD
sized pages.

> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
> +	if (has_svnapot())
> +		hugetlb_add_hstate(NAPOT_CONT64KB_SHIFT - PAGE_SHIFT);
> +#endif /*CONFIG_RISCV_ISA_SVNAPOT*/
>  	return 0;
>  }
> -arch_initcall(gigantic_pages_init);
> -#endif
> +arch_initcall(hugetlbpage_init);
> -- 
> 2.35.1
>

Thanks,
drew

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v5 1/4] mm: modify pte format for Svnapot
  2022-10-05 12:41           ` Qinglin Pan
@ 2022-10-05 13:25             ` Andrew Jones
  2022-10-05 14:50               ` Qinglin Pan
  0 siblings, 1 reply; 26+ messages in thread
From: Andrew Jones @ 2022-10-05 13:25 UTC (permalink / raw)
  To: Qinglin Pan; +Cc: Conor Dooley, palmer, linux-riscv, jeff, xuyinan

On Wed, Oct 05, 2022 at 08:41:02PM +0800, Qinglin Pan wrote:
> On 10/5/22 3:15 PM, Andrew Jones wrote:
> > On Wed, Oct 05, 2022 at 12:43:01PM +0800, Qinglin Pan wrote:
> > > Hi Conor and Andrew,
> > > 
> > > On 10/5/22 2:33 AM, Conor Dooley wrote:
> > > > On Tue, Oct 04, 2022 at 07:00:27PM +0200, Andrew Jones wrote:
> > > > > On Mon, Oct 03, 2022 at 09:47:18PM +0800, panqinglin2020@iscas.ac.cn wrote:
> > 
> > > > > > +#define ALT_SVNAPOT_PTE_PFN(_val, _napot_shift, _pfn_mask, _pfn_shift)	\
> > > > > > +asm(ALTERNATIVE("and %0, %0, %1\n\t"					\
> > > > > > +		"srli %0, %0, %2\n\t"					\
> > > > > > +		__nops(3),						\
> > > > > > +		"srli t3, %0, %3\n\t"					\
> > > > > > +		"and %0, %0, %1\n\t"					\
> > > > > > +		"srli %0, %0, %2\n\t"					\
> > > > > > +		"sub  t4, %0, t3\n\t"					\
> > > > > > +		"and  %0, %0, t4",					\
> > > > > 
> > > > > This implements
> > > > > 
> > > > >     temp = ((pte & _PAGE_PFN_MASK) >> _PAGE_PFN_SHIFT);
> > > > >     pfn = temp & (temp - (pte >> _PAGE_NAPOT_SHIFT));
> > > > > 
> > > > > which for a non-napot pte just returns the same as the non-napot
> > > > > case would, but for a napot pte we return the same as the non-napot
> > > > > case but with its first set bit cleared, wherever that first set
> > > > > bit was. Can you explain to me why that's what we want to do?
> > > > > 
> > > 
> > > For 64KB napot pte, (pte >> _PAGE_NAPOT_SHIFT) will get 1, and temp will be
> > > something like 0xabcdabcdab8, but the correct pfn we expect should be
> > > 0xabcdabcdab0. We can get it by calculating (temp & (temp - 1)).
> > > So temp & (temp - (pte >> _PAGE_NAPOT_SHIFT)) will give correct pfn
> > > both in non-napot and napot case:)
> > 
> > I understood that and it makes sense to me for your example, where
> > temp = 0xabcdabcdab8, as we effectively clear the lower four bits as
> > expected (I think) for napot-order = 4. But, what if temp = 0xabcdabcdab0,
> > then we'll get 0xabcdabcdaa0 for the napot case. Is that still correct?
> > With the (temp & (temp - 1)) approach we'll always clear the first set
> > bit, wherever it is, e.g. 0xabcd0000800 would be 0xabcd0000000.  Am I
> > missing something about the expectations of the lower PPN bits of the PTE?
> 
> According to spec, when napot-order=4, the last 3 bit of temp will be 0, and
> the fourth bit from the bottom must be 1. All 16 PTEs will be the same.
> We'll always need to clear the first set bit of napot PTE's pfn.
> The first set bit is used by MMU to determine this PTE's napot-order.

Thank you for the clarification. My first reading of the spec left me
confused, so I was hoping your answer would help me understand it and
it did. The assembly you've pulled together is indeed a clever way to
handle this :-)

Thanks,
drew

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v5 1/4] mm: modify pte format for Svnapot
  2022-10-05 13:25             ` Andrew Jones
@ 2022-10-05 14:50               ` Qinglin Pan
  0 siblings, 0 replies; 26+ messages in thread
From: Qinglin Pan @ 2022-10-05 14:50 UTC (permalink / raw)
  To: Andrew Jones; +Cc: Conor Dooley, palmer, linux-riscv, jeff, xuyinan

On 10/5/22 9:25 PM, Andrew Jones wrote:
> On Wed, Oct 05, 2022 at 08:41:02PM +0800, Qinglin Pan wrote:
>> On 10/5/22 3:15 PM, Andrew Jones wrote:
>>> On Wed, Oct 05, 2022 at 12:43:01PM +0800, Qinglin Pan wrote:
>>>> Hi Conor and Andrew,
>>>>
>>>> On 10/5/22 2:33 AM, Conor Dooley wrote:
>>>>> On Tue, Oct 04, 2022 at 07:00:27PM +0200, Andrew Jones wrote:
>>>>>> On Mon, Oct 03, 2022 at 09:47:18PM +0800, panqinglin2020@iscas.ac.cn wrote:
>>>
>>>>>>> +#define ALT_SVNAPOT_PTE_PFN(_val, _napot_shift, _pfn_mask, _pfn_shift)	\
>>>>>>> +asm(ALTERNATIVE("and %0, %0, %1\n\t"					\
>>>>>>> +		"srli %0, %0, %2\n\t"					\
>>>>>>> +		__nops(3),						\
>>>>>>> +		"srli t3, %0, %3\n\t"					\
>>>>>>> +		"and %0, %0, %1\n\t"					\
>>>>>>> +		"srli %0, %0, %2\n\t"					\
>>>>>>> +		"sub  t4, %0, t3\n\t"					\
>>>>>>> +		"and  %0, %0, t4",					\
>>>>>>
>>>>>> This implements
>>>>>>
>>>>>>      temp = ((pte & _PAGE_PFN_MASK) >> _PAGE_PFN_SHIFT);
>>>>>>      pfn = temp & (temp - (pte >> _PAGE_NAPOT_SHIFT));
>>>>>>
>>>>>> which for a non-napot pte just returns the same as the non-napot
>>>>>> case would, but for a napot pte we return the same as the non-napot
>>>>>> case but with its first set bit cleared, wherever that first set
>>>>>> bit was. Can you explain to me why that's what we want to do?
>>>>>>
>>>>
>>>> For 64KB napot pte, (pte >> _PAGE_NAPOT_SHIFT) will get 1, and temp will be
>>>> something like 0xabcdabcdab8, but the correct pfn we expect should be
>>>> 0xabcdabcdab0. We can get it by calculating (temp & (temp - 1)).
>>>> So temp & (temp - (pte >> _PAGE_NAPOT_SHIFT)) will give correct pfn
>>>> both in non-napot and napot case:)
>>>
>>> I understood that and it makes sense to me for your example, where
>>> temp = 0xabcdabcdab8, as we effectively clear the lower four bits as
>>> expected (I think) for napot-order = 4. But, what if temp = 0xabcdabcdab0,
>>> then we'll get 0xabcdabcdaa0 for the napot case. Is that still correct?
>>> With the (temp & (temp - 1)) approach we'll always clear the first set
>>> bit, wherever it is, e.g. 0xabcd0000800 would be 0xabcd0000000.  Am I
>>> missing something about the expectations of the lower PPN bits of the PTE?
>>
>> According to spec, when napot-order=4, the last 3 bit of temp will be 0, and
>> the fourth bit from the bottom must be 1. All 16 PTEs will be the same.
>> We'll always need to clear the first set bit of napot PTE's pfn.
>> The first set bit is used by MMU to determine this PTE's napot-order.
> 
> Thank you for the clarification. My first reading of the spec left me
> confused, so I was hoping your answer would help me understand it and
> it did. The assembly you've pulled together is indeed a clever way to
> handle this :-)
> 

You are welcome :-)

Yours,
Qinglin

> Thanks,
> drew
> 
> _______________________________________________
> linux-riscv mailing list
> linux-riscv@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-riscv


_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH v5 3/4] mm: support Svnapot in hugetlb page
  2022-10-05 13:11   ` Andrew Jones
@ 2022-10-05 14:54     ` Qinglin Pan
  0 siblings, 0 replies; 26+ messages in thread
From: Qinglin Pan @ 2022-10-05 14:54 UTC (permalink / raw)
  To: Andrew Jones; +Cc: palmer, linux-riscv, jeff, xuyinan

Hi Andrew,

Thanks for so much suggestions : )
It seems that the code base has been changed and my code is not aware of
it. I will check them in detail according to your comments.

Tanks,
Qinglin

On 10/5/22 9:11 PM, Andrew Jones wrote:
> On Mon, Oct 03, 2022 at 09:47:20PM +0800, panqinglin2020@iscas.ac.cn wrote:
>> From: Qinglin Pan <panqinglin2020@iscas.ac.cn>
>>
>> Svnapot can be used to support 64KB hugetlb page, so it can become a new
>> option when using hugetlbfs. This commit adds a basic implementation of
>> hugetlb page, and support 64KB as a size in it by using Svnapot.
>>
>> For test, boot kernel with command line contains "default_hugepagesz=64K
>> hugepagesz=64K hugepages=20" and run a simple test like this:
>>
>> int main() {
>> 	void *addr;
>> 	addr = mmap(NULL, 64 * 1024, PROT_WRITE | PROT_READ,
>> 			MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_HUGE_64KB, -1, 0);
>> 	printf("back from mmap \n");
>> 	long *ptr = (long *)addr;
>> 	unsigned int i = 0;
>> 	for(; i < 8 * 1024;i += 512) {
>> 		printf("%lp \n", ptr);
>> 		*ptr = 0xdeafabcd12345678;
>> 		ptr += 512;
>> 	}
>> 	ptr = (long *)addr;
>> 	i = 0;
>> 	for(; i < 8 * 1024;i += 512) {
>> 		if (*ptr != 0xdeafabcd12345678) {
>> 			printf("failed! 0x%lx \n", *ptr);
>> 			break;
>> 		}
>> 		ptr += 512;
>> 	}
>> 	if(i == 8 * 1024)
>> 		printf("simple test passed!\n");
>> }
>>
>> And it should be passed.
> 
> I think the above test is nearly equivalent to running
> 
>    tools/testing/selftests/vm/map_hugetlb 1 16
> 
>>
>> Signed-off-by: Qinglin Pan <panqinglin2020@iscas.ac.cn>
>>
>> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
>> index 4354024aae21..3d5ec1391046 100644
>> --- a/arch/riscv/Kconfig
>> +++ b/arch/riscv/Kconfig
>> @@ -43,7 +43,7 @@ config RISCV
>>   	select ARCH_USE_QUEUED_RWLOCKS
>>   	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
>>   	select ARCH_WANT_FRAME_POINTERS
>> -	select ARCH_WANT_GENERAL_HUGETLB
>> +	select ARCH_WANT_GENERAL_HUGETLB if !RISCV_ISA_SVNAPOT
>>   	select ARCH_WANT_HUGE_PMD_SHARE if 64BIT
>>   	select BINFMT_FLAT_NO_DATA_START_OFFSET if !MMU
>>   	select BUILDTIME_TABLE_SORT if MMU
>> diff --git a/arch/riscv/include/asm/hugetlb.h b/arch/riscv/include/asm/hugetlb.h
>> index a5c2ca1d1cd8..79cbb482f0a0 100644
>> --- a/arch/riscv/include/asm/hugetlb.h
>> +++ b/arch/riscv/include/asm/hugetlb.h
>> @@ -2,7 +2,35 @@
>>   #ifndef _ASM_RISCV_HUGETLB_H
>>   #define _ASM_RISCV_HUGETLB_H
>>   
>> -#include <asm-generic/hugetlb.h>
>>   #include <asm/page.h>
>>   
>> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
>> +pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags);
>> +#define arch_make_huge_pte arch_make_huge_pte
> 
> The above define should be moved above its function.
> 
>> +#define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT
>> +void set_huge_pte_at(struct mm_struct *mm,
>> +		     unsigned long addr, pte_t *ptep, pte_t pte);
>> +#define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR
>> +pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
>> +			      unsigned long addr, pte_t *ptep);
>> +#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH
>> +pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
>> +			    unsigned long addr, pte_t *ptep);
>> +#define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS
>> +int huge_ptep_set_access_flags(struct vm_area_struct *vma,
>> +			       unsigned long addr, pte_t *ptep,
>> +			       pte_t pte, int dirty);
>> +#define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT
>> +void huge_ptep_set_wrprotect(struct mm_struct *mm,
>> +			     unsigned long addr, pte_t *ptep);
>> +#define __HAVE_ARCH_HUGE_PTE_CLEAR
>> +void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
>> +		    pte_t *ptep, unsigned long sz);
>> +#define set_huge_swap_pte_at riscv_set_huge_swap_pte_at
>> +void riscv_set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
>> +				pte_t *ptep, pte_t pte, unsigned long sz);
> 
> Why the riscv_ prefix on this one? Wait, what is set_huge_swap_pte_at()?
> It's not in asm-generic/hugetlb.h.
> 
>> +#endif /*CONFIG_RISCV_ISA_SVNAPOT*/
>> +
>> +#include <asm-generic/hugetlb.h>
>> +
>>   #endif /* _ASM_RISCV_HUGETLB_H */
>> diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
>> index ac70b0fd9a9a..1ea06476902a 100644
>> --- a/arch/riscv/include/asm/page.h
>> +++ b/arch/riscv/include/asm/page.h
>> @@ -17,7 +17,7 @@
>>   #define PAGE_MASK	(~(PAGE_SIZE - 1))
>>   
>>   #ifdef CONFIG_64BIT
>> -#define HUGE_MAX_HSTATE		2
>> +#define HUGE_MAX_HSTATE		3
>>   #else
>>   #define HUGE_MAX_HSTATE		1
>>   #endif
>> diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
>> index 932dadfdca54..faa207826260 100644
>> --- a/arch/riscv/mm/hugetlbpage.c
>> +++ b/arch/riscv/mm/hugetlbpage.c
>> @@ -2,6 +2,239 @@
>>   #include <linux/hugetlb.h>
>>   #include <linux/err.h>
>>   
>> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
>> +pte_t *huge_pte_alloc(struct mm_struct *mm,
>> +		      struct vm_area_struct *vma,
>> +		      unsigned long addr,
>> +		      unsigned long sz)
>> +{
>> +	pgd_t *pgdp = pgd_offset(mm, addr);
>> +	p4d_t *p4dp = p4d_alloc(mm, pgdp, addr);
>> +	pud_t *pudp = pud_alloc(mm, p4dp, addr);
>> +	pmd_t *pmdp = pmd_alloc(mm, pudp, addr);
> 
> We should be checking all the *_alloc's for NULL returns before
> progressing to the next one.
> 
>> +
>> +	if (sz == NAPOT_CONT64KB_SIZE) {
>> +		if (!pmdp)
>> +			return NULL;
>> +		WARN_ON(addr & (sz - 1));
>> +		return pte_alloc_map(mm, pmdp, addr);
>> +	}
> 
> What happened to the PUD_SIZE and PMD_SIZE handling that the
> generic huge_pte_alloc() does? I'm pretty sure we need to
> duplicate it here, at least it appears arm64 does it that way.
> 
>> +
>> +	return NULL;
>> +}
>> +
>> +pte_t *huge_pte_offset(struct mm_struct *mm,
>> +		       unsigned long addr,
>> +		       unsigned long sz)
>> +{
>> +	pgd_t *pgdp;
>> +	p4d_t *p4dp;
>> +	pud_t *pudp;
>> +	pmd_t *pmdp;
>> +	pte_t *ptep = NULL;
>> +
>> +	pgdp = pgd_offset(mm, addr);
>> +	if (!pgd_present(READ_ONCE(*pgdp)))
>> +		return NULL;
>> +
>> +	p4dp = p4d_offset(pgdp, addr);
>> +	if (!p4d_present(READ_ONCE(*p4dp)))
>> +		return NULL;
>> +
>> +	pudp = pud_offset(p4dp, addr);
>> +	if (!pud_present(READ_ONCE(*pudp)))
>> +		return NULL;
>> +
>> +	pmdp = pmd_offset(pudp, addr);
>> +	if (!pmd_present(READ_ONCE(*pmdp)))
>> +		return NULL;
>> +
>> +	if (sz == NAPOT_CONT64KB_SIZE)
>> +		ptep = pte_offset_kernel(pmdp, (addr & ~NAPOT_CONT64KB_MASK));
> 
> Same question as above; where's the PUD and PMD size handling?
> 
>> +
>> +	return ptep;
>> +}
>> +
>> +static int napot_pte_num(pte_t pte)
>> +{
>> +	if (pte_val(pte) & _PAGE_NAPOT)
>> +		return NAPOT_64KB_PTE_NUM;
>> +
>> +	pr_warn("%s: unrecognized napot pte size 0x%lx\n",
>> +		__func__, pte_val(pte));
> 
> If we should never call this function with a non-napot pte, then
> maybe this should be
> 
>   BUG(!(pte_val(pte) & _PAGE_NAPOT));
>   return NAPOT_64KB_PTE_NUM;
> 
>> +	return 1;
>> +}
>> +
>> +static pte_t get_clear_flush(struct mm_struct *mm,
>> +			     unsigned long addr,
>> +			     pte_t *ptep,
>> +			     unsigned long pte_num)
>> +{
>> +	pte_t orig_pte = huge_ptep_get(ptep);
> 
> It seems like we'd only want orig_pte = ptep_get(ptep) here (even
> though I know for riscv they're the same).
> 
>> +	bool valid = pte_val(orig_pte);
> 
> I think !pte_none(orig_pte) is more idiomatic.
> 
>> +	unsigned long i, saddr = addr;
>> +
>> +	for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++) {
>> +		pte_t pte = ptep_get_and_clear(mm, addr, ptep);
>> +
>> +		if (pte_dirty(pte))
>> +			orig_pte = pte_mkdirty(orig_pte);
>> +
>> +		if (pte_young(pte))
>> +			orig_pte = pte_mkyoung(orig_pte);
>> +	}
>> +
>> +	if (valid) {
> 
> I'm not sure why we only flush all the cleared ptes when the
> the head pte wasn't originally cleared. And what if orig_pte
> had dirty and young bits propagated to it even though it
> originally cleared?
> 
>> +		struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
>> +
>> +		flush_tlb_range(&vma, saddr, addr);
>> +	}
> 
> blank line here, please
> 
>> +	return orig_pte;
>> +}
>> +
>> +static void clear_flush(struct mm_struct *mm,
>> +			unsigned long addr,
>> +			pte_t *ptep,
>> +			unsigned long pte_num)
>> +{
>> +	struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
>> +	unsigned long i, saddr = addr;
>> +
>> +	for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
>> +		pte_clear(mm, addr, ptep);
>> +
>> +	flush_tlb_range(&vma, saddr, addr);
>> +}
>> +
>> +pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
>> +{
> 
> riscv doesn't do anything in pte_mkhuge(), but I think I'd still prefer to
> see a
> 
>    entry = pte_mkhuge(entry);
> 
> here.
> 
>> +	if (shift == NAPOT_CONT64KB_SHIFT)
>> +		entry = pte_mknapot(entry, NAPOT_CONT64KB_SHIFT - PAGE_SHIFT);
>> +
>> +	return entry;
>> +}
>> +
>> +void set_huge_pte_at(struct mm_struct *mm,
>> +		     unsigned long addr,
>> +		     pte_t *ptep,
>> +		     pte_t pte)
>> +{
>> +	int i;
>> +	int pte_num;
>> +
>> +	if (!pte_napot(pte)) {
>> +		set_pte_at(mm, addr, ptep, pte);
>> +		return;
>> +	}
>> +
>> +	pte_num = napot_pte_num(pte);
>> +	for (i = 0; i < pte_num; i++, ptep++, addr += PAGE_SIZE)
>> +		set_pte_at(mm, addr, ptep, pte);
>> +}
>> +
>> +int huge_ptep_set_access_flags(struct vm_area_struct *vma,
>> +			       unsigned long addr,
>> +			       pte_t *ptep,
>> +			       pte_t pte,
>> +			       int dirty)
>> +{
>> +	pte_t orig_pte;
>> +	int i;
>> +	int pte_num;
>> +
>> +	if (!pte_napot(pte))
>> +		return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
>> +
>> +	pte_num = napot_pte_num(pte);
>> +	ptep = huge_pte_offset(vma->vm_mm, addr, NAPOT_CONT64KB_SIZE);
> 
> How about replacing NAPOT_CONT64KB_SIZE with
> 
>    napot_pte_num(pte) << PAGE_SHIFT
> 
> which should allow it to work with other napot sizes?
> 
>> +	orig_pte = huge_ptep_get(ptep);
>> +
>> +	if (pte_dirty(orig_pte))
>> +		pte = pte_mkdirty(pte);
>> +
>> +	if (pte_young(orig_pte))
>> +		pte = pte_mkyoung(pte);
>> +
>> +	for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
>> +		ptep_set_access_flags(vma, addr, ptep, pte, dirty);
>> +
>> +	return true;
>> +}
>> +
>> +pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
>> +			      unsigned long addr,
>> +			      pte_t *ptep)
>> +{
>> +	int pte_num;
>> +	pte_t orig_pte = huge_ptep_get(ptep);
>> +
>> +	if (!pte_napot(orig_pte))
>> +		return ptep_get_and_clear(mm, addr, ptep);
>> +
>> +	pte_num = napot_pte_num(orig_pte);
>> +	return get_clear_flush(mm, addr, ptep, pte_num);
>> +}
>> +
>> +void huge_ptep_set_wrprotect(struct mm_struct *mm,
>> +			     unsigned long addr,
>> +			     pte_t *ptep)
>> +{
>> +	int i;
>> +	int pte_num;
>> +	pte_t pte = READ_ONCE(*ptep);
> 
> ptep_get() ?
> 
>> +
>> +	if (!pte_napot(pte))
>> +		return ptep_set_wrprotect(mm, addr, ptep);
> 
> ptep_set_wrprotect is a void function so this should be
> written as
> 
>    ptep_set_wrprotect(...);
>    return;
> 
> even though it's valid to return void from a void function...
> 
>> +
>> +	pte_num = napot_pte_num(pte);
>> +	ptep = huge_pte_offset(mm, addr, NAPOT_CONT64KB_SIZE);
>    
> Change NAPOT_CONT64KB_SIZE to napot_pte_num(pte) << PAGE_SHIFT ?
> 
>> +
>> +	for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
>> +		ptep_set_wrprotect(mm, addr, ptep);
>> +}
>> +
>> +pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
>> +			    unsigned long addr,
>> +			    pte_t *ptep)
>> +{
>> +	int pte_num;
>> +	pte_t pte = READ_ONCE(*ptep);
> 
> ptep_get() ?
> 
>> +
>> +	if (!pte_napot(pte)) {
>> +		ptep_clear_flush(vma, addr, ptep);
>> +		return pte;
> 
> This would look nicer as
> 
>    if (!pte_napot(pte))
>       return ptep_clear_flush(vma, addr, ptep);
> 
>> +	}
>> +
>> +	pte_num = napot_pte_num(pte);
>> +	clear_flush(vma->vm_mm, addr, ptep, pte_num);
>> +
>> +	return pte;
> 
> arm64 ensures the pte returned from this function has had its
> dirty and young bits updated, but this function does not?
> 
>> +}
>> +
>> +void huge_pte_clear(struct mm_struct *mm,
>> +		    unsigned long addr,
>> +		    pte_t *ptep,
>> +		    unsigned long sz)
>> +{
>> +	int i, pte_num;
> 
> No check that the pte is a napot pte?
> 
>> +
>> +	pte_num = napot_pte_num(READ_ONCE(*ptep));
>> +	for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
>> +		pte_clear(mm, addr, ptep);
>> +}
>> +
>> +void riscv_set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
>> +				pte_t *ptep, pte_t pte, unsigned long sz)
>> +{
>> +	int i, pte_num;
> 
> No check that the pte is a napot pte?
> 
>> +
>> +	pte_num = napot_pte_num(READ_ONCE(*ptep));
>> +
>> +	for (i = 0; i < pte_num; i++, ptep++)
>> +		set_pte(ptep, pte);
>> +}
>> +#endif /*CONFIG_RISCV_ISA_SVNAPOT*/
>> +
>>   int pud_huge(pud_t pud)
>>   {
>>   	return pud_leaf(pud);
>> @@ -18,17 +251,26 @@ bool __init arch_hugetlb_valid_size(unsigned long size)
>>   		return true;
>>   	else if (IS_ENABLED(CONFIG_64BIT) && size == PUD_SIZE)
>>   		return true;
>> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
>> +	else if (has_svnapot() && size == NAPOT_CONT64KB_SIZE)
> 
> I think it'd be best if we avoid using explicit 64KB constants and instead
> define, e.g. napot_cont_size(pte), macros which, for now, will always
> return 64KB-based values.
> 
>> +		return true;
>> +#endif /*CONFIG_RISCV_ISA_SVNAPOT*/
>>   	else
>>   		return false;
>>   }
>>   
>> -#ifdef CONFIG_CONTIG_ALLOC
>> -static __init int gigantic_pages_init(void)
>> +static __init int hugetlbpage_init(void)
>>   {
>> +#ifdef CONFIG_CONTIG_ALLOC
>>   	/* With CONTIG_ALLOC, we can allocate gigantic pages at runtime */
>>   	if (IS_ENABLED(CONFIG_64BIT))
>>   		hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
>> +#endif /*CONFIG_CONTIG_ALLOC*/
>> +	hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
> 
> Where'd this come from? It looks like there should be another patch
> that just does the change of this gigantic_pages_init() function to
> a hugetlbpage function that does both gigantic (PUD) pages and PMD
> sized pages.
> 
>> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
>> +	if (has_svnapot())
>> +		hugetlb_add_hstate(NAPOT_CONT64KB_SHIFT - PAGE_SHIFT);
>> +#endif /*CONFIG_RISCV_ISA_SVNAPOT*/
>>   	return 0;
>>   }
>> -arch_initcall(gigantic_pages_init);
>> -#endif
>> +arch_initcall(hugetlbpage_init);
>> -- 
>> 2.35.1
>>
> 
> Thanks,
> drew


_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2022-10-05 14:54 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-10-03 13:47 [PATCH v5 0/4] riscv, mm: detect svnapot cpu support at runtime panqinglin2020
2022-10-03 13:47 ` [PATCH v5 1/4] mm: modify pte format for Svnapot panqinglin2020
2022-10-04 17:00   ` Andrew Jones
2022-10-04 17:09     ` Conor Dooley
2022-10-04 18:33     ` Conor Dooley
2022-10-05  4:43       ` Qinglin Pan
2022-10-05  6:57         ` Conor Dooley
2022-10-05  7:15         ` Andrew Jones
2022-10-05 12:41           ` Qinglin Pan
2022-10-05 13:25             ` Andrew Jones
2022-10-05 14:50               ` Qinglin Pan
2022-10-05  9:52       ` Heiko Stübner
2022-10-03 13:47 ` [PATCH v5 2/4] mm: support Svnapot in physical page linear-mapping panqinglin2020
2022-10-04 18:40   ` Conor Dooley
2022-10-05  2:43     ` Qinglin Pan
2022-10-05 11:19   ` Andrew Jones
2022-10-05 12:45     ` Qinglin Pan
2022-10-03 13:47 ` [PATCH v5 3/4] mm: support Svnapot in hugetlb page panqinglin2020
2022-10-04 18:43   ` Conor Dooley
2022-10-05  2:31     ` Qinglin Pan
2022-10-05 13:11   ` Andrew Jones
2022-10-05 14:54     ` Qinglin Pan
2022-10-03 13:47 ` [PATCH v5 4/4] mm: support Svnapot in huge vmap panqinglin2020
2022-10-04 18:46   ` Conor Dooley
2022-10-05  4:44     ` Qinglin Pan
2022-10-03 13:53 ` [PATCH v5 0/4] riscv, mm: detect svnapot cpu support at runtime Qinglin Pan

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).