linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [RFC PATCH v3 0/2] arm64: tlb: add support for TLBI RANGE instructions
@ 2020-04-14 11:28 Zhenyu Ye
  2020-04-14 11:28 ` [RFC PATCH v3 1/2] arm64: tlb: Detect the ARMv8.4 TLBI RANGE feature Zhenyu Ye
  2020-04-14 11:28 ` [RFC PATCH v3 2/2] arm64: tlb: Use the TLBI RANGE feature in arm64 Zhenyu Ye
  0 siblings, 2 replies; 12+ messages in thread
From: Zhenyu Ye @ 2020-04-14 11:28 UTC (permalink / raw)
  To: will, catalin.marinas, suzuki.poulose, maz, steven.price,
	guohanjun, olof
  Cc: yezhenyu2, linux-arm-kernel, linux-kernel, linux-arch, linux-mm,
	arm, xiexiangyou, prime.zeng, zhangshaokun, kuhn.chenqun

ARMv8.4-TLBI provides TLBI invalidation instruction that apply to a
range of input addresses. This series add support for this feature.

--
ChangeList:
v3:
rebase this series on Linux 5.7-rc1.

v2:
Link: https://lkml.org/lkml/2019/11/11/348

Zhenyu Ye (2):
  arm64: tlb: Detect the ARMv8.4 TLBI RANGE feature
  arm64: tlb: Use the TLBI RANGE feature in arm64

 arch/arm64/include/asm/cpucaps.h  |   3 +-
 arch/arm64/include/asm/sysreg.h   |   4 ++
 arch/arm64/include/asm/tlbflush.h | 108 +++++++++++++++++++++++++++++-
 arch/arm64/kernel/cpufeature.c    |  11 +++
 4 files changed, 124 insertions(+), 2 deletions(-)

-- 
2.19.1




^ permalink raw reply	[flat|nested] 12+ messages in thread

* [RFC PATCH v3 1/2] arm64: tlb: Detect the ARMv8.4 TLBI RANGE feature
  2020-04-14 11:28 [RFC PATCH v3 0/2] arm64: tlb: add support for TLBI RANGE instructions Zhenyu Ye
@ 2020-04-14 11:28 ` Zhenyu Ye
  2020-05-05 10:14   ` Mark Rutland
  2020-04-14 11:28 ` [RFC PATCH v3 2/2] arm64: tlb: Use the TLBI RANGE feature in arm64 Zhenyu Ye
  1 sibling, 1 reply; 12+ messages in thread
From: Zhenyu Ye @ 2020-04-14 11:28 UTC (permalink / raw)
  To: will, catalin.marinas, suzuki.poulose, maz, steven.price,
	guohanjun, olof
  Cc: yezhenyu2, linux-arm-kernel, linux-kernel, linux-arch, linux-mm,
	arm, xiexiangyou, prime.zeng, zhangshaokun, kuhn.chenqun

ARMv8.4-TLBI provides TLBI invalidation instruction that apply to a
range of input addresses. This patch detect this feature.

Signed-off-by: Zhenyu Ye <yezhenyu2@huawei.com>
---
 arch/arm64/include/asm/cpucaps.h |  3 ++-
 arch/arm64/include/asm/sysreg.h  |  4 ++++
 arch/arm64/kernel/cpufeature.c   | 11 +++++++++++
 3 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
index 8eb5a088ae65..950095a72617 100644
--- a/arch/arm64/include/asm/cpucaps.h
+++ b/arch/arm64/include/asm/cpucaps.h
@@ -61,7 +61,8 @@
 #define ARM64_HAS_AMU_EXTN			51
 #define ARM64_HAS_ADDRESS_AUTH			52
 #define ARM64_HAS_GENERIC_AUTH			53
+#define ARM64_HAS_TLBI_RANGE			54
 
-#define ARM64_NCAPS				54
+#define ARM64_NCAPS				55
 
 #endif /* __ASM_CPUCAPS_H */
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index ebc622432831..ac1b98650234 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -592,6 +592,7 @@
 
 /* id_aa64isar0 */
 #define ID_AA64ISAR0_RNDR_SHIFT		60
+#define ID_AA64ISAR0_TLBI_RANGE_SHIFT	56
 #define ID_AA64ISAR0_TS_SHIFT		52
 #define ID_AA64ISAR0_FHM_SHIFT		48
 #define ID_AA64ISAR0_DP_SHIFT		44
@@ -605,6 +606,9 @@
 #define ID_AA64ISAR0_SHA1_SHIFT		8
 #define ID_AA64ISAR0_AES_SHIFT		4
 
+#define ID_AA64ISAR0_TLBI_RANGE_NI	0x0
+#define ID_AA64ISAR0_TLBI_RANGE		0x2
+
 /* id_aa64isar1 */
 #define ID_AA64ISAR1_I8MM_SHIFT		52
 #define ID_AA64ISAR1_DGH_SHIFT		48
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 9fac745aa7bb..31bcfd0722b5 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -124,6 +124,7 @@ static bool __system_matches_cap(unsigned int n);
  */
 static const struct arm64_ftr_bits ftr_id_aa64isar0[] = {
 	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_RNDR_SHIFT, 4, 0),
+	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_TLBI_RANGE_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_TS_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_FHM_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_DP_SHIFT, 4, 0),
@@ -1779,6 +1780,16 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 		.min_field_value = 1,
 	},
 #endif
+	{
+		.desc = "TLB range maintenance instruction",
+		.capability = ARM64_HAS_TLBI_RANGE,
+		.type = ARM64_CPUCAP_SYSTEM_FEATURE,
+		.matches = has_cpuid_feature,
+		.sys_reg = SYS_ID_AA64ISAR0_EL1,
+		.field_pos = ID_AA64ISAR0_TLBI_RANGE_SHIFT,
+		.sign = FTR_UNSIGNED,
+		.min_field_value = ID_AA64ISAR0_TLBI_RANGE,
+	},
 	{},
 };
 
-- 
2.19.1




^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [RFC PATCH v3 2/2] arm64: tlb: Use the TLBI RANGE feature in arm64
  2020-04-14 11:28 [RFC PATCH v3 0/2] arm64: tlb: add support for TLBI RANGE instructions Zhenyu Ye
  2020-04-14 11:28 ` [RFC PATCH v3 1/2] arm64: tlb: Detect the ARMv8.4 TLBI RANGE feature Zhenyu Ye
@ 2020-04-14 11:28 ` Zhenyu Ye
  2020-05-14 15:28   ` Catalin Marinas
  1 sibling, 1 reply; 12+ messages in thread
From: Zhenyu Ye @ 2020-04-14 11:28 UTC (permalink / raw)
  To: will, catalin.marinas, suzuki.poulose, maz, steven.price,
	guohanjun, olof
  Cc: yezhenyu2, linux-arm-kernel, linux-kernel, linux-arch, linux-mm,
	arm, xiexiangyou, prime.zeng, zhangshaokun, kuhn.chenqun

Add __TLBI_VADDR_RANGE macro and __flush_tlb_range_directly()
interface.

Signed-off-by: Zhenyu Ye <yezhenyu2@huawei.com>
---
 arch/arm64/include/asm/tlb.h      |   7 +-
 arch/arm64/include/asm/tlbflush.h | 114 +++++++++++++++++++++++++++++-
 2 files changed, 119 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h
index b76df828e6b7..3a1816770bd1 100644
--- a/arch/arm64/include/asm/tlb.h
+++ b/arch/arm64/include/asm/tlb.h
@@ -38,7 +38,12 @@ static inline void tlb_flush(struct mmu_gather *tlb)
 		return;
 	}
 
-	__flush_tlb_range(&vma, tlb->start, tlb->end, stride, last_level);
+	if (cpus_have_const_cap(ARM64_HAS_TLBI_RANGE))
+		__flush_tlb_range_directly(&vma, tlb->start, tlb->end,
+					   stride, last_level);
+	else
+		__flush_tlb_range(&vma, tlb->start, tlb->end,
+				  stride, last_level);
 }
 
 static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte,
diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index bc3949064725..a482188ea563 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -59,6 +59,44 @@
 		__ta;						\
 	})
 
+/*
+ * This macro creates a properly formatted VA operand for the TLBI RANGE.
+ * The value bit assignments are:
+ *
+ * +----------+------+-------+-------+-------+----------------------+
+ * |   ASID   |  TG  | SCALE |  NUM  |  TTL  |        BADDR         |
+ * +-----------------+-------+-------+-------+----------------------+
+ * |63      48|47  46|45   44|43   39|38   37|36                   0|
+ *
+ * The address range is determined by below formula:
+ * [BADDR, BADDR + (NUM + 1) * 2^(5*SCALE + 1) * PAGESIZE)
+ *
+ */
+#define __TLBI_VADDR_RANGE(addr, asid, tg, scale, num, ttl)	\
+	({							\
+		unsigned long __ta = (addr) >> PAGE_SHIFT;	\
+		__ta &= GENMASK_ULL(36, 0);			\
+		__ta |= (unsigned long)(ttl) << 37;		\
+		__ta |= (unsigned long)(num) << 39;		\
+		__ta |= (unsigned long)(scale) << 44;		\
+		__ta |= (unsigned long)(tg) << 46;		\
+		__ta |= (unsigned long)(asid) << 48;		\
+		__ta;						\
+	})
+
+#define TLB_RANGE_MASK_SHIFT 5
+#define TLB_RANGE_MASK GENMASK_ULL(TLB_RANGE_MASK_SHIFT - 1, 0)
+
+/*
+ * __TG defines translation granule of the system, which is defined by
+ * PAGE_SHIFT.  Used by TTL.
+ *  - 4KB	: 1
+ *  - 16KB	: 2
+ *  - 64KB	: 3
+ */
+#define __TG	((PAGE_SHIFT - 12) / 2 + 1)
+
+
 /*
  *	TLB Invalidation
  *	================
@@ -171,12 +209,83 @@ static inline void flush_tlb_page(struct vm_area_struct *vma,
 	dsb(ish);
 }
 
+/* The maximum range size of one TLBI-RANGE instruction */
+#define MAX_TLBI_RANGE_SIZE	(1UL << 21)
+
+/*
+ * This interface uses the *rvale1is* instruction to flush TLBs
+ * in [start, end) directly.
+ * This instruction is supported from ARM v8.4.
+ */
+static inline void __flush_tlb_range_directly(struct vm_area_struct *vma,
+				unsigned long start, unsigned long end,
+				unsigned long stride, bool last_level)
+{
+	int num = 0;
+	int scale = 0;
+	unsigned long asid = ASID(vma->vm_mm);
+	unsigned long addr = 0;
+	unsigned long range_size;
+
+	start = round_down(start, stride);
+	end = round_up(end, stride);
+	range_size = (end - start) >> PAGE_SHIFT;
+
+	if (range_size > MAX_TLBI_RANGE_SIZE) {
+		flush_tlb_mm(vma->vm_mm);
+		return;
+	}
+
+	dsb(ishst);
+
+	/*
+	 * The minimum size of TLB RANGE is 2 PAGE;
+	 * Use normal TLB instruction to handle odd PAGEs
+	 */
+	if (range_size % 2 == 1) {
+		addr = __TLBI_VADDR(start, asid);
+		if (last_level) {
+			__tlbi(vale1is, addr);
+			__tlbi_user(vale1is, addr);
+		} else {
+			__tlbi(vae1is, addr);
+			__tlbi_user(vae1is, addr);
+		}
+		start += 1 << PAGE_SHIFT;
+		range_size -= 1;
+	}
+
+	range_size >>= 1;
+	while (range_size > 0) {
+		num = (range_size & TLB_RANGE_MASK) - 1;
+		if (num >= 0) {
+			addr = __TLBI_VADDR_RANGE(start, asid, __TG,
+						  scale, num, 0);
+			if (last_level) {
+				__tlbi(rvale1is, addr);
+				__tlbi_user(rvale1is, addr);
+			} else {
+				__tlbi(rvae1is, addr);
+				__tlbi_user(rvae1is, addr);
+			}
+			start += (num + 1) << (5 * scale + 1) << PAGE_SHIFT;
+		}
+		scale++;
+		range_size >>= TLB_RANGE_MASK_SHIFT;
+	}
+	dsb(ish);
+}
+
 /*
  * This is meant to avoid soft lock-ups on large TLB flushing ranges and not
  * necessarily a performance improvement.
  */
 #define MAX_TLBI_OPS	PTRS_PER_PTE
 
+/*
+ * This interface uses the *vae1is* instruction to flush TLBs
+ * in [start, end) one by one.
+ */
 static inline void __flush_tlb_range(struct vm_area_struct *vma,
 				     unsigned long start, unsigned long end,
 				     unsigned long stride, bool last_level)
@@ -218,7 +327,10 @@ static inline void flush_tlb_range(struct vm_area_struct *vma,
 	 * We cannot use leaf-only invalidation here, since we may be invalidating
 	 * table entries as part of collapsing hugepages or moving page tables.
 	 */
-	__flush_tlb_range(vma, start, end, PAGE_SIZE, false);
+	if (cpus_have_const_cap(ARM64_HAS_TLBI_RANGE))
+		__flush_tlb_range_directly(vma, start, end, PAGE_SIZE, false);
+	else
+		__flush_tlb_range(vma, start, end, PAGE_SIZE, false);
 }
 
 static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end)
-- 
2.19.1




^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [RFC PATCH v3 1/2] arm64: tlb: Detect the ARMv8.4 TLBI RANGE feature
  2020-04-14 11:28 ` [RFC PATCH v3 1/2] arm64: tlb: Detect the ARMv8.4 TLBI RANGE feature Zhenyu Ye
@ 2020-05-05 10:14   ` Mark Rutland
  2020-05-11 12:25     ` Zhenyu Ye
  0 siblings, 1 reply; 12+ messages in thread
From: Mark Rutland @ 2020-05-05 10:14 UTC (permalink / raw)
  To: Zhenyu Ye
  Cc: will, catalin.marinas, suzuki.poulose, maz, steven.price,
	guohanjun, olof, linux-arch, linux-kernel, xiexiangyou,
	zhangshaokun, linux-mm, arm, prime.zeng, kuhn.chenqun,
	linux-arm-kernel

On Tue, Apr 14, 2020 at 07:28:34PM +0800, Zhenyu Ye wrote:
> ARMv8.4-TLBI provides TLBI invalidation instruction that apply to a
> range of input addresses. This patch detect this feature.
> 
> Signed-off-by: Zhenyu Ye <yezhenyu2@huawei.com>
> ---
>  arch/arm64/include/asm/cpucaps.h |  3 ++-
>  arch/arm64/include/asm/sysreg.h  |  4 ++++
>  arch/arm64/kernel/cpufeature.c   | 11 +++++++++++
>  3 files changed, 17 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
> index 8eb5a088ae65..950095a72617 100644
> --- a/arch/arm64/include/asm/cpucaps.h
> +++ b/arch/arm64/include/asm/cpucaps.h
> @@ -61,7 +61,8 @@
>  #define ARM64_HAS_AMU_EXTN			51
>  #define ARM64_HAS_ADDRESS_AUTH			52
>  #define ARM64_HAS_GENERIC_AUTH			53
> +#define ARM64_HAS_TLBI_RANGE			54
>  
> -#define ARM64_NCAPS				54
> +#define ARM64_NCAPS				55
>  
>  #endif /* __ASM_CPUCAPS_H */
> diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
> index ebc622432831..ac1b98650234 100644
> --- a/arch/arm64/include/asm/sysreg.h
> +++ b/arch/arm64/include/asm/sysreg.h
> @@ -592,6 +592,7 @@
>  
>  /* id_aa64isar0 */
>  #define ID_AA64ISAR0_RNDR_SHIFT		60
> +#define ID_AA64ISAR0_TLBI_RANGE_SHIFT	56
>  #define ID_AA64ISAR0_TS_SHIFT		52
>  #define ID_AA64ISAR0_FHM_SHIFT		48
>  #define ID_AA64ISAR0_DP_SHIFT		44
> @@ -605,6 +606,9 @@
>  #define ID_AA64ISAR0_SHA1_SHIFT		8
>  #define ID_AA64ISAR0_AES_SHIFT		4
>  
> +#define ID_AA64ISAR0_TLBI_RANGE_NI	0x0
> +#define ID_AA64ISAR0_TLBI_RANGE		0x2
> +
>  /* id_aa64isar1 */
>  #define ID_AA64ISAR1_I8MM_SHIFT		52
>  #define ID_AA64ISAR1_DGH_SHIFT		48
> diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
> index 9fac745aa7bb..31bcfd0722b5 100644
> --- a/arch/arm64/kernel/cpufeature.c
> +++ b/arch/arm64/kernel/cpufeature.c
> @@ -124,6 +124,7 @@ static bool __system_matches_cap(unsigned int n);
>   */
>  static const struct arm64_ftr_bits ftr_id_aa64isar0[] = {
>  	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_RNDR_SHIFT, 4, 0),
> +	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_TLBI_RANGE_SHIFT, 4, 0),

This should be FTR_HIDDEN as userspace has no reason to see this.

Otherwise this all seems to match the ARM ARM.

Mark.

>  	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_TS_SHIFT, 4, 0),
>  	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_FHM_SHIFT, 4, 0),
>  	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_DP_SHIFT, 4, 0),
> @@ -1779,6 +1780,16 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
>  		.min_field_value = 1,
>  	},
>  #endif
> +	{
> +		.desc = "TLB range maintenance instruction",
> +		.capability = ARM64_HAS_TLBI_RANGE,
> +		.type = ARM64_CPUCAP_SYSTEM_FEATURE,
> +		.matches = has_cpuid_feature,
> +		.sys_reg = SYS_ID_AA64ISAR0_EL1,
> +		.field_pos = ID_AA64ISAR0_TLBI_RANGE_SHIFT,
> +		.sign = FTR_UNSIGNED,
> +		.min_field_value = ID_AA64ISAR0_TLBI_RANGE,
> +	},
>  	{},
>  };
>  
> -- 
> 2.19.1
> 
> 
> 
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC PATCH v3 1/2] arm64: tlb: Detect the ARMv8.4 TLBI RANGE feature
  2020-05-05 10:14   ` Mark Rutland
@ 2020-05-11 12:25     ` Zhenyu Ye
  2020-05-18  4:22       ` Anshuman Khandual
  0 siblings, 1 reply; 12+ messages in thread
From: Zhenyu Ye @ 2020-05-11 12:25 UTC (permalink / raw)
  To: Mark Rutland
  Cc: will, catalin.marinas, suzuki.poulose, maz, steven.price,
	guohanjun, olof, linux-arch, linux-kernel, xiexiangyou,
	zhangshaokun, linux-mm, arm, prime.zeng, kuhn.chenqun,
	linux-arm-kernel

On 2020/5/5 18:14, Mark Rutland wrote:
> On Tue, Apr 14, 2020 at 07:28:34PM +0800, Zhenyu Ye wrote:
>> ARMv8.4-TLBI provides TLBI invalidation instruction that apply to a
>> range of input addresses. This patch detect this feature.
>>
>> Signed-off-by: Zhenyu Ye <yezhenyu2@huawei.com>
>> ---
>>  arch/arm64/include/asm/cpucaps.h |  3 ++-
>>  arch/arm64/include/asm/sysreg.h  |  4 ++++
>>  arch/arm64/kernel/cpufeature.c   | 11 +++++++++++
>>  3 files changed, 17 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
>> index 8eb5a088ae65..950095a72617 100644
>> --- a/arch/arm64/include/asm/cpucaps.h
>> +++ b/arch/arm64/include/asm/cpucaps.h
>> @@ -61,7 +61,8 @@
>>  #define ARM64_HAS_AMU_EXTN			51
>>  #define ARM64_HAS_ADDRESS_AUTH			52
>>  #define ARM64_HAS_GENERIC_AUTH			53
>> +#define ARM64_HAS_TLBI_RANGE			54
>>  
>> -#define ARM64_NCAPS				54
>> +#define ARM64_NCAPS				55
>>  
>>  #endif /* __ASM_CPUCAPS_H */
>> diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
>> index ebc622432831..ac1b98650234 100644
>> --- a/arch/arm64/include/asm/sysreg.h
>> +++ b/arch/arm64/include/asm/sysreg.h
>> @@ -592,6 +592,7 @@
>>  
>>  /* id_aa64isar0 */
>>  #define ID_AA64ISAR0_RNDR_SHIFT		60
>> +#define ID_AA64ISAR0_TLBI_RANGE_SHIFT	56
>>  #define ID_AA64ISAR0_TS_SHIFT		52
>>  #define ID_AA64ISAR0_FHM_SHIFT		48
>>  #define ID_AA64ISAR0_DP_SHIFT		44
>> @@ -605,6 +606,9 @@
>>  #define ID_AA64ISAR0_SHA1_SHIFT		8
>>  #define ID_AA64ISAR0_AES_SHIFT		4
>>  
>> +#define ID_AA64ISAR0_TLBI_RANGE_NI	0x0
>> +#define ID_AA64ISAR0_TLBI_RANGE		0x2
>> +
>>  /* id_aa64isar1 */
>>  #define ID_AA64ISAR1_I8MM_SHIFT		52
>>  #define ID_AA64ISAR1_DGH_SHIFT		48
>> diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
>> index 9fac745aa7bb..31bcfd0722b5 100644
>> --- a/arch/arm64/kernel/cpufeature.c
>> +++ b/arch/arm64/kernel/cpufeature.c
>> @@ -124,6 +124,7 @@ static bool __system_matches_cap(unsigned int n);
>>   */
>>  static const struct arm64_ftr_bits ftr_id_aa64isar0[] = {
>>  	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_RNDR_SHIFT, 4, 0),
>> +	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_TLBI_RANGE_SHIFT, 4, 0),
> 
> This should be FTR_HIDDEN as userspace has no reason to see this.
> 
> Otherwise this all seems to match the ARM ARM.
> 
> Mark.
> 

OK, I will change it to FTR_HIDDEN in next version series.

Thanks,
Zhenyu




^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC PATCH v3 2/2] arm64: tlb: Use the TLBI RANGE feature in arm64
  2020-04-14 11:28 ` [RFC PATCH v3 2/2] arm64: tlb: Use the TLBI RANGE feature in arm64 Zhenyu Ye
@ 2020-05-14 15:28   ` Catalin Marinas
  2020-05-18 12:21     ` Zhenyu Ye
  0 siblings, 1 reply; 12+ messages in thread
From: Catalin Marinas @ 2020-05-14 15:28 UTC (permalink / raw)
  To: Zhenyu Ye
  Cc: will, suzuki.poulose, maz, steven.price, guohanjun, olof,
	linux-arm-kernel, linux-kernel, linux-arch, linux-mm, arm,
	xiexiangyou, prime.zeng, zhangshaokun, kuhn.chenqun

Hi Zhenyu,

On Tue, Apr 14, 2020 at 07:28:35PM +0800, Zhenyu Ye wrote:
> diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h
> index b76df828e6b7..3a1816770bd1 100644
> --- a/arch/arm64/include/asm/tlb.h
> +++ b/arch/arm64/include/asm/tlb.h
> @@ -38,7 +38,12 @@ static inline void tlb_flush(struct mmu_gather *tlb)
>  		return;
>  	}
>  
> -	__flush_tlb_range(&vma, tlb->start, tlb->end, stride, last_level);
> +	if (cpus_have_const_cap(ARM64_HAS_TLBI_RANGE))
> +		__flush_tlb_range_directly(&vma, tlb->start, tlb->end,
> +					   stride, last_level);
> +	else
> +		__flush_tlb_range(&vma, tlb->start, tlb->end,
> +				  stride, last_level);

I think you could move such check in __flush_tlb_range() and avoid
cpus_have_const_cap() in two places. More on this below.

> diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
> index bc3949064725..a482188ea563 100644
> --- a/arch/arm64/include/asm/tlbflush.h
> +++ b/arch/arm64/include/asm/tlbflush.h
> @@ -59,6 +59,44 @@
>  		__ta;						\
>  	})
>  
> +/*
> + * This macro creates a properly formatted VA operand for the TLBI RANGE.
> + * The value bit assignments are:
> + *
> + * +----------+------+-------+-------+-------+----------------------+
> + * |   ASID   |  TG  | SCALE |  NUM  |  TTL  |        BADDR         |
> + * +-----------------+-------+-------+-------+----------------------+
> + * |63      48|47  46|45   44|43   39|38   37|36                   0|
> + *
> + * The address range is determined by below formula:
> + * [BADDR, BADDR + (NUM + 1) * 2^(5*SCALE + 1) * PAGESIZE)
> + *
> + */
> +#define __TLBI_VADDR_RANGE(addr, asid, tg, scale, num, ttl)	\
> +	({							\
> +		unsigned long __ta = (addr) >> PAGE_SHIFT;	\
> +		__ta &= GENMASK_ULL(36, 0);			\
> +		__ta |= (unsigned long)(ttl) << 37;		\
> +		__ta |= (unsigned long)(num) << 39;		\
> +		__ta |= (unsigned long)(scale) << 44;		\
> +		__ta |= (unsigned long)(tg) << 46;		\
> +		__ta |= (unsigned long)(asid) << 48;		\
> +		__ta;						\
> +	})
> +
> +#define TLB_RANGE_MASK_SHIFT 5
> +#define TLB_RANGE_MASK GENMASK_ULL(TLB_RANGE_MASK_SHIFT - 1, 0)
> +
> +/*
> + * __TG defines translation granule of the system, which is defined by
> + * PAGE_SHIFT.  Used by TTL.
> + *  - 4KB	: 1
> + *  - 16KB	: 2
> + *  - 64KB	: 3
> + */
> +#define __TG	((PAGE_SHIFT - 12) / 2 + 1)

I don't think we need __TLBI_VADDR_RANGE to take a tg argument since
it's always the same.

> +
> +
>  /*
>   *	TLB Invalidation
>   *	================
> @@ -171,12 +209,83 @@ static inline void flush_tlb_page(struct vm_area_struct *vma,
>  	dsb(ish);
>  }
>  
> +/* The maximum range size of one TLBI-RANGE instruction */
> +#define MAX_TLBI_RANGE_SIZE	(1UL << 21)

Nitpick: call this MAX_TLBI_RANGE_PAGES as that's not an address range.

It may be useful to have a macro for the range here, something like:

#define __TLBI_PAGES(num, scale)	((num + 1) << (5 * scale + 1))

and define MAX_TLBI_RANGE_PAGES in terms of this macro as
__TLBI_PAGES(31, 3).

> +
> +/*
> + * This interface uses the *rvale1is* instruction to flush TLBs
> + * in [start, end) directly.
> + * This instruction is supported from ARM v8.4.
> + */
> +static inline void __flush_tlb_range_directly(struct vm_area_struct *vma,
> +				unsigned long start, unsigned long end,
> +				unsigned long stride, bool last_level)
> +{
> +	int num = 0;
> +	int scale = 0;
> +	unsigned long asid = ASID(vma->vm_mm);
> +	unsigned long addr = 0;
> +	unsigned long range_size;
> +
> +	start = round_down(start, stride);
> +	end = round_up(end, stride);
> +	range_size = (end - start) >> PAGE_SHIFT;
> +
> +	if (range_size > MAX_TLBI_RANGE_SIZE) {
> +		flush_tlb_mm(vma->vm_mm);
> +		return;
> +	}
> +
> +	dsb(ishst);
> +
> +	/*
> +	 * The minimum size of TLB RANGE is 2 PAGE;
> +	 * Use normal TLB instruction to handle odd PAGEs

Nitpick: no need to capitalise PAGE.

> +	 */
> +	if (range_size % 2 == 1) {
> +		addr = __TLBI_VADDR(start, asid);
> +		if (last_level) {
> +			__tlbi(vale1is, addr);
> +			__tlbi_user(vale1is, addr);
> +		} else {
> +			__tlbi(vae1is, addr);
> +			__tlbi_user(vae1is, addr);
> +		}
> +		start += 1 << PAGE_SHIFT;
> +		range_size -= 1;
> +	}
> +
> +	range_size >>= 1;
> +	while (range_size > 0) {
> +		num = (range_size & TLB_RANGE_MASK) - 1;
> +		if (num >= 0) {
> +			addr = __TLBI_VADDR_RANGE(start, asid, __TG,
> +						  scale, num, 0);
> +			if (last_level) {
> +				__tlbi(rvale1is, addr);
> +				__tlbi_user(rvale1is, addr);
> +			} else {
> +				__tlbi(rvae1is, addr);
> +				__tlbi_user(rvae1is, addr);
> +			}
> +			start += (num + 1) << (5 * scale + 1) << PAGE_SHIFT;

You could use the __TLBI_PAGES macro I proposed above.

> +		}
> +		scale++;
> +		range_size >>= TLB_RANGE_MASK_SHIFT;
> +	}

So, you start from scale 0 and increment it until you reach the maximum.
I think (haven't done the maths on paper) you could also start from the
top with something like scale = ilog2(range_size) / 5. Not sure it's
significantly better though, maybe avoiding the loop 3 times if your
range is 2MB (which happens with huge pages).

Anyway, I think it would be more efficient if we combine the
__flush_tlb_range() and the _directly one into the same function with a
single loop for both. For example, if the stride is 2MB already, we can
handle this with a single classic TLBI without all the calculations for
the range operation. The hardware may also handle this better since the
software already told it there can be only one entry in that 2MB range.
So each loop iteration could figure which operation to use based on
cpucaps, TLBI range ops, stride and reduce range_size accordingly.

-- 
Catalin


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC PATCH v3 1/2] arm64: tlb: Detect the ARMv8.4 TLBI RANGE feature
  2020-05-11 12:25     ` Zhenyu Ye
@ 2020-05-18  4:22       ` Anshuman Khandual
  2020-05-18 12:29         ` Zhenyu Ye
  0 siblings, 1 reply; 12+ messages in thread
From: Anshuman Khandual @ 2020-05-18  4:22 UTC (permalink / raw)
  To: Zhenyu Ye, Mark Rutland
  Cc: will, catalin.marinas, suzuki.poulose, maz, steven.price,
	guohanjun, olof, linux-arch, linux-kernel, xiexiangyou,
	zhangshaokun, linux-mm, arm, prime.zeng, kuhn.chenqun,
	linux-arm-kernel



On 05/11/2020 05:55 PM, Zhenyu Ye wrote:
> On 2020/5/5 18:14, Mark Rutland wrote:
>> On Tue, Apr 14, 2020 at 07:28:34PM +0800, Zhenyu Ye wrote:
>>> ARMv8.4-TLBI provides TLBI invalidation instruction that apply to a
>>> range of input addresses. This patch detect this feature.
>>>
>>> Signed-off-by: Zhenyu Ye <yezhenyu2@huawei.com>
>>> ---
>>>  arch/arm64/include/asm/cpucaps.h |  3 ++-
>>>  arch/arm64/include/asm/sysreg.h  |  4 ++++
>>>  arch/arm64/kernel/cpufeature.c   | 11 +++++++++++
>>>  3 files changed, 17 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
>>> index 8eb5a088ae65..950095a72617 100644
>>> --- a/arch/arm64/include/asm/cpucaps.h
>>> +++ b/arch/arm64/include/asm/cpucaps.h
>>> @@ -61,7 +61,8 @@
>>>  #define ARM64_HAS_AMU_EXTN			51
>>>  #define ARM64_HAS_ADDRESS_AUTH			52
>>>  #define ARM64_HAS_GENERIC_AUTH			53
>>> +#define ARM64_HAS_TLBI_RANGE			54
>>>  
>>> -#define ARM64_NCAPS				54
>>> +#define ARM64_NCAPS				55
>>>  
>>>  #endif /* __ASM_CPUCAPS_H */
>>> diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
>>> index ebc622432831..ac1b98650234 100644
>>> --- a/arch/arm64/include/asm/sysreg.h
>>> +++ b/arch/arm64/include/asm/sysreg.h
>>> @@ -592,6 +592,7 @@
>>>  
>>>  /* id_aa64isar0 */
>>>  #define ID_AA64ISAR0_RNDR_SHIFT		60
>>> +#define ID_AA64ISAR0_TLBI_RANGE_SHIFT	56
>>>  #define ID_AA64ISAR0_TS_SHIFT		52
>>>  #define ID_AA64ISAR0_FHM_SHIFT		48
>>>  #define ID_AA64ISAR0_DP_SHIFT		44
>>> @@ -605,6 +606,9 @@
>>>  #define ID_AA64ISAR0_SHA1_SHIFT		8
>>>  #define ID_AA64ISAR0_AES_SHIFT		4
>>>  
>>> +#define ID_AA64ISAR0_TLBI_RANGE_NI	0x0
>>> +#define ID_AA64ISAR0_TLBI_RANGE		0x2
>>> +
>>>  /* id_aa64isar1 */
>>>  #define ID_AA64ISAR1_I8MM_SHIFT		52
>>>  #define ID_AA64ISAR1_DGH_SHIFT		48
>>> diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
>>> index 9fac745aa7bb..31bcfd0722b5 100644
>>> --- a/arch/arm64/kernel/cpufeature.c
>>> +++ b/arch/arm64/kernel/cpufeature.c
>>> @@ -124,6 +124,7 @@ static bool __system_matches_cap(unsigned int n);
>>>   */
>>>  static const struct arm64_ftr_bits ftr_id_aa64isar0[] = {
>>>  	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_RNDR_SHIFT, 4, 0),
>>> +	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_TLBI_RANGE_SHIFT, 4, 0),

Hello Zhenyu,

This is already being added through another patch [1] in a series [2] which primarily
has cpufeature changes. I will soon update the series making this feature FTR_HIDDEN.

[1] https://patchwork.kernel.org/patch/11523881/
[2] https://patchwork.kernel.org/project/linux-arm-kernel/list/?series=281211

I am planning to respin the series (V4) based on arm64 tree (for-next/cpufeature). So
could you please rebase this patch (probably dropping cpufeature related changes) on
upcoming V4, so that all the changes will be based on arm64 tree (for-next/cpufeature).

- Anshuman


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC PATCH v3 2/2] arm64: tlb: Use the TLBI RANGE feature in arm64
  2020-05-14 15:28   ` Catalin Marinas
@ 2020-05-18 12:21     ` Zhenyu Ye
  2020-05-20 17:08       ` Catalin Marinas
  2020-06-09 13:26       ` Zhenyu Ye
  0 siblings, 2 replies; 12+ messages in thread
From: Zhenyu Ye @ 2020-05-18 12:21 UTC (permalink / raw)
  To: Catalin Marinas
  Cc: will, suzuki.poulose, maz, steven.price, guohanjun, olof,
	linux-arm-kernel, linux-kernel, linux-arch, linux-mm, arm,
	xiexiangyou, prime.zeng, zhangshaokun, kuhn.chenqun

Hi Catalin,

Thanks for your review.

On 2020/5/14 23:28, Catalin Marinas wrote:
> Hi Zhenyu,
> 
> On Tue, Apr 14, 2020 at 07:28:35PM +0800, Zhenyu Ye wrote:
>> diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h
>> index b76df828e6b7..3a1816770bd1 100644
>> --- a/arch/arm64/include/asm/tlb.h
>> +++ b/arch/arm64/include/asm/tlb.h
>> @@ -38,7 +38,12 @@ static inline void tlb_flush(struct mmu_gather *tlb)
>>  		return;
>>  	}
>>  
>> -	__flush_tlb_range(&vma, tlb->start, tlb->end, stride, last_level);
>> +	if (cpus_have_const_cap(ARM64_HAS_TLBI_RANGE))
>> +		__flush_tlb_range_directly(&vma, tlb->start, tlb->end,
>> +					   stride, last_level);
>> +	else
>> +		__flush_tlb_range(&vma, tlb->start, tlb->end,
>> +				  stride, last_level);
> 
> I think you could move such check in __flush_tlb_range() and avoid
> cpus_have_const_cap() in two places. More on this below.
> 

Then we must mix the __flush_tlb_range() and the _directly one together.
I'm worried this will make the code very complicated.  See the end for
details.

>> diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
>> index bc3949064725..a482188ea563 100644
>> --- a/arch/arm64/include/asm/tlbflush.h
>> +++ b/arch/arm64/include/asm/tlbflush.h
>> @@ -59,6 +59,44 @@
>>  		__ta;						\
>>  	})
>>  
>> +/*
>> + * This macro creates a properly formatted VA operand for the TLBI RANGE.
>> + * The value bit assignments are:
>> + *
>> + * +----------+------+-------+-------+-------+----------------------+
>> + * |   ASID   |  TG  | SCALE |  NUM  |  TTL  |        BADDR         |
>> + * +-----------------+-------+-------+-------+----------------------+
>> + * |63      48|47  46|45   44|43   39|38   37|36                   0|
>> + *
>> + * The address range is determined by below formula:
>> + * [BADDR, BADDR + (NUM + 1) * 2^(5*SCALE + 1) * PAGESIZE)
>> + *
>> + */
>> +#define __TLBI_VADDR_RANGE(addr, asid, tg, scale, num, ttl)	\
>> +	({							\
>> +		unsigned long __ta = (addr) >> PAGE_SHIFT;	\
>> +		__ta &= GENMASK_ULL(36, 0);			\
>> +		__ta |= (unsigned long)(ttl) << 37;		\
>> +		__ta |= (unsigned long)(num) << 39;		\
>> +		__ta |= (unsigned long)(scale) << 44;		\
>> +		__ta |= (unsigned long)(tg) << 46;		\
>> +		__ta |= (unsigned long)(asid) << 48;		\
>> +		__ta;						\
>> +	})
>> +
>> +#define TLB_RANGE_MASK_SHIFT 5
>> +#define TLB_RANGE_MASK GENMASK_ULL(TLB_RANGE_MASK_SHIFT - 1, 0)
>> +
>> +/*
>> + * __TG defines translation granule of the system, which is defined by
>> + * PAGE_SHIFT.  Used by TTL.
>> + *  - 4KB	: 1
>> + *  - 16KB	: 2
>> + *  - 64KB	: 3
>> + */
>> +#define __TG	((PAGE_SHIFT - 12) / 2 + 1)
> 
> I don't think we need __TLBI_VADDR_RANGE to take a tg argument since
> it's always the same.
> 

OK.

>> +
>> +
>>  /*
>>   *	TLB Invalidation
>>   *	================
>> @@ -171,12 +209,83 @@ static inline void flush_tlb_page(struct vm_area_struct *vma,
>>  	dsb(ish);
>>  }
>>  
>> +/* The maximum range size of one TLBI-RANGE instruction */
>> +#define MAX_TLBI_RANGE_SIZE	(1UL << 21)
> 
> Nitpick: call this MAX_TLBI_RANGE_PAGES as that's not an address range.
> 
> It may be useful to have a macro for the range here, something like:
> 
> #define __TLBI_PAGES(num, scale)	((num + 1) << (5 * scale + 1))
> 
> and define MAX_TLBI_RANGE_PAGES in terms of this macro as
> __TLBI_PAGES(31, 3).
> 

OK, thanks for your great suggestion.

>> +
>> +/*
>> + * This interface uses the *rvale1is* instruction to flush TLBs
>> + * in [start, end) directly.
>> + * This instruction is supported from ARM v8.4.
>> + */
>> +static inline void __flush_tlb_range_directly(struct vm_area_struct *vma,
>> +				unsigned long start, unsigned long end,
>> +				unsigned long stride, bool last_level)
>> +{
>> +	int num = 0;
>> +	int scale = 0;
>> +	unsigned long asid = ASID(vma->vm_mm);
>> +	unsigned long addr = 0;
>> +	unsigned long range_size;
>> +
>> +	start = round_down(start, stride);
>> +	end = round_up(end, stride);
>> +	range_size = (end - start) >> PAGE_SHIFT;
>> +
>> +	if (range_size > MAX_TLBI_RANGE_SIZE) {
>> +		flush_tlb_mm(vma->vm_mm);
>> +		return;
>> +	}
>> +
>> +	dsb(ishst);
>> +
>> +	/*
>> +	 * The minimum size of TLB RANGE is 2 PAGE;
>> +	 * Use normal TLB instruction to handle odd PAGEs
> 
> Nitpick: no need to capitalise PAGE.
> 

OK.

>> +	 */
>> +	if (range_size % 2 == 1) {
>> +		addr = __TLBI_VADDR(start, asid);
>> +		if (last_level) {
>> +			__tlbi(vale1is, addr);
>> +			__tlbi_user(vale1is, addr);
>> +		} else {
>> +			__tlbi(vae1is, addr);
>> +			__tlbi_user(vae1is, addr);
>> +		}
>> +		start += 1 << PAGE_SHIFT;
>> +		range_size -= 1;
>> +	}
>> +
>> +	range_size >>= 1;
>> +	while (range_size > 0) {
>> +		num = (range_size & TLB_RANGE_MASK) - 1;
>> +		if (num >= 0) {
>> +			addr = __TLBI_VADDR_RANGE(start, asid, __TG,
>> +						  scale, num, 0);
>> +			if (last_level) {
>> +				__tlbi(rvale1is, addr);
>> +				__tlbi_user(rvale1is, addr);
>> +			} else {
>> +				__tlbi(rvae1is, addr);
>> +				__tlbi_user(rvae1is, addr);
>> +			}
>> +			start += (num + 1) << (5 * scale + 1) << PAGE_SHIFT;
> 
> You could use the __TLBI_PAGES macro I proposed above.
> 

OK.

>> +		}
>> +		scale++;
>> +		range_size >>= TLB_RANGE_MASK_SHIFT;
>> +	}
> 
> So, you start from scale 0 and increment it until you reach the maximum.
> I think (haven't done the maths on paper) you could also start from the
> top with something like scale = ilog2(range_size) / 5. Not sure it's
> significantly better though, maybe avoiding the loop 3 times if your
> range is 2MB (which happens with huge pages).
> 

This optimization is only effective when the range is a multiple of 256KB
(when the page size is 4KB), and I'm worried about the performance
of ilog2().  I traced the __flush_tlb_range() last year and found that in
most cases the range is less than 256K (see details in [1]).

I will test the performance of your suggestion and then reply you again
here.

> Anyway, I think it would be more efficient if we combine the
> __flush_tlb_range() and the _directly one into the same function with a
> single loop for both. For example, if the stride is 2MB already, we can
> handle this with a single classic TLBI without all the calculations for
> the range operation. The hardware may also handle this better since the
> software already told it there can be only one entry in that 2MB range.
> So each loop iteration could figure which operation to use based on
> cpucaps, TLBI range ops, stride and reduce range_size accordingly.
> 

Summarize your suggestion in one sentence: use 'stride' to optimize the
preformance of TLBI.  This can also be done by dividing into two functions,
and this should indeed be taken into account in the TLBI RANGE feature.

But if we figure which operation to use based on cpucaps in each loop
iteration, then cpus_have_const_cap() will be called frequently, which
may affect performance of TLBI.  In my opinion, we should do as few
judgments as possible in the loop, so judge the cpucaps outside the
loop maybe a good choice.


[1] https://lkml.org/lkml/2019/11/11/593

Thanks,
Zhenyu



^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC PATCH v3 1/2] arm64: tlb: Detect the ARMv8.4 TLBI RANGE feature
  2020-05-18  4:22       ` Anshuman Khandual
@ 2020-05-18 12:29         ` Zhenyu Ye
  0 siblings, 0 replies; 12+ messages in thread
From: Zhenyu Ye @ 2020-05-18 12:29 UTC (permalink / raw)
  To: Anshuman Khandual, Mark Rutland
  Cc: will, catalin.marinas, suzuki.poulose, maz, steven.price,
	guohanjun, olof, linux-arch, linux-kernel, xiexiangyou,
	zhangshaokun, linux-mm, arm, prime.zeng, kuhn.chenqun,
	linux-arm-kernel

Hi Anshuman,

On 2020/5/18 12:22, Anshuman Khandual wrote:
>>>>  static const struct arm64_ftr_bits ftr_id_aa64isar0[] = {
>>>>  	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_RNDR_SHIFT, 4, 0),
>>>> +	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_TLBI_RANGE_SHIFT, 4, 0),
> 
> Hello Zhenyu,
> 
> This is already being added through another patch [1] in a series [2] which primarily
> has cpufeature changes. I will soon update the series making this feature FTR_HIDDEN.
> 
> [1] https://patchwork.kernel.org/patch/11523881/
> [2] https://patchwork.kernel.org/project/linux-arm-kernel/list/?series=281211
> 
> I am planning to respin the series (V4) based on arm64 tree (for-next/cpufeature). So
> could you please rebase this patch (probably dropping cpufeature related changes) on
> upcoming V4, so that all the changes will be based on arm64 tree (for-next/cpufeature).
> 
> - Anshuman
> 

OK, I will rebase my patch based on your V4.

Zhenyu



^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC PATCH v3 2/2] arm64: tlb: Use the TLBI RANGE feature in arm64
  2020-05-18 12:21     ` Zhenyu Ye
@ 2020-05-20 17:08       ` Catalin Marinas
  2020-06-01 14:57         ` Zhenyu Ye
  2020-06-09 13:26       ` Zhenyu Ye
  1 sibling, 1 reply; 12+ messages in thread
From: Catalin Marinas @ 2020-05-20 17:08 UTC (permalink / raw)
  To: Zhenyu Ye
  Cc: linux-arch, suzuki.poulose, maz, linux-kernel, xiexiangyou,
	steven.price, zhangshaokun, linux-mm, arm, prime.zeng, guohanjun,
	olof, kuhn.chenqun, will, linux-arm-kernel

On Mon, May 18, 2020 at 08:21:02PM +0800, Zhenyu Ye wrote:
> On 2020/5/14 23:28, Catalin Marinas wrote:
> > On Tue, Apr 14, 2020 at 07:28:35PM +0800, Zhenyu Ye wrote:
> >> +		}
> >> +		scale++;
> >> +		range_size >>= TLB_RANGE_MASK_SHIFT;
> >> +	}
> > 
> > So, you start from scale 0 and increment it until you reach the maximum.
> > I think (haven't done the maths on paper) you could also start from the
> > top with something like scale = ilog2(range_size) / 5. Not sure it's
> > significantly better though, maybe avoiding the loop 3 times if your
> > range is 2MB (which happens with huge pages).
> 
> This optimization is only effective when the range is a multiple of 256KB
> (when the page size is 4KB), and I'm worried about the performance
> of ilog2().  I traced the __flush_tlb_range() last year and found that in
> most cases the range is less than 256K (see details in [1]).

THP or hugetlbfs would exercise bigger strides but I guess it depends on
the use-case. ilog2() should be reduced to a few instructions on arm64
AFAICT (haven't tried but it should use the CLZ instruction).

> > Anyway, I think it would be more efficient if we combine the
> > __flush_tlb_range() and the _directly one into the same function with a
> > single loop for both. For example, if the stride is 2MB already, we can
> > handle this with a single classic TLBI without all the calculations for
> > the range operation. The hardware may also handle this better since the
> > software already told it there can be only one entry in that 2MB range.
> > So each loop iteration could figure which operation to use based on
> > cpucaps, TLBI range ops, stride and reduce range_size accordingly.
> 
> Summarize your suggestion in one sentence: use 'stride' to optimize the
> preformance of TLBI.  This can also be done by dividing into two functions,
> and this should indeed be taken into account in the TLBI RANGE feature.
> 
> But if we figure which operation to use based on cpucaps in each loop
> iteration, then cpus_have_const_cap() will be called frequently, which
> may affect performance of TLBI.  In my opinion, we should do as few
> judgments as possible in the loop, so judge the cpucaps outside the
> loop maybe a good choice.

cpus_have_const_cap() is a static label, so should be patched with a
branch or nop. My point was that in the classic __flush_tlb_range()
loop, instead of an addr += stride we could have something more dynamic
depending on whether the CPU supports range TLBI ops or not. But we
would indeed have more (static) branches in the loop, so possibly some
performance degradation.

If the code looks ok, I'd favour this and we can look at the
optimisation later. But I can't really tell how the code would look
without attempting to merge the two.

Anyway, a first step would be to to add the the range and stride to the
decision (i.e. (end-start)/stride > 1) before jumping to the range
operations. You can avoid the additional checks in the new TLBI
functions since we know we have at least two (huge)pages.

-- 
Catalin


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC PATCH v3 2/2] arm64: tlb: Use the TLBI RANGE feature in arm64
  2020-05-20 17:08       ` Catalin Marinas
@ 2020-06-01 14:57         ` Zhenyu Ye
  0 siblings, 0 replies; 12+ messages in thread
From: Zhenyu Ye @ 2020-06-01 14:57 UTC (permalink / raw)
  To: Catalin Marinas
  Cc: linux-arch, suzuki.poulose, maz, linux-kernel, xiexiangyou,
	steven.price, zhangshaokun, linux-mm, arm, prime.zeng, guohanjun,
	olof, kuhn.chenqun, will, linux-arm-kernel

Hi Catalin,

I have sent the v4 of this series [1] and combine the two function with
a single loop.  See codes for details.

[1] https://lore.kernel.org/linux-arm-kernel/20200601144713.2222-1-yezhenyu2@huawei.com/

On 2020/5/21 1:08, Catalin Marinas wrote:
>> This optimization is only effective when the range is a multiple of 256KB
>> (when the page size is 4KB), and I'm worried about the performance
>> of ilog2().  I traced the __flush_tlb_range() last year and found that in
>> most cases the range is less than 256K (see details in [1]).
> 
> THP or hugetlbfs would exercise bigger strides but I guess it depends on
> the use-case. ilog2() should be reduced to a few instructions on arm64
> AFAICT (haven't tried but it should use the CLZ instruction).
> 

Not bigger than 256K, but the range must be a integer multiple of 256KB,
so I still start from scale 0.

Thanks,
Zhenyu



^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [RFC PATCH v3 2/2] arm64: tlb: Use the TLBI RANGE feature in arm64
  2020-05-18 12:21     ` Zhenyu Ye
  2020-05-20 17:08       ` Catalin Marinas
@ 2020-06-09 13:26       ` Zhenyu Ye
  1 sibling, 0 replies; 12+ messages in thread
From: Zhenyu Ye @ 2020-06-09 13:26 UTC (permalink / raw)
  To: Catalin Marinas
  Cc: will, suzuki.poulose, maz, steven.price, guohanjun, olof,
	linux-arm-kernel, linux-kernel, linux-arch, linux-mm, arm,
	xiexiangyou, prime.zeng, zhangshaokun, kuhn.chenqun

[-- Attachment #1: Type: text/plain, Size: 3038 bytes --]

Hi Catalin,

On 2020/5/18 20:21, Zhenyu Ye wrote:
> I will test the performance of your suggestion and then reply you again
> here.
> 

I have sent the v4 of this series [1], and compared the performance of
these two different implement.  The test code is in the attachment (directly
call the __flush_tlb_range()).

First, I tested the v4 on a machine whose cpus do not support tlb range.
Fortunately, the newly added judgment in loop has very little effect on
performance.  When page nums are 256 (loop 256 times), the impact is less
than 0.5%:

	[page num]	[before change]		[v4 change]
	1		1457			1491
	2		1911			1957
	3		2382			2377
	4		2827			2852
	5		3282			3349
	6		3763			3781
	7		4295			4252
	8		4716			4716
	9		5186			5218
	10		5618			5648
	16		8427			8454
	32		15938			15951
	64		30890			30977
	128		60802			60863
	256		120826			121395
	512		1508			1555

Then I tested them on a FPGA machine whose cpus support the tlb range
feature (this machine is not the same as above).  Below is the test
data when the stride = PTE:

	[page num]	[before change]	[v3 change]	[v4 change]
	1		16051		15094		13524
	2		11366		11270		11146
	3		11582		11536		12171
	4		11694		11199		11101
	5		12138		11506		12267
	6		12290		11214		11105
	7		12400		11448		12002
	8		12837		11225		11097
	9		14791		11529		12140
	10		15461		11218		11087
	16		18233		11192		11094
	32		26983		11224		11079
	64		43840		11237		11092
	128		77754		11247		11098
	256		145514		11223		11089
	512		280932		11197		11111

We can see the v3 and v4 are very similar in this scene, and both
of them performance improved very much compared to current
implementation.  When the page nums are 256, the performance is
improved by more than 10 times.  And the TLBI RANGE instruction
cost less time than classic TLBI in all secenes on this machine,
even if the page num is small. (but this may be different on
different machines)

Everything performs will util now, but I added a new judgment of
stride in the v4:

	if (cpus_have_const_cap(ARM64_HAS_TLBI_RANGE) &&
	    stride == PAGE_SIZE)
		use tlbi range here...

So when the stride != PTE, then there will use the classic tlbi
instruction and flush the tlbs one by one, where the performance
becomes worse than v3:

	[page num]	[before change]	[v3 change]	[v4 change]
	1		14047		11332		11611
	2		11568		11255		11701
	3		11664		11231		11759
	4		12097		11204		12173
	5		12229		11236		12374
	6		12399		11203		12497
	7		12802		11266		12914
	8		14764		17098		14907
	9		15370		17106		15551
	10		16130		17103		16137
	16		19029		17175		19194
	32		27300		17097		27604
	64		44172		17075		44609
	128		77878		17176		78548
	256		145185		12022		146063
	512		279822		12029		279922

And as we can see, "handle the 2MB with a single classic TLBI"
costs the same time as a single TLBI RANGE instruction.  So should
I remove the judgment of stride and only figure which to use based
on cpucaps in the loop?  But if removes the judgment, the logic
will be the same as v3.(both of them only judge cpucaps)

Waiting for your suggestions...

Thanks,
Zhenyu

[-- Attachment #2: test-tlb-range-perf.c --]
[-- Type: text/plain, Size: 2931 bytes --]

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/delay.h>
#include <asm/tlb.h>
#include <linux/time.h>
#include <asm/current.h>
#include <linux/sched.h>
#include <linux/delay.h>
#include <linux/mm.h>

#define TESTTIMES 10000

void testRangePerf(void);

static int __init test_init(void)
{
    printk("BEGIN TEST\n");

    testRangePerf();

    printk("END TEST\n");
    return 0;
}


static void __exit test_exit(void)
{
    return;
}

void testRangePerf(void)
{
    int i, j;
    struct timespec64 start, end;
    struct task_struct *ts;
    struct vm_area_struct *vma;

    printk("BEGIN testRangePerf\n");

    ts = current;
    vma = ts->mm->mmap;

    printk("vma->start: %lx, vma->end: %lx, ttl = 0, PAGE_SIZE = 0x%lx\n", vma->vm_start, vma->vm_end, PAGE_SIZE);
    for (i = 1; i <= 10; i++) {
        ktime_get_ts64(&start);
        for (j = 0; j < TESTTIMES; j++) {
            __flush_tlb_range(vma, vma->vm_start, vma->vm_start + PAGE_SIZE * i, PAGE_SIZE, false);
        }
        ktime_get_ts64(&end);
        printk("test __flush_tlb_range with %04d pages, used time: %12lld ns\n", i,
               ((end.tv_sec - start.tv_sec) * 1000000000 + end.tv_nsec - start.tv_nsec) / TESTTIMES);
        msleep(100);
    }

    for (i = 16; i <= 512; i+=i) {
        ktime_get_ts64(&start);
        for (j = 0; j < TESTTIMES; j++) {
            __flush_tlb_range(vma, vma->vm_start, vma->vm_start + PAGE_SIZE * i, PAGE_SIZE, false);
        }
        ktime_get_ts64(&end);
        printk("test __flush_tlb_range with %04d pages, used time: %12lld ns\n", i,
               ((end.tv_sec - start.tv_sec) * 1000000000 + end.tv_nsec - start.tv_nsec) / TESTTIMES);
        msleep(100);
    }

    printk("vma->start: %lx, vma->end: %lx, ttl = 0, PAGE_SIZE = 0x%lx\n", vma->vm_start, vma->vm_end, PMD_SIZE);
    for (i = 1; i <= 10; i++) {
        ktime_get_ts64(&start);
        for (j = 0; j < TESTTIMES; j++) {
            __flush_tlb_range(vma, vma->vm_start, vma->vm_start + PMD_SIZE * i, PMD_SIZE, false);
        }
        ktime_get_ts64(&end);
        printk("test __flush_tlb_range with %04d pages, used time: %12lld ns\n", i,
               ((end.tv_sec - start.tv_sec) * 1000000000 + end.tv_nsec - start.tv_nsec) / TESTTIMES);
        msleep(100);
    }

    for (i = 16; i <= 512; i+=i) {
        ktime_get_ts64(&start);
        for (j = 0; j < TESTTIMES; j++) {
            __flush_tlb_range(vma, vma->vm_start, vma->vm_start + PMD_SIZE * i, PMD_SIZE, false);
        }
        ktime_get_ts64(&end);
        printk("test __flush_tlb_range with %04d pages, used time: %12lld ns\n", i,
               ((end.tv_sec - start.tv_sec) * 1000000000 + end.tv_nsec - start.tv_nsec) / TESTTIMES);
        msleep(100);
    }
}

module_init(test_init)
module_exit(test_exit)

MODULE_LICENSE("Dual BSD/GPL"); 
MODULE_AUTHOR("Eillon");
MODULE_DESCRIPTION("do TTL test");
MODULE_VERSION("1.0");

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2020-06-09 13:27 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-04-14 11:28 [RFC PATCH v3 0/2] arm64: tlb: add support for TLBI RANGE instructions Zhenyu Ye
2020-04-14 11:28 ` [RFC PATCH v3 1/2] arm64: tlb: Detect the ARMv8.4 TLBI RANGE feature Zhenyu Ye
2020-05-05 10:14   ` Mark Rutland
2020-05-11 12:25     ` Zhenyu Ye
2020-05-18  4:22       ` Anshuman Khandual
2020-05-18 12:29         ` Zhenyu Ye
2020-04-14 11:28 ` [RFC PATCH v3 2/2] arm64: tlb: Use the TLBI RANGE feature in arm64 Zhenyu Ye
2020-05-14 15:28   ` Catalin Marinas
2020-05-18 12:21     ` Zhenyu Ye
2020-05-20 17:08       ` Catalin Marinas
2020-06-01 14:57         ` Zhenyu Ye
2020-06-09 13:26       ` Zhenyu Ye

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).