linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC PATCH v2] arm64: cpufeatures: add support for tlbi range instructions
@ 2019-11-11 13:23 Zhenyu Ye
  2019-11-11 13:27 ` Will Deacon
  2019-11-11 16:32 ` Olof Johansson
  0 siblings, 2 replies; 11+ messages in thread
From: Zhenyu Ye @ 2019-11-11 13:23 UTC (permalink / raw)
  To: catalin.marinas, will, maz, suzuki.poulose, mark.rutland
  Cc: tangnianyao, xiexiangyou, linux-kernel, yezhenyu2, arm

ARMv8.4-TLBI provides TLBI invalidation instruction that apply to a
range of input addresses. This patch adds support for this feature.
This is the second version of the patch.

I traced the __flush_tlb_range() for a minute and get some statistical
data as below:

	PAGENUM		COUNT
	1		34944
	2		5683
	3		1343
	4		7857
	5		838
	9		339
	16		933
	19		427
	20		5821
	23		279
	41		338
	141		279
	512		428
	1668		120
	2038		100

Those data are based on kernel-5.4.0, where PAGENUM = end - start, COUNT
shows number of calls to the __flush_tlb_range() in a minute. There only
shows the data which COUNT >= 100. The kernel is started normally, and
transparent hugepage is opened. As we can see, though most user TLBI
ranges were 1 pages long, the num of long-range can not be ignored.

The new feature of TLB range can improve lots of performance compared to
the current implementation. As an example, flush 512 ranges needs only 1
instruction as opposed to 512 instructions using current implementation.

And for a new hardware feature, support is better than not.

Signed-off-by: Zhenyu Ye <yezhenyu2@huawei.com>
---
ChangeLog v1 -> v2:
- Change the main implementation of this feature.
- Add some comments.

---
 arch/arm64/include/asm/cpucaps.h  |  3 +-
 arch/arm64/include/asm/sysreg.h   |  4 ++
 arch/arm64/include/asm/tlbflush.h | 99 ++++++++++++++++++++++++++++++-
 arch/arm64/kernel/cpufeature.c    | 10 ++++
 4 files changed, 112 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
index ac1dbca3d0cd..5b5230060e5b 100644
--- a/arch/arm64/include/asm/cpucaps.h
+++ b/arch/arm64/include/asm/cpucaps.h
@@ -54,7 +54,8 @@
 #define ARM64_WORKAROUND_1463225		44
 #define ARM64_WORKAROUND_CAVIUM_TX2_219_TVM	45
 #define ARM64_WORKAROUND_CAVIUM_TX2_219_PRFM	46
+#define ARM64_HAS_TLBI_RANGE			47

-#define ARM64_NCAPS				47
+#define ARM64_NCAPS				48

 #endif /* __ASM_CPUCAPS_H */
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 6e919fafb43d..a6abbf2b067d 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -539,6 +539,7 @@
 			 ENDIAN_SET_EL1 | SCTLR_EL1_UCI  | SCTLR_EL1_RES1)

 /* id_aa64isar0 */
+#define ID_AA64ISAR0_TLB_SHIFT		56
 #define ID_AA64ISAR0_TS_SHIFT		52
 #define ID_AA64ISAR0_FHM_SHIFT		48
 #define ID_AA64ISAR0_DP_SHIFT		44
@@ -552,6 +553,9 @@
 #define ID_AA64ISAR0_SHA1_SHIFT		8
 #define ID_AA64ISAR0_AES_SHIFT		4

+#define ID_AA64ISAR0_TLB_NI		0x0
+#define ID_AA64ISAR0_TLB_RANGE		0x2
+
 /* id_aa64isar1 */
 #define ID_AA64ISAR1_SB_SHIFT		36
 #define ID_AA64ISAR1_FRINTTS_SHIFT	32
diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index bc3949064725..f49bed7ecb68 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -59,6 +59,33 @@
 		__ta;						\
 	})

+/*
+ * This macro creates a properly formatted VA operand for the TLBI RANGE.
+ * The value bit assignments are:
+ *
+ * +----------+------+-------+-------+-------+----------------------+
+ * |   ASID   |  TG  | SCALE |  NUM  |  TTL  |        BADDR         |
+ * +-----------------+-------+-------+-------+----------------------+
+ * |63      48|47  46|45   44|43   39|38   37|36                   0|
+ *
+ * The address range is determined by below formula:
+ * [BADDR, BADDR + (NUM + 1) * 2^(5*SCALE + 1) * PAGESIZE)
+ *
+ */
+#define __TLBI_VADDR_RANGE(addr, asid, tg, scale, num, ttl)	\
+	({							\
+		unsigned long __ta = (addr) >> PAGE_SHIFT;	\
+		__ta |= (unsigned long)(ttl) << 37;		\
+		__ta |= (unsigned long)(num) << 39;		\
+		__ta |= (unsigned long)(scale) << 44;		\
+		__ta |= (unsigned long)(tg) << 46;		\
+		__ta |= (unsigned long)(asid) << 48;		\
+		__ta;						\
+	})
+
+#define TLB_RANGE_MASK_SHIFT 5
+#define TLB_RANGE_MASK GENMASK_ULL(TLB_RANGE_MASK_SHIFT, 0)
+
 /*
  *	TLB Invalidation
  *	================
@@ -177,9 +204,9 @@ static inline void flush_tlb_page(struct vm_area_struct *vma,
  */
 #define MAX_TLBI_OPS	PTRS_PER_PTE

-static inline void __flush_tlb_range(struct vm_area_struct *vma,
-				     unsigned long start, unsigned long end,
-				     unsigned long stride, bool last_level)
+static inline void __flush_tlb_range_old(struct vm_area_struct *vma,
+					 unsigned long start, unsigned long end,
+					 unsigned long stride, bool last_level)
 {
 	unsigned long asid = ASID(vma->vm_mm);
 	unsigned long addr;
@@ -211,6 +238,72 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma,
 	dsb(ish);
 }

+static inline void __flush_tlb_range_new(struct vm_area_struct *vma,
+					 unsigned long start, unsigned long end,
+					 unsigned long stride, bool last_level)
+{
+	int num = 0;
+	int scale = 0;
+	int ttl = 0;
+	int tg = (PAGE_SHIFT - 12) / 2 + 1;
+	unsigned long asid = ASID(vma->vm_mm);
+	unsigned long addr = 0;
+	unsigned long offset = (end - start) >> PAGE_SHIFT;
+
+	if (offset > (1UL << 21)) {
+		flush_tlb_mm(vma->vm_mm);
+		return;
+	}
+
+	dsb(ishst);
+
+	/*
+	 * The minimum size of TLB RANGE is 2 PAGE;
+	 * Use normal TLB instruction to handle odd PAGEs
+	 */
+	if (offset % 2 == 1) {
+		addr = __TLBI_VADDR(start, asid);
+		if (last_level) {
+			__tlbi(vale1is, addr);
+			__tlbi_user(vale1is, addr);
+		} else {
+			__tlbi(vae1is, addr);
+			__tlbi_user(vae1is, addr);
+		}
+		start += 1 << PAGE_SHIFT;
+		offset -= 1;
+	}
+
+	while (offset > 0) {
+		num = (offset & TLB_RANGE_MASK) - 1;
+		if (num >= 0) {
+			addr = __TLBI_VADDR_RANGE(start, asid, tg,
+						  scale, num, ttl);
+			if (last_level) {
+				__tlbi(rvale1is, addr);
+				__tlbi_user(rvale1is, addr);
+			} else {
+				__tlbi(rvae1is, addr);
+				__tlbi_user(rvae1is, addr);
+			}
+			start += (num + 1) << (5 * scale + 1) << PAGE_SHIFT;
+		}
+		scale++;
+		offset >>= TLB_RANGE_MASK_SHIFT;
+	}
+	dsb(ish);
+}
+
+static inline void __flush_tlb_range(struct vm_area_struct *vma,
+				     unsigned long start, unsigned long end,
+				     unsigned long stride, bool last_level)
+{
+	if (cpus_have_const_cap(ARM64_HAS_TLBI_RANGE))
+		__flush_tlb_range_new(vma, start, end, stride, last_level);
+	else
+		__flush_tlb_range_old(vma, start, end, stride, last_level);
+}
+
 static inline void flush_tlb_range(struct vm_area_struct *vma,
 				   unsigned long start, unsigned long end)
 {
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 80f459ad0190..bdefd8a34729 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -1566,6 +1566,16 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 		.min_field_value = 1,
 	},
 #endif
+	{
+		.desc = "TLB range maintenance instruction",
+		.capability = ARM64_HAS_TLBI_RANGE,
+		.type = ARM64_CPUCAP_SYSTEM_FEATURE,
+		.matches = has_cpuid_feature,
+		.sys_reg = SYS_ID_AA64ISAR0_EL1,
+		.field_pos = ID_AA64ISAR0_TLB_SHIFT,
+		.sign = FTR_UNSIGNED,
+		.min_field_value = ID_AA64ISAR0_TLB_RANGE,
+	},
 	{},
 };

-- 
2.19.1



^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH v2] arm64: cpufeatures: add support for tlbi range instructions
  2019-11-11 13:23 [RFC PATCH v2] arm64: cpufeatures: add support for tlbi range instructions Zhenyu Ye
@ 2019-11-11 13:27 ` Will Deacon
  2019-11-11 13:47   ` Zhenyu Ye
  2019-11-11 16:32 ` Olof Johansson
  1 sibling, 1 reply; 11+ messages in thread
From: Will Deacon @ 2019-11-11 13:27 UTC (permalink / raw)
  To: Zhenyu Ye
  Cc: catalin.marinas, maz, suzuki.poulose, mark.rutland, tangnianyao,
	xiexiangyou, linux-kernel, arm

On Mon, Nov 11, 2019 at 09:23:55PM +0800, Zhenyu Ye wrote:
> ARMv8.4-TLBI provides TLBI invalidation instruction that apply to a
> range of input addresses. This patch adds support for this feature.
> This is the second version of the patch.
> 
> I traced the __flush_tlb_range() for a minute and get some statistical
> data as below:
> 
> 	PAGENUM		COUNT
> 	1		34944
> 	2		5683
> 	3		1343
> 	4		7857
> 	5		838
> 	9		339
> 	16		933
> 	19		427
> 	20		5821
> 	23		279
> 	41		338
> 	141		279
> 	512		428
> 	1668		120
> 	2038		100
> 
> Those data are based on kernel-5.4.0, where PAGENUM = end - start, COUNT
> shows number of calls to the __flush_tlb_range() in a minute. There only
> shows the data which COUNT >= 100. The kernel is started normally, and
> transparent hugepage is opened. As we can see, though most user TLBI
> ranges were 1 pages long, the num of long-range can not be ignored.
> 
> The new feature of TLB range can improve lots of performance compared to
> the current implementation. As an example, flush 512 ranges needs only 1
> instruction as opposed to 512 instructions using current implementation.
> 
> And for a new hardware feature, support is better than not.
> 
> Signed-off-by: Zhenyu Ye <yezhenyu2@huawei.com>
> ---
> ChangeLog v1 -> v2:
> - Change the main implementation of this feature.
> - Add some comments.

How does this address my concerns here:

https://lore.kernel.org/linux-arm-kernel/20191031131649.GB27196@willie-the-truck/

?

Will

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH v2] arm64: cpufeatures: add support for tlbi range instructions
  2019-11-11 13:27 ` Will Deacon
@ 2019-11-11 13:47   ` Zhenyu Ye
  2019-11-11 14:04     ` Marc Zyngier
  0 siblings, 1 reply; 11+ messages in thread
From: Zhenyu Ye @ 2019-11-11 13:47 UTC (permalink / raw)
  To: Will Deacon
  Cc: catalin.marinas, maz, suzuki.poulose, mark.rutland, tangnianyao,
	xiexiangyou, linux-kernel, arm



On 2019/11/11 21:27, Will Deacon wrote:
> On Mon, Nov 11, 2019 at 09:23:55PM +0800, Zhenyu Ye wrote:
>> ARMv8.4-TLBI provides TLBI invalidation instruction that apply to a
>> range of input addresses. This patch adds support for this feature.
>> This is the second version of the patch.
>>
>> I traced the __flush_tlb_range() for a minute and get some statistical
>> data as below:
>>
>> 	PAGENUM		COUNT
>> 	1		34944
>> 	2		5683
>> 	3		1343
>> 	4		7857
>> 	5		838
>> 	9		339
>> 	16		933
>> 	19		427
>> 	20		5821
>> 	23		279
>> 	41		338
>> 	141		279
>> 	512		428
>> 	1668		120
>> 	2038		100
>>
>> Those data are based on kernel-5.4.0, where PAGENUM = end - start, COUNT
>> shows number of calls to the __flush_tlb_range() in a minute. There only
>> shows the data which COUNT >= 100. The kernel is started normally, and
>> transparent hugepage is opened. As we can see, though most user TLBI
>> ranges were 1 pages long, the num of long-range can not be ignored.
>>
>> The new feature of TLB range can improve lots of performance compared to
>> the current implementation. As an example, flush 512 ranges needs only 1
>> instruction as opposed to 512 instructions using current implementation.
>>
>> And for a new hardware feature, support is better than not.
>>
>> Signed-off-by: Zhenyu Ye <yezhenyu2@huawei.com>
>> ---
>> ChangeLog v1 -> v2:
>> - Change the main implementation of this feature.
>> - Add some comments.
> 
> How does this address my concerns here:
> 
> https://lore.kernel.org/linux-arm-kernel/20191031131649.GB27196@willie-the-truck/
> 
> ?
> 
> Will
> 
> .
> 

I think your concern is more about the hardware level, and we can do nothing about
this at all. The interconnect/DVM implementation is not exposed to software layer
(and no need), and may should be constrained at hardware level.

Zhenyu


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH v2] arm64: cpufeatures: add support for tlbi range  instructions
  2019-11-11 13:47   ` Zhenyu Ye
@ 2019-11-11 14:04     ` Marc Zyngier
  2019-11-11 14:23       ` Zhenyu Ye
  2019-11-19  1:13       ` Hanjun Guo
  0 siblings, 2 replies; 11+ messages in thread
From: Marc Zyngier @ 2019-11-11 14:04 UTC (permalink / raw)
  To: Zhenyu Ye
  Cc: Will Deacon, catalin.marinas, suzuki.poulose, mark.rutland,
	tangnianyao, xiexiangyou, linux-kernel, arm

On 2019-11-11 14:56, Zhenyu Ye wrote:
> On 2019/11/11 21:27, Will Deacon wrote:
>> On Mon, Nov 11, 2019 at 09:23:55PM +0800, Zhenyu Ye wrote:
>>> ARMv8.4-TLBI provides TLBI invalidation instruction that apply to a
>>> range of input addresses. This patch adds support for this feature.
>>> This is the second version of the patch.
>>>
>>> I traced the __flush_tlb_range() for a minute and get some 
>>> statistical
>>> data as below:
>>>
>>> 	PAGENUM		COUNT
>>> 	1		34944
>>> 	2		5683
>>> 	3		1343
>>> 	4		7857
>>> 	5		838
>>> 	9		339
>>> 	16		933
>>> 	19		427
>>> 	20		5821
>>> 	23		279
>>> 	41		338
>>> 	141		279
>>> 	512		428
>>> 	1668		120
>>> 	2038		100
>>>
>>> Those data are based on kernel-5.4.0, where PAGENUM = end - start, 
>>> COUNT
>>> shows number of calls to the __flush_tlb_range() in a minute. There 
>>> only
>>> shows the data which COUNT >= 100. The kernel is started normally, 
>>> and
>>> transparent hugepage is opened. As we can see, though most user 
>>> TLBI
>>> ranges were 1 pages long, the num of long-range can not be ignored.
>>>
>>> The new feature of TLB range can improve lots of performance 
>>> compared to
>>> the current implementation. As an example, flush 512 ranges needs 
>>> only 1
>>> instruction as opposed to 512 instructions using current 
>>> implementation.
>>>
>>> And for a new hardware feature, support is better than not.
>>>
>>> Signed-off-by: Zhenyu Ye <yezhenyu2@huawei.com>
>>> ---
>>> ChangeLog v1 -> v2:
>>> - Change the main implementation of this feature.
>>> - Add some comments.
>>
>> How does this address my concerns here:
>>
>> 
>> https://lore.kernel.org/linux-arm-kernel/20191031131649.GB27196@willie-the-truck/
>>
>> ?
>>
>> Will
>>
>> .
>>
>
> I think your concern is more about the hardware level, and we can do
> nothing about
> this at all. The interconnect/DVM implementation is not exposed to
> software layer
> (and no need), and may should be constrained at hardware level.

You're missing the point here: the instruction may be implemented
and perfectly working at the CPU level, and yet not carried over
the interconnect. In this situation, other CPUs may not observe
the DVM messages instructing them of such invalidation, and you'll end
up with memory corruption.

So, in the absence of an architectural guarantee that range 
invalidation
is supported and observed by all the DVM agents in the system, there 
must
be a firmware description for it on which the kernel can rely.

         M.
-- 
Jazz is not dead. It just smells funny...

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH v2] arm64: cpufeatures: add support for tlbi range instructions
  2019-11-11 14:04     ` Marc Zyngier
@ 2019-11-11 14:23       ` Zhenyu Ye
  2019-11-19  1:13       ` Hanjun Guo
  1 sibling, 0 replies; 11+ messages in thread
From: Zhenyu Ye @ 2019-11-11 14:23 UTC (permalink / raw)
  To: Marc Zyngier
  Cc: Will Deacon, catalin.marinas, suzuki.poulose, mark.rutland,
	tangnianyao, xiexiangyou, linux-kernel, arm



On 2019/11/11 22:04, Marc Zyngier wrote:
> On 2019-11-11 14:56, Zhenyu Ye wrote:
>> On 2019/11/11 21:27, Will Deacon wrote:
>>> On Mon, Nov 11, 2019 at 09:23:55PM +0800, Zhenyu Ye wrote:
>>>> ARMv8.4-TLBI provides TLBI invalidation instruction that apply to a
>>>> range of input addresses. This patch adds support for this feature.
>>>> This is the second version of the patch.
>>>>
>>>> I traced the __flush_tlb_range() for a minute and get some statistical
>>>> data as below:
>>>>
>>>>     PAGENUM        COUNT
>>>>     1        34944
>>>>     2        5683
>>>>     3        1343
>>>>     4        7857
>>>>     5        838
>>>>     9        339
>>>>     16        933
>>>>     19        427
>>>>     20        5821
>>>>     23        279
>>>>     41        338
>>>>     141        279
>>>>     512        428
>>>>     1668        120
>>>>     2038        100
>>>>
>>>> Those data are based on kernel-5.4.0, where PAGENUM = end - start, COUNT
>>>> shows number of calls to the __flush_tlb_range() in a minute. There only
>>>> shows the data which COUNT >= 100. The kernel is started normally, and
>>>> transparent hugepage is opened. As we can see, though most user TLBI
>>>> ranges were 1 pages long, the num of long-range can not be ignored.
>>>>
>>>> The new feature of TLB range can improve lots of performance compared to
>>>> the current implementation. As an example, flush 512 ranges needs only 1
>>>> instruction as opposed to 512 instructions using current implementation.
>>>>
>>>> And for a new hardware feature, support is better than not.
>>>>
>>>> Signed-off-by: Zhenyu Ye <yezhenyu2@huawei.com>
>>>> ---
>>>> ChangeLog v1 -> v2:
>>>> - Change the main implementation of this feature.
>>>> - Add some comments.
>>>
>>> How does this address my concerns here:
>>>
>>>
>>> https://lore.kernel.org/linux-arm-kernel/20191031131649.GB27196@willie-the-truck/
>>>
>>> ?
>>>
>>> Will
>>>
>>> .
>>>
>>
>> I think your concern is more about the hardware level, and we can do
>> nothing about
>> this at all. The interconnect/DVM implementation is not exposed to
>> software layer
>> (and no need), and may should be constrained at hardware level.
> 
> You're missing the point here: the instruction may be implemented
> and perfectly working at the CPU level, and yet not carried over
> the interconnect. In this situation, other CPUs may not observe
> the DVM messages instructing them of such invalidation, and you'll end
> up with memory corruption.
> 
> So, in the absence of an architectural guarantee that range invalidation
> is supported and observed by all the DVM agents in the system, there must
> be a firmware description for it on which the kernel can rely.
> 
>         M.

OK, sounds like an architectural defect. Some hardware-level improvements
may should be done here, which is unfamiliar to me.

However, We can discuss other aspects of the patch. Anyway, this should be
support by the kernel in the future, isn't it?



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH v2] arm64: cpufeatures: add support for tlbi range instructions
  2019-11-11 13:23 [RFC PATCH v2] arm64: cpufeatures: add support for tlbi range instructions Zhenyu Ye
  2019-11-11 13:27 ` Will Deacon
@ 2019-11-11 16:32 ` Olof Johansson
  2019-11-12  2:19   ` Zhenyu Ye
  1 sibling, 1 reply; 11+ messages in thread
From: Olof Johansson @ 2019-11-11 16:32 UTC (permalink / raw)
  To: Zhenyu Ye
  Cc: Catalin Marinas, Will Deacon, Marc Zyngier, Suzuki K. Poulose,
	Mark Rutland, tangnianyao, xiexiangyou,
	Linux Kernel Mailing List, ARM-SoC Maintainers

On Mon, Nov 11, 2019 at 5:24 AM Zhenyu Ye <yezhenyu2@huawei.com> wrote:
>
> ARMv8.4-TLBI provides TLBI invalidation instruction that apply to a
> range of input addresses. This patch adds support for this feature.
> This is the second version of the patch.
>
> I traced the __flush_tlb_range() for a minute and get some statistical
> data as below:
>
>         PAGENUM         COUNT
>         1               34944
>         2               5683
>         3               1343
>         4               7857
>         5               838
>         9               339
>         16              933
>         19              427
>         20              5821
>         23              279
>         41              338
>         141             279
>         512             428
>         1668            120
>         2038            100
>
> Those data are based on kernel-5.4.0, where PAGENUM = end - start, COUNT
> shows number of calls to the __flush_tlb_range() in a minute. There only
> shows the data which COUNT >= 100. The kernel is started normally, and
> transparent hugepage is opened. As we can see, though most user TLBI
> ranges were 1 pages long, the num of long-range can not be ignored.
>
> The new feature of TLB range can improve lots of performance compared to
> the current implementation. As an example, flush 512 ranges needs only 1
> instruction as opposed to 512 instructions using current implementation.

But there's no indication whether this performs better or not in reality.

A perf report indicating, for example, cycles spent in TLBI on the two
versions would be a lot more valuable.

> And for a new hardware feature, support is better than not.

This is blatantly untrue. Complexity is added, and if there's no
evidence of benefit of said complexity, it is not something we want to
add.

> Signed-off-by: Zhenyu Ye <yezhenyu2@huawei.com>
> ---
> ChangeLog v1 -> v2:
> - Change the main implementation of this feature.
> - Add some comments.
>
> ---
>  arch/arm64/include/asm/cpucaps.h  |  3 +-
>  arch/arm64/include/asm/sysreg.h   |  4 ++
>  arch/arm64/include/asm/tlbflush.h | 99 ++++++++++++++++++++++++++++++-
>  arch/arm64/kernel/cpufeature.c    | 10 ++++
>  4 files changed, 112 insertions(+), 4 deletions(-)
>
> diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
> index ac1dbca3d0cd..5b5230060e5b 100644
> --- a/arch/arm64/include/asm/cpucaps.h
> +++ b/arch/arm64/include/asm/cpucaps.h
> @@ -54,7 +54,8 @@
>  #define ARM64_WORKAROUND_1463225               44
>  #define ARM64_WORKAROUND_CAVIUM_TX2_219_TVM    45
>  #define ARM64_WORKAROUND_CAVIUM_TX2_219_PRFM   46
> +#define ARM64_HAS_TLBI_RANGE                   47
>
> -#define ARM64_NCAPS                            47
> +#define ARM64_NCAPS                            48
>
>  #endif /* __ASM_CPUCAPS_H */
> diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
> index 6e919fafb43d..a6abbf2b067d 100644
> --- a/arch/arm64/include/asm/sysreg.h
> +++ b/arch/arm64/include/asm/sysreg.h
> @@ -539,6 +539,7 @@
>                          ENDIAN_SET_EL1 | SCTLR_EL1_UCI  | SCTLR_EL1_RES1)
>
>  /* id_aa64isar0 */
> +#define ID_AA64ISAR0_TLB_SHIFT         56
>  #define ID_AA64ISAR0_TS_SHIFT          52
>  #define ID_AA64ISAR0_FHM_SHIFT         48
>  #define ID_AA64ISAR0_DP_SHIFT          44
> @@ -552,6 +553,9 @@
>  #define ID_AA64ISAR0_SHA1_SHIFT                8
>  #define ID_AA64ISAR0_AES_SHIFT         4
>
> +#define ID_AA64ISAR0_TLB_NI            0x0
> +#define ID_AA64ISAR0_TLB_RANGE         0x2
> +
>  /* id_aa64isar1 */
>  #define ID_AA64ISAR1_SB_SHIFT          36
>  #define ID_AA64ISAR1_FRINTTS_SHIFT     32
> diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
> index bc3949064725..f49bed7ecb68 100644
> --- a/arch/arm64/include/asm/tlbflush.h
> +++ b/arch/arm64/include/asm/tlbflush.h
> @@ -59,6 +59,33 @@
>                 __ta;                                           \
>         })
>
> +/*
> + * This macro creates a properly formatted VA operand for the TLBI RANGE.
> + * The value bit assignments are:
> + *
> + * +----------+------+-------+-------+-------+----------------------+
> + * |   ASID   |  TG  | SCALE |  NUM  |  TTL  |        BADDR         |
> + * +-----------------+-------+-------+-------+----------------------+
> + * |63      48|47  46|45   44|43   39|38   37|36                   0|
> + *
> + * The address range is determined by below formula:
> + * [BADDR, BADDR + (NUM + 1) * 2^(5*SCALE + 1) * PAGESIZE)
> + *
> + */
> +#define __TLBI_VADDR_RANGE(addr, asid, tg, scale, num, ttl)    \
> +       ({                                                      \
> +               unsigned long __ta = (addr) >> PAGE_SHIFT;      \
> +               __ta |= (unsigned long)(ttl) << 37;             \
> +               __ta |= (unsigned long)(num) << 39;             \
> +               __ta |= (unsigned long)(scale) << 44;           \
> +               __ta |= (unsigned long)(tg) << 46;              \
> +               __ta |= (unsigned long)(asid) << 48;            \
> +               __ta;                                           \
> +       })
> +
> +#define TLB_RANGE_MASK_SHIFT 5
> +#define TLB_RANGE_MASK GENMASK_ULL(TLB_RANGE_MASK_SHIFT, 0)
> +
>  /*
>   *     TLB Invalidation
>   *     ================
> @@ -177,9 +204,9 @@ static inline void flush_tlb_page(struct vm_area_struct *vma,
>   */
>  #define MAX_TLBI_OPS   PTRS_PER_PTE
>
> -static inline void __flush_tlb_range(struct vm_area_struct *vma,
> -                                    unsigned long start, unsigned long end,
> -                                    unsigned long stride, bool last_level)
> +static inline void __flush_tlb_range_old(struct vm_area_struct *vma,
> +                                        unsigned long start, unsigned long end,
> +                                        unsigned long stride, bool last_level)

"old" and "new" are not very descriptive.

>  {
>         unsigned long asid = ASID(vma->vm_mm);
>         unsigned long addr;
> @@ -211,6 +238,72 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma,
>         dsb(ish);
>  }
>
> +static inline void __flush_tlb_range_new(struct vm_area_struct *vma,
> +                                        unsigned long start, unsigned long end,
> +                                        unsigned long stride, bool last_level)
> +{
> +       int num = 0;
> +       int scale = 0;
> +       int ttl = 0;
> +       int tg = (PAGE_SHIFT - 12) / 2 + 1;

This is a constant, and shouldn't need to be a variable. You can push
it down to the addr generator macro.

> +       unsigned long asid = ASID(vma->vm_mm);
> +       unsigned long addr = 0;
> +       unsigned long offset = (end - start) >> PAGE_SHIFT;

"offset" confused me a lot here -- I think this variable really
describes number of pages to flush?

And, if so, you're probably off-by-one here: you need to round up
partial pages. As a matter of fact, you probably need to deal with
partial pages at both beginning and end.

> +       if (offset > (1UL << 21)) {
> +               flush_tlb_mm(vma->vm_mm);
> +               return;
> +       }

There's a comment that this limitation on iterative flushes is
arbitrary, and selected to not trigger soft lockups. At the very
least, you need a similar comment here as to why this code is needed.

> +       dsb(ishst);
> +
> +       /*
> +        * The minimum size of TLB RANGE is 2 PAGE;
> +        * Use normal TLB instruction to handle odd PAGEs
> +        */
> +       if (offset % 2 == 1) {
> +               addr = __TLBI_VADDR(start, asid);
> +               if (last_level) {
> +                       __tlbi(vale1is, addr);
> +                       __tlbi_user(vale1is, addr);
> +               } else {
> +                       __tlbi(vae1is, addr);
> +                       __tlbi_user(vae1is, addr);
> +               }
> +               start += 1 << PAGE_SHIFT;
> +               offset -= 1;
> +       }
> +
> +       while (offset > 0) {
> +               num = (offset & TLB_RANGE_MASK) - 1;
> +               if (num >= 0) {
> +                       addr = __TLBI_VADDR_RANGE(start, asid, tg,
> +                                                 scale, num, ttl);
> +                       if (last_level) {
> +                               __tlbi(rvale1is, addr);
> +                               __tlbi_user(rvale1is, addr);
> +                       } else {
> +                               __tlbi(rvae1is, addr);
> +                               __tlbi_user(rvae1is, addr);
> +                       }
> +                       start += (num + 1) << (5 * scale + 1) << PAGE_SHIFT;
> +               }
> +               scale++;

This is an odd way of doing the loop, by looping over the base and
linearly increasing the exponent.

Wouldn't it be easier to start with as high 'num' as possible as long
as the range ("offset" in your code) is larger than 2^5, and then do a
few iterations at the end for the smaller ranges?

> +               offset >>= TLB_RANGE_MASK_SHIFT;
> +       }
> +       dsb(ish);
> +}

The inner pieces of this loop, the special case at the beginning, and
the old implementation are all the same.

The main difference between now and before are:

1) How much you step forward on each iteration
2) How you calculate the address argument

Would it be better to just refactor the old code? You can calculate
the ranges the same way but just loop over them for non-8.4-TLBI
platforms.

No matter what, we really want some benchmarks and numbers to motivate
these changes. TLB operations tend to get on critical paths where
single cycles matter.

> +static inline void __flush_tlb_range(struct vm_area_struct *vma,
> +                                    unsigned long start, unsigned long end,
> +                                    unsigned long stride, bool last_level)
> +{
> +       if (cpus_have_const_cap(ARM64_HAS_TLBI_RANGE))
> +               __flush_tlb_range_new(vma, start, end, stride, last_level);
> +       else
> +               __flush_tlb_range_old(vma, start, end, stride, last_level);
> +}
> +
>  static inline void flush_tlb_range(struct vm_area_struct *vma,
>                                    unsigned long start, unsigned long end)
>  {
> diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
> index 80f459ad0190..bdefd8a34729 100644
> --- a/arch/arm64/kernel/cpufeature.c
> +++ b/arch/arm64/kernel/cpufeature.c
> @@ -1566,6 +1566,16 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
>                 .min_field_value = 1,
>         },
>  #endif
> +       {
> +               .desc = "TLB range maintenance instruction",
> +               .capability = ARM64_HAS_TLBI_RANGE,
> +               .type = ARM64_CPUCAP_SYSTEM_FEATURE,
> +               .matches = has_cpuid_feature,
> +               .sys_reg = SYS_ID_AA64ISAR0_EL1,
> +               .field_pos = ID_AA64ISAR0_TLB_SHIFT,
> +               .sign = FTR_UNSIGNED,
> +               .min_field_value = ID_AA64ISAR0_TLB_RANGE,
> +       },
>         {},
>  };
>
> --
> 2.19.1
>
>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH v2] arm64: cpufeatures: add support for tlbi range instructions
  2019-11-11 16:32 ` Olof Johansson
@ 2019-11-12  2:19   ` Zhenyu Ye
  0 siblings, 0 replies; 11+ messages in thread
From: Zhenyu Ye @ 2019-11-12  2:19 UTC (permalink / raw)
  To: Olof Johansson
  Cc: Catalin Marinas, Will Deacon, Marc Zyngier, Suzuki K. Poulose,
	Mark Rutland, tangnianyao, xiexiangyou,
	Linux Kernel Mailing List, ARM-SoC Maintainers



On 2019/11/12 0:32, Olof Johansson wrote:
> On Mon, Nov 11, 2019 at 5:24 AM Zhenyu Ye <yezhenyu2@huawei.com> wrote:
>>
>> ARMv8.4-TLBI provides TLBI invalidation instruction that apply to a
>> range of input addresses. This patch adds support for this feature.
>> This is the second version of the patch.
>>
>> I traced the __flush_tlb_range() for a minute and get some statistical
>> data as below:
>>
>>         PAGENUM         COUNT
>>         1               34944
>>         2               5683
>>         3               1343
>>         4               7857
>>         5               838
>>         9               339
>>         16              933
>>         19              427
>>         20              5821
>>         23              279
>>         41              338
>>         141             279
>>         512             428
>>         1668            120
>>         2038            100
>>
>> Those data are based on kernel-5.4.0, where PAGENUM = end - start, COUNT
>> shows number of calls to the __flush_tlb_range() in a minute. There only
>> shows the data which COUNT >= 100. The kernel is started normally, and
>> transparent hugepage is opened. As we can see, though most user TLBI
>> ranges were 1 pages long, the num of long-range can not be ignored.
>>
>> The new feature of TLB range can improve lots of performance compared to
>> the current implementation. As an example, flush 512 ranges needs only 1
>> instruction as opposed to 512 instructions using current implementation.
> 
> But there's no indication whether this performs better or not in reality.
> 
> A perf report indicating, for example, cycles spent in TLBI on the two
> versions would be a lot more valuable.
> 

We don't have a performance test environment supporting TLBI range right
now, so there is only some theoretical analysis. Anyway, above data shows
there are application scenarios of wide pages range.

>> And for a new hardware feature, support is better than not.
> 
> This is blatantly untrue. Complexity is added, and if there's no
> evidence of benefit of said complexity, it is not something we want to
> add.
> 

You are right. I will do some benchmarks when conditions permit. Data is
the most convincing. And I will modify my code according to your advice
below.

When the benchmark data is ready, I will send a new version of this patch.


>> Signed-off-by: Zhenyu Ye <yezhenyu2@huawei.com>
>> ---
>> ChangeLog v1 -> v2:
>> - Change the main implementation of this feature.
>> - Add some comments.
>>
>> ---
>>  arch/arm64/include/asm/cpucaps.h  |  3 +-
>>  arch/arm64/include/asm/sysreg.h   |  4 ++
>>  arch/arm64/include/asm/tlbflush.h | 99 ++++++++++++++++++++++++++++++-
>>  arch/arm64/kernel/cpufeature.c    | 10 ++++
>>  4 files changed, 112 insertions(+), 4 deletions(-)
>>
>> diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
>> index ac1dbca3d0cd..5b5230060e5b 100644
>> --- a/arch/arm64/include/asm/cpucaps.h
>> +++ b/arch/arm64/include/asm/cpucaps.h
>> @@ -54,7 +54,8 @@
>>  #define ARM64_WORKAROUND_1463225               44
>>  #define ARM64_WORKAROUND_CAVIUM_TX2_219_TVM    45
>>  #define ARM64_WORKAROUND_CAVIUM_TX2_219_PRFM   46
>> +#define ARM64_HAS_TLBI_RANGE                   47
>>
>> -#define ARM64_NCAPS                            47
>> +#define ARM64_NCAPS                            48
>>
>>  #endif /* __ASM_CPUCAPS_H */
>> diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
>> index 6e919fafb43d..a6abbf2b067d 100644
>> --- a/arch/arm64/include/asm/sysreg.h
>> +++ b/arch/arm64/include/asm/sysreg.h
>> @@ -539,6 +539,7 @@
>>                          ENDIAN_SET_EL1 | SCTLR_EL1_UCI  | SCTLR_EL1_RES1)
>>
>>  /* id_aa64isar0 */
>> +#define ID_AA64ISAR0_TLB_SHIFT         56
>>  #define ID_AA64ISAR0_TS_SHIFT          52
>>  #define ID_AA64ISAR0_FHM_SHIFT         48
>>  #define ID_AA64ISAR0_DP_SHIFT          44
>> @@ -552,6 +553,9 @@
>>  #define ID_AA64ISAR0_SHA1_SHIFT                8
>>  #define ID_AA64ISAR0_AES_SHIFT         4
>>
>> +#define ID_AA64ISAR0_TLB_NI            0x0
>> +#define ID_AA64ISAR0_TLB_RANGE         0x2
>> +
>>  /* id_aa64isar1 */
>>  #define ID_AA64ISAR1_SB_SHIFT          36
>>  #define ID_AA64ISAR1_FRINTTS_SHIFT     32
>> diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
>> index bc3949064725..f49bed7ecb68 100644
>> --- a/arch/arm64/include/asm/tlbflush.h
>> +++ b/arch/arm64/include/asm/tlbflush.h
>> @@ -59,6 +59,33 @@
>>                 __ta;                                           \
>>         })
>>
>> +/*
>> + * This macro creates a properly formatted VA operand for the TLBI RANGE.
>> + * The value bit assignments are:
>> + *
>> + * +----------+------+-------+-------+-------+----------------------+
>> + * |   ASID   |  TG  | SCALE |  NUM  |  TTL  |        BADDR         |
>> + * +-----------------+-------+-------+-------+----------------------+
>> + * |63      48|47  46|45   44|43   39|38   37|36                   0|
>> + *
>> + * The address range is determined by below formula:
>> + * [BADDR, BADDR + (NUM + 1) * 2^(5*SCALE + 1) * PAGESIZE)
>> + *
>> + */
>> +#define __TLBI_VADDR_RANGE(addr, asid, tg, scale, num, ttl)    \
>> +       ({                                                      \
>> +               unsigned long __ta = (addr) >> PAGE_SHIFT;      \
>> +               __ta |= (unsigned long)(ttl) << 37;             \
>> +               __ta |= (unsigned long)(num) << 39;             \
>> +               __ta |= (unsigned long)(scale) << 44;           \
>> +               __ta |= (unsigned long)(tg) << 46;              \
>> +               __ta |= (unsigned long)(asid) << 48;            \
>> +               __ta;                                           \
>> +       })
>> +
>> +#define TLB_RANGE_MASK_SHIFT 5
>> +#define TLB_RANGE_MASK GENMASK_ULL(TLB_RANGE_MASK_SHIFT, 0)
>> +
>>  /*
>>   *     TLB Invalidation
>>   *     ================
>> @@ -177,9 +204,9 @@ static inline void flush_tlb_page(struct vm_area_struct *vma,
>>   */
>>  #define MAX_TLBI_OPS   PTRS_PER_PTE
>>
>> -static inline void __flush_tlb_range(struct vm_area_struct *vma,
>> -                                    unsigned long start, unsigned long end,
>> -                                    unsigned long stride, bool last_level)
>> +static inline void __flush_tlb_range_old(struct vm_area_struct *vma,
>> +                                        unsigned long start, unsigned long end,
>> +                                        unsigned long stride, bool last_level)
> 
> "old" and "new" are not very descriptive.
> 

I will change these.

>>  {
>>         unsigned long asid = ASID(vma->vm_mm);
>>         unsigned long addr;
>> @@ -211,6 +238,72 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma,
>>         dsb(ish);
>>  }
>>
>> +static inline void __flush_tlb_range_new(struct vm_area_struct *vma,
>> +                                        unsigned long start, unsigned long end,
>> +                                        unsigned long stride, bool last_level)
>> +{
>> +       int num = 0;
>> +       int scale = 0;
>> +       int ttl = 0;
>> +       int tg = (PAGE_SHIFT - 12) / 2 + 1;
> 
> This is a constant, and shouldn't need to be a variable. You can push
> it down to the addr generator macro.
> 

OK.

>> +       unsigned long asid = ASID(vma->vm_mm);
>> +       unsigned long addr = 0;
>> +       unsigned long offset = (end - start) >> PAGE_SHIFT;
> 
> "offset" confused me a lot here -- I think this variable really
> describes number of pages to flush?
> 

Yes. I will use round_down and round_up here.

> And, if so, you're probably off-by-one here: you need to round up
> partial pages. As a matter of fact, you probably need to deal with
> partial pages at both beginning and end.
> 
>> +       if (offset > (1UL << 21)) {
>> +               flush_tlb_mm(vma->vm_mm);
>> +               return;
>> +       }
> 
> There's a comment that this limitation on iterative flushes is
> arbitrary, and selected to not trigger soft lockups. At the very
> least, you need a similar comment here as to why this code is needed.
> 

This is the biggest range of TLBI range instruction supported, which
is equal to (2^5 - 1 + 1) * (2^(3 * 5 + 1)).  There needs up to 4
instructions and may not trigger soft lockups.  Maybe I should add
some comments here.

>> +       dsb(ishst);
>> +
>> +       /*
>> +        * The minimum size of TLB RANGE is 2 PAGE;
>> +        * Use normal TLB instruction to handle odd PAGEs
>> +        */
>> +       if (offset % 2 == 1) {
>> +               addr = __TLBI_VADDR(start, asid);
>> +               if (last_level) {
>> +                       __tlbi(vale1is, addr);
>> +                       __tlbi_user(vale1is, addr);
>> +               } else {
>> +                       __tlbi(vae1is, addr);
>> +                       __tlbi_user(vae1is, addr);
>> +               }
>> +               start += 1 << PAGE_SHIFT;
>> +               offset -= 1;
>> +       }
>> +
>> +       while (offset > 0) {
>> +               num = (offset & TLB_RANGE_MASK) - 1;
>> +               if (num >= 0) {
>> +                       addr = __TLBI_VADDR_RANGE(start, asid, tg,
>> +                                                 scale, num, ttl);
>> +                       if (last_level) {
>> +                               __tlbi(rvale1is, addr);
>> +                               __tlbi_user(rvale1is, addr);
>> +                       } else {
>> +                               __tlbi(rvae1is, addr);
>> +                               __tlbi_user(rvae1is, addr);
>> +                       }
>> +                       start += (num + 1) << (5 * scale + 1) << PAGE_SHIFT;
>> +               }
>> +               scale++;
> 
> This is an odd way of doing the loop, by looping over the base and
> linearly increasing the exponent.
> 
> Wouldn't it be easier to start with as high 'num' as possible as long
> as the range ("offset" in your code) is larger than 2^5, and then do a
> few iterations at the end for the smaller ranges?
> 

As the above data shows, most of the range are small, so I decide to do
the flush from small to high. If we do from high to small, there may need
some additional judgment, which will add software side overhead. So I think
it's better to do this from small to high.

>> +               offset >>= TLB_RANGE_MASK_SHIFT;
>> +       }
>> +       dsb(ish);
>> +}
> 
> The inner pieces of this loop, the special case at the beginning, and
> the old implementation are all the same.
> 
> The main difference between now and before are:
> 
> 1) How much you step forward on each iteration
> 2) How you calculate the address argument
> 
> Would it be better to just refactor the old code? You can calculate
> the ranges the same way but just loop over them for non-8.4-TLBI
> platforms.
> 

However, they just look similar but actually different. There are some small
differences between now and before, such as,

	 __TLBI_VADDR(start, asid);
	 __TLBI_VADDR_RANGE(start, start, asid, tg, scale, num, ttl);

	__tlbi(rvale1is, addr);
	__tlbi(vale1is, addr);

So I think it's not worth to refactor the old code.

> No matter what, we really want some benchmarks and numbers to motivate
> these changes. TLB operations tend to get on critical paths where
> single cycles matter.
> 

I can understand your concern. When the benchmark data is ready,
I will send a new version of this patch.

>> +static inline void __flush_tlb_range(struct vm_area_struct *vma,
>> +                                    unsigned long start, unsigned long end,
>> +                                    unsigned long stride, bool last_level)
>> +{
>> +       if (cpus_have_const_cap(ARM64_HAS_TLBI_RANGE))
>> +               __flush_tlb_range_new(vma, start, end, stride, last_level);
>> +       else
>> +               __flush_tlb_range_old(vma, start, end, stride, last_level);
>> +}
>> +
>>  static inline void flush_tlb_range(struct vm_area_struct *vma,
>>                                    unsigned long start, unsigned long end)
>>  {
>> diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
>> index 80f459ad0190..bdefd8a34729 100644
>> --- a/arch/arm64/kernel/cpufeature.c
>> +++ b/arch/arm64/kernel/cpufeature.c
>> @@ -1566,6 +1566,16 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
>>                 .min_field_value = 1,
>>         },
>>  #endif
>> +       {
>> +               .desc = "TLB range maintenance instruction",
>> +               .capability = ARM64_HAS_TLBI_RANGE,
>> +               .type = ARM64_CPUCAP_SYSTEM_FEATURE,
>> +               .matches = has_cpuid_feature,
>> +               .sys_reg = SYS_ID_AA64ISAR0_EL1,
>> +               .field_pos = ID_AA64ISAR0_TLB_SHIFT,
>> +               .sign = FTR_UNSIGNED,
>> +               .min_field_value = ID_AA64ISAR0_TLB_RANGE,
>> +       },
>>         {},
>>  };
>>
>> --
>> 2.19.1
>>
>>
> 
> .
> 


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH v2] arm64: cpufeatures: add support for tlbi range instructions
  2019-11-11 14:04     ` Marc Zyngier
  2019-11-11 14:23       ` Zhenyu Ye
@ 2019-11-19  1:13       ` Hanjun Guo
  2019-11-19 10:03         ` Marc Zyngier
  1 sibling, 1 reply; 11+ messages in thread
From: Hanjun Guo @ 2019-11-19  1:13 UTC (permalink / raw)
  To: Marc Zyngier, Zhenyu Ye
  Cc: Will Deacon, catalin.marinas, suzuki.poulose, mark.rutland,
	tangnianyao, xiexiangyou, linux-kernel, arm, linux-arm-kernel,
	Linuxarm, Shaokun Zhang, wanghuiqiang

+Cc linux-arm-kernel mailing list and Shaokun.

Hi Marc,

On 2019/11/11 22:04, Marc Zyngier wrote:
> On 2019-11-11 14:56, Zhenyu Ye wrote:
>> On 2019/11/11 21:27, Will Deacon wrote:
>>> On Mon, Nov 11, 2019 at 09:23:55PM +0800, Zhenyu Ye wrote:
[...]
>>>
>>> How does this address my concerns here:
>>>
>>>
>>> https://lore.kernel.org/linux-arm-kernel/20191031131649.GB27196@willie-the-truck/
>>>
>>> ?
>>>
>>> Will
>>
>> I think your concern is more about the hardware level, and we can do
>> nothing about
>> this at all. The interconnect/DVM implementation is not exposed to
>> software layer
>> (and no need), and may should be constrained at hardware level.
> 
> You're missing the point here: the instruction may be implemented
> and perfectly working at the CPU level, and yet not carried over
> the interconnect. In this situation, other CPUs may not observe
> the DVM messages instructing them of such invalidation, and you'll end
> up with memory corruption.
> 
> So, in the absence of an architectural guarantee that range invalidation
> is supported and observed by all the DVM agents in the system, there must
> be a firmware description for it on which the kernel can rely.

I'm thinking of how to add a firmware description for it, how about this:

Adding a system level flag to indicate the supporting of TIBi by range,
which means adding a binding name for example "tlbi-by-range" at system
level in the dts file, or a tlbi by range flag in ACPI FADT table, then
we use the ID register per-cpu and the system level flag as

if (cpus_have_const_cap(ARM64_HAS_TLBI_BY_RANGE) && system_level_tlbi_by_range)
	flush_tlb_by_range()
else
	flush_tlb_range()

And this seems work for heterogeneous system (olny parts of the CPU support
TLBi by range) as well, correct me if anything wrong.

Thanks
Hanjun


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH v2] arm64: cpufeatures: add support for tlbi range  instructions
  2019-11-19  1:13       ` Hanjun Guo
@ 2019-11-19 10:03         ` Marc Zyngier
  2019-11-20  1:29           ` Hanjun Guo
  2019-11-20  8:47           ` Will Deacon
  0 siblings, 2 replies; 11+ messages in thread
From: Marc Zyngier @ 2019-11-19 10:03 UTC (permalink / raw)
  To: Hanjun Guo
  Cc: Zhenyu Ye, Will Deacon, catalin.marinas, suzuki.poulose,
	mark.rutland, tangnianyao, xiexiangyou, linux-kernel, arm,
	linux-arm-kernel, Linuxarm, Shaokun Zhang, wanghuiqiang

Hi Hanjun,

On 2019-11-19 01:13, Hanjun Guo wrote:
> +Cc linux-arm-kernel mailing list and Shaokun.
>
> Hi Marc,
>
> On 2019/11/11 22:04, Marc Zyngier wrote:
>> On 2019-11-11 14:56, Zhenyu Ye wrote:
>>> On 2019/11/11 21:27, Will Deacon wrote:
>>>> On Mon, Nov 11, 2019 at 09:23:55PM +0800, Zhenyu Ye wrote:
> [...]
>>>>
>>>> How does this address my concerns here:
>>>>
>>>>
>>>> 
>>>> https://lore.kernel.org/linux-arm-kernel/20191031131649.GB27196@willie-the-truck/
>>>>
>>>> ?
>>>>
>>>> Will
>>>
>>> I think your concern is more about the hardware level, and we can 
>>> do
>>> nothing about
>>> this at all. The interconnect/DVM implementation is not exposed to
>>> software layer
>>> (and no need), and may should be constrained at hardware level.
>>
>> You're missing the point here: the instruction may be implemented
>> and perfectly working at the CPU level, and yet not carried over
>> the interconnect. In this situation, other CPUs may not observe
>> the DVM messages instructing them of such invalidation, and you'll 
>> end
>> up with memory corruption.
>>
>> So, in the absence of an architectural guarantee that range 
>> invalidation
>> is supported and observed by all the DVM agents in the system, there 
>> must
>> be a firmware description for it on which the kernel can rely.
>
> I'm thinking of how to add a firmware description for it, how about 
> this:
>
> Adding a system level flag to indicate the supporting of TIBi by 
> range,
> which means adding a binding name for example "tlbi-by-range" at 
> system
> level in the dts file, or a tlbi by range flag in ACPI FADT table, 
> then
> we use the ID register per-cpu and the system level flag as
>
> if (cpus_have_const_cap(ARM64_HAS_TLBI_BY_RANGE) &&
> system_level_tlbi_by_range)
> 	flush_tlb_by_range()
> else
> 	flush_tlb_range()
>
> And this seems work for heterogeneous system (olny parts of the CPU 
> support
> TLBi by range) as well, correct me if anything wrong.

It could work, but it needs to come with the strongest guarantees that
all the DVM agents in the system understand this type of invalidation,
specially as we move into the SVM territory. It may also need to cope
with non-compliant agents being hot-plugged, or at least discovered 
late.

I also wonder if the ARMv8.4-TTL extension (which I have patches for in
the nested virt series) requires the same kind of treatment (after all,
it has an implicit range based on the base granule size and level).

In any way, this requires careful specification, and I don't think
we can improvise this on the ML... ;-)

Thanks,

         M.
-- 
Jazz is not dead. It just smells funny...

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH v2] arm64: cpufeatures: add support for tlbi range instructions
  2019-11-19 10:03         ` Marc Zyngier
@ 2019-11-20  1:29           ` Hanjun Guo
  2019-11-20  8:47           ` Will Deacon
  1 sibling, 0 replies; 11+ messages in thread
From: Hanjun Guo @ 2019-11-20  1:29 UTC (permalink / raw)
  To: Marc Zyngier
  Cc: Zhenyu Ye, Will Deacon, catalin.marinas, suzuki.poulose,
	mark.rutland, tangnianyao, xiexiangyou, linux-kernel, arm,
	linux-arm-kernel, Linuxarm, Shaokun Zhang, wanghuiqiang

Hi Marc,

On 2019/11/19 18:03, Marc Zyngier wrote:
> Hi Hanjun,
> 
> On 2019-11-19 01:13, Hanjun Guo wrote:
>> +Cc linux-arm-kernel mailing list and Shaokun.
>>
>> Hi Marc,
>>
>> On 2019/11/11 22:04, Marc Zyngier wrote:
>>> On 2019-11-11 14:56, Zhenyu Ye wrote:
>>>> On 2019/11/11 21:27, Will Deacon wrote:
>>>>> On Mon, Nov 11, 2019 at 09:23:55PM +0800, Zhenyu Ye wrote:
>> [...]
>>>>>
>>>>> How does this address my concerns here:
>>>>> https://lore.kernel.org/linux-arm-kernel/20191031131649.GB27196@willie-the-truck/
>>>>>
>>>>> ?
>>>>>
>>>>> Will
>>>>
>>>> I think your concern is more about the hardware level, and we can do
>>>> nothing about
>>>> this at all. The interconnect/DVM implementation is not exposed to
>>>> software layer
>>>> (and no need), and may should be constrained at hardware level.
>>>
>>> You're missing the point here: the instruction may be implemented
>>> and perfectly working at the CPU level, and yet not carried over
>>> the interconnect. In this situation, other CPUs may not observe
>>> the DVM messages instructing them of such invalidation, and you'll end
>>> up with memory corruption.
>>>
>>> So, in the absence of an architectural guarantee that range invalidation
>>> is supported and observed by all the DVM agents in the system, there must
>>> be a firmware description for it on which the kernel can rely.
>>
>> I'm thinking of how to add a firmware description for it, how about this:
>>
>> Adding a system level flag to indicate the supporting of TIBi by range,
>> which means adding a binding name for example "tlbi-by-range" at system
>> level in the dts file, or a tlbi by range flag in ACPI FADT table, then
>> we use the ID register per-cpu and the system level flag as
>>
>> if (cpus_have_const_cap(ARM64_HAS_TLBI_BY_RANGE) &&
>> system_level_tlbi_by_range)
>>     flush_tlb_by_range()
>> else
>>     flush_tlb_range()
>>
>> And this seems work for heterogeneous system (olny parts of the CPU support
>> TLBi by range) as well, correct me if anything wrong.
> 
> It could work, but it needs to come with the strongest guarantees that
> all the DVM agents in the system understand this type of invalidation,
> specially as we move into the SVM territory. It may also need to cope
> with non-compliant agents being hot-plugged, or at least discovered late.

Totally agreed, we are working on this in the system level including SMMU.

> 
> I also wonder if the ARMv8.4-TTL extension (which I have patches for in
> the nested virt series) requires the same kind of treatment (after all,
> it has an implicit range based on the base granule size and level).
> 
> In any way, this requires careful specification, and I don't think
> we can improvise this on the ML... ;-)

Sure :), the good news is that ARM officially announced will be
working with Huawei again.

So if I understand your point correctly, we need steps to take:
 - ARM spec needs to make TIBi by range crystal clear and being
   written down in the spec;
 - Firmware description of supporting TLBi by range in system level
   for both FDT and ACPI;
 - Then upstream the code.

Thanks
Hanjun


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH v2] arm64: cpufeatures: add support for tlbi range instructions
  2019-11-19 10:03         ` Marc Zyngier
  2019-11-20  1:29           ` Hanjun Guo
@ 2019-11-20  8:47           ` Will Deacon
  1 sibling, 0 replies; 11+ messages in thread
From: Will Deacon @ 2019-11-20  8:47 UTC (permalink / raw)
  To: Marc Zyngier
  Cc: Hanjun Guo, Zhenyu Ye, catalin.marinas, suzuki.poulose,
	mark.rutland, tangnianyao, xiexiangyou, linux-kernel, arm,
	linux-arm-kernel, Linuxarm, Shaokun Zhang, wanghuiqiang

On Tue, Nov 19, 2019 at 10:03:34AM +0000, Marc Zyngier wrote:
> On 2019-11-19 01:13, Hanjun Guo wrote:
> > I'm thinking of how to add a firmware description for it, how about
> > this:
> > 
> > Adding a system level flag to indicate the supporting of TIBi by range,
> > which means adding a binding name for example "tlbi-by-range" at system
> > level in the dts file, or a tlbi by range flag in ACPI FADT table, then
> > we use the ID register per-cpu and the system level flag as
> > 
> > if (cpus_have_const_cap(ARM64_HAS_TLBI_BY_RANGE) &&
> > system_level_tlbi_by_range)
> > 	flush_tlb_by_range()
> > else
> > 	flush_tlb_range()
> > 
> > And this seems work for heterogeneous system (olny parts of the CPU
> > support
> > TLBi by range) as well, correct me if anything wrong.
> 
> It could work, but it needs to come with the strongest guarantees that
> all the DVM agents in the system understand this type of invalidation,
> specially as we move into the SVM territory. It may also need to cope
> with non-compliant agents being hot-plugged, or at least discovered late.
> 
> I also wonder if the ARMv8.4-TTL extension (which I have patches for in
> the nested virt series) requires the same kind of treatment (after all,
> it has an implicit range based on the base granule size and level).

It would be good to get confirmation from Arm about this, since the TTL
extension doesn't have the dangerous 'Note' that the range ops do and it
wouldn't be difficult to ignore those bits in hardware where the system
doesn't support the hint for all agents (in comparison to upgrading range
ops to ALL, which may be unpalatable).

Will

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2019-11-20  8:47 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-11-11 13:23 [RFC PATCH v2] arm64: cpufeatures: add support for tlbi range instructions Zhenyu Ye
2019-11-11 13:27 ` Will Deacon
2019-11-11 13:47   ` Zhenyu Ye
2019-11-11 14:04     ` Marc Zyngier
2019-11-11 14:23       ` Zhenyu Ye
2019-11-19  1:13       ` Hanjun Guo
2019-11-19 10:03         ` Marc Zyngier
2019-11-20  1:29           ` Hanjun Guo
2019-11-20  8:47           ` Will Deacon
2019-11-11 16:32 ` Olof Johansson
2019-11-12  2:19   ` Zhenyu Ye

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).