linux-arm-kernel.lists.infradead.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v3] kasan: speed up mte_set_mem_tag_range
@ 2021-05-17 23:55 Evgenii Stepanov
  2021-05-18 17:44 ` Catalin Marinas
  0 siblings, 1 reply; 5+ messages in thread
From: Evgenii Stepanov @ 2021-05-17 23:55 UTC (permalink / raw)
  To: Andrey Ryabinin, Alexander Potapenko, Andrey Konovalov,
	Dmitry Vyukov, Catalin Marinas, Will Deacon, Steven Price,
	Peter Collingbourne, Evgenii Stepanov, kasan-dev,
	linux-arm-kernel, linux-kernel

Use DC GVA / DC GZVA to speed up KASan memory tagging in HW tags mode.

The first cacheline is always tagged using STG/STZG even if the address is
cacheline-aligned, as benchmarks show it is faster than a conditional
branch.

Signed-off-by: Evgenii Stepanov <eugenis@google.com>
Co-developed-by: Peter Collingbourne <pcc@google.com>
Signed-off-by: Peter Collingbourne <pcc@google.com>
---
Changelog since v1:
- Added Co-developed-by.

Changelog since v2:
- Added Signed-off-by.

 arch/arm64/include/asm/mte-kasan.h | 40 +------------------
 arch/arm64/lib/Makefile            |  2 +
 arch/arm64/lib/mte-kasan.S         | 63 ++++++++++++++++++++++++++++++
 3 files changed, 66 insertions(+), 39 deletions(-)
 create mode 100644 arch/arm64/lib/mte-kasan.S

diff --git a/arch/arm64/include/asm/mte-kasan.h b/arch/arm64/include/asm/mte-kasan.h
index ddd4d17cf9a0..e29a0e2ab35c 100644
--- a/arch/arm64/include/asm/mte-kasan.h
+++ b/arch/arm64/include/asm/mte-kasan.h
@@ -48,45 +48,7 @@ static inline u8 mte_get_random_tag(void)
 	return mte_get_ptr_tag(addr);
 }
 
-/*
- * Assign allocation tags for a region of memory based on the pointer tag.
- * Note: The address must be non-NULL and MTE_GRANULE_SIZE aligned and
- * size must be non-zero and MTE_GRANULE_SIZE aligned.
- */
-static inline void mte_set_mem_tag_range(void *addr, size_t size,
-						u8 tag, bool init)
-{
-	u64 curr, end;
-
-	if (!size)
-		return;
-
-	curr = (u64)__tag_set(addr, tag);
-	end = curr + size;
-
-	/*
-	 * 'asm volatile' is required to prevent the compiler to move
-	 * the statement outside of the loop.
-	 */
-	if (init) {
-		do {
-			asm volatile(__MTE_PREAMBLE "stzg %0, [%0]"
-				     :
-				     : "r" (curr)
-				     : "memory");
-			curr += MTE_GRANULE_SIZE;
-		} while (curr != end);
-	} else {
-		do {
-			asm volatile(__MTE_PREAMBLE "stg %0, [%0]"
-				     :
-				     : "r" (curr)
-				     : "memory");
-			curr += MTE_GRANULE_SIZE;
-		} while (curr != end);
-	}
-}
-
+void mte_set_mem_tag_range(void *addr, size_t size, u8 tag, bool init);
 void mte_enable_kernel_sync(void);
 void mte_enable_kernel_async(void);
 void mte_init_tags(u64 max_tag);
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index d31e1169d9b8..c06ada79a437 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -18,3 +18,5 @@ obj-$(CONFIG_CRC32) += crc32.o
 obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
 
 obj-$(CONFIG_ARM64_MTE) += mte.o
+
+obj-$(CONFIG_KASAN_HW_TAGS) += mte-kasan.o
diff --git a/arch/arm64/lib/mte-kasan.S b/arch/arm64/lib/mte-kasan.S
new file mode 100644
index 000000000000..9f6975e2af60
--- /dev/null
+++ b/arch/arm64/lib/mte-kasan.S
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2021 Google Inc.
+ */
+#include <linux/const.h>
+#include <linux/linkage.h>
+
+#include <asm/mte-def.h>
+
+	.arch	armv8.5-a+memtag
+
+	.macro  __set_mem_tag_range, stg, gva, start, size, linesize, tmp1, tmp2, tmp3
+	add	\tmp3, \start, \size
+	cmp	\size, \linesize, lsl #1
+	b.lt	.Lsmtr3_\@
+
+	sub	\tmp1, \linesize, #1
+	bic	\tmp2, \tmp3, \tmp1
+	orr	\tmp1, \start, \tmp1
+
+.Lsmtr1_\@:
+	\stg	\start, [\start], #MTE_GRANULE_SIZE
+	cmp	\start, \tmp1
+	b.lt	.Lsmtr1_\@
+
+.Lsmtr2_\@:
+	dc	\gva, \start
+	add	\start, \start, \linesize
+	cmp	\start, \tmp2
+	b.lt	.Lsmtr2_\@
+
+.Lsmtr3_\@:
+	cmp	\start, \tmp3
+	b.ge	.Lsmtr4_\@
+	\stg	\start, [\start], #MTE_GRANULE_SIZE
+	b	.Lsmtr3_\@
+.Lsmtr4_\@:
+	.endm
+
+/*
+ * Assign allocation tags for a region of memory based on the pointer tag.
+ * Note: The address must be non-NULL and MTE_GRANULE_SIZE aligned and
+ * size must be non-zero and MTE_GRANULE_SIZE aligned.
+ *   x0 - start address
+ *   x1 - region size
+ *   x2 - tag
+ *   x3 - bool init
+ */
+SYM_FUNC_START(mte_set_mem_tag_range)
+	mrs	x4, dczid_el0
+	and	w4, w4, #0xf
+	mov	x5, #4
+	lsl	x4, x5, x4
+
+	bfi	x0, x2, #56, #8
+
+	cbz	x3, .Lnoinit
+	__set_mem_tag_range stzg, gzva, x0, x1, x4, x2, x3, x5
+	ret
+.Lnoinit:
+	__set_mem_tag_range stg, gva, x0, x1, x4, x2, x3, x5
+	ret
+SYM_FUNC_END(mte_set_mem_tag_range)
-- 
2.31.1.751.gd2f1c929bd-goog


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH v3] kasan: speed up mte_set_mem_tag_range
  2021-05-17 23:55 [PATCH v3] kasan: speed up mte_set_mem_tag_range Evgenii Stepanov
@ 2021-05-18 17:44 ` Catalin Marinas
  2021-05-18 18:11   ` Peter Collingbourne
  0 siblings, 1 reply; 5+ messages in thread
From: Catalin Marinas @ 2021-05-18 17:44 UTC (permalink / raw)
  To: Evgenii Stepanov
  Cc: Andrey Ryabinin, Alexander Potapenko, Andrey Konovalov,
	Dmitry Vyukov, Will Deacon, Steven Price, Peter Collingbourne,
	kasan-dev, linux-arm-kernel, linux-kernel

On Mon, May 17, 2021 at 04:55:46PM -0700, Evgenii Stepanov wrote:
> Use DC GVA / DC GZVA to speed up KASan memory tagging in HW tags mode.
> 
> The first cacheline is always tagged using STG/STZG even if the address is
> cacheline-aligned, as benchmarks show it is faster than a conditional
> branch.
[...]
> diff --git a/arch/arm64/include/asm/mte-kasan.h b/arch/arm64/include/asm/mte-kasan.h
> index ddd4d17cf9a0..e29a0e2ab35c 100644
> --- a/arch/arm64/include/asm/mte-kasan.h
> +++ b/arch/arm64/include/asm/mte-kasan.h
> @@ -48,45 +48,7 @@ static inline u8 mte_get_random_tag(void)
>  	return mte_get_ptr_tag(addr);
>  }
>  
> -/*
> - * Assign allocation tags for a region of memory based on the pointer tag.
> - * Note: The address must be non-NULL and MTE_GRANULE_SIZE aligned and
> - * size must be non-zero and MTE_GRANULE_SIZE aligned.
> - */
> -static inline void mte_set_mem_tag_range(void *addr, size_t size,
> -						u8 tag, bool init)

With commit 2cb34276427a ("arm64: kasan: simplify and inline MTE
functions") you wanted this inlined for performance. Does this not
matter much that it's now out of line?

> diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
> index d31e1169d9b8..c06ada79a437 100644
> --- a/arch/arm64/lib/Makefile
> +++ b/arch/arm64/lib/Makefile
> @@ -18,3 +18,5 @@ obj-$(CONFIG_CRC32) += crc32.o
>  obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
>  
>  obj-$(CONFIG_ARM64_MTE) += mte.o
> +
> +obj-$(CONFIG_KASAN_HW_TAGS) += mte-kasan.o
> diff --git a/arch/arm64/lib/mte-kasan.S b/arch/arm64/lib/mte-kasan.S
> new file mode 100644
> index 000000000000..9f6975e2af60
> --- /dev/null
> +++ b/arch/arm64/lib/mte-kasan.S
> @@ -0,0 +1,63 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Copyright (C) 2021 Google Inc.
> + */
> +#include <linux/const.h>
> +#include <linux/linkage.h>
> +
> +#include <asm/mte-def.h>
> +
> +	.arch	armv8.5-a+memtag
> +
> +	.macro  __set_mem_tag_range, stg, gva, start, size, linesize, tmp1, tmp2, tmp3
> +	add	\tmp3, \start, \size
> +	cmp	\size, \linesize, lsl #1
> +	b.lt	.Lsmtr3_\@

We could do with some comments here. Why the lsl #1? I think I get it
but it would be good to make this more readable.

It may be easier if you placed it in a file on its own (as it is now but
with a less generic file name) and use a few .req instead of the tmpX.
You can use the macro args only for the stg/gva.

> +
> +	sub	\tmp1, \linesize, #1
> +	bic	\tmp2, \tmp3, \tmp1
> +	orr	\tmp1, \start, \tmp1
> +
> +.Lsmtr1_\@:
> +	\stg	\start, [\start], #MTE_GRANULE_SIZE
> +	cmp	\start, \tmp1
> +	b.lt	.Lsmtr1_\@
> +
> +.Lsmtr2_\@:
> +	dc	\gva, \start
> +	add	\start, \start, \linesize
> +	cmp	\start, \tmp2
> +	b.lt	.Lsmtr2_\@
> +
> +.Lsmtr3_\@:
> +	cmp	\start, \tmp3
> +	b.ge	.Lsmtr4_\@
> +	\stg	\start, [\start], #MTE_GRANULE_SIZE
> +	b	.Lsmtr3_\@
> +.Lsmtr4_\@:
> +	.endm

If we want to get the best performance out of this, we should look at
the memset implementation and do something similar. In principle it's
not that far from a memzero, though depending on the microarchitecture
it may behave slightly differently.

Anyway, before that I wonder if we wrote all this in C + inline asm
(three while loops or maybe two and some goto), what's the performance
difference? It has the advantage of being easier to maintain even if we
used some C macros to generate gva/gzva variants.

-- 
Catalin

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v3] kasan: speed up mte_set_mem_tag_range
  2021-05-18 17:44 ` Catalin Marinas
@ 2021-05-18 18:11   ` Peter Collingbourne
  2021-05-19 18:12     ` Catalin Marinas
  0 siblings, 1 reply; 5+ messages in thread
From: Peter Collingbourne @ 2021-05-18 18:11 UTC (permalink / raw)
  To: Catalin Marinas
  Cc: Evgenii Stepanov, Andrey Ryabinin, Alexander Potapenko,
	Andrey Konovalov, Dmitry Vyukov, Will Deacon, Steven Price,
	kasan-dev, Linux ARM, Linux Kernel Mailing List

On Tue, May 18, 2021 at 10:44 AM Catalin Marinas
<catalin.marinas@arm.com> wrote:
>
> On Mon, May 17, 2021 at 04:55:46PM -0700, Evgenii Stepanov wrote:
> > Use DC GVA / DC GZVA to speed up KASan memory tagging in HW tags mode.
> >
> > The first cacheline is always tagged using STG/STZG even if the address is
> > cacheline-aligned, as benchmarks show it is faster than a conditional
> > branch.
> [...]
> > diff --git a/arch/arm64/include/asm/mte-kasan.h b/arch/arm64/include/asm/mte-kasan.h
> > index ddd4d17cf9a0..e29a0e2ab35c 100644
> > --- a/arch/arm64/include/asm/mte-kasan.h
> > +++ b/arch/arm64/include/asm/mte-kasan.h
> > @@ -48,45 +48,7 @@ static inline u8 mte_get_random_tag(void)
> >       return mte_get_ptr_tag(addr);
> >  }
> >
> > -/*
> > - * Assign allocation tags for a region of memory based on the pointer tag.
> > - * Note: The address must be non-NULL and MTE_GRANULE_SIZE aligned and
> > - * size must be non-zero and MTE_GRANULE_SIZE aligned.
> > - */
> > -static inline void mte_set_mem_tag_range(void *addr, size_t size,
> > -                                             u8 tag, bool init)
>
> With commit 2cb34276427a ("arm64: kasan: simplify and inline MTE
> functions") you wanted this inlined for performance. Does this not
> matter much that it's now out of line?
>
> > diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
> > index d31e1169d9b8..c06ada79a437 100644
> > --- a/arch/arm64/lib/Makefile
> > +++ b/arch/arm64/lib/Makefile
> > @@ -18,3 +18,5 @@ obj-$(CONFIG_CRC32) += crc32.o
> >  obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
> >
> >  obj-$(CONFIG_ARM64_MTE) += mte.o
> > +
> > +obj-$(CONFIG_KASAN_HW_TAGS) += mte-kasan.o
> > diff --git a/arch/arm64/lib/mte-kasan.S b/arch/arm64/lib/mte-kasan.S
> > new file mode 100644
> > index 000000000000..9f6975e2af60
> > --- /dev/null
> > +++ b/arch/arm64/lib/mte-kasan.S
> > @@ -0,0 +1,63 @@
> > +/* SPDX-License-Identifier: GPL-2.0-only */
> > +/*
> > + * Copyright (C) 2021 Google Inc.
> > + */
> > +#include <linux/const.h>
> > +#include <linux/linkage.h>
> > +
> > +#include <asm/mte-def.h>
> > +
> > +     .arch   armv8.5-a+memtag
> > +
> > +     .macro  __set_mem_tag_range, stg, gva, start, size, linesize, tmp1, tmp2, tmp3
> > +     add     \tmp3, \start, \size
> > +     cmp     \size, \linesize, lsl #1
> > +     b.lt    .Lsmtr3_\@
>
> We could do with some comments here. Why the lsl #1? I think I get it
> but it would be good to make this more readable.
>
> It may be easier if you placed it in a file on its own (as it is now but
> with a less generic file name) and use a few .req instead of the tmpX.
> You can use the macro args only for the stg/gva.

Yes, I think we could use more comments and .req here, like the
userspace version of this function that we use in Scudo:
https://cs.android.com/android/platform/superproject/+/master:external/scudo/standalone/memtag.h;l=150;drc=34c8857fef28eab205c22cbfb4bfda2f848e5a80

> > +
> > +     sub     \tmp1, \linesize, #1
> > +     bic     \tmp2, \tmp3, \tmp1
> > +     orr     \tmp1, \start, \tmp1
> > +
> > +.Lsmtr1_\@:
> > +     \stg    \start, [\start], #MTE_GRANULE_SIZE
> > +     cmp     \start, \tmp1
> > +     b.lt    .Lsmtr1_\@
> > +
> > +.Lsmtr2_\@:
> > +     dc      \gva, \start
> > +     add     \start, \start, \linesize
> > +     cmp     \start, \tmp2
> > +     b.lt    .Lsmtr2_\@
> > +
> > +.Lsmtr3_\@:
> > +     cmp     \start, \tmp3
> > +     b.ge    .Lsmtr4_\@
> > +     \stg    \start, [\start], #MTE_GRANULE_SIZE
> > +     b       .Lsmtr3_\@
> > +.Lsmtr4_\@:
> > +     .endm
>
> If we want to get the best performance out of this, we should look at
> the memset implementation and do something similar. In principle it's
> not that far from a memzero, though depending on the microarchitecture
> it may behave slightly differently.

For Scudo I compared our storeTags implementation linked above against
__mtag_tag_zero_region from the arm-optimized-routines repository
(which I think is basically an improved version of that memset
implementation rewritten to use STG and DC GZVA), and our
implementation performed better on the hardware that we have access
to.

> Anyway, before that I wonder if we wrote all this in C + inline asm
> (three while loops or maybe two and some goto), what's the performance
> difference? It has the advantage of being easier to maintain even if we
> used some C macros to generate gva/gzva variants.

I'm not sure I agree that it will be easier to maintain. Due to the
number of "unusual" instructions required here it seems more readable
to have the code in pure assembly than to require readers to switch
contexts between C and asm. If we did move it to inline asm then I
think it should basically be a large blob of asm like the Scudo code
that I linked.

Peter

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v3] kasan: speed up mte_set_mem_tag_range
  2021-05-18 18:11   ` Peter Collingbourne
@ 2021-05-19 18:12     ` Catalin Marinas
  2021-05-19 19:52       ` Evgenii Stepanov
  0 siblings, 1 reply; 5+ messages in thread
From: Catalin Marinas @ 2021-05-19 18:12 UTC (permalink / raw)
  To: Peter Collingbourne
  Cc: Evgenii Stepanov, Andrey Ryabinin, Alexander Potapenko,
	Andrey Konovalov, Dmitry Vyukov, Will Deacon, Steven Price,
	kasan-dev, Linux ARM, Linux Kernel Mailing List

On Tue, May 18, 2021 at 11:11:52AM -0700, Peter Collingbourne wrote:
> On Tue, May 18, 2021 at 10:44 AM Catalin Marinas
> <catalin.marinas@arm.com> wrote:
> > If we want to get the best performance out of this, we should look at
> > the memset implementation and do something similar. In principle it's
> > not that far from a memzero, though depending on the microarchitecture
> > it may behave slightly differently.
> 
> For Scudo I compared our storeTags implementation linked above against
> __mtag_tag_zero_region from the arm-optimized-routines repository
> (which I think is basically an improved version of that memset
> implementation rewritten to use STG and DC GZVA), and our
> implementation performed better on the hardware that we have access
> to.

That's the advantage of having hardware early ;).

> > Anyway, before that I wonder if we wrote all this in C + inline asm
> > (three while loops or maybe two and some goto), what's the performance
> > difference? It has the advantage of being easier to maintain even if we
> > used some C macros to generate gva/gzva variants.
> 
> I'm not sure I agree that it will be easier to maintain. Due to the
> number of "unusual" instructions required here it seems more readable
> to have the code in pure assembly than to require readers to switch
> contexts between C and asm. If we did move it to inline asm then I
> think it should basically be a large blob of asm like the Scudo code
> that I linked.

I was definitely not thinking of a big asm block, that's even less
readable than separate .S file. It's more like adding dedicated macros
for single STG or DC GVA uses and using them in while loops.

Anyway, let's see a better commented .S implementation first. Given that
tagging is very sensitive to the performance of this function, we'd
probably benefit from a (few percent I suspect) perf improvement with
the hand-coded assembly.

-- 
Catalin

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v3] kasan: speed up mte_set_mem_tag_range
  2021-05-19 18:12     ` Catalin Marinas
@ 2021-05-19 19:52       ` Evgenii Stepanov
  0 siblings, 0 replies; 5+ messages in thread
From: Evgenii Stepanov @ 2021-05-19 19:52 UTC (permalink / raw)
  To: Catalin Marinas
  Cc: Peter Collingbourne, Andrey Ryabinin, Alexander Potapenko,
	Andrey Konovalov, Dmitry Vyukov, Will Deacon, Steven Price,
	kasan-dev, Linux ARM, Linux Kernel Mailing List

On Wed, May 19, 2021 at 11:13 AM Catalin Marinas
<catalin.marinas@arm.com> wrote:
>
> On Tue, May 18, 2021 at 11:11:52AM -0700, Peter Collingbourne wrote:
> > On Tue, May 18, 2021 at 10:44 AM Catalin Marinas
> > <catalin.marinas@arm.com> wrote:
> > > If we want to get the best performance out of this, we should look at
> > > the memset implementation and do something similar. In principle it's
> > > not that far from a memzero, though depending on the microarchitecture
> > > it may behave slightly differently.
> >
> > For Scudo I compared our storeTags implementation linked above against
> > __mtag_tag_zero_region from the arm-optimized-routines repository
> > (which I think is basically an improved version of that memset
> > implementation rewritten to use STG and DC GZVA), and our
> > implementation performed better on the hardware that we have access
> > to.
>
> That's the advantage of having hardware early ;).
>
> > > Anyway, before that I wonder if we wrote all this in C + inline asm
> > > (three while loops or maybe two and some goto), what's the performance
> > > difference? It has the advantage of being easier to maintain even if we
> > > used some C macros to generate gva/gzva variants.
> >
> > I'm not sure I agree that it will be easier to maintain. Due to the
> > number of "unusual" instructions required here it seems more readable
> > to have the code in pure assembly than to require readers to switch
> > contexts between C and asm. If we did move it to inline asm then I
> > think it should basically be a large blob of asm like the Scudo code
> > that I linked.
>
> I was definitely not thinking of a big asm block, that's even less
> readable than separate .S file. It's more like adding dedicated macros
> for single STG or DC GVA uses and using them in while loops.

I've got a C version with 4 single-instruction asm blocks, and it
looks pretty nice. The assembly is almost identical to the hand
written variant, and performance is 3% better, presumably because of
the inlining. Also, the C version allows more potential optimizations,
like specialization on the value of "init" - which is not happening
right now because it is not constant in any of the callers.

I'll upload a v4 shortly.

>
> Anyway, let's see a better commented .S implementation first. Given that
> tagging is very sensitive to the performance of this function, we'd
> probably benefit from a (few percent I suspect) perf improvement with
> the hand-coded assembly.
>
> --
> Catalin

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2021-05-19 19:55 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-05-17 23:55 [PATCH v3] kasan: speed up mte_set_mem_tag_range Evgenii Stepanov
2021-05-18 17:44 ` Catalin Marinas
2021-05-18 18:11   ` Peter Collingbourne
2021-05-19 18:12     ` Catalin Marinas
2021-05-19 19:52       ` Evgenii Stepanov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).