linux-riscv.lists.infradead.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] riscv: Select ARCH_HAS_DEBUG_VM_PGTABLE
@ 2020-07-14 21:26 Emil Renner Berthing
  2020-07-15  3:20 ` Anshuman Khandual
  2020-07-22  4:46 ` Palmer Dabbelt
  0 siblings, 2 replies; 5+ messages in thread
From: Emil Renner Berthing @ 2020-07-14 21:26 UTC (permalink / raw)
  To: linux-riscv
  Cc: linux-kernel, Anshuman Khandual, Palmer Dabbelt,
	Emil Renner Berthing, Paul Walmsley

This allows the pgtable tests to be built.

Signed-off-by: Emil Renner Berthing <kernel@esmil.dk>
---

The tests seem to succeed both in Qemu and on the HiFive Unleashed

Both with and without the recent additions in
https://lore.kernel.org/linux-riscv/1594610587-4172-1-git-send-email-anshuman.khandual@arm.com/

 Documentation/features/debug/debug-vm-pgtable/arch-support.txt | 2 +-
 arch/riscv/Kconfig                                             | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/Documentation/features/debug/debug-vm-pgtable/arch-support.txt b/Documentation/features/debug/debug-vm-pgtable/arch-support.txt
index c527d05c0459..c9a764c3c795 100644
--- a/Documentation/features/debug/debug-vm-pgtable/arch-support.txt
+++ b/Documentation/features/debug/debug-vm-pgtable/arch-support.txt
@@ -23,7 +23,7 @@
     |    openrisc: | TODO |
     |      parisc: | TODO |
     |     powerpc: |  ok  |
-    |       riscv: | TODO |
+    |       riscv: |  ok  |
     |        s390: |  ok  |
     |          sh: | TODO |
     |       sparc: | TODO |
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 3230c1d48562..b4e674b1e857 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -15,6 +15,7 @@ config RISCV
 	select ARCH_CLOCKSOURCE_INIT
 	select ARCH_HAS_BINFMT_FLAT
 	select ARCH_HAS_DEBUG_VIRTUAL if MMU
+	select ARCH_HAS_DEBUG_VM_PGTABLE
 	select ARCH_HAS_DEBUG_WX
 	select ARCH_HAS_GCOV_PROFILE_ALL
 	select ARCH_HAS_GIGANTIC_PAGE
-- 
2.27.0


_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH] riscv: Select ARCH_HAS_DEBUG_VM_PGTABLE
  2020-07-14 21:26 [PATCH] riscv: Select ARCH_HAS_DEBUG_VM_PGTABLE Emil Renner Berthing
@ 2020-07-15  3:20 ` Anshuman Khandual
  2020-07-21  4:04   ` Palmer Dabbelt
  2020-07-22  4:46 ` Palmer Dabbelt
  1 sibling, 1 reply; 5+ messages in thread
From: Anshuman Khandual @ 2020-07-15  3:20 UTC (permalink / raw)
  To: Emil Renner Berthing, linux-riscv
  Cc: Palmer Dabbelt, linux-kernel, Paul Walmsley



On 07/15/2020 02:56 AM, Emil Renner Berthing wrote:
> This allows the pgtable tests to be built.
> 
> Signed-off-by: Emil Renner Berthing <kernel@esmil.dk>
> ---
> 
> The tests seem to succeed both in Qemu and on the HiFive Unleashed
> 
> Both with and without the recent additions in
> https://lore.kernel.org/linux-riscv/1594610587-4172-1-git-send-email-anshuman.khandual@arm.com/

That's great, thanks for testing.

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] riscv: Select ARCH_HAS_DEBUG_VM_PGTABLE
  2020-07-15  3:20 ` Anshuman Khandual
@ 2020-07-21  4:04   ` Palmer Dabbelt
  2020-07-21  6:52     ` Emil Renner Berthing
  0 siblings, 1 reply; 5+ messages in thread
From: Palmer Dabbelt @ 2020-07-21  4:04 UTC (permalink / raw)
  To: anshuman.khandual; +Cc: linux-kernel, linux-riscv, kernel, Paul Walmsley

On Tue, 14 Jul 2020 20:20:54 PDT (-0700), anshuman.khandual@arm.com wrote:
>
>
> On 07/15/2020 02:56 AM, Emil Renner Berthing wrote:
>> This allows the pgtable tests to be built.
>>
>> Signed-off-by: Emil Renner Berthing <kernel@esmil.dk>
>> ---
>>
>> The tests seem to succeed both in Qemu and on the HiFive Unleashed
>>
>> Both with and without the recent additions in
>> https://lore.kernel.org/linux-riscv/1594610587-4172-1-git-send-email-anshuman.khandual@arm.com/
>
> That's great, thanks for testing.

Actually, looking at this I'm not sure it actually helps us any.  This changes
the behavior of two functions.  Pulling out the relevant sections, I see:

unsigned int __sw_hweight32(unsigned int w)
{
#ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER
        w -= (w >> 1) & 0x55555555;
        w =  (w & 0x33333333) + ((w >> 2) & 0x33333333);
        w =  (w + (w >> 4)) & 0x0f0f0f0f;
        return (w * 0x01010101) >> 24;
#else
        unsigned int res = w - ((w >> 1) & 0x55555555);
        res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
        res = (res + (res >> 4)) & 0x0F0F0F0F;
        res = res + (res >> 8);
        return (res + (res >> 16)) & 0x000000FF;
#endif
}

and

unsigned long memchr_inv(unsigned long value64)
{
#if defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) && BITS_PER_LONG == 64
        value64 *= 0x0101010101010101ULL;
#elif defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER)
        value64 *= 0x01010101;
        value64 |= value64 << 32;
#else
        value64 |= value64 << 8;
        value64 |= value64 << 16;
        value64 |= value64 << 32;
#endif
	return value64;
}

GCC optimizer the multiplication out of the first one:

__sw_hweight32:
	li	a4,1431654400
	srliw	a5,a0,1
	addi	a4,a4,1365
	and	a5,a5,a4
	subw	a0,a0,a5
	li	a5,858992640
	srliw	a4,a0,2
	addi	a5,a5,819
	and	a0,a5,a0
	and	a5,a5,a4
	addw	a5,a0,a5
	srliw	a0,a5,4
	addw	a0,a0,a5
	li	a5,252645376
	addi	a5,a5,-241
	and	a5,a5,a0
	srliw	a0,a5,8
	addw	a5,a0,a5
	srliw	a0,a5,16
	addw	a0,a0,a5
	andi	a0,a0,0xff
	ret

__sw_hweight32:
	li	a5,1431654400
	srliw	a4,a0,1
	addi	a5,a5,1365
	and	a5,a5,a4
	subw	a0,a0,a5
	li	a5,858992640
	srliw	a4,a0,2
	addi	a5,a5,819
	and	a0,a5,a0
	and	a5,a5,a4
	addw	a5,a0,a5
	srliw	a0,a5,4
	addw	a5,a0,a5
	li	a0,252645376
	addi	a0,a0,-241
	and	a5,a0,a5
	slliw	a0,a5,8
	addw	a0,a0,a5
	slliw	a5,a0,16
	addw	a0,a0,a5
	srliw	a0,a0,24
	ret

so that doesn't matter.  The second one is really a wash: 

memchr_inv:
	ld	a5,.LC0
	mul	a0,a0,a5
	ret
.rodata
.LC0:
	.dword	72340172838076673

vs

memchr_inv:
	slli	a5,a0,8
	or	a5,a5,a0
	slli	a0,a5,16
	or	a0,a0,a5
	slli	a5,a0,32
	or	a0,a5,a0
	ret

It's unlikely that load ends up relaxed, so it's going to be two instructions.
That means we have 4 cycles to forward the load and multiply, for a cache hit.
IIRC the multiplier on the existing hardware isn't that fast -- GCC lists it as
imul as 10 cycles, but I remember it being more like 5 so that might just be an
architecture-inaccurate tuning in the generic pipeline model.  This is out of
the inner loop, so it's probably not all that important anyway.  The result
isn't used for a while so on a bigger machine it's probably worth picking the
smaller code path, but it seems like a very small thing to optimize for either
way.

I'm actually a bit surprised about this.  Do you have benchmarks that indicate
ARCH_HAS_FAST_MULTIPLIER is actually faster?  Otherwise I guess I'm going to
reject this, as it's really more
ARCH_HAS_FAST_MULTIPLIER_AND_FAST_LARGE_CONSTANTS than just
ARCH_HAS_FAST_MULTIPLIER.

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] riscv: Select ARCH_HAS_DEBUG_VM_PGTABLE
  2020-07-21  4:04   ` Palmer Dabbelt
@ 2020-07-21  6:52     ` Emil Renner Berthing
  0 siblings, 0 replies; 5+ messages in thread
From: Emil Renner Berthing @ 2020-07-21  6:52 UTC (permalink / raw)
  To: Palmer Dabbelt
  Cc: linux-riscv, Paul Walmsley, maochenxi, Linux Kernel Mailing List,
	Anshuman Khandual

On Tue, 21 Jul 2020 at 06:04, Palmer Dabbelt <palmer@dabbelt.com> wrote:
>
> On Tue, 14 Jul 2020 20:20:54 PDT (-0700), anshuman.khandual@arm.com wrote:
> >
> >
> > On 07/15/2020 02:56 AM, Emil Renner Berthing wrote:
> >> This allows the pgtable tests to be built.
> >>
> >> Signed-off-by: Emil Renner Berthing <kernel@esmil.dk>
> >> ---
> >>
> >> The tests seem to succeed both in Qemu and on the HiFive Unleashed
> >>
> >> Both with and without the recent additions in
> >> https://lore.kernel.org/linux-riscv/1594610587-4172-1-git-send-email-anshuman.khandual@arm.com/
> >
> > That's great, thanks for testing.
>
> Actually, looking at this I'm not sure it actually helps us any.  This changes
> the behavior of two functions.  Pulling out the relevant sections, I see:
>
> unsigned int __sw_hweight32(unsigned int w)
> {
> #ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER
>         w -= (w >> 1) & 0x55555555;
>         w =  (w & 0x33333333) + ((w >> 2) & 0x33333333);
>         w =  (w + (w >> 4)) & 0x0f0f0f0f;
>         return (w * 0x01010101) >> 24;
> #else
>         unsigned int res = w - ((w >> 1) & 0x55555555);
>         res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
>         res = (res + (res >> 4)) & 0x0F0F0F0F;
>         res = res + (res >> 8);
>         return (res + (res >> 16)) & 0x000000FF;
> #endif
> }
>
> and
>
> unsigned long memchr_inv(unsigned long value64)
> {
> #if defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER) && BITS_PER_LONG == 64
>         value64 *= 0x0101010101010101ULL;
> #elif defined(CONFIG_ARCH_HAS_FAST_MULTIPLIER)
>         value64 *= 0x01010101;
>         value64 |= value64 << 32;
> #else
>         value64 |= value64 << 8;
>         value64 |= value64 << 16;
>         value64 |= value64 << 32;
> #endif
>         return value64;
> }
>
> GCC optimizer the multiplication out of the first one:
>
> __sw_hweight32:
>         li      a4,1431654400
>         srliw   a5,a0,1
>         addi    a4,a4,1365
>         and     a5,a5,a4
>         subw    a0,a0,a5
>         li      a5,858992640
>         srliw   a4,a0,2
>         addi    a5,a5,819
>         and     a0,a5,a0
>         and     a5,a5,a4
>         addw    a5,a0,a5
>         srliw   a0,a5,4
>         addw    a0,a0,a5
>         li      a5,252645376
>         addi    a5,a5,-241
>         and     a5,a5,a0
>         srliw   a0,a5,8
>         addw    a5,a0,a5
>         srliw   a0,a5,16
>         addw    a0,a0,a5
>         andi    a0,a0,0xff
>         ret
>
> __sw_hweight32:
>         li      a5,1431654400
>         srliw   a4,a0,1
>         addi    a5,a5,1365
>         and     a5,a5,a4
>         subw    a0,a0,a5
>         li      a5,858992640
>         srliw   a4,a0,2
>         addi    a5,a5,819
>         and     a0,a5,a0
>         and     a5,a5,a4
>         addw    a5,a0,a5
>         srliw   a0,a5,4
>         addw    a5,a0,a5
>         li      a0,252645376
>         addi    a0,a0,-241
>         and     a5,a0,a5
>         slliw   a0,a5,8
>         addw    a0,a0,a5
>         slliw   a5,a0,16
>         addw    a0,a0,a5
>         srliw   a0,a0,24
>         ret
>
> so that doesn't matter.  The second one is really a wash:
>
> memchr_inv:
>         ld      a5,.LC0
>         mul     a0,a0,a5
>         ret
> .rodata
> .LC0:
>         .dword  72340172838076673
>
> vs
>
> memchr_inv:
>         slli    a5,a0,8
>         or      a5,a5,a0
>         slli    a0,a5,16
>         or      a0,a0,a5
>         slli    a5,a0,32
>         or      a0,a5,a0
>         ret
>
> It's unlikely that load ends up relaxed, so it's going to be two instructions.
> That means we have 4 cycles to forward the load and multiply, for a cache hit.
> IIRC the multiplier on the existing hardware isn't that fast -- GCC lists it as
> imul as 10 cycles, but I remember it being more like 5 so that might just be an
> architecture-inaccurate tuning in the generic pipeline model.  This is out of
> the inner loop, so it's probably not all that important anyway.  The result
> isn't used for a while so on a bigger machine it's probably worth picking the
> smaller code path, but it seems like a very small thing to optimize for either
> way.
>
> I'm actually a bit surprised about this.  Do you have benchmarks that indicate
> ARCH_HAS_FAST_MULTIPLIER is actually faster?  Otherwise I guess I'm going to
> reject this, as it's really more
> ARCH_HAS_FAST_MULTIPLIER_AND_FAST_LARGE_CONSTANTS than just
> ARCH_HAS_FAST_MULTIPLIER.

Hi Palmer,

I think you meant this reply for
https://lore.kernel.org/linux-riscv/c5d82526-233a-15d5-90df-ca0c25a53639@eswin.com/T/#t

/Emil

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] riscv: Select ARCH_HAS_DEBUG_VM_PGTABLE
  2020-07-14 21:26 [PATCH] riscv: Select ARCH_HAS_DEBUG_VM_PGTABLE Emil Renner Berthing
  2020-07-15  3:20 ` Anshuman Khandual
@ 2020-07-22  4:46 ` Palmer Dabbelt
  1 sibling, 0 replies; 5+ messages in thread
From: Palmer Dabbelt @ 2020-07-22  4:46 UTC (permalink / raw)
  To: kernel
  Cc: linux-kernel, linux-riscv, anshuman.khandual, kernel, Paul Walmsley

On Tue, 14 Jul 2020 14:26:11 PDT (-0700), kernel@esmil.dk wrote:
> This allows the pgtable tests to be built.
>
> Signed-off-by: Emil Renner Berthing <kernel@esmil.dk>
> ---
>
> The tests seem to succeed both in Qemu and on the HiFive Unleashed
>
> Both with and without the recent additions in
> https://lore.kernel.org/linux-riscv/1594610587-4172-1-git-send-email-anshuman.khandual@arm.com/
>
>  Documentation/features/debug/debug-vm-pgtable/arch-support.txt | 2 +-
>  arch/riscv/Kconfig                                             | 1 +
>  2 files changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/Documentation/features/debug/debug-vm-pgtable/arch-support.txt b/Documentation/features/debug/debug-vm-pgtable/arch-support.txt
> index c527d05c0459..c9a764c3c795 100644
> --- a/Documentation/features/debug/debug-vm-pgtable/arch-support.txt
> +++ b/Documentation/features/debug/debug-vm-pgtable/arch-support.txt
> @@ -23,7 +23,7 @@
>      |    openrisc: | TODO |
>      |      parisc: | TODO |
>      |     powerpc: |  ok  |
> -    |       riscv: | TODO |
> +    |       riscv: |  ok  |
>      |        s390: |  ok  |
>      |          sh: | TODO |
>      |       sparc: | TODO |
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index 3230c1d48562..b4e674b1e857 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -15,6 +15,7 @@ config RISCV
>  	select ARCH_CLOCKSOURCE_INIT
>  	select ARCH_HAS_BINFMT_FLAT
>  	select ARCH_HAS_DEBUG_VIRTUAL if MMU
> +	select ARCH_HAS_DEBUG_VM_PGTABLE
>  	select ARCH_HAS_DEBUG_WX
>  	select ARCH_HAS_GCOV_PROFILE_ALL
>  	select ARCH_HAS_GIGANTIC_PAGE

This is on for-next.  Thanks!

_______________________________________________
linux-riscv mailing list
linux-riscv@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-riscv

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2020-07-22  4:46 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-07-14 21:26 [PATCH] riscv: Select ARCH_HAS_DEBUG_VM_PGTABLE Emil Renner Berthing
2020-07-15  3:20 ` Anshuman Khandual
2020-07-21  4:04   ` Palmer Dabbelt
2020-07-21  6:52     ` Emil Renner Berthing
2020-07-22  4:46 ` Palmer Dabbelt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).