* [PATCH] parisc: Align locks for LWS syscalls to L1 cache size
@ 2015-09-02 19:38 Helge Deller
2015-09-02 19:46 ` John David Anglin
2015-09-02 21:32 ` James Bottomley
0 siblings, 2 replies; 7+ messages in thread
From: Helge Deller @ 2015-09-02 19:38 UTC (permalink / raw)
To: linux-parisc; +Cc: James Bottomley, ohn David Anglin
Align the locks for the Light weight syscall (LWS) which is used for
atomic userspace operations (e.g. gcc atomic builtins) on L1 cache
boundaries. This should speed up LWS calls on PA20 systems.
Reported-by: John David Anglin <dave.anglin@bell.net>
Signed-off-by: Helge Deller <deller@gmx.de>
diff --git a/arch/parisc/kernel/syscall.S b/arch/parisc/kernel/syscall.S
index 7ef22e3..80c2306 100644
--- a/arch/parisc/kernel/syscall.S
+++ b/arch/parisc/kernel/syscall.S
@@ -561,9 +561,9 @@ lws_compare_and_swap:
extru %r26, 27, 4, %r20
/* Find lock to use, the hash is either one of 0 to
- 15, multiplied by 16 (keep it 16-byte aligned)
+ 15, multiplied by L1_CACHE_BYTES (keep it L1 cache aligned)
and add to the lock table offset. */
- shlw %r20, 4, %r20
+ shlw %r20, L1_CACHE_SHIFT, %r20
add %r20, %r28, %r20
# if ENABLE_LWS_DEBUG
@@ -751,9 +751,9 @@ cas2_lock_start:
extru %r26, 27, 4, %r20
/* Find lock to use, the hash is either one of 0 to
- 15, multiplied by 16 (keep it 16-byte aligned)
+ 15, multiplied by L1_CACHE_BYTES (keep it L1 cache aligned)
and add to the lock table offset. */
- shlw %r20, 4, %r20
+ shlw %r20, L1_CACHE_SHIFT, %r20
add %r20, %r28, %r20
rsm PSW_SM_I, %r0 /* Disable interrupts */
@@ -931,11 +931,9 @@ END(sys_call_table64)
ENTRY(lws_lock_start)
/* lws locks */
.rept 16
- /* Keep locks aligned at 16-bytes */
+ /* Keep locks aligned to L1_CACHE_BYTES */
.word 1
- .word 0
- .word 0
- .word 0
+ .align L1_CACHE_BYTES
.endr
END(lws_lock_start)
.previous
^ permalink raw reply related [flat|nested] 7+ messages in thread
* Re: [PATCH] parisc: Align locks for LWS syscalls to L1 cache size
2015-09-02 19:38 [PATCH] parisc: Align locks for LWS syscalls to L1 cache size Helge Deller
@ 2015-09-02 19:46 ` John David Anglin
2015-09-02 20:29 ` Helge Deller
2015-09-02 21:32 ` James Bottomley
1 sibling, 1 reply; 7+ messages in thread
From: John David Anglin @ 2015-09-02 19:46 UTC (permalink / raw)
To: Helge Deller, linux-parisc; +Cc: James Bottomley
The LWS locks are also used for futex operations. The shifts in
arch/parisc/include/asm/futex.h need a corresponding update.
Dave
On 2015-09-02 3:38 PM, Helge Deller wrote:
> Align the locks for the Light weight syscall (LWS) which is used for
> atomic userspace operations (e.g. gcc atomic builtins) on L1 cache
> boundaries. This should speed up LWS calls on PA20 systems.
>
> Reported-by: John David Anglin <dave.anglin@bell.net>
> Signed-off-by: Helge Deller <deller@gmx.de>
>
> diff --git a/arch/parisc/kernel/syscall.S b/arch/parisc/kernel/syscall.S
> index 7ef22e3..80c2306 100644
> --- a/arch/parisc/kernel/syscall.S
> +++ b/arch/parisc/kernel/syscall.S
> @@ -561,9 +561,9 @@ lws_compare_and_swap:
> extru %r26, 27, 4, %r20
>
> /* Find lock to use, the hash is either one of 0 to
> - 15, multiplied by 16 (keep it 16-byte aligned)
> + 15, multiplied by L1_CACHE_BYTES (keep it L1 cache aligned)
> and add to the lock table offset. */
> - shlw %r20, 4, %r20
> + shlw %r20, L1_CACHE_SHIFT, %r20
> add %r20, %r28, %r20
>
> # if ENABLE_LWS_DEBUG
> @@ -751,9 +751,9 @@ cas2_lock_start:
> extru %r26, 27, 4, %r20
>
> /* Find lock to use, the hash is either one of 0 to
> - 15, multiplied by 16 (keep it 16-byte aligned)
> + 15, multiplied by L1_CACHE_BYTES (keep it L1 cache aligned)
> and add to the lock table offset. */
> - shlw %r20, 4, %r20
> + shlw %r20, L1_CACHE_SHIFT, %r20
> add %r20, %r28, %r20
>
> rsm PSW_SM_I, %r0 /* Disable interrupts */
> @@ -931,11 +931,9 @@ END(sys_call_table64)
> ENTRY(lws_lock_start)
> /* lws locks */
> .rept 16
> - /* Keep locks aligned at 16-bytes */
> + /* Keep locks aligned to L1_CACHE_BYTES */
> .word 1
> - .word 0
> - .word 0
> - .word 0
> + .align L1_CACHE_BYTES
> .endr
> END(lws_lock_start)
> .previous
>
>
--
John David Anglin dave.anglin@bell.net
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] parisc: Align locks for LWS syscalls to L1 cache size
2015-09-02 19:46 ` John David Anglin
@ 2015-09-02 20:29 ` Helge Deller
2015-09-05 21:48 ` Helge Deller
0 siblings, 1 reply; 7+ messages in thread
From: Helge Deller @ 2015-09-02 20:29 UTC (permalink / raw)
To: John David Anglin, linux-parisc, James Bottomley
parisc: Align locks for LWS syscalls to L1 cache size (v2)
Align the locks for the Light-weight-syscall (LWS) which are used
for atomic userspace operations (e.g. gcc atomic builtins) on L1 cache
boundaries. This should speed up LWS calls on PA20 systems.
Reported-by: John David Anglin <dave.anglin@bell.net>
Signed-off-by: Helge Deller <deller@gmx.de>
diff --git a/arch/parisc/kernel/syscall.S b/arch/parisc/kernel/syscall.S
index 7ef22e3..80c2306 100644
--- a/arch/parisc/kernel/syscall.S
+++ b/arch/parisc/kernel/syscall.S
@@ -561,9 +561,9 @@ lws_compare_and_swap:
extru %r26, 27, 4, %r20
/* Find lock to use, the hash is either one of 0 to
- 15, multiplied by 16 (keep it 16-byte aligned)
+ 15, multiplied by L1_CACHE_BYTES (keep it L1 cache aligned)
and add to the lock table offset. */
- shlw %r20, 4, %r20
+ shlw %r20, L1_CACHE_SHIFT, %r20
add %r20, %r28, %r20
# if ENABLE_LWS_DEBUG
@@ -751,9 +751,9 @@ cas2_lock_start:
extru %r26, 27, 4, %r20
/* Find lock to use, the hash is either one of 0 to
- 15, multiplied by 16 (keep it 16-byte aligned)
+ 15, multiplied by L1_CACHE_BYTES (keep it L1 cache aligned)
and add to the lock table offset. */
- shlw %r20, 4, %r20
+ shlw %r20, L1_CACHE_SHIFT, %r20
add %r20, %r28, %r20
rsm PSW_SM_I, %r0 /* Disable interrupts */
@@ -931,11 +931,9 @@ END(sys_call_table64)
ENTRY(lws_lock_start)
/* lws locks */
.rept 16
- /* Keep locks aligned at 16-bytes */
+ /* Keep locks aligned to L1_CACHE_BYTES */
.word 1
- .word 0
- .word 0
- .word 0
+ .align L1_CACHE_BYTES
.endr
END(lws_lock_start)
.previous
diff --git a/arch/parisc/include/asm/futex.h b/arch/parisc/include/asm/futex.h
index 49df148..47b075c 100644
--- a/arch/parisc/include/asm/futex.h
+++ b/arch/parisc/include/asm/futex.h
@@ -15,7 +15,7 @@ static inline void
_futex_spin_lock_irqsave(u32 __user *uaddr, unsigned long int *flags)
{
extern u32 lws_lock_start[];
- long index = ((long)uaddr & 0xf0) >> 2;
+ long index = (((long)uaddr & 0xf0) >> 4) << (L1_CACHE_SHIFT-2);
arch_spinlock_t *s = (arch_spinlock_t *)&lws_lock_start[index];
local_irq_save(*flags);
arch_spin_lock(s);
@@ -25,7 +25,7 @@ static inline void
_futex_spin_unlock_irqrestore(u32 __user *uaddr, unsigned long int *flags)
{
extern u32 lws_lock_start[];
- long index = ((long)uaddr & 0xf0) >> 2;
+ long index = (((long)uaddr & 0xf0) >> 4) << (L1_CACHE_SHIFT-2);
arch_spinlock_t *s = (arch_spinlock_t *)&lws_lock_start[index];
arch_spin_unlock(s);
local_irq_restore(*flags);
^ permalink raw reply related [flat|nested] 7+ messages in thread
* Re: [PATCH] parisc: Align locks for LWS syscalls to L1 cache size
2015-09-02 19:38 [PATCH] parisc: Align locks for LWS syscalls to L1 cache size Helge Deller
2015-09-02 19:46 ` John David Anglin
@ 2015-09-02 21:32 ` James Bottomley
2015-09-02 22:18 ` Helge Deller
1 sibling, 1 reply; 7+ messages in thread
From: James Bottomley @ 2015-09-02 21:32 UTC (permalink / raw)
To: Helge Deller; +Cc: linux-parisc, ohn David Anglin
On Wed, 2015-09-02 at 21:38 +0200, Helge Deller wrote:
> Align the locks for the Light weight syscall (LWS) which is used for
> atomic userspace operations (e.g. gcc atomic builtins) on L1 cache
> boundaries. This should speed up LWS calls on PA20 systems.
Is there any evidence for this? The architectural requirement for ldcw
on which all this is based is pegged at 16 bytes. This implies that the
burst width on PA88/89 may indeed be 128 bytes, but the coherence width
for operations may still be 16 bytes. If that speculation is true,
there's no speed at all gained by aligning ldcw to 128 bytes and all you
do is waste space.
James
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] parisc: Align locks for LWS syscalls to L1 cache size
2015-09-02 21:32 ` James Bottomley
@ 2015-09-02 22:18 ` Helge Deller
0 siblings, 0 replies; 7+ messages in thread
From: Helge Deller @ 2015-09-02 22:18 UTC (permalink / raw)
To: James Bottomley; +Cc: linux-parisc, John David Anglin
On 02.09.2015 23:32, James Bottomley wrote:
> On Wed, 2015-09-02 at 21:38 +0200, Helge Deller wrote:
>> Align the locks for the Light weight syscall (LWS) which is used for
>> atomic userspace operations (e.g. gcc atomic builtins) on L1 cache
>> boundaries. This should speed up LWS calls on PA20 systems.
>
> Is there any evidence for this? The architectural requirement for ldcw
> on which all this is based is pegged at 16 bytes. This implies that the
> burst width on PA88/89 may indeed be 128 bytes, but the coherence width
> for operations may still be 16 bytes. If that speculation is true,
> there's no speed at all gained by aligning ldcw to 128 bytes and all you
> do is waste space.
Sure, we'll have to measure timings here...
Helge
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] parisc: Align locks for LWS syscalls to L1 cache size
2015-09-02 20:29 ` Helge Deller
@ 2015-09-05 21:48 ` Helge Deller
2015-09-07 20:51 ` Helge Deller
0 siblings, 1 reply; 7+ messages in thread
From: Helge Deller @ 2015-09-05 21:48 UTC (permalink / raw)
To: John David Anglin, linux-parisc, James Bottomley
On 02.09.2015 22:29, Helge Deller wrote:
> parisc: Align locks for LWS syscalls to L1 cache size (v2)
>
> Align the locks for the Light-weight-syscall (LWS) which are used
> for atomic userspace operations (e.g. gcc atomic builtins) on L1 cache
> boundaries. This should speed up LWS calls on PA20 systems.
>
> Reported-by: John David Anglin <dave.anglin@bell.net>
> Signed-off-by: Helge Deller <deller@gmx.de>
Any objections to this patch ?
One idea below...
>
> diff --git a/arch/parisc/kernel/syscall.S b/arch/parisc/kernel/syscall.S
> index 7ef22e3..80c2306 100644
> --- a/arch/parisc/kernel/syscall.S
> +++ b/arch/parisc/kernel/syscall.S
> @@ -561,9 +561,9 @@ lws_compare_and_swap:
> extru %r26, 27, 4, %r20
>
> /* Find lock to use, the hash is either one of 0 to
> - 15, multiplied by 16 (keep it 16-byte aligned)
> + 15, multiplied by L1_CACHE_BYTES (keep it L1 cache aligned)
> and add to the lock table offset. */
> - shlw %r20, 4, %r20
> + shlw %r20, L1_CACHE_SHIFT, %r20
> add %r20, %r28, %r20
>
> # if ENABLE_LWS_DEBUG
> @@ -751,9 +751,9 @@ cas2_lock_start:
> extru %r26, 27, 4, %r20
>
> /* Find lock to use, the hash is either one of 0 to
> - 15, multiplied by 16 (keep it 16-byte aligned)
> + 15, multiplied by L1_CACHE_BYTES (keep it L1 cache aligned)
> and add to the lock table offset. */
> - shlw %r20, 4, %r20
> + shlw %r20, L1_CACHE_SHIFT, %r20
> add %r20, %r28, %r20
>
> rsm PSW_SM_I, %r0 /* Disable interrupts */
> @@ -931,11 +931,9 @@ END(sys_call_table64)
> ENTRY(lws_lock_start)
> /* lws locks */
> .rept 16
> - /* Keep locks aligned at 16-bytes */
> + /* Keep locks aligned to L1_CACHE_BYTES */
> .word 1
> - .word 0
> - .word 0
> - .word 0
> + .align L1_CACHE_BYTES
> .endr
I think this alignment/increase of each array entry to size of L1_CACHE_BYTES
should be limited to the SMP case only... For UP 16 bytes would be ok.
Helge
> END(lws_lock_start)
> .previous
>
>
> diff --git a/arch/parisc/include/asm/futex.h b/arch/parisc/include/asm/futex.h
> index 49df148..47b075c 100644
> --- a/arch/parisc/include/asm/futex.h
> +++ b/arch/parisc/include/asm/futex.h
> @@ -15,7 +15,7 @@ static inline void
> _futex_spin_lock_irqsave(u32 __user *uaddr, unsigned long int *flags)
> {
> extern u32 lws_lock_start[];
> - long index = ((long)uaddr & 0xf0) >> 2;
> + long index = (((long)uaddr & 0xf0) >> 4) << (L1_CACHE_SHIFT-2);
> arch_spinlock_t *s = (arch_spinlock_t *)&lws_lock_start[index];
> local_irq_save(*flags);
> arch_spin_lock(s);
> @@ -25,7 +25,7 @@ static inline void
> _futex_spin_unlock_irqrestore(u32 __user *uaddr, unsigned long int *flags)
> {
> extern u32 lws_lock_start[];
> - long index = ((long)uaddr & 0xf0) >> 2;
> + long index = (((long)uaddr & 0xf0) >> 4) << (L1_CACHE_SHIFT-2);
> arch_spinlock_t *s = (arch_spinlock_t *)&lws_lock_start[index];
> arch_spin_unlock(s);
> local_irq_restore(*flags);
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] parisc: Align locks for LWS syscalls to L1 cache size
2015-09-05 21:48 ` Helge Deller
@ 2015-09-07 20:51 ` Helge Deller
0 siblings, 0 replies; 7+ messages in thread
From: Helge Deller @ 2015-09-07 20:51 UTC (permalink / raw)
To: linux-parisc, James Bottomley; +Cc: John David Anglin
* Helge Deller <deller@gmx.de>:
> On 02.09.2015 22:29, Helge Deller wrote:
> > parisc: Align locks for LWS syscalls to L1 cache size (v2)
> >
> > Align the locks for the Light-weight-syscall (LWS) which are used
> > for atomic userspace operations (e.g. gcc atomic builtins) on L1 cache
> > boundaries. This should speed up LWS calls on PA20 systems.
> >
> > Reported-by: John David Anglin <dave.anglin@bell.net>
> > Signed-off-by: Helge Deller <deller@gmx.de>
Updated patch (v2):
- using 64 LWS locks (instead of 16)
- LWS lock index is calculated by offset of u32 type, because futexes
operate on u32 types (before based on 16 bytes)
- LWS locks aligned to 16byte on UP and to L1 cache size on SMP (to
avoid different threads/processes locking each other on futexes at
different addresses
Signed-off-by: Helge Deller <deller@gmx.de>
diff --git a/arch/parisc/include/asm/cache.h b/arch/parisc/include/asm/cache.h
index 47f11c7..bb3d952 100644
--- a/arch/parisc/include/asm/cache.h
+++ b/arch/parisc/include/asm/cache.h
@@ -22,6 +22,21 @@
#define L1_CACHE_SHIFT 5
#endif
+
+/* Number of Light-weight-syscall (LWS) spinlocks */
+#define LWS_NUM_LOCK_BITS 6
+#define LWS_NUM_LOCKS (1 << LWS_NUM_LOCK_BITS)
+
+/* Number of bits for alignment of LWS locks.
+ * Needs to be at least 4 (=16 bytes) for safe operation of LDCW. For SMP
+ * align locks on L1 cache size. */
+#ifdef CONFIG_SMP
+# define LWS_LOCK_ALIGN_BITS L1_CACHE_SHIFT
+#else
+# define LWS_LOCK_ALIGN_BITS 4
+#endif
+
+
#ifndef __ASSEMBLY__
#define SMP_CACHE_BYTES L1_CACHE_BYTES
diff --git a/arch/parisc/include/asm/futex.h b/arch/parisc/include/asm/futex.h
index 49df148..b79e469 100644
--- a/arch/parisc/include/asm/futex.h
+++ b/arch/parisc/include/asm/futex.h
@@ -7,16 +7,23 @@
#include <linux/uaccess.h>
#include <asm/atomic.h>
#include <asm/errno.h>
+#include <asm/cache.h>
-/* The following has to match the LWS code in syscall.S. We have
- sixteen four-word locks. */
+/* The following has to match the LWS code in syscall.S. */
+static inline arch_spinlock_t *
+_lws_spinlockptr(u32 __user *uaddr)
+{
+ extern u8 lws_lock_start[]; /* in arch/parisc/kernel/syscall.S */
+ /* futexes operates on int values */
+ long index = (((unsigned long)uaddr >> 2) & (LWS_NUM_LOCKS-1));
+ index <<= LWS_LOCK_ALIGN_BITS; /* multiply by alignment of the locks */
+ return (arch_spinlock_t *) &lws_lock_start[index];
+}
static inline void
_futex_spin_lock_irqsave(u32 __user *uaddr, unsigned long int *flags)
{
- extern u32 lws_lock_start[];
- long index = ((long)uaddr & 0xf0) >> 2;
- arch_spinlock_t *s = (arch_spinlock_t *)&lws_lock_start[index];
+ arch_spinlock_t *s = _lws_spinlockptr(uaddr);
local_irq_save(*flags);
arch_spin_lock(s);
}
@@ -24,9 +31,7 @@ _futex_spin_lock_irqsave(u32 __user *uaddr, unsigned long int *flags)
static inline void
_futex_spin_unlock_irqrestore(u32 __user *uaddr, unsigned long int *flags)
{
- extern u32 lws_lock_start[];
- long index = ((long)uaddr & 0xf0) >> 2;
- arch_spinlock_t *s = (arch_spinlock_t *)&lws_lock_start[index];
+ arch_spinlock_t *s = _lws_spinlockptr(uaddr);
arch_spin_unlock(s);
local_irq_restore(*flags);
}
diff --git a/arch/parisc/kernel/syscall.S b/arch/parisc/kernel/syscall.S
index 7ef22e3..fb0dd94 100644
--- a/arch/parisc/kernel/syscall.S
+++ b/arch/parisc/kernel/syscall.S
@@ -557,13 +557,11 @@ lws_compare_and_swap:
ldil L%lws_lock_start, %r20
ldo R%lws_lock_start(%r20), %r28
- /* Extract four bits from r26 and hash lock (Bits 4-7) */
- extru %r26, 27, 4, %r20
+ /* Extract lws lock entry from r26 */
+ extru %r26, (31-2), LWS_NUM_LOCK_BITS, %r20
- /* Find lock to use, the hash is either one of 0 to
- 15, multiplied by 16 (keep it 16-byte aligned)
- and add to the lock table offset. */
- shlw %r20, 4, %r20
+ /* Find hash lock to use */
+ shlw %r20, LWS_LOCK_ALIGN_BITS, %r20
add %r20, %r28, %r20
# if ENABLE_LWS_DEBUG
@@ -747,13 +745,11 @@ cas2_lock_start:
ldil L%lws_lock_start, %r20
ldo R%lws_lock_start(%r20), %r28
- /* Extract four bits from r26 and hash lock (Bits 4-7) */
- extru %r26, 27, 4, %r20
+ /* Extract lws lock entry from r26 */
+ extru %r26, (31-2), LWS_NUM_LOCK_BITS, %r20
- /* Find lock to use, the hash is either one of 0 to
- 15, multiplied by 16 (keep it 16-byte aligned)
- and add to the lock table offset. */
- shlw %r20, 4, %r20
+ /* Find hash lock to use */
+ shlw %r20, LWS_LOCK_ALIGN_BITS, %r20
add %r20, %r28, %r20
rsm PSW_SM_I, %r0 /* Disable interrupts */
@@ -930,12 +926,10 @@ END(sys_call_table64)
.align L1_CACHE_BYTES
ENTRY(lws_lock_start)
/* lws locks */
- .rept 16
- /* Keep locks aligned at 16-bytes */
+ .rept LWS_NUM_LOCKS
+ /* Keep locks at least 16-byte aligned */
.word 1
- .word 0
- .word 0
- .word 0
+ .align (1 << LWS_LOCK_ALIGN_BITS)
.endr
END(lws_lock_start)
.previous
^ permalink raw reply related [flat|nested] 7+ messages in thread
end of thread, other threads:[~2015-09-07 20:51 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-09-02 19:38 [PATCH] parisc: Align locks for LWS syscalls to L1 cache size Helge Deller
2015-09-02 19:46 ` John David Anglin
2015-09-02 20:29 ` Helge Deller
2015-09-05 21:48 ` Helge Deller
2015-09-07 20:51 ` Helge Deller
2015-09-02 21:32 ` James Bottomley
2015-09-02 22:18 ` Helge Deller
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.