All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] parisc: Align locks for LWS syscalls to L1 cache size
@ 2015-09-02 19:38 Helge Deller
  2015-09-02 19:46 ` John David Anglin
  2015-09-02 21:32 ` James Bottomley
  0 siblings, 2 replies; 7+ messages in thread
From: Helge Deller @ 2015-09-02 19:38 UTC (permalink / raw)
  To: linux-parisc; +Cc: James Bottomley, ohn David Anglin

Align the locks for the Light weight syscall (LWS) which is used for
atomic userspace operations (e.g. gcc atomic builtins) on L1 cache
boundaries. This should speed up LWS calls on PA20 systems.

Reported-by: John David Anglin <dave.anglin@bell.net>
Signed-off-by: Helge Deller <deller@gmx.de>

diff --git a/arch/parisc/kernel/syscall.S b/arch/parisc/kernel/syscall.S
index 7ef22e3..80c2306 100644
--- a/arch/parisc/kernel/syscall.S
+++ b/arch/parisc/kernel/syscall.S
@@ -561,9 +561,9 @@ lws_compare_and_swap:
 	extru  %r26, 27, 4, %r20
 
 	/* Find lock to use, the hash is either one of 0 to
-	   15, multiplied by 16 (keep it 16-byte aligned)
+	   15, multiplied by L1_CACHE_BYTES (keep it L1 cache aligned)
 	   and add to the lock table offset. */
-	shlw	%r20, 4, %r20
+	shlw	%r20, L1_CACHE_SHIFT, %r20
 	add	%r20, %r28, %r20
 
 # if ENABLE_LWS_DEBUG
@@ -751,9 +751,9 @@ cas2_lock_start:
 	extru  %r26, 27, 4, %r20
 
 	/* Find lock to use, the hash is either one of 0 to
-	   15, multiplied by 16 (keep it 16-byte aligned)
+	   15, multiplied by L1_CACHE_BYTES (keep it L1 cache aligned)
 	   and add to the lock table offset. */
-	shlw	%r20, 4, %r20
+	shlw	%r20, L1_CACHE_SHIFT, %r20
 	add	%r20, %r28, %r20
 
 	rsm	PSW_SM_I, %r0			/* Disable interrupts */
@@ -931,11 +931,9 @@ END(sys_call_table64)
 ENTRY(lws_lock_start)
 	/* lws locks */
 	.rept 16
-	/* Keep locks aligned at 16-bytes */
+	/* Keep locks aligned to L1_CACHE_BYTES */
 	.word 1
-	.word 0 
-	.word 0
-	.word 0
+	.align	L1_CACHE_BYTES
 	.endr
 END(lws_lock_start)
 	.previous

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH] parisc: Align locks for LWS syscalls to L1 cache size
  2015-09-02 19:38 [PATCH] parisc: Align locks for LWS syscalls to L1 cache size Helge Deller
@ 2015-09-02 19:46 ` John David Anglin
  2015-09-02 20:29   ` Helge Deller
  2015-09-02 21:32 ` James Bottomley
  1 sibling, 1 reply; 7+ messages in thread
From: John David Anglin @ 2015-09-02 19:46 UTC (permalink / raw)
  To: Helge Deller, linux-parisc; +Cc: James Bottomley

The LWS locks are also used for futex operations.  The shifts in
arch/parisc/include/asm/futex.h need a corresponding update.

Dave

On 2015-09-02 3:38 PM, Helge Deller wrote:
> Align the locks for the Light weight syscall (LWS) which is used for
> atomic userspace operations (e.g. gcc atomic builtins) on L1 cache
> boundaries. This should speed up LWS calls on PA20 systems.
>
> Reported-by: John David Anglin <dave.anglin@bell.net>
> Signed-off-by: Helge Deller <deller@gmx.de>
>
> diff --git a/arch/parisc/kernel/syscall.S b/arch/parisc/kernel/syscall.S
> index 7ef22e3..80c2306 100644
> --- a/arch/parisc/kernel/syscall.S
> +++ b/arch/parisc/kernel/syscall.S
> @@ -561,9 +561,9 @@ lws_compare_and_swap:
>   	extru  %r26, 27, 4, %r20
>   
>   	/* Find lock to use, the hash is either one of 0 to
> -	   15, multiplied by 16 (keep it 16-byte aligned)
> +	   15, multiplied by L1_CACHE_BYTES (keep it L1 cache aligned)
>   	   and add to the lock table offset. */
> -	shlw	%r20, 4, %r20
> +	shlw	%r20, L1_CACHE_SHIFT, %r20
>   	add	%r20, %r28, %r20
>   
>   # if ENABLE_LWS_DEBUG
> @@ -751,9 +751,9 @@ cas2_lock_start:
>   	extru  %r26, 27, 4, %r20
>   
>   	/* Find lock to use, the hash is either one of 0 to
> -	   15, multiplied by 16 (keep it 16-byte aligned)
> +	   15, multiplied by L1_CACHE_BYTES (keep it L1 cache aligned)
>   	   and add to the lock table offset. */
> -	shlw	%r20, 4, %r20
> +	shlw	%r20, L1_CACHE_SHIFT, %r20
>   	add	%r20, %r28, %r20
>   
>   	rsm	PSW_SM_I, %r0			/* Disable interrupts */
> @@ -931,11 +931,9 @@ END(sys_call_table64)
>   ENTRY(lws_lock_start)
>   	/* lws locks */
>   	.rept 16
> -	/* Keep locks aligned at 16-bytes */
> +	/* Keep locks aligned to L1_CACHE_BYTES */
>   	.word 1
> -	.word 0
> -	.word 0
> -	.word 0
> +	.align	L1_CACHE_BYTES
>   	.endr
>   END(lws_lock_start)
>   	.previous
>
>


-- 
John David Anglin  dave.anglin@bell.net


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] parisc: Align locks for LWS syscalls to L1 cache size
  2015-09-02 19:46 ` John David Anglin
@ 2015-09-02 20:29   ` Helge Deller
  2015-09-05 21:48     ` Helge Deller
  0 siblings, 1 reply; 7+ messages in thread
From: Helge Deller @ 2015-09-02 20:29 UTC (permalink / raw)
  To: John David Anglin, linux-parisc, James Bottomley

parisc: Align locks for LWS syscalls to L1 cache size (v2)

Align the locks for the Light-weight-syscall (LWS) which are used
for atomic userspace operations (e.g. gcc atomic builtins) on L1 cache
boundaries. This should speed up LWS calls on PA20 systems.

Reported-by: John David Anglin <dave.anglin@bell.net>
Signed-off-by: Helge Deller <deller@gmx.de>

diff --git a/arch/parisc/kernel/syscall.S b/arch/parisc/kernel/syscall.S
index 7ef22e3..80c2306 100644
--- a/arch/parisc/kernel/syscall.S
+++ b/arch/parisc/kernel/syscall.S
@@ -561,9 +561,9 @@ lws_compare_and_swap:
 	extru  %r26, 27, 4, %r20
 
 	/* Find lock to use, the hash is either one of 0 to
-	   15, multiplied by 16 (keep it 16-byte aligned)
+	   15, multiplied by L1_CACHE_BYTES (keep it L1 cache aligned)
 	   and add to the lock table offset. */
-	shlw	%r20, 4, %r20
+	shlw	%r20, L1_CACHE_SHIFT, %r20
 	add	%r20, %r28, %r20
 
 # if ENABLE_LWS_DEBUG
@@ -751,9 +751,9 @@ cas2_lock_start:
 	extru  %r26, 27, 4, %r20
 
 	/* Find lock to use, the hash is either one of 0 to
-	   15, multiplied by 16 (keep it 16-byte aligned)
+	   15, multiplied by L1_CACHE_BYTES (keep it L1 cache aligned)
 	   and add to the lock table offset. */
-	shlw	%r20, 4, %r20
+	shlw	%r20, L1_CACHE_SHIFT, %r20
 	add	%r20, %r28, %r20
 
 	rsm	PSW_SM_I, %r0			/* Disable interrupts */
@@ -931,11 +931,9 @@ END(sys_call_table64)
 ENTRY(lws_lock_start)
 	/* lws locks */
 	.rept 16
-	/* Keep locks aligned at 16-bytes */
+	/* Keep locks aligned to L1_CACHE_BYTES */
 	.word 1
-	.word 0 
-	.word 0
-	.word 0
+	.align	L1_CACHE_BYTES
 	.endr
 END(lws_lock_start)
 	.previous


diff --git a/arch/parisc/include/asm/futex.h b/arch/parisc/include/asm/futex.h
index 49df148..47b075c 100644
--- a/arch/parisc/include/asm/futex.h
+++ b/arch/parisc/include/asm/futex.h
@@ -15,7 +15,7 @@ static inline void
 _futex_spin_lock_irqsave(u32 __user *uaddr, unsigned long int *flags)
 {
 	extern u32 lws_lock_start[];
-	long index = ((long)uaddr & 0xf0) >> 2;
+	long index = (((long)uaddr & 0xf0) >> 4) << (L1_CACHE_SHIFT-2);
 	arch_spinlock_t *s = (arch_spinlock_t *)&lws_lock_start[index];
 	local_irq_save(*flags);
 	arch_spin_lock(s);
@@ -25,7 +25,7 @@ static inline void
 _futex_spin_unlock_irqrestore(u32 __user *uaddr, unsigned long int *flags)
 {
 	extern u32 lws_lock_start[];
-	long index = ((long)uaddr & 0xf0) >> 2;
+	long index = (((long)uaddr & 0xf0) >> 4) << (L1_CACHE_SHIFT-2);
 	arch_spinlock_t *s = (arch_spinlock_t *)&lws_lock_start[index];
 	arch_spin_unlock(s);
 	local_irq_restore(*flags);

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH] parisc: Align locks for LWS syscalls to L1 cache size
  2015-09-02 19:38 [PATCH] parisc: Align locks for LWS syscalls to L1 cache size Helge Deller
  2015-09-02 19:46 ` John David Anglin
@ 2015-09-02 21:32 ` James Bottomley
  2015-09-02 22:18   ` Helge Deller
  1 sibling, 1 reply; 7+ messages in thread
From: James Bottomley @ 2015-09-02 21:32 UTC (permalink / raw)
  To: Helge Deller; +Cc: linux-parisc, ohn David Anglin

On Wed, 2015-09-02 at 21:38 +0200, Helge Deller wrote:
> Align the locks for the Light weight syscall (LWS) which is used for
> atomic userspace operations (e.g. gcc atomic builtins) on L1 cache
> boundaries. This should speed up LWS calls on PA20 systems.

Is there any evidence for this?  The architectural requirement for ldcw
on which all this is based is pegged at 16 bytes.  This implies that the
burst width on PA88/89 may indeed be 128 bytes, but the coherence width
for operations may still be 16 bytes.  If that speculation is true,
there's no speed at all gained by aligning ldcw to 128 bytes and all you
do is waste space.

James



^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] parisc: Align locks for LWS syscalls to L1 cache size
  2015-09-02 21:32 ` James Bottomley
@ 2015-09-02 22:18   ` Helge Deller
  0 siblings, 0 replies; 7+ messages in thread
From: Helge Deller @ 2015-09-02 22:18 UTC (permalink / raw)
  To: James Bottomley; +Cc: linux-parisc, John David Anglin

On 02.09.2015 23:32, James Bottomley wrote:
> On Wed, 2015-09-02 at 21:38 +0200, Helge Deller wrote:
>> Align the locks for the Light weight syscall (LWS) which is used for
>> atomic userspace operations (e.g. gcc atomic builtins) on L1 cache
>> boundaries. This should speed up LWS calls on PA20 systems.
> 
> Is there any evidence for this?  The architectural requirement for ldcw
> on which all this is based is pegged at 16 bytes.  This implies that the
> burst width on PA88/89 may indeed be 128 bytes, but the coherence width
> for operations may still be 16 bytes.  If that speculation is true,
> there's no speed at all gained by aligning ldcw to 128 bytes and all you
> do is waste space.

Sure, we'll have to measure timings here...

Helge


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] parisc: Align locks for LWS syscalls to L1 cache size
  2015-09-02 20:29   ` Helge Deller
@ 2015-09-05 21:48     ` Helge Deller
  2015-09-07 20:51       ` Helge Deller
  0 siblings, 1 reply; 7+ messages in thread
From: Helge Deller @ 2015-09-05 21:48 UTC (permalink / raw)
  To: John David Anglin, linux-parisc, James Bottomley

On 02.09.2015 22:29, Helge Deller wrote:
> parisc: Align locks for LWS syscalls to L1 cache size (v2)
> 
> Align the locks for the Light-weight-syscall (LWS) which are used
> for atomic userspace operations (e.g. gcc atomic builtins) on L1 cache
> boundaries. This should speed up LWS calls on PA20 systems.
> 
> Reported-by: John David Anglin <dave.anglin@bell.net>
> Signed-off-by: Helge Deller <deller@gmx.de>


Any objections to this patch ?
One idea below...


> 
> diff --git a/arch/parisc/kernel/syscall.S b/arch/parisc/kernel/syscall.S
> index 7ef22e3..80c2306 100644
> --- a/arch/parisc/kernel/syscall.S
> +++ b/arch/parisc/kernel/syscall.S
> @@ -561,9 +561,9 @@ lws_compare_and_swap:
>  	extru  %r26, 27, 4, %r20
>  
>  	/* Find lock to use, the hash is either one of 0 to
> -	   15, multiplied by 16 (keep it 16-byte aligned)
> +	   15, multiplied by L1_CACHE_BYTES (keep it L1 cache aligned)
>  	   and add to the lock table offset. */
> -	shlw	%r20, 4, %r20
> +	shlw	%r20, L1_CACHE_SHIFT, %r20
>  	add	%r20, %r28, %r20
>  
>  # if ENABLE_LWS_DEBUG
> @@ -751,9 +751,9 @@ cas2_lock_start:
>  	extru  %r26, 27, 4, %r20
>  
>  	/* Find lock to use, the hash is either one of 0 to
> -	   15, multiplied by 16 (keep it 16-byte aligned)
> +	   15, multiplied by L1_CACHE_BYTES (keep it L1 cache aligned)
>  	   and add to the lock table offset. */
> -	shlw	%r20, 4, %r20
> +	shlw	%r20, L1_CACHE_SHIFT, %r20
>  	add	%r20, %r28, %r20
>  
>  	rsm	PSW_SM_I, %r0			/* Disable interrupts */
> @@ -931,11 +931,9 @@ END(sys_call_table64)
>  ENTRY(lws_lock_start)
>  	/* lws locks */
>  	.rept 16
> -	/* Keep locks aligned at 16-bytes */
> +	/* Keep locks aligned to L1_CACHE_BYTES */
>  	.word 1
> -	.word 0 
> -	.word 0
> -	.word 0
> +	.align	L1_CACHE_BYTES
>  	.endr

I think this alignment/increase of each array entry to size of L1_CACHE_BYTES 
should be limited to the SMP case only... For UP 16 bytes would be ok. 

Helge



>  END(lws_lock_start)
>  	.previous
> 
> 
> diff --git a/arch/parisc/include/asm/futex.h b/arch/parisc/include/asm/futex.h
> index 49df148..47b075c 100644
> --- a/arch/parisc/include/asm/futex.h
> +++ b/arch/parisc/include/asm/futex.h
> @@ -15,7 +15,7 @@ static inline void
>  _futex_spin_lock_irqsave(u32 __user *uaddr, unsigned long int *flags)
>  {
>  	extern u32 lws_lock_start[];
> -	long index = ((long)uaddr & 0xf0) >> 2;
> +	long index = (((long)uaddr & 0xf0) >> 4) << (L1_CACHE_SHIFT-2);
>  	arch_spinlock_t *s = (arch_spinlock_t *)&lws_lock_start[index];
>  	local_irq_save(*flags);
>  	arch_spin_lock(s);
> @@ -25,7 +25,7 @@ static inline void
>  _futex_spin_unlock_irqrestore(u32 __user *uaddr, unsigned long int *flags)
>  {
>  	extern u32 lws_lock_start[];
> -	long index = ((long)uaddr & 0xf0) >> 2;
> +	long index = (((long)uaddr & 0xf0) >> 4) << (L1_CACHE_SHIFT-2);
>  	arch_spinlock_t *s = (arch_spinlock_t *)&lws_lock_start[index];
>  	arch_spin_unlock(s);
>  	local_irq_restore(*flags);


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] parisc: Align locks for LWS syscalls to L1 cache size
  2015-09-05 21:48     ` Helge Deller
@ 2015-09-07 20:51       ` Helge Deller
  0 siblings, 0 replies; 7+ messages in thread
From: Helge Deller @ 2015-09-07 20:51 UTC (permalink / raw)
  To: linux-parisc, James Bottomley; +Cc: John David Anglin

* Helge Deller <deller@gmx.de>:
> On 02.09.2015 22:29, Helge Deller wrote:
> > parisc: Align locks for LWS syscalls to L1 cache size (v2)
> > 
> > Align the locks for the Light-weight-syscall (LWS) which are used
> > for atomic userspace operations (e.g. gcc atomic builtins) on L1 cache
> > boundaries. This should speed up LWS calls on PA20 systems.
> > 
> > Reported-by: John David Anglin <dave.anglin@bell.net>
> > Signed-off-by: Helge Deller <deller@gmx.de>

Updated patch (v2):
- using 64 LWS locks (instead of 16)
- LWS lock index is calculated by offset of u32 type, because futexes
  operate on u32 types (before based on 16 bytes)
- LWS locks aligned to 16byte on UP and to L1 cache size on SMP (to
  avoid different threads/processes locking each other on futexes at
  different addresses

Signed-off-by: Helge Deller <deller@gmx.de>


diff --git a/arch/parisc/include/asm/cache.h b/arch/parisc/include/asm/cache.h
index 47f11c7..bb3d952 100644
--- a/arch/parisc/include/asm/cache.h
+++ b/arch/parisc/include/asm/cache.h
@@ -22,6 +22,21 @@
 #define L1_CACHE_SHIFT 5
 #endif
 
+
+/* Number of Light-weight-syscall (LWS) spinlocks */
+#define LWS_NUM_LOCK_BITS	6
+#define LWS_NUM_LOCKS		(1 << LWS_NUM_LOCK_BITS)
+
+/* Number of bits for alignment of LWS locks.
+ * Needs to be at least 4 (=16 bytes) for safe operation of LDCW.  For SMP
+ * align locks on L1 cache size. */
+#ifdef CONFIG_SMP
+# define LWS_LOCK_ALIGN_BITS	L1_CACHE_SHIFT
+#else
+# define LWS_LOCK_ALIGN_BITS	4
+#endif
+
+
 #ifndef __ASSEMBLY__
 
 #define SMP_CACHE_BYTES L1_CACHE_BYTES
diff --git a/arch/parisc/include/asm/futex.h b/arch/parisc/include/asm/futex.h
index 49df148..b79e469 100644
--- a/arch/parisc/include/asm/futex.h
+++ b/arch/parisc/include/asm/futex.h
@@ -7,16 +7,23 @@
 #include <linux/uaccess.h>
 #include <asm/atomic.h>
 #include <asm/errno.h>
+#include <asm/cache.h>
 
-/* The following has to match the LWS code in syscall.S.  We have
-   sixteen four-word locks. */
+/* The following has to match the LWS code in syscall.S. */
+static inline arch_spinlock_t *
+_lws_spinlockptr(u32 __user *uaddr)
+{
+	extern u8 lws_lock_start[]; /* in arch/parisc/kernel/syscall.S */
+	/* futexes operates on int values */
+	long index = (((unsigned long)uaddr >> 2) & (LWS_NUM_LOCKS-1));
+	index <<= LWS_LOCK_ALIGN_BITS;	/* multiply by alignment of the locks */
+	return (arch_spinlock_t *) &lws_lock_start[index];
+}
 
 static inline void
 _futex_spin_lock_irqsave(u32 __user *uaddr, unsigned long int *flags)
 {
-	extern u32 lws_lock_start[];
-	long index = ((long)uaddr & 0xf0) >> 2;
-	arch_spinlock_t *s = (arch_spinlock_t *)&lws_lock_start[index];
+	arch_spinlock_t *s = _lws_spinlockptr(uaddr);
 	local_irq_save(*flags);
 	arch_spin_lock(s);
 }
@@ -24,9 +31,7 @@ _futex_spin_lock_irqsave(u32 __user *uaddr, unsigned long int *flags)
 static inline void
 _futex_spin_unlock_irqrestore(u32 __user *uaddr, unsigned long int *flags)
 {
-	extern u32 lws_lock_start[];
-	long index = ((long)uaddr & 0xf0) >> 2;
-	arch_spinlock_t *s = (arch_spinlock_t *)&lws_lock_start[index];
+	arch_spinlock_t *s = _lws_spinlockptr(uaddr);
 	arch_spin_unlock(s);
 	local_irq_restore(*flags);
 }
diff --git a/arch/parisc/kernel/syscall.S b/arch/parisc/kernel/syscall.S
index 7ef22e3..fb0dd94 100644
--- a/arch/parisc/kernel/syscall.S
+++ b/arch/parisc/kernel/syscall.S
@@ -557,13 +557,11 @@ lws_compare_and_swap:
 	ldil	L%lws_lock_start, %r20
 	ldo	R%lws_lock_start(%r20), %r28
 
-	/* Extract four bits from r26 and hash lock (Bits 4-7) */
-	extru  %r26, 27, 4, %r20
+	/* Extract lws lock entry from r26 */
+	extru  %r26, (31-2), LWS_NUM_LOCK_BITS, %r20
 
-	/* Find lock to use, the hash is either one of 0 to
-	   15, multiplied by 16 (keep it 16-byte aligned)
-	   and add to the lock table offset. */
-	shlw	%r20, 4, %r20
+	/* Find hash lock to use */
+	shlw	%r20, LWS_LOCK_ALIGN_BITS, %r20
 	add	%r20, %r28, %r20
 
 # if ENABLE_LWS_DEBUG
@@ -747,13 +745,11 @@ cas2_lock_start:
 	ldil	L%lws_lock_start, %r20
 	ldo	R%lws_lock_start(%r20), %r28
 
-	/* Extract four bits from r26 and hash lock (Bits 4-7) */
-	extru  %r26, 27, 4, %r20
+	/* Extract lws lock entry from r26 */
+	extru  %r26, (31-2), LWS_NUM_LOCK_BITS, %r20
 
-	/* Find lock to use, the hash is either one of 0 to
-	   15, multiplied by 16 (keep it 16-byte aligned)
-	   and add to the lock table offset. */
-	shlw	%r20, 4, %r20
+	/* Find hash lock to use */
+	shlw	%r20, LWS_LOCK_ALIGN_BITS, %r20
 	add	%r20, %r28, %r20
 
 	rsm	PSW_SM_I, %r0			/* Disable interrupts */
@@ -930,12 +926,10 @@ END(sys_call_table64)
 	.align	L1_CACHE_BYTES
 ENTRY(lws_lock_start)
 	/* lws locks */
-	.rept 16
-	/* Keep locks aligned at 16-bytes */
+	.rept LWS_NUM_LOCKS
+	/* Keep locks at least 16-byte aligned */
 	.word 1
-	.word 0 
-	.word 0
-	.word 0
+	.align (1 << LWS_LOCK_ALIGN_BITS)
 	.endr
 END(lws_lock_start)
 	.previous

^ permalink raw reply related	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2015-09-07 20:51 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-09-02 19:38 [PATCH] parisc: Align locks for LWS syscalls to L1 cache size Helge Deller
2015-09-02 19:46 ` John David Anglin
2015-09-02 20:29   ` Helge Deller
2015-09-05 21:48     ` Helge Deller
2015-09-07 20:51       ` Helge Deller
2015-09-02 21:32 ` James Bottomley
2015-09-02 22:18   ` Helge Deller

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.