linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] rw_semaphores, optimisations try #3
@ 2001-04-23 20:35 D.W.Howells
  2001-04-23 21:34 ` Andrea Arcangeli
  2001-04-23 22:23 ` [PATCH] rw_semaphores, optimisations try #3 Linus Torvalds
  0 siblings, 2 replies; 19+ messages in thread
From: D.W.Howells @ 2001-04-23 20:35 UTC (permalink / raw)
  To: torvalds; +Cc: linux-kernel, dhowells, andrea, davem

[-- Attachment #1: Type: text/plain, Size: 882 bytes --]

This patch (made against linux-2.4.4-pre6) makes a number of changes to the
rwsem implementation:

 (1) Everything in try #2

plus

 (2) Changes proposed by Linus for the generic semaphore code.

 (3) Ideas from Andrea and how he implemented his semaphores.

Linus, you suggested that the generic list handling stuff would be faster (2 
unconditional stores) than mine (1 unconditional store and 1 conditional 
store and branch to jump round it). You are both right and wrong. The generic 
code does two stores per _process_ woken up (list_del) mine does the 1 or 2 
stores per _batch_ of processes woken up. So the generic way is better when 
the queue is an even mixture of readers or writers and my way is better when 
there are far greater numbers of waiting readers. However, that said, there 
is not much in it either way, so I've reverted it to the generic list stuff.

David

[-- Attachment #2: rw-semaphores optimisation, try #3 --]
[-- Type: text/plain, Size: 30651 bytes --]

diff -uNr linux-2.4.4-pre6/arch/i386/kernel/i386_ksyms.c linux/arch/i386/kernel/i386_ksyms.c
--- linux-2.4.4-pre6/arch/i386/kernel/i386_ksyms.c	Sat Apr 21 21:24:25 2001
+++ linux/arch/i386/kernel/i386_ksyms.c	Sat Apr 21 22:52:50 2001
@@ -80,11 +80,6 @@
 EXPORT_SYMBOL_NOVERS(__down_failed_interruptible);
 EXPORT_SYMBOL_NOVERS(__down_failed_trylock);
 EXPORT_SYMBOL_NOVERS(__up_wakeup);
-#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
-EXPORT_SYMBOL_NOVERS(__rwsem_down_write_failed);
-EXPORT_SYMBOL_NOVERS(__rwsem_down_read_failed);
-EXPORT_SYMBOL_NOVERS(__rwsem_wake);
-#endif
 /* Networking helper routines. */
 EXPORT_SYMBOL(csum_partial_copy_generic);
 /* Delay loops */
diff -uNr linux-2.4.4-pre6/arch/i386/lib/Makefile linux/arch/i386/lib/Makefile
--- linux-2.4.4-pre6/arch/i386/lib/Makefile	Sat Apr 21 21:24:25 2001
+++ linux/arch/i386/lib/Makefile	Sat Apr 21 22:52:50 2001
@@ -9,7 +9,7 @@
 
 obj-y = checksum.o old-checksum.o delay.o \
 	usercopy.o getuser.o putuser.o \
-	memcpy.o strstr.o rwsem.o
+	memcpy.o strstr.o
 
 obj-$(CONFIG_X86_USE_3DNOW) += mmx.o
 obj-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o
diff -uNr linux-2.4.4-pre6/arch/i386/lib/rwsem.S linux/arch/i386/lib/rwsem.S
--- linux-2.4.4-pre6/arch/i386/lib/rwsem.S	Sat Apr 21 21:24:25 2001
+++ linux/arch/i386/lib/rwsem.S	Thu Jan  1 01:00:00 1970
@@ -1,36 +0,0 @@
-/* rwsem.S: R/W semaphores, register saving wrapper function stubs
- *
- * Written by David Howells (dhowells@redhat.com).
- * Derived from arch/i386/kernel/semaphore.c
- */
-
-.text
-.align 4
-.globl __rwsem_down_read_failed
-__rwsem_down_read_failed:
-	pushl	%edx
-	pushl	%ecx
-	call	rwsem_down_read_failed
-	popl	%ecx
-	popl	%edx
-	ret
-
-.align 4
-.globl __rwsem_down_write_failed
-__rwsem_down_write_failed:
-	pushl	%edx
-	pushl	%ecx
-	call	rwsem_down_write_failed
-	popl	%ecx
-	popl	%edx
-	ret
-
-.align 4
-.globl __rwsem_wake
-__rwsem_wake:
-	pushl	%edx
-	pushl	%ecx
-	call	rwsem_wake
-	popl	%ecx
-	popl	%edx
-	ret
diff -uNr linux-2.4.4-pre6/include/asm-i386/rwsem.h linux/include/asm-i386/rwsem.h
--- linux-2.4.4-pre6/include/asm-i386/rwsem.h	Sat Apr 21 21:24:32 2001
+++ linux/include/asm-i386/rwsem.h	Mon Apr 23 20:37:37 2001
@@ -17,11 +17,6 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 
-/* we use FASTCALL convention for the helpers */
-extern struct rw_semaphore *FASTCALL(__rwsem_down_read_failed(struct rw_semaphore *sem));
-extern struct rw_semaphore *FASTCALL(__rwsem_down_write_failed(struct rw_semaphore *sem));
-extern struct rw_semaphore *FASTCALL(__rwsem_wake(struct rw_semaphore *sem));
-
 struct rwsem_waiter;
 
 /*
@@ -41,11 +36,6 @@
 #if RWSEM_DEBUG
 	int			debug;
 #endif
-#if RWSEM_DEBUG_MAGIC
-	long			__magic;
-	atomic_t		readers;
-	atomic_t		writers;
-#endif
 };
 
 /*
@@ -56,15 +46,10 @@
 #else
 #define __RWSEM_DEBUG_INIT	/* */
 #endif
-#if RWSEM_DEBUG_MAGIC
-#define __RWSEM_DEBUG_MINIT(name)	, (int)&(name).__magic, ATOMIC_INIT(0), ATOMIC_INIT(0)
-#else
-#define __RWSEM_DEBUG_MINIT(name)	/* */
-#endif
 
 #define __RWSEM_INITIALIZER(name) \
 { RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, NULL, &(name).wait_front \
-	__RWSEM_DEBUG_INIT __RWSEM_DEBUG_MINIT(name) }
+	__RWSEM_DEBUG_INIT }
 
 #define DECLARE_RWSEM(name) \
 	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
@@ -78,11 +63,6 @@
 #if RWSEM_DEBUG
 	sem->debug = 0;
 #endif
-#if RWSEM_DEBUG_MAGIC
-	sem->__magic = (long)&sem->__magic;
-	atomic_set(&sem->readers, 0);
-	atomic_set(&sem->writers, 0);
-#endif
 }
 
 /*
@@ -97,7 +77,11 @@
 		"1:\n\t"
 		".section .text.lock,\"ax\"\n"
 		"2:\n\t"
-		"  call      __rwsem_down_read_failed\n\t"
+		"  pushl     %%ecx\n\t"
+		"  pushl     %%edx\n\t"
+		"  call      rwsem_down_read_failed\n\t"
+		"  popl      %%edx\n\t"
+		"  popl      %%ecx\n\t"
 		"  jmp       1b\n"
 		".previous"
 		"# ending down_read\n\t"
@@ -116,17 +100,19 @@
 	tmp = RWSEM_ACTIVE_WRITE_BIAS;
 	__asm__ __volatile__(
 		"# beginning down_write\n\t"
-LOCK_PREFIX	"  xadd      %0,(%%eax)\n\t" /* subtract 0x00010001, returns the old value */
+LOCK_PREFIX	"  xadd      %0,(%%eax)\n\t" /* subtract 0x0000ffff, returns the old value */
 		"  testl     %0,%0\n\t" /* was the count 0 before? */
 		"  jnz       2f\n\t" /* jump if we weren't granted the lock */
 		"1:\n\t"
 		".section .text.lock,\"ax\"\n"
 		"2:\n\t"
-		"  call      __rwsem_down_write_failed\n\t"
+		"  pushl     %%ecx\n\t"
+		"  call      rwsem_down_write_failed\n\t"
+		"  popl      %%ecx\n\t"
 		"  jmp       1b\n"
 		".previous\n"
 		"# ending down_write"
-		: "+r"(tmp), "=m"(sem->count)
+		: "+d"(tmp), "=m"(sem->count)
 		: "a"(sem), "m"(sem->count)
 		: "memory");
 }
@@ -136,26 +122,23 @@
  */
 static inline void __up_read(struct rw_semaphore *sem)
 {
-	int tmp;
-
-	tmp = -RWSEM_ACTIVE_READ_BIAS;
 	__asm__ __volatile__(
 		"# beginning __up_read\n\t"
-LOCK_PREFIX	"  xadd      %0,(%%eax)\n\t" /* subtracts 1, returns the old value */
+LOCK_PREFIX	"  xadd      %%eax,(%%edx)\n\t" /* subtracts 1, returns the old value */
 		"  js        2f\n\t" /* jump if the lock is being waited upon */
 		"1:\n\t"
 		".section .text.lock,\"ax\"\n"
 		"2:\n\t"
-		"  decl      %0\n\t" /* xadd gave us the old count */
-		"  testl     %3,%0\n\t" /* do nothing if still outstanding active readers */
+		"  decl      %%eax\n\t" /* xadd gave us the old count */
+		"  testl     %3,%%eax\n\t" /* do nothing if still outstanding active readers */
 		"  jnz       1b\n\t"
-		"  call      __rwsem_wake\n\t"
+		"  call      rwsem_up_read_wake\n\t"
 		"  jmp       1b\n"
 		".previous\n"
 		"# ending __up_read\n"
-		: "+r"(tmp), "=m"(sem->count)
-		: "a"(sem), "i"(RWSEM_ACTIVE_MASK), "m"(sem->count)
-		: "memory");
+		: "=m"(sem->count)
+		: "d"(sem), "a"(-RWSEM_ACTIVE_READ_BIAS), "i"(RWSEM_ACTIVE_MASK), "m"(sem->count)
+		: "memory", "ecx");
 }
 
 /*
@@ -165,21 +148,32 @@
 {
 	__asm__ __volatile__(
 		"# beginning __up_write\n\t"
-LOCK_PREFIX	"  addl      %2,(%%eax)\n\t" /* adds 0x0000ffff */
-		"  js        2f\n\t" /* jump if the lock is being waited upon */
+LOCK_PREFIX	"  cmpxchgl  %%ecx,(%%edx)\n\t" /* tries to transition 0xffff0001 -> 0x00000000 */
+		"  jnz       2f\n\t" /* jump if the lock is being waited upon */
 		"1:\n\t"
 		".section .text.lock,\"ax\"\n"
 		"2:\n\t"
-		"  call      __rwsem_wake\n\t"
+		"  call      rwsem_up_write_wake\n\t"
 		"  jmp       1b\n"
 		".previous\n"
 		"# ending __up_write\n"
 		: "=m"(sem->count)
-		: "a"(sem), "i"(-RWSEM_ACTIVE_WRITE_BIAS), "m"(sem->count)
+		: "d"(sem), "a"(RWSEM_ACTIVE_WRITE_BIAS), "c"(0), "m"(sem->count)
 		: "memory");
 }
 
 /*
+ * implement atomic add functionality
+ */
+static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem)
+{
+	__asm__ __volatile__(
+LOCK_PREFIX	"addl %1,%0"
+		:"=m"(sem->count)
+		:"ir"(delta), "m"(sem->count));
+}
+
+/*
  * implement exchange and add functionality
  */
 static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem)
@@ -187,9 +181,9 @@
 	int tmp = delta;
 
 	__asm__ __volatile__(
-		LOCK_PREFIX "xadd %0,(%1)"
-		: "+r"(tmp)
-		: "r"(sem)
+LOCK_PREFIX	"xadd %0,(%2)"
+		: "+r"(tmp), "=m"(sem->count)
+		: "r"(sem), "m"(sem->count)
 		: "memory");
 
 	return tmp+delta;
@@ -200,7 +194,31 @@
  */
 static inline __u16 rwsem_cmpxchgw(struct rw_semaphore *sem, __u16 old, __u16 new)
 {
-	return cmpxchg((__u16*)&sem->count,0,RWSEM_ACTIVE_BIAS);
+	__u16 tmp = old;
+
+	__asm__ __volatile__(
+LOCK_PREFIX	"cmpxchgw %w2,%3"
+		: "=a"(tmp), "=m"(sem->count)
+		: "r"(new), "m1"(sem->count), "a"(tmp)
+		: "memory");
+
+	return tmp;
+}
+
+/*
+ * implement compare and exchange functionality on the rw-semaphore count
+ */
+static inline signed long rwsem_cmpxchg(struct rw_semaphore *sem, signed long old, signed long new)
+{
+	signed long tmp = old;
+
+	__asm__ __volatile__(
+LOCK_PREFIX	"cmpxchgl %2,%3"
+		: "=a"(tmp), "=m"(sem->count)
+		: "r"(new), "m1"(sem->count), "a"(tmp)
+		: "memory");
+
+	return tmp;
 }
 
 #endif /* __KERNEL__ */
diff -uNr linux-2.4.4-pre6/include/asm-sparc64/rwsem.h linux/include/asm-sparc64/rwsem.h
--- linux-2.4.4-pre6/include/asm-sparc64/rwsem.h	Sat Apr 21 21:24:33 2001
+++ linux/include/asm-sparc64/rwsem.h	Sat Apr 21 23:12:22 2001
@@ -2,7 +2,7 @@
  * rwsem.h: R/W semaphores implemented using CAS
  *
  * Written by David S. Miller (davem@redhat.com), 2001.
- * Derived from asm-i386/rwsem-xadd.h
+ * Derived from asm-i386/rwsem.h
  */
 #ifndef _SPARC64_RWSEM_H
 #define _SPARC64_RWSEM_H
@@ -127,14 +127,15 @@
 		"save		%%sp, -160, %%sp\n\t"
 		"mov		%%g2, %%l2\n\t"
 		"mov		%%g3, %%l3\n\t"
+		" mov		%%g7, %%o0\n\t"
 		"call		%1\n\t"
-		" mov		%%g5, %%o0\n\t"
+		" mov		%%g5, %%o1\n\t"
 		"mov		%%l2, %%g2\n\t"
 		"ba,pt		%%xcc, 2b\n\t"
 		" restore	%%l3, %%g0, %%g3\n\t"
 		".previous\n\t"
 		"! ending __up_read"
-		: : "r" (sem), "i" (rwsem_wake),
+		: : "r" (sem), "i" (rwsem_up_read_wake),
 		    "i" (RWSEM_ACTIVE_MASK)
 		: "g1", "g5", "g7", "memory", "cc");
 }
@@ -145,31 +146,28 @@
 		"! beginning __up_write\n\t"
 		"sethi		%%hi(%2), %%g1\n\t"
 		"or		%%g1, %%lo(%2), %%g1\n"
-		"1:\tlduw	[%0], %%g5\n\t"
-		"sub		%%g5, %%g1, %%g7\n\t"
-		"cas		[%0], %%g5, %%g7\n\t"
-		"cmp		%%g5, %%g7\n\t"
-		"bne,pn		%%icc, 1b\n\t"
-		" sub		%%g7, %%g1, %%g7\n\t"
-		"cmp		%%g7, 0\n\t"
-		"bl,pn		%%icc, 3f\n\t"
+		"sub		%%g5, %%g5, %%g5\n\t"
+		"cas		[%0], %%g1, %%g5\n\t"
+		"cmp		%%g1, %%g5\n\t"
+		"bne,pn		%%icc, 1f\n\t"
 		" membar	#StoreStore\n"
 		"2:\n\t"
 		".subsection 2\n"
-		"3:\tmov	%0, %%g5\n\t"
+		"3:\tmov	%0, %%g1\n\t"
 		"save		%%sp, -160, %%sp\n\t"
 		"mov		%%g2, %%l2\n\t"
 		"mov		%%g3, %%l3\n\t"
+		"mov		%%g1, %%o0\n\t"
 		"call		%1\n\t"
-		" mov		%%g5, %%o0\n\t"
+		" mov		%%g5, %%o1\n\t"
 		"mov		%%l2, %%g2\n\t"
 		"ba,pt		%%xcc, 2b\n\t"
 		" restore	%%l3, %%g0, %%g3\n\t"
 		".previous\n\t"
 		"! ending __up_write"
-		: : "r" (sem), "i" (rwsem_wake),
+		: : "r" (sem), "i" (rwsem_up_write_wake),
 		    "i" (RWSEM_ACTIVE_WRITE_BIAS)
-		: "g1", "g5", "g7", "memory", "cc");
+		: "g1", "g5", "memory", "cc");
 }
 
 static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem)
@@ -191,6 +189,8 @@
 	return tmp + delta;
 }
 
+#define rwsem_atomic_add rwsem_atomic_update
+
 static inline __u16 rwsem_cmpxchgw(struct rw_semaphore *sem, __u16 __old, __u16 __new)
 {
 	u32 old = (sem->count & 0xffff0000) | (u32) __old;
@@ -212,6 +212,11 @@
 		goto again;
 
 	return prev & 0xffff;
+}
+
+static inline signed long rwsem_cmpxchg(struct rw_semaphore *sem, signed long old, signed long new)
+{
+	return cmpxchg(&sem->count,old,new);
 }
 
 #endif /* __KERNEL__ */
diff -uNr linux-2.4.4-pre6/include/linux/rwsem-spinlock.h linux/include/linux/rwsem-spinlock.h
--- linux-2.4.4-pre6/include/linux/rwsem-spinlock.h	Sat Apr 21 21:24:33 2001
+++ linux/include/linux/rwsem-spinlock.h	Mon Apr 23 21:06:42 2001
@@ -1,6 +1,8 @@
 /* rwsem-spinlock.h: fallback C implementation
  *
  * Copyright (c) 2001   David Howells (dhowells@redhat.com).
+ * - Derived partially from ideas by Andrea Arcangeli <andrea@suse.de>
+ * - Derived also from comments by Linus
  */
 
 #ifndef _LINUX_RWSEM_SPINLOCK_H
@@ -11,6 +13,7 @@
 #endif
 
 #include <linux/spinlock.h>
+#include <linux/list.h>
 
 #ifdef __KERNEL__
 
@@ -19,27 +22,19 @@
 struct rwsem_waiter;
 
 /*
- * the semaphore definition
+ * the rw-semaphore definition
+ * - if activity is 0 then there are no active readers or writers
+ * - if activity is +ve then that is the number of active readers
+ * - if activity is -1 then there is one active writer
+ * - if wait_list is not empty, then there are processes waiting for the semaphore
  */
 struct rw_semaphore {
-	signed long		count;
-#define RWSEM_UNLOCKED_VALUE		0x00000000
-#define RWSEM_ACTIVE_BIAS		0x00000001
-#define RWSEM_ACTIVE_MASK		0x0000ffff
-#define RWSEM_WAITING_BIAS		(-0x00010000)
-#define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
-#define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
+	__s32			activity;
 	spinlock_t		wait_lock;
-	struct rwsem_waiter	*wait_front;
-	struct rwsem_waiter	**wait_back;
+	struct list_head	wait_list;
 #if RWSEM_DEBUG
 	int			debug;
 #endif
-#if RWSEM_DEBUG_MAGIC
-	long			__magic;
-	atomic_t		readers;
-	atomic_t		writers;
-#endif
 };
 
 /*
@@ -50,119 +45,18 @@
 #else
 #define __RWSEM_DEBUG_INIT	/* */
 #endif
-#if RWSEM_DEBUG_MAGIC
-#define __RWSEM_DEBUG_MINIT(name)	, (int)&(name).__magic, ATOMIC_INIT(0), ATOMIC_INIT(0)
-#else
-#define __RWSEM_DEBUG_MINIT(name)	/* */
-#endif
 
 #define __RWSEM_INITIALIZER(name) \
-{ RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, NULL, &(name).wait_front \
-	__RWSEM_DEBUG_INIT __RWSEM_DEBUG_MINIT(name) }
+{ 0, SPIN_LOCK_UNLOCKED, LIST_HEAD_INIT((name).wait_list) __RWSEM_DEBUG_INIT }
 
 #define DECLARE_RWSEM(name) \
 	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
 
-static inline void init_rwsem(struct rw_semaphore *sem)
-{
-	sem->count = RWSEM_UNLOCKED_VALUE;
-	spin_lock_init(&sem->wait_lock);
-	sem->wait_front = NULL;
-	sem->wait_back = &sem->wait_front;
-#if RWSEM_DEBUG
-	sem->debug = 0;
-#endif
-#if RWSEM_DEBUG_MAGIC
-	sem->__magic = (long)&sem->__magic;
-	atomic_set(&sem->readers, 0);
-	atomic_set(&sem->writers, 0);
-#endif
-}
-
-/*
- * lock for reading
- */
-static inline void __down_read(struct rw_semaphore *sem)
-{
-	int count;
-	spin_lock(&sem->wait_lock);
-	sem->count += RWSEM_ACTIVE_READ_BIAS;
-	count = sem->count;
-	spin_unlock(&sem->wait_lock);
-	if (count<0)
-		rwsem_down_read_failed(sem);
-}
-
-/*
- * lock for writing
- */
-static inline void __down_write(struct rw_semaphore *sem)
-{
-	int count;
-	spin_lock(&sem->wait_lock);
-	count = sem->count;
-	sem->count += RWSEM_ACTIVE_WRITE_BIAS;
-	spin_unlock(&sem->wait_lock);
-	if (count)
-		rwsem_down_write_failed(sem);
-}
-
-/*
- * unlock after reading
- */
-static inline void __up_read(struct rw_semaphore *sem)
-{
-	int count;
-	spin_lock(&sem->wait_lock);
-	count = sem->count;
-	sem->count -= RWSEM_ACTIVE_READ_BIAS;
-	spin_unlock(&sem->wait_lock);
-	if (count<0 && !((count-RWSEM_ACTIVE_READ_BIAS)&RWSEM_ACTIVE_MASK))
-		rwsem_wake(sem);
-}
-
-/*
- * unlock after writing
- */
-static inline void __up_write(struct rw_semaphore *sem)
-{
-	int count;
-	spin_lock(&sem->wait_lock);
-	sem->count -= RWSEM_ACTIVE_WRITE_BIAS;
-	count = sem->count;
-	spin_unlock(&sem->wait_lock);
-	if (count<0)
-		rwsem_wake(sem);
-}
-
-/*
- * implement exchange and add functionality
- * - only called when spinlock is already held
- */
-static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem)
-{
-	int count;
-
-	sem->count += delta;
-	count = sem->count;
-
-	return count;
-}
-
-/*
- * implement compare and exchange functionality on the rw-semaphore count LSW
- * - only called by __rwsem_do_wake(), so spinlock is already held when called
- */
-static inline __u16 rwsem_cmpxchgw(struct rw_semaphore *sem, __u16 old, __u16 new)
-{
-	__u16 prev;
-
-	prev = sem->count & RWSEM_ACTIVE_MASK;
-	if (prev==old)
-		sem->count = (sem->count & ~RWSEM_ACTIVE_MASK) | new;
-
-	return prev;
-}
+extern void FASTCALL(init_rwsem(struct rw_semaphore *sem));
+extern void FASTCALL(__down_read(struct rw_semaphore *sem));
+extern void FASTCALL(__down_write(struct rw_semaphore *sem));
+extern void FASTCALL(__up_read(struct rw_semaphore *sem));
+extern void FASTCALL(__up_write(struct rw_semaphore *sem));
 
 #endif /* __KERNEL__ */
 #endif /* _LINUX_RWSEM_SPINLOCK_H */
diff -uNr linux-2.4.4-pre6/include/linux/rwsem.h linux/include/linux/rwsem.h
--- linux-2.4.4-pre6/include/linux/rwsem.h	Sat Apr 21 21:24:33 2001
+++ linux/include/linux/rwsem.h	Mon Apr 23 20:37:37 2001
@@ -34,7 +34,6 @@
 #include <linux/linkage.h>
 
 #define RWSEM_DEBUG 0
-#define RWSEM_DEBUG_MAGIC 0
 
 #ifdef __KERNEL__
 
@@ -47,11 +46,12 @@
 /* defined contention handler functions for the generic case
  * - these are also used for the exchange-and-add based algorithm
  */
-#if defined(CONFIG_RWSEM_GENERIC_SPINLOCK) || defined(CONFIG_RWSEM_XCHGADD_ALGORITHM)
+#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
 /* we use FASTCALL convention for the helpers */
 extern struct rw_semaphore *FASTCALL(rwsem_down_read_failed(struct rw_semaphore *sem));
 extern struct rw_semaphore *FASTCALL(rwsem_down_write_failed(struct rw_semaphore *sem));
-extern struct rw_semaphore *FASTCALL(rwsem_wake(struct rw_semaphore *sem));
+extern void FASTCALL(rwsem_up_read_wake(signed long, struct rw_semaphore *));
+extern void FASTCALL(rwsem_up_write_wake(signed long, struct rw_semaphore *));
 #endif
 
 #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
@@ -74,20 +74,7 @@
 static inline void down_read(struct rw_semaphore *sem)
 {
 	rwsemtrace(sem,"Entering down_read");
-
-#if RWSEM_DEBUG_MAGIC
-	if (sem->__magic != (long)&sem->__magic)
-		BUG();
-#endif
-
 	__down_read(sem);
-
-#if RWSEM_DEBUG_MAGIC
-	if (atomic_read(&sem->writers))
-		BUG();
-	atomic_inc(&sem->readers);
-#endif
-
 	rwsemtrace(sem,"Leaving down_read");
 }
 
@@ -97,22 +84,7 @@
 static inline void down_write(struct rw_semaphore *sem)
 {
 	rwsemtrace(sem,"Entering down_write");
-
-#if RWSEM_DEBUG_MAGIC
-	if (sem->__magic != (long)&sem->__magic)
-		BUG();
-#endif
-
 	__down_write(sem);
-
-#if RWSEM_DEBUG_MAGIC
-	if (atomic_read(&sem->writers))
-		BUG();
-	if (atomic_read(&sem->readers))
-		BUG();
-	atomic_inc(&sem->writers);
-#endif
-
 	rwsemtrace(sem,"Leaving down_write");
 }
 
@@ -122,14 +94,7 @@
 static inline void up_read(struct rw_semaphore *sem)
 {
 	rwsemtrace(sem,"Entering up_read");
-
-#if RWSEM_DEBUG_MAGIC
-	if (atomic_read(&sem->writers))
-		BUG();
-	atomic_dec(&sem->readers);
-#endif
 	__up_read(sem);
-
 	rwsemtrace(sem,"Leaving up_read");
 }
 
@@ -139,16 +104,7 @@
 static inline void up_write(struct rw_semaphore *sem)
 {
 	rwsemtrace(sem,"Entering up_write");
-
-#if RWSEM_DEBUG_MAGIC
-	if (atomic_read(&sem->readers))
-		BUG();
-	if (atomic_read(&sem->writers) != 1)
-		BUG();
-	atomic_dec(&sem->writers);
-#endif
 	__up_write(sem);
-
 	rwsemtrace(sem,"Leaving up_write");
 }
 
diff -uNr linux-2.4.4-pre6/lib/Makefile linux/lib/Makefile
--- linux-2.4.4-pre6/lib/Makefile	Sat Apr 21 21:24:33 2001
+++ linux/lib/Makefile	Sun Apr 22 00:07:33 2001
@@ -8,14 +8,12 @@
 
 L_TARGET := lib.a
 
-export-objs := cmdline.o
+export-objs := cmdline.o rwsem-spinlock.o rwsem.o
 
 obj-y := errno.o ctype.o string.o vsprintf.o brlock.o cmdline.o
 
-ifneq ($(CONFIG_RWSEM_GENERIC_SPINLOCK)$(CONFIG_RWSEM_XCHGADD_ALGORITHM),nn)
-export-objs += rwsem.o
-obj-y += rwsem.o
-endif
+obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
+obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
 
 ifneq ($(CONFIG_HAVE_DEC_LOCK),y) 
   obj-y += dec_and_lock.o
diff -uNr linux-2.4.4-pre6/lib/rwsem-spinlock.c linux/lib/rwsem-spinlock.c
--- linux-2.4.4-pre6/lib/rwsem-spinlock.c	Thu Jan  1 01:00:00 1970
+++ linux/lib/rwsem-spinlock.c	Mon Apr 23 21:06:46 2001
@@ -0,0 +1,239 @@
+/* rwsem-spinlock.c: R/W semaphores: contention handling functions for generic spinlock
+ *                                   implementation
+ *
+ * Copyright (c) 2001   David Howells (dhowells@redhat.com).
+ * - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
+ * - Derived also from comments by Linus
+ */
+#include <linux/rwsem.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+
+struct rwsem_waiter {
+	struct list_head	list;
+	struct task_struct	*task;
+	unsigned int		flags;
+#define RWSEM_WAITING_FOR_READ	0x00000001
+#define RWSEM_WAITING_FOR_WRITE	0x00000002
+};
+
+#if RWSEM_DEBUG
+void rwsemtrace(struct rw_semaphore *sem, const char *str)
+{
+	if (sem->debug)
+		printk("[%d] %s({%d,%d})\n",
+		       current->pid,str,sem->activity,list_empty(&sem->wait_list)?0:1);
+}
+#endif
+
+/*
+ * initialise the semaphore
+ */
+void init_rwsem(struct rw_semaphore *sem)
+{
+	sem->activity = 0;
+	spin_lock_init(&sem->wait_lock);
+	INIT_LIST_HEAD(&sem->wait_list);
+#if RWSEM_DEBUG
+	sem->debug = 0;
+#endif
+}
+
+/*
+ * handle the lock being released whilst there are processes blocked on it that can now run
+ * - if we come here, then:
+ *   - the 'active count' _reached_ zero
+ *   - the 'waiting count' is non-zero
+ * - the spinlock must be held by the caller
+ * - woken process blocks are discarded from the list after having flags zeroised
+ */
+static inline struct rw_semaphore *__rwsem_do_wake(struct rw_semaphore *sem)
+{
+	struct rwsem_waiter *waiter;
+	int woken;
+
+	rwsemtrace(sem,"Entering __rwsem_do_wake");
+
+	waiter = list_entry(sem->wait_list.next,struct rwsem_waiter,list);
+
+	/* try to grant a single write lock if there's a writer at the front of the queue
+	 * - we leave the 'waiting count' incremented to signify potential contention
+	 */
+	if (waiter->flags & RWSEM_WAITING_FOR_WRITE) {
+		sem->activity = -1;
+		list_del(&waiter->list);
+		waiter->flags = 0;
+		wake_up_process(waiter->task);
+		goto out;
+	}
+
+	/* grant an infinite number of read locks to the readers at the front of the queue */
+	woken = 0;
+	do {
+		list_del(&waiter->list);
+		waiter->flags = 0;
+		wake_up_process(waiter->task);
+		woken++;
+		if (list_empty(&sem->wait_list))
+			break;
+		waiter = list_entry(sem->wait_list.next,struct rwsem_waiter,list);
+	} while (waiter->flags&RWSEM_WAITING_FOR_READ);
+
+	sem->activity += woken;
+
+ out:
+	rwsemtrace(sem,"Leaving __rwsem_do_wake");
+	return sem;
+}
+
+/*
+ * wake a single writer
+ */
+static inline struct rw_semaphore *__rwsem_wake_one_writer(struct rw_semaphore *sem)
+{
+	struct rwsem_waiter *waiter;
+
+	sem->activity = -1;
+
+	waiter = list_entry(sem->wait_list.next,struct rwsem_waiter,list);
+	list_del(&waiter->list);
+
+	waiter->flags = 0;
+	wake_up_process(waiter->task);
+	return sem;
+}
+
+/*
+ * get a read lock on the semaphore
+ */
+void __down_read(struct rw_semaphore *sem)
+{
+	struct rwsem_waiter waiter;
+	struct task_struct *tsk;
+
+	rwsemtrace(sem,"Entering __down_read");
+
+	spin_lock(&sem->wait_lock);
+
+	if (sem->activity>=0 && list_empty(&sem->wait_list)) {
+		/* granted */
+		sem->activity++;
+		spin_unlock(&sem->wait_lock);
+		goto out;
+	}
+
+	tsk = current;
+	set_task_state(tsk,TASK_UNINTERRUPTIBLE);
+
+	/* set up my own style of waitqueue */
+	waiter.task = tsk;
+	waiter.flags = RWSEM_WAITING_FOR_READ;
+
+	list_add_tail(&waiter.list,&sem->wait_list);
+
+	/* we don't need to touch the semaphore struct anymore */
+	spin_unlock(&sem->wait_lock);
+
+	/* wait to be given the lock */
+	for (;;) {
+		if (!waiter.flags)
+			break;
+		schedule();
+		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+	}
+
+	tsk->state = TASK_RUNNING;
+
+ out:
+	rwsemtrace(sem,"Leaving __down_read");
+}
+
+/*
+ * get a write lock on the semaphore
+ * - note that we increment the waiting count anyway to indicate an exclusive lock
+ */
+void __down_write(struct rw_semaphore *sem)
+{
+	struct rwsem_waiter waiter;
+	struct task_struct *tsk;
+
+	rwsemtrace(sem,"Entering __down_write");
+
+	spin_lock(&sem->wait_lock);
+
+	if (sem->activity==0 && list_empty(&sem->wait_list)) {
+		/* granted */
+		sem->activity = -1;
+		spin_unlock(&sem->wait_lock);
+		goto out;
+	}
+
+	tsk = current;
+	set_task_state(tsk,TASK_UNINTERRUPTIBLE);
+
+	/* set up my own style of waitqueue */
+	waiter.task = tsk;
+	waiter.flags = RWSEM_WAITING_FOR_WRITE;
+
+	list_add_tail(&waiter.list,&sem->wait_list);
+
+	/* we don't need to touch the semaphore struct anymore */
+	spin_unlock(&sem->wait_lock);
+
+	/* wait to be given the lock */
+	for (;;) {
+		if (!waiter.flags)
+			break;
+		schedule();
+		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+	}
+
+	tsk->state = TASK_RUNNING;
+
+ out:
+	rwsemtrace(sem,"Leaving __down_write");
+}
+
+/*
+ * release a read lock on the semaphore
+ */
+void __up_read(struct rw_semaphore *sem)
+{
+	rwsemtrace(sem,"Entering __up_read");
+
+	spin_lock(&sem->wait_lock);
+
+	if (--sem->activity==0 && !list_empty(&sem->wait_list))
+		sem = __rwsem_wake_one_writer(sem);
+
+	spin_unlock(&sem->wait_lock);
+
+	rwsemtrace(sem,"Leaving __up_read");
+}
+
+/*
+ * release a write lock on the semaphore
+ */
+void __up_write(struct rw_semaphore *sem)
+{
+	rwsemtrace(sem,"Entering __up_write");
+
+	spin_lock(&sem->wait_lock);
+
+	sem->activity = 0;
+	if (!list_empty(&sem->wait_list))
+		sem = __rwsem_do_wake(sem);
+
+	spin_unlock(&sem->wait_lock);
+
+	rwsemtrace(sem,"Leaving __up_write");
+}
+
+EXPORT_SYMBOL(init_rwsem);
+EXPORT_SYMBOL(__down_read);
+EXPORT_SYMBOL(__down_write);
+EXPORT_SYMBOL(__up_read);
+EXPORT_SYMBOL(__up_write);
+#if RWSEM_DEBUG
+EXPORT_SYMBOL(rwsemtrace);
+#endif
diff -uNr linux-2.4.4-pre6/lib/rwsem.c linux/lib/rwsem.c
--- linux-2.4.4-pre6/lib/rwsem.c	Sat Apr 21 21:24:33 2001
+++ linux/lib/rwsem.c	Sun Apr 22 14:29:53 2001
@@ -14,57 +14,36 @@
 #define RWSEM_WAITING_FOR_READ	0x00000001
 #define RWSEM_WAITING_FOR_WRITE	0x00000002
 };
-#define RWSEM_WAITER_MAGIC 0x52575345
-
-static struct rw_semaphore *FASTCALL(__rwsem_do_wake(struct rw_semaphore *sem));
 
 #if RWSEM_DEBUG
 void rwsemtrace(struct rw_semaphore *sem, const char *str)
 {
 	if (sem->debug)
-		printk("[%d] %s(count=%08lx)\n",current->pid,str,sem->count);
+		printk("[%d] %s({%08lx})\n",current->pid,str,sem->count);
 }
 #endif
 
 /*
  * handle the lock being released whilst there are processes blocked on it that can now run
+ * - the caller can specify an adjustment that will need to be made to the semaphore count to
+ *   help reduce the number of atomic operations invoked
  * - if we come here, then:
- *   - the 'active part' of the count (&0x0000ffff) reached zero (but may no longer be zero)
+ *   - the 'active part' of the count (&0x0000ffff) reached zero but has been re-incremented
  *   - the 'waiting part' of the count (&0xffff0000) is negative (and will still be so)
- *   - the spinlock must be held before entry
- *   - woken process blocks are discarded from the list after having flags zeroised
+ * - the spinlock must be held by the caller
+ * - woken process blocks are discarded from the list after having flags zeroised
  */
-static struct rw_semaphore *__rwsem_do_wake(struct rw_semaphore *sem)
+static inline struct rw_semaphore *__rwsem_do_wake(int adjustment, struct rw_semaphore *sem)
 {
 	struct rwsem_waiter *waiter, *next;
 	int woken, loop;
 
 	rwsemtrace(sem,"Entering __rwsem_do_wake");
 
-	/* try to grab an 'activity' marker
-	 * - need to make sure two copies of rwsem_wake() don't do this for two separate processes
-	 *   simultaneously
-	 * - be horribly naughty, and only deal with the LSW of the atomic counter
-	 */
-	if (rwsem_cmpxchgw(sem,0,RWSEM_ACTIVE_BIAS)!=0) {
-		rwsemtrace(sem,"__rwsem_do_wake: abort wakeup due to renewed activity");
-		goto out;
-	}
-
-	/* check the wait queue is populated */
 	waiter = sem->wait_front;
 
-	if (__builtin_expect(!waiter,0)) {
-		printk("__rwsem_do_wake(): wait_list unexpectedly empty\n");
-		BUG();
-		goto out;
-	}
-
-	if (__builtin_expect(!waiter->flags,0)) {
-		printk("__rwsem_do_wake(): wait_list front apparently not waiting\n");
-		BUG();
-		goto out;
-	}
+	if (!waiter)
+	  goto list_unexpectedly_empty;
 
 	next = NULL;
 
@@ -73,6 +52,8 @@
 	 *   incremented by 0x00010000
 	 */
 	if (waiter->flags & RWSEM_WAITING_FOR_WRITE) {
+		if (adjustment)
+			rwsem_atomic_add(adjustment,sem);
 		next = waiter->next;
 		waiter->flags = 0;
 		wake_up_process(waiter->task);
@@ -92,7 +73,8 @@
 	loop = woken;
 	woken *= RWSEM_ACTIVE_BIAS-RWSEM_WAITING_BIAS;
 	woken -= RWSEM_ACTIVE_BIAS;
-	rwsem_atomic_update(woken,sem);
+	woken += adjustment;
+	rwsem_atomic_add(woken,sem);
 
 	waiter = sem->wait_front;
 	for (; loop>0; loop--) {
@@ -109,6 +91,12 @@
  out:
 	rwsemtrace(sem,"Leaving __rwsem_do_wake");
 	return sem;
+
+ list_unexpectedly_empty:
+	printk("__rwsem_do_wake(): wait_list unexpectedly empty\n");
+	printk("[%d] %p = { %08lx })\n",current->pid,sem,sem->count);
+	BUG();
+	goto out;
 }
 
 /*
@@ -123,7 +111,7 @@
 	signed long count;
 
 	rwsemtrace(sem,"Entering rwsem_down_read_failed");
-	
+
 	set_task_state(tsk,TASK_UNINTERRUPTIBLE);
 
 	/* set up my own style of waitqueue */
@@ -141,9 +129,11 @@
 
 	/* if there are no longer active locks, wake the front queued process(es) up
 	 * - it might even be this process, since the waker takes a more active part
+	 * - should only enter __rwsem_do_wake() only on a transition 0->1 in the LSW
 	 */
 	if (!(count & RWSEM_ACTIVE_MASK))
-		__rwsem_do_wake(sem);
+		if (rwsem_cmpxchgw(sem,0,RWSEM_ACTIVE_BIAS)==0)
+			__rwsem_do_wake(0,sem);
 
 	spin_unlock(&sem->wait_lock);
 
@@ -189,9 +179,11 @@
 
 	/* if there are no longer active locks, wake the front queued process(es) up
 	 * - it might even be this process, since the waker takes a more active part
+	 * - should only enter __rwsem_do_wake() only on a transition 0->1 in the LSW
 	 */
 	if (!(count & RWSEM_ACTIVE_MASK))
-		__rwsem_do_wake(sem);
+		if (rwsem_cmpxchgw(sem,0,RWSEM_ACTIVE_BIAS)==0)
+			__rwsem_do_wake(0,sem);
 
 	spin_unlock(&sem->wait_lock);
 
@@ -210,25 +202,64 @@
 }
 
 /*
- * spinlock grabbing wrapper for __rwsem_do_wake()
+ * handle up_read() finding a waiter on the semaphore
+ * - up_read has decremented the active part of the count if we come here
  */
-struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
+void rwsem_up_read_wake(signed long count, struct rw_semaphore *sem)
 {
-	rwsemtrace(sem,"Entering rwsem_wake");
+	rwsemtrace(sem,"Entering rwsem_up_read_wake");
 
 	spin_lock(&sem->wait_lock);
 
-	sem = __rwsem_do_wake(sem);
+	/* need to wake up a waiter unless the semaphore has gone active again
+	 * - should only enter __rwsem_do_wake() only on a transition 0->1 in the LSW
+	 */
+	if (rwsem_cmpxchgw(sem,0,RWSEM_ACTIVE_BIAS)==0)
+		sem = __rwsem_do_wake(0,sem);
 
 	spin_unlock(&sem->wait_lock);
 
-	rwsemtrace(sem,"Leaving rwsem_wake");
-	return sem;
+	rwsemtrace(sem,"Leaving rwsem_up_read_wake");
+}
+
+/*
+ * handle up_write() finding a waiter on the semaphore
+ * - up_write has not modified the count if we come here
+ */
+void rwsem_up_write_wake(signed long count, struct rw_semaphore *sem)
+{
+	signed long new;
+
+	rwsemtrace(sem,"Entering rwsem_up_write_wake");
+
+	spin_lock(&sem->wait_lock);
+
+ try_again:
+	/* if the active part of the count is 1, we should perform a wake-up, else we should
+	 * decrement the count and return
+	 */
+	if ((count&RWSEM_ACTIVE_MASK)==RWSEM_ACTIVE_BIAS) {
+		sem = __rwsem_do_wake(-RWSEM_WAITING_BIAS,sem);
+	}
+	else {
+		/* tricky - we mustn't return the active part of the count to 0 */
+		new = count - RWSEM_ACTIVE_WRITE_BIAS;
+		new = rwsem_cmpxchg(sem,count,new);
+		if (count!=new) {
+			count = new;
+			goto try_again;
+		}
+	}
+
+	spin_unlock(&sem->wait_lock);
+
+	rwsemtrace(sem,"Leaving rwsem_up_write_wake");
 }
 
 EXPORT_SYMBOL(rwsem_down_read_failed);
 EXPORT_SYMBOL(rwsem_down_write_failed);
-EXPORT_SYMBOL(rwsem_wake);
+EXPORT_SYMBOL(rwsem_up_read_wake);
+EXPORT_SYMBOL(rwsem_up_write_wake);
 #if RWSEM_DEBUG
 EXPORT_SYMBOL(rwsemtrace);
 #endif

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] rw_semaphores, optimisations try #3
  2001-04-23 20:35 [PATCH] rw_semaphores, optimisations try #3 D.W.Howells
@ 2001-04-23 21:34 ` Andrea Arcangeli
  2001-04-24  4:56   ` rwsem benchmark [was Re: [PATCH] rw_semaphores, optimisations try #3] Andrea Arcangeli
  2001-04-23 22:23 ` [PATCH] rw_semaphores, optimisations try #3 Linus Torvalds
  1 sibling, 1 reply; 19+ messages in thread
From: Andrea Arcangeli @ 2001-04-23 21:34 UTC (permalink / raw)
  To: D . W . Howells; +Cc: torvalds, linux-kernel, dhowells, davem

[-- Attachment #1: Type: text/plain, Size: 1623 bytes --]

On Mon, Apr 23, 2001 at 09:35:34PM +0100, D . W . Howells wrote:
> This patch (made against linux-2.4.4-pre6) makes a number of changes to the
> rwsem implementation:
> 
>  (1) Everything in try #2
> 
> plus
> 
>  (2) Changes proposed by Linus for the generic semaphore code.
> 
>  (3) Ideas from Andrea and how he implemented his semaphores.

I benchmarked try3 on top of pre6 and I get this:

----------------------------------------------
RWSEM_GENERIC_SPINLOCK y in rwsem-2.4.4-pre6 + your latest #try3

rw

reads taken: 5842496
writes taken: 3016649
reads taken: 5823381
writes taken: 3006773

r1

reads taken: 13309316
reads taken: 13311722

r2

reads taken: 5010534
reads taken: 5023185

ro

reads taken: 3850228
reads taken: 3845954

w1

writes taken: 13012701
writes taken: 13021716

wo

writes taken: 1825789
writes taken: 1802560

----------------------------------------------
RWSEM_XCHGADD y in rwsem-2.4.4-pre6 + your latest #try3

rw

reads taken: 5789542
writes taken: 2989478
reads taken: 5801777
writes taken: 2995669

r1

reads taken: 16922653
reads taken: 16946132

r2

reads taken: 5650211
reads taken: 5647272

ro

reads taken: 4956250
reads taken: 4959828

w1

writes taken: 15431139
writes taken: 15439790

wo

writes taken: 813756
writes taken: 816005

graph updated attached. so in short my fast path is still quite faster (r1/w1),
slow path is comparable now (I still win in all tests but wo which is probably
the less interesting one in real life [write contention]). I still have room to
improve the wo test [write contention] by spending more icache btw but it
probably doesn't worth.

Andrea

[-- Attachment #2: rwsem2.png --]
[-- Type: image/png, Size: 7414 bytes --]

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] rw_semaphores, optimisations try #3
  2001-04-23 20:35 [PATCH] rw_semaphores, optimisations try #3 D.W.Howells
  2001-04-23 21:34 ` Andrea Arcangeli
@ 2001-04-23 22:23 ` Linus Torvalds
  2001-04-24 10:05   ` David Howells
  1 sibling, 1 reply; 19+ messages in thread
From: Linus Torvalds @ 2001-04-23 22:23 UTC (permalink / raw)
  To: D.W.Howells; +Cc: linux-kernel, dhowells, andrea, davem



On Mon, 23 Apr 2001, D.W.Howells wrote:
>
> Linus, you suggested that the generic list handling stuff would be faster (2
> unconditional stores) than mine (1 unconditional store and 1 conditional
> store and branch to jump round it). You are both right and wrong. The generic
> code does two stores per _process_ woken up (list_del) mine does the 1 or 2
> stores per _batch_ of processes woken up. So the generic way is better when
> the queue is an even mixture of readers or writers and my way is better when
> there are far greater numbers of waiting readers. However, that said, there
> is not much in it either way, so I've reverted it to the generic list stuff.

Note that the generic list structure already has support for "batching".
It only does it for multiple adds right now (see the "list_splice"
merging code), but there is nothing to stop people from doing it for
multiple deletions too. The code is something like

	static inline void list_remove_between(x,y)
	{
		n->next = y;
		y->prev = x;
	}

and notice how it's still just two unconditional stores for _any_ number
of deleted entries.

Anyway, I've already applied your #2, how about a patch relative to that?

		Linus


^ permalink raw reply	[flat|nested] 19+ messages in thread

* rwsem benchmark [was Re: [PATCH] rw_semaphores, optimisations try #3]
  2001-04-23 21:34 ` Andrea Arcangeli
@ 2001-04-24  4:56   ` Andrea Arcangeli
  2001-04-24  8:56     ` David Howells
  0 siblings, 1 reply; 19+ messages in thread
From: Andrea Arcangeli @ 2001-04-24  4:56 UTC (permalink / raw)
  To: D . W . Howells; +Cc: torvalds, linux-kernel, dhowells, davem

[-- Attachment #1: Type: text/plain, Size: 6546 bytes --]

On Mon, Apr 23, 2001 at 11:34:35PM +0200, Andrea Arcangeli wrote:
> On Mon, Apr 23, 2001 at 09:35:34PM +0100, D . W . Howells wrote:
> > This patch (made against linux-2.4.4-pre6) makes a number of changes to the
> > rwsem implementation:
> > 
> >  (1) Everything in try #2
> > 
> > plus
> > 
> >  (2) Changes proposed by Linus for the generic semaphore code.
> > 
> >  (3) Ideas from Andrea and how he implemented his semaphores.
> 
> I benchmarked try3 on top of pre6 and I get this:
> 
> ----------------------------------------------
> RWSEM_GENERIC_SPINLOCK y in rwsem-2.4.4-pre6 + your latest #try3
> 
> rw
> 
> reads taken: 5842496
> writes taken: 3016649
> reads taken: 5823381
> writes taken: 3006773
> 
> r1
> 
> reads taken: 13309316
> reads taken: 13311722
> 
> r2
> 
> reads taken: 5010534
> reads taken: 5023185
> 
> ro
> 
> reads taken: 3850228
> reads taken: 3845954
> 
> w1
> 
> writes taken: 13012701
> writes taken: 13021716
> 
> wo
> 
> writes taken: 1825789
> writes taken: 1802560
> 
> ----------------------------------------------
> RWSEM_XCHGADD y in rwsem-2.4.4-pre6 + your latest #try3
> 
> rw
> 
> reads taken: 5789542
> writes taken: 2989478
> reads taken: 5801777
> writes taken: 2995669
> 
> r1
> 
> reads taken: 16922653
> reads taken: 16946132
> 
> r2
> 
> reads taken: 5650211
> reads taken: 5647272
> 
> ro
> 
> reads taken: 4956250
> reads taken: 4959828
> 
> w1
> 
> writes taken: 15431139
> writes taken: 15439790
> 
> wo
> 
> writes taken: 813756
> writes taken: 816005
> 
> graph updated attached. so in short my fast path is still quite faster (r1/w1),
> slow path is comparable now (I still win in all tests but wo which is probably
> the less interesting one in real life [write contention]). I still have room to
> improve the wo test [write contention] by spending more icache btw but it
> probably doesn't worth.

Ok I finished now my asm optimized rwsemaphores and I improved a little my
spinlock based one but without touching the icache usage.

Here it is the code against pre6 vanilla:

	ftp://ftp.us.kernel.org/pub/linux/kernel/people/andrea/patches/v2.4/2.4.4pre6/rwsem-8

here the same but against David's try#2:

	ftp://ftp.us.kernel.org/pub/linux/kernel/people/andrea/patches/v2.4/2.4.4pre6/rwsem-8-against-dh-try2

and here again the same but against David's latest try#3:

	ftp://ftp.us.kernel.org/pub/linux/kernel/people/andrea/patches/v2.4/2.4.4pre6/rwsem-8-against-dh-try3

The main advantage of my rewrite are:

-	my x86 version is visibly faster than yours in the write fast path and it
	also saves icache because it's smaller (read fast path is reproducibly a bit faster too)

-	my spinlock version is visibly faster in both read and write fast path and still
	has smaller icache footprint (of course my version is completly out of line too
	and I'm comparing apples to apples)

-	at least to me the slow path of my code seems to be much simpler
	than yours (and I can actually understand it ;), for example I don't feel
	any need of any atomic exchange anywhere, I admit now comparing
	my code to yours I still don't know why you need it

-	the common code automatically extends itself to support 2^32 concurrent sleepers
	on 64bit archs

-	there is no code duplication at all in supporting xchgadd common code logic
	for other archs (and I prepared a skeleton to fill for the alpha)

Only disavantage is that sparc64 will have to fixup some bit again.

Here the numbers of the above patches on top of vanilla 2.4.4-pre6:

----------------------------------------------
RWSEM_XCHGADD y in rwsem-2.4.4-pre6 + my latest rwsem

rw

reads taken: 5736978
writes taken: 2962325
reads taken: 5799163
writes taken: 2994404

r1

reads taken: 17044842
reads taken: 17053405

r2

reads taken: 5603085
reads taken: 5601647

ro

reads taken: 4831655
reads taken: 4833518

w1

writes taken: 16064773
writes taken: 16037018

wo

writes taken: 860791
writes taken: 864103

----------------------------------------------
RWSEM_SPINLOCK y in rwsem-2.4.4-pre6 + my latest rwsem

rw

reads taken: 6061713
writes taken: 3129801
reads taken: 6099046
writes taken: 3148951

r1

reads taken: 14251500
reads taken: 14265389

r2

reads taken: 4972932
reads taken: 4936267

ro

reads taken: 4253814
reads taken: 4253432

w1

writes taken: 13652385
writes taken: 13632914

wo

writes taken: 1751857
writes taken: 1753608

I draw a graph with the above numbers compared to the previous quoted numbers I
generated a few hours ago with your latest try#3 (attached). (W1 and R1 are
respectively the benchmark of the write fast path and read fast path, only one
writer and only one reader and they're of course the most interesting ones)

So I'd suggest Linus to apply one of my above -8 patches for pre7. (I hope I
won't need any secondary try)

this is a diffstat of the rwsem-8 patch:

 arch/alpha/config.in              |    1 
 arch/alpha/kernel/alpha_ksyms.c   |    4 
 arch/arm/config.in                |    2 
 arch/cris/config.in               |    1 
 arch/i386/config.in               |    4 
 arch/ia64/config.in               |    1 
 arch/m68k/config.in               |    1 
 arch/mips/config.in               |    1 
 arch/mips64/config.in             |    1 
 arch/parisc/config.in             |    1 
 arch/ppc/config.in                |    1 
 arch/ppc/kernel/ppc_ksyms.c       |    2 
 arch/s390/config.in               |    1 
 arch/s390x/config.in              |    1 
 arch/sh/config.in                 |    1 
 arch/sparc/config.in              |    1 
 arch/sparc64/config.in            |    4 
 include/asm-alpha/compiler.h      |    9 -
 include/asm-alpha/rwsem_xchgadd.h |   27 +++
 include/asm-alpha/semaphore.h     |    2 
 include/asm-i386/rwsem.h          |  225 --------------------------------
 include/asm-i386/rwsem_xchgadd.h  |   93 +++++++++++++
 include/asm-sparc64/rwsem.h       |   35 ++---
 include/linux/compiler.h          |   13 +
 include/linux/rwsem-spinlock.h    |   57 --------
 include/linux/rwsem.h             |  106 ---------------
 include/linux/rwsem_spinlock.h    |   61 ++++++++
 include/linux/rwsem_xchgadd.h     |   96 +++++++++++++
 include/linux/sched.h             |    2 
 lib/Makefile                      |    6 
 lib/rwsem-spinlock.c              |  245 -----------------------------------
 lib/rwsem.c                       |  265 --------------------------------------
 lib/rwsem_spinlock.c              |  124 +++++++++++++++++
 lib/rwsem_xchgadd.c               |   88 ++++++++++++
 35 files changed, 531 insertions(+), 951 deletions(-)

Andrea

[-- Attachment #2: rwsem-8.png --]
[-- Type: image/png, Size: 6697 bytes --]

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: rwsem benchmark [was Re: [PATCH] rw_semaphores, optimisations try #3]
  2001-04-24  4:56   ` rwsem benchmark [was Re: [PATCH] rw_semaphores, optimisations try #3] Andrea Arcangeli
@ 2001-04-24  8:56     ` David Howells
  2001-04-24  9:49       ` Andrea Arcangeli
  2001-04-24 10:17       ` Andrea Arcangeli
  0 siblings, 2 replies; 19+ messages in thread
From: David Howells @ 2001-04-24  8:56 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: torvalds, linux-kernel, davem

> Ok I finished now my asm optimized rwsemaphores and I improved a little my
> spinlock based one but without touching the icache usage.

And I can break it. There's a very good reason the I changed __up_write() to
use CMPXCHG instead of SUBL. I found a sequence of operations that locked up
on this.

Unfortunately, I can't remember what it was. I do have it written down at
home, I think, so I see about sending it to you later.

>
> The main advantage of my rewrite are:
>
> -	my x86 version is visibly faster than yours in the write fast path and
> 	it also saves icache because it's smaller

That's because your fast write path is wrong. You can get away without
clobbering EDX and ECX, which I can't.

> (read fast path is reproducibly a bit faster too)

| +static inline void __down_read(struct rw_semaphore *sem)
...
| +			     : "+m" (sem->count), "+a" (sem)

>From what I've been told, you're lucky here... you avoid a pipeline stall
between whatever loads sem into EAX and the INCL that increments what it
points to because the "+a" constraint says that EAX will be changed. This
means that the compiler saves EAX before entering the inline asm, thus
delaying the INCL and thus avoiding the stall. However, you generally seem to
gain this at the expense of clobbering another register, say EDX.

So, for a function that just calls down_read twice, I get:
  20:   8b 44 24 04             mov    0x4(%esp,1),%eax
  24:   f0 ff 00                lock incl (%eax)
  27:   0f 88 22 00 00 00       js     4f <dr2+0x2f>
  2d:   f0 ff 00                lock incl (%eax)
  30:   0f 88 30 00 00 00       js     66 <dr2+0x46>
  36:   c3                      ret

And you get:
   0:   83 ec 08                sub    $0x8,%esp
   3:   8b 54 24 0c             mov    0xc(%esp,1),%edx
   7:   89 d0                   mov    %edx,%eax
   9:   f0 ff 02                lock incl (%edx)
   c:   0f 88 fc ff ff ff       js     e <dr+0xe>
  12:   89 d0                   mov    %edx,%eax
  14:   f0 ff 02                lock incl (%edx)
  17:   0f 88 0f 00 00 00       js     2c <dr2+0xc>
  1d:   58                      pop    %eax
  1e:   5a                      pop    %edx
  1f:   c3                      ret

Note also your asm constraints cause the compiler to eat an extra 8 bytes of
stack and then to pop it into registers to get rid of it. This is a gcc bug,
and will only hurt if the up_read and down_read are done in separate
subroutines.

| +static inline void __up_read(struct rw_semaphore *sem)
...
| +			     "jnz 1b\n\t"
| +			     "pushl %%ecx\n\t"
| +			     "call rwsem_wake\n\t"

Putting a PUSHL or two there hurt performance when I tried it, because, I'm
told, it introduces a pipeline stall.

> -	the common code automatically extends itself to support 2^32
>	concurrent sleepers on 64bit archs

You shouldn't do this in the XADD case, since you are explicitly using 32-bit
registers and instructions.

Actually, both of our generic cases allow 2^31 sleepers on a 32-bit arch, and
by changing mine to a long I can make it so we both support up to 2^63 on a
64-bit arch. However, I suspect that is overkill...

> -	there is no code duplication at all in supporting xchgadd common code
>       logic for other archs (and I prepared a skeleton to fill for the alpha)

Why not make it shareable? It doesn't have to be mandatory...

> So I'd suggest Linus to apply one of my above -8 patches for pre7. (I hope I
> won't need any secondary try)

I recommend against this at the moment (there's a bug in __up_write in his X86
optimised code).

David

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: rwsem benchmark [was Re: [PATCH] rw_semaphores, optimisations try #3]
  2001-04-24  8:56     ` David Howells
@ 2001-04-24  9:49       ` Andrea Arcangeli
  2001-04-24 10:25         ` David Howells
  2001-04-24 10:17       ` Andrea Arcangeli
  1 sibling, 1 reply; 19+ messages in thread
From: Andrea Arcangeli @ 2001-04-24  9:49 UTC (permalink / raw)
  To: David Howells; +Cc: torvalds, linux-kernel, davem

On Tue, Apr 24, 2001 at 09:56:11AM +0100, David Howells wrote:
> > Ok I finished now my asm optimized rwsemaphores and I improved a little my
> > spinlock based one but without touching the icache usage.
> 
> And I can break it. There's a very good reason the I changed __up_write() to
> use CMPXCHG instead of SUBL. I found a sequence of operations that locked up
> on this.

I'd love to hear this sequence. Certainly regression testing never generated
this sequence yet but yes that doesn't mean anything. Note that your slow path
is very different than mine.

My rule is really really simple:

-	the last one that moves from 1 to 0 the lower half of the word
	has to do 1 wakeup

I don't care who does that or where it does that.

That's all. No other real rules. Of course then both wakeup and down_failed
have to do their retire logic with the spinlock acquired to make sure to
serialize but that doesn't change the real rule. I don't feel the need
of any xchg to enforce additional serialization.

Anyways if you can provide a sequence that breaks my simple algorithm I will
love to know that, of course then it will be possible also to write a kernel
module to exploit it and reproduce the hang in real life. It is possible that I
missed something however right now the logic seems simple enough to be right (I
mean mine, yours plays with cmpxchg in a way that I still cannot understand).

> Unfortunately, I can't remember what it was. I do have it written down at
> home, I think, so I see about sending it to you later.

Ok thanks!

> > The main advantage of my rewrite are:
> >
> > -	my x86 version is visibly faster than yours in the write fast path and
> > 	it also saves icache because it's smaller
> 
> That's because your fast write path is wrong. You can get away without
> clobbering EDX and ECX, which I can't.
> 
> > (read fast path is reproducibly a bit faster too)
> 
> | +static inline void __down_read(struct rw_semaphore *sem)
> ...
> | +			     : "+m" (sem->count), "+a" (sem)
> 
> >From what I've been told, you're lucky here... you avoid a pipeline stall
> between whatever loads sem into EAX and the INCL that increments what it
> points to because the "+a" constraint says that EAX will be changed. This

Infact eax will be changed because it will be clobbered by the slow path, so I
have to. Infact you are not using the +a like I do there and you don't save EAX
explicitly on the stack I think that's "your" bug.

> means that the compiler saves EAX before entering the inline asm, thus
> delaying the INCL and thus avoiding the stall. However, you generally seem to
> gain this at the expense of clobbering another register, say EDX.

Again it's not a performance issue, the "+a" (sem) is a correctness issue
because the slow path will clobber it.

About the reason I'm faster than you in the down_write fast path is that I can
do the subl instead of the cmpxchg, you say this is my big fault, I think my
algorithm allows me to do that, but maybe I'm wrong.

> So, for a function that just calls down_read twice, I get:
>   20:   8b 44 24 04             mov    0x4(%esp,1),%eax
>   24:   f0 ff 00                lock incl (%eax)
>   27:   0f 88 22 00 00 00       js     4f <dr2+0x2f>
>   2d:   f0 ff 00                lock incl (%eax)
>   30:   0f 88 30 00 00 00       js     66 <dr2+0x46>
>   36:   c3                      ret
> 
> And you get:
>    0:   83 ec 08                sub    $0x8,%esp
>    3:   8b 54 24 0c             mov    0xc(%esp,1),%edx
>    7:   89 d0                   mov    %edx,%eax
>    9:   f0 ff 02                lock incl (%edx)
>    c:   0f 88 fc ff ff ff       js     e <dr+0xe>
>   12:   89 d0                   mov    %edx,%eax
>   14:   f0 ff 02                lock incl (%edx)
>   17:   0f 88 0f 00 00 00       js     2c <dr2+0xc>
>   1d:   58                      pop    %eax
>   1e:   5a                      pop    %edx
>   1f:   c3                      ret
> 
> Note also your asm constraints cause the compiler to eat an extra 8 bytes of
> stack and then to pop it into registers to get rid of it. This is a gcc bug,
> and will only hurt if the up_read and down_read are done in separate
> subroutines.
> 
> | +static inline void __up_read(struct rw_semaphore *sem)
> ...
> | +			     "jnz 1b\n\t"
> | +			     "pushl %%ecx\n\t"
> | +			     "call rwsem_wake\n\t"
> 
> Putting a PUSHL or two there hurt performance when I tried it, because, I'm
> told, it introduces a pipeline stall.

Unfortunatly I "have" to put the pushl there because I don't want to save %ecx
in the fast path (if I declare ecx clobbered it's even worse no?).

> > -	the common code automatically extends itself to support 2^32
> >	concurrent sleepers on 64bit archs
> 
> You shouldn't do this in the XADD case, since you are explicitly using 32-bit
> registers and instructions.

I said on 64bit archs. Of course on x86-64 there is xaddq and the rex
registers.

> Actually, both of our generic cases allow 2^31 sleepers on a 32-bit arch, and

Both my generic and asm code only allows 2^16 sleepers on 32bit archs, then I
don't know what happens, if it works it wasn't intentional ;).

> by changing mine to a long I can make it so we both support up to 2^63 on a
> 64-bit arch. However, I suspect that is overkill...
>
> > -	there is no code duplication at all in supporting xchgadd common code
> >       logic for other archs (and I prepared a skeleton to fill for the alpha)
> 
> Why not make it shareable? It doesn't have to be mandatory...

It isn't mandatory, if you don't want the xchgadd infrastructure then you don't
set even CONFIG_RWSEM_XCHGADD, no?

> I recommend against this at the moment (there's a bug in __up_write in his X86
> optimised code).

Certainly if there's a bug I agree ;). It will be really fun for me to get a
kernel module that deadlocks my algorithm. thanks for the auditing effort!

Andrea

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] rw_semaphores, optimisations try #3
  2001-04-23 22:23 ` [PATCH] rw_semaphores, optimisations try #3 Linus Torvalds
@ 2001-04-24 10:05   ` David Howells
  2001-04-24 15:40     ` Linus Torvalds
  0 siblings, 1 reply; 19+ messages in thread
From: David Howells @ 2001-04-24 10:05 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel

Linus Torvalds <torvalds@transmeta.com> wrote:
> Note that the generic list structure already has support for "batching".
> It only does it for multiple adds right now (see the "list_splice"
> merging code), but there is nothing to stop people from doing it for
> multiple deletions too. The code is something like
> 
> 	static inline void list_remove_between(x,y)
> 	{
> 		n->next = y;
> 		y->prev = x;
> 	}
> 
> and notice how it's still just two unconditional stores for _any_ number
> of deleted entries.

Yes but the "struct rwsem_waiter" batch would have to be entirely deleted from
the list before any of them are woken, otherwise the waking processes may
destroy their "rwsem_waiter" blocks before they are dequeued (this destruction
is not guarded by a spinlock).

This would then reintroduce a second loop to find out which was the last block
we would be waking.

> Anyway, I've already applied your #2, how about a patch relative to that?

Attached.

David
=================================
diff -uNr linux-rwsem-opt2/include/linux/rwsem-spinlock.h linux/include/linux/rwsem-spinlock.h
--- linux-rwsem-opt2/include/linux/rwsem-spinlock.h	Tue Apr 24 10:51:58 2001
+++ linux/include/linux/rwsem-spinlock.h	Tue Apr 24 08:40:20 2001
@@ -1,6 +1,8 @@
 /* rwsem-spinlock.h: fallback C implementation
  *
  * Copyright (c) 2001   David Howells (dhowells@redhat.com).
+ * - Derived partially from ideas by Andrea Arcangeli <andrea@suse.de>
+ * - Derived also from comments by Linus
  */
 
 #ifndef _LINUX_RWSEM_SPINLOCK_H
@@ -11,6 +13,7 @@
 #endif
 
 #include <linux/spinlock.h>
+#include <linux/list.h>
 
 #ifdef __KERNEL__
 
@@ -19,14 +22,16 @@
 struct rwsem_waiter;
 
 /*
- * the semaphore definition
+ * the rw-semaphore definition
+ * - if activity is 0 then there are no active readers or writers
+ * - if activity is +ve then that is the number of active readers
+ * - if activity is -1 then there is one active writer
+ * - if wait_list is not empty, then there are processes waiting for the semaphore
  */
 struct rw_semaphore {
-	__u32			active;
-	__u32			waiting;
+	__s32			activity;
 	spinlock_t		wait_lock;
-	struct rwsem_waiter	*wait_front;
-	struct rwsem_waiter	**wait_back;
+	struct list_head	wait_list;
 #if RWSEM_DEBUG
 	int			debug;
 #endif
@@ -42,7 +47,7 @@
 #endif
 
 #define __RWSEM_INITIALIZER(name) \
-{ 0, 0, SPIN_LOCK_UNLOCKED, NULL, &(name).wait_front __RWSEM_DEBUG_INIT }
+{ 0, SPIN_LOCK_UNLOCKED, LIST_HEAD_INIT((name).wait_list) __RWSEM_DEBUG_INIT }
 
 #define DECLARE_RWSEM(name) \
 	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
diff -uNr linux-rwsem-opt2/lib/rwsem-spinlock.c linux/lib/rwsem-spinlock.c
--- linux-rwsem-opt2/lib/rwsem-spinlock.c	Tue Apr 24 10:51:58 2001
+++ linux/lib/rwsem-spinlock.c	Tue Apr 24 08:40:20 2001
@@ -2,13 +2,15 @@
  *                                   implementation
  *
  * Copyright (c) 2001   David Howells (dhowells@redhat.com).
+ * - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
+ * - Derived also from comments by Linus
  */
 #include <linux/rwsem.h>
 #include <linux/sched.h>
 #include <linux/module.h>
 
 struct rwsem_waiter {
-	struct rwsem_waiter	*next;
+	struct list_head	list;
 	struct task_struct	*task;
 	unsigned int		flags;
 #define RWSEM_WAITING_FOR_READ	0x00000001
@@ -19,7 +21,8 @@
 void rwsemtrace(struct rw_semaphore *sem, const char *str)
 {
 	if (sem->debug)
-		printk("[%d] %s({%d,%d})\n",current->pid,str,sem->active,sem->waiting);
+		printk("[%d] %s({%d,%d})\n",
+		       current->pid,str,sem->activity,list_empty(&sem->wait_list)?0:1);
 }
 #endif
 
@@ -28,11 +31,9 @@
  */
 void init_rwsem(struct rw_semaphore *sem)
 {
-	sem->active = 0;
-	sem->waiting = 0;
+	sem->activity = 0;
 	spin_lock_init(&sem->wait_lock);
-	sem->wait_front = NULL;
-	sem->wait_back = &sem->wait_front;
+	INIT_LIST_HEAD(&sem->wait_list);
 #if RWSEM_DEBUG
 	sem->debug = 0;
 #endif
@@ -48,60 +49,58 @@
  */
 static inline struct rw_semaphore *__rwsem_do_wake(struct rw_semaphore *sem)
 {
-	struct rwsem_waiter *waiter, *next;
-	int woken, loop;
+	struct rwsem_waiter *waiter;
+	int woken;
 
 	rwsemtrace(sem,"Entering __rwsem_do_wake");
 
-	waiter = sem->wait_front;
-
-	if (!waiter)
-	  goto list_unexpectedly_empty;
-
-	next = NULL;
+	waiter = list_entry(sem->wait_list.next,struct rwsem_waiter,list);
 
 	/* try to grant a single write lock if there's a writer at the front of the queue
 	 * - we leave the 'waiting count' incremented to signify potential contention
 	 */
 	if (waiter->flags & RWSEM_WAITING_FOR_WRITE) {
-		sem->active++;
-		next = waiter->next;
+		sem->activity = -1;
+		list_del(&waiter->list);
 		waiter->flags = 0;
 		wake_up_process(waiter->task);
-		goto discard_woken_processes;
+		goto out;
 	}
 
 	/* grant an infinite number of read locks to the readers at the front of the queue */
 	woken = 0;
 	do {
-		woken++;
-		waiter = waiter->next;
-	} while (waiter && waiter->flags&RWSEM_WAITING_FOR_READ);
-
-	sem->active += woken;
-	sem->waiting -= woken;
-
-	waiter = sem->wait_front;
-	for (loop=woken; loop>0; loop--) {
-		next = waiter->next;
+		list_del(&waiter->list);
 		waiter->flags = 0;
 		wake_up_process(waiter->task);
-		waiter = next;
-	}
+		woken++;
+		if (list_empty(&sem->wait_list))
+			break;
+		waiter = list_entry(sem->wait_list.next,struct rwsem_waiter,list);
+	} while (waiter->flags&RWSEM_WAITING_FOR_READ);
 
- discard_woken_processes:
-	sem->wait_front = next;
-	if (!next) sem->wait_back = &sem->wait_front;
+	sem->activity += woken;
 
  out:
 	rwsemtrace(sem,"Leaving __rwsem_do_wake");
 	return sem;
+}
+
+/*
+ * wake a single writer
+ */
+static inline struct rw_semaphore *__rwsem_wake_one_writer(struct rw_semaphore *sem)
+{
+	struct rwsem_waiter *waiter;
+
+	sem->activity = -1;
 
- list_unexpectedly_empty:
-	printk("__rwsem_do_wake(): wait_list unexpectedly empty\n");
-	printk("[%d] %p = { %d, %d })\n",current->pid,sem,sem->active,sem->waiting);
-	BUG();
-	goto out;
+	waiter = list_entry(sem->wait_list.next,struct rwsem_waiter,list);
+	list_del(&waiter->list);
+
+	waiter->flags = 0;
+	wake_up_process(waiter->task);
+	return sem;
 }
 
 /*
@@ -110,29 +109,27 @@
 void __down_read(struct rw_semaphore *sem)
 {
 	struct rwsem_waiter waiter;
-	struct task_struct *tsk = current;
+	struct task_struct *tsk;
 
 	rwsemtrace(sem,"Entering __down_read");
 
 	spin_lock(&sem->wait_lock);
 
-	if (!sem->waiting) {
+	if (sem->activity>=0 && list_empty(&sem->wait_list)) {
 		/* granted */
-		sem->active++;
+		sem->activity++;
 		spin_unlock(&sem->wait_lock);
 		goto out;
 	}
-	sem->waiting++;
 
+	tsk = current;
 	set_task_state(tsk,TASK_UNINTERRUPTIBLE);
 
 	/* set up my own style of waitqueue */
-	waiter.next = NULL;
 	waiter.task = tsk;
 	waiter.flags = RWSEM_WAITING_FOR_READ;
 
-	*sem->wait_back = &waiter; /* add to back of queue */
-	sem->wait_back = &waiter.next;
+	list_add_tail(&waiter.list,&sem->wait_list);
 
 	/* we don't need to touch the semaphore struct anymore */
 	spin_unlock(&sem->wait_lock);
@@ -158,30 +155,27 @@
 void __down_write(struct rw_semaphore *sem)
 {
 	struct rwsem_waiter waiter;
-	struct task_struct *tsk = current;
+	struct task_struct *tsk;
 
 	rwsemtrace(sem,"Entering __down_write");
 
 	spin_lock(&sem->wait_lock);
 
-	if (!sem->waiting && !sem->active) {
+	if (sem->activity==0 && list_empty(&sem->wait_list)) {
 		/* granted */
-		sem->active++;
-		sem->waiting++;
+		sem->activity = -1;
 		spin_unlock(&sem->wait_lock);
 		goto out;
 	}
-	sem->waiting++;
 
+	tsk = current;
 	set_task_state(tsk,TASK_UNINTERRUPTIBLE);
 
 	/* set up my own style of waitqueue */
-	waiter.next = NULL;
 	waiter.task = tsk;
 	waiter.flags = RWSEM_WAITING_FOR_WRITE;
 
-	*sem->wait_back = &waiter; /* add to back of queue */
-	sem->wait_back = &waiter.next;
+	list_add_tail(&waiter.list,&sem->wait_list);
 
 	/* we don't need to touch the semaphore struct anymore */
 	spin_unlock(&sem->wait_lock);
@@ -209,8 +203,8 @@
 
 	spin_lock(&sem->wait_lock);
 
-	if (--sem->active==0 && sem->waiting)
-		__rwsem_do_wake(sem);
+	if (--sem->activity==0 && !list_empty(&sem->wait_list))
+		sem = __rwsem_wake_one_writer(sem);
 
 	spin_unlock(&sem->wait_lock);
 
@@ -226,9 +220,9 @@
 
 	spin_lock(&sem->wait_lock);
 
-	sem->waiting--;
-	if (--sem->active==0 && sem->waiting)
-		__rwsem_do_wake(sem);
+	sem->activity = 0;
+	if (!list_empty(&sem->wait_list))
+		sem = __rwsem_do_wake(sem);
 
 	spin_unlock(&sem->wait_lock);
 


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: rwsem benchmark [was Re: [PATCH] rw_semaphores, optimisations try #3]
  2001-04-24  8:56     ` David Howells
  2001-04-24  9:49       ` Andrea Arcangeli
@ 2001-04-24 10:17       ` Andrea Arcangeli
  2001-04-24 10:33         ` David Howells
  1 sibling, 1 reply; 19+ messages in thread
From: Andrea Arcangeli @ 2001-04-24 10:17 UTC (permalink / raw)
  To: David Howells; +Cc: torvalds, linux-kernel, davem

On Tue, Apr 24, 2001 at 09:56:11AM +0100, David Howells wrote:
> | +			     : "+m" (sem->count), "+a" (sem)
				     ^^^^^^^^^^ I think you were comenting on
					        the +m not +a ok
> 
> >From what I've been told, you're lucky here... you avoid a pipeline stall

I see what you meant here and no, I'm not lucky, I thought about that. gcc
2.95.* seems smart enough to produce (%%eax) that you hardcoded when the sem is
not a constant (I'm not clobbering another register, if it does it's stupid and
I consider this a compiler mistake). I tried with a variable pointer and gcc
as I expected generated the (%%eax) but instead when it's a constant like in
the bench my way it avoids to stall the pipeline by using the constant address
for the locked incl, exactly as you said and that's probably why I beat you on
the down read fast path too.  (I also benchmarked with a variable semaphore and
it was running a little slower)

Andrea

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: rwsem benchmark [was Re: [PATCH] rw_semaphores, optimisations try #3]
  2001-04-24  9:49       ` Andrea Arcangeli
@ 2001-04-24 10:25         ` David Howells
  2001-04-24 10:44           ` Andrea Arcangeli
  0 siblings, 1 reply; 19+ messages in thread
From: David Howells @ 2001-04-24 10:25 UTC (permalink / raw)
  To: Andrea Arcangeli, David Howells; +Cc: torvalds, linux-kernel

> I'd love to hear this sequence. Certainly regression testing never generated
> this sequence yet but yes that doesn't mean anything. Note that your slow
> path is very different than mine.

One of my testcases fell over on it...

> I don't feel the need of any xchg to enforce additional serialization.

I don't use XCHG anywhere... do you mean CMPXCHG?

> yours plays with cmpxchg in a way that I still cannot understand

It tries not to let the "active count" transition 1->0 happen if it can avoid
it (ie: it would rather wake someone up and not decrement the count). It also
only calls __rwsem_do_wake() if the caller manages to transition the active
count 0->1.

This avoids another subtle bug that I think I have a sequence written down for
too.

> Infact eax will be changed because it will be clobbered by the slow path, so
> I have to. Infact you are not using the +a like I do there and you don't
> save EAX explicitly on the stack I think that's "your" bug.

Not so... my down-failed slowpath functions return sem in EAX.

> Again it's not a performance issue, the "+a" (sem) is a correctness issue
> because the slow path will clobber it.

There must be a performance issue too, otherwise our read up/down fastpaths
are the same. Which clearly they're not.

> About the reason I'm faster than you in the down_write fast path is that I
> can do the subl instead of the cmpxchg, you say this is my big fault, I
> think my algorithm allows me to do that, but maybe I'm wrong.

I used to do that.

> Unfortunatly I "have" to put the pushl there because I don't want to save
> %ecx in the fast path (if I declare ecx clobbered it's even worse no?).

It benchmarked faster without it for me. I suspect this will be different on
different CPUs anyway.

I'm going to have to have a play with Intel's VTUNE program and see what it
says.

> I said on 64bit archs. Of course on x86-64 there is xaddq and the rex
> registers.

But the instructions you've specified aren't 64-bit.

> It isn't mandatory, if you don't want the xchgadd infrastructure then you
> don't set even CONFIG_RWSEM_XCHGADD, no?

My point is that mine isn't mandatory either... You just don't set the XADD
config option.

David

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: rwsem benchmark [was Re: [PATCH] rw_semaphores, optimisations try #3]
  2001-04-24 10:17       ` Andrea Arcangeli
@ 2001-04-24 10:33         ` David Howells
  2001-04-24 10:46           ` Andrea Arcangeli
  0 siblings, 1 reply; 19+ messages in thread
From: David Howells @ 2001-04-24 10:33 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: linux-kernel


> I see what you meant here and no, I'm not lucky, I thought about that. gcc x
> 2.95.* seems smart enough to produce (%%eax) that you hardcoded when the
> sem is not a constant (I'm not clobbering another register, if it does it's
> stupid and I consider this a compiler mistake).

It is a compiler mistake... the compiler clobbers another register for
you. The compiler does not, however, know about timing issues with the
contents of the inline assembly... otherwise it'd stick a delay in front of
the XADD in my stuff.

> I tried with a variable pointer and gcc as I expected generated the (%%eax)
> but instead when it's a constant like in the bench my way it avoids to stall
> the pipeline by using the constant address for the locked incl, exactly as
> you said and that's probably why I beat you on the down read fast path too.
> (I also benchmarked with a variable semaphore and it was running a little
> slower)

*grin* Fun ain't it... Try it on a dual athlon or P4 and the answer may come
out differently.

David

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: rwsem benchmark [was Re: [PATCH] rw_semaphores, optimisations try #3]
  2001-04-24 10:25         ` David Howells
@ 2001-04-24 10:44           ` Andrea Arcangeli
  2001-04-24 13:07             ` David Howells
  2001-04-24 15:49             ` Linus Torvalds
  0 siblings, 2 replies; 19+ messages in thread
From: Andrea Arcangeli @ 2001-04-24 10:44 UTC (permalink / raw)
  To: David Howells; +Cc: David Howells, torvalds, linux-kernel

On Tue, Apr 24, 2001 at 11:25:23AM +0100, David Howells wrote:
> > I'd love to hear this sequence. Certainly regression testing never generated
> > this sequence yet but yes that doesn't mean anything. Note that your slow
> > path is very different than mine.
> 
> One of my testcases fell over on it...

so you reproduced a deadlock with my patch applied, or you are saying
you discovered that case with one of you testcases?

I'm asking because I'm running regression testing on my patch for several hours
now and it didn't showed up anything wrong yet (however I'm mainly using my
rwsem program to better stress the rwsem in an interesting environment with
different timings as well, but your stress tests by default looks less
aggressive than my rwsem, infact the bug I found in your code was never
triggered by your testcases until I changed them).

> > I don't feel the need of any xchg to enforce additional serialization.
> 
> I don't use XCHG anywhere... do you mean CMPXCHG?

yes of course, you use rwsem_cmpxchgw that is unnecessary.

> 
> > yours plays with cmpxchg in a way that I still cannot understand
> 
> It tries not to let the "active count" transition 1->0 happen if it can avoid

I don't try to avoid it anytime. I let it to happen all the time it wants.
Immediatly as soon as it can happen.

> it (ie: it would rather wake someone up and not decrement the count). It also

I always retire first.

> only calls __rwsem_do_wake() if the caller manages to transition the active
> count 0->1.

I call rwsem_wake if while retiring the counter gone down to 0 so it's
time to wakeup somebody according to my rule, then I handle the additional
synchroniziation between 0->1 inside the rwsem_wake if it fails I break
the rwsem_wake in the middle while doing the usual retire check from 1 to 0.
That's why up_write is a no brainer for me as far I can see and that's probably
why I can provide a much faster up_write fast path as benchmark shows.

> > Infact eax will be changed because it will be clobbered by the slow path, so
> > I have to. Infact you are not using the +a like I do there and you don't
> > save EAX explicitly on the stack I think that's "your" bug.
> 
> Not so... my down-failed slowpath functions return sem in EAX.

Ah ok, I didn't had such idea, I will optimize that bit in my code now.

> > Again it's not a performance issue, the "+a" (sem) is a correctness issue
> > because the slow path will clobber it.
> 
> There must be a performance issue too, otherwise our read up/down fastpaths
> are the same. Which clearly they're not.

I guess I'm faster because I avoid the pipeline stall using "+m" (sem->count)
that is written as a constant, that was definitely intentional idea. For
me right now the "+a" (sem) is a correctness issue that is hurting me (probably
not in the benchmark), and I can optimize it the same way you did.

> > I said on 64bit archs. Of course on x86-64 there is xaddq and the rex
> > registers.
> 
> But the instructions you've specified aren't 64-bit.

And infact on 32bit arch I use 32bit regs xaddl and 2^16 max sleepers.

> > It isn't mandatory, if you don't want the xchgadd infrastructure then you
> > don't set even CONFIG_RWSEM_XCHGADD, no?
> 
> My point is that mine isn't mandatory either... You just don't set the XADD
> config option.

My point is that when you set XADD you still force duplication of the header
stuff into the the asm/*

Andrea

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: rwsem benchmark [was Re: [PATCH] rw_semaphores, optimisations try #3]
  2001-04-24 10:33         ` David Howells
@ 2001-04-24 10:46           ` Andrea Arcangeli
  2001-04-24 12:19             ` Andrea Arcangeli
  0 siblings, 1 reply; 19+ messages in thread
From: Andrea Arcangeli @ 2001-04-24 10:46 UTC (permalink / raw)
  To: David Howells; +Cc: linux-kernel

On Tue, Apr 24, 2001 at 11:33:13AM +0100, David Howells wrote:
> *grin* Fun ain't it... Try it on a dual athlon or P4 and the answer may come
> out differently.

compile with -mathlon and the compiler then should generate (%%eax) if that's
faster even if the sem is a constant, that's a compiler issue IMHO, I just give the
compiler the flexibility to do the right choice.

Andrea

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: rwsem benchmark [was Re: [PATCH] rw_semaphores, optimisations try #3]
  2001-04-24 10:46           ` Andrea Arcangeli
@ 2001-04-24 12:19             ` Andrea Arcangeli
  2001-04-24 13:10               ` Andrea Arcangeli
  0 siblings, 1 reply; 19+ messages in thread
From: Andrea Arcangeli @ 2001-04-24 12:19 UTC (permalink / raw)
  To: David Howells; +Cc: linux-kernel

There is a bug in both the C version and asm version of my rwsem
and it is the slow path where I forgotten to drop the _irq part
from the spinlock calls ;) Silly bug. (I inherit it also in the
asm fast path version because I started hacking the same C slow path)

I catched it now because it locks hard my alpha as soon as I play with
any rwsem testcase, not sure why x86 is apparently immune by the hard lockup.

then I also added your trick of returning the semaphore so I can declare "a"
(sem) as read only (that is an improvement for the fast path).

Because of that changes I rerun all the benchmarks. I finished now to
re-benchmark the asm version as it runs even faster than before in the write
contention, the other numbers are basically unchanged. the read down fast path
now runs exactly like yours (so yes it seems the "+a" was giving a no sensical
improvement to my code for the down read fast path).

Of course my down write fast path remains significantly faster than yours and that
really make sense because my smarter algorithm allows me to avoid all your
cmpxchg stuff.

I'm starting the benchmarks of the C version and I will post a number update
and a new patch in a few minutes.

If you can ship me the testcase (also theorical) that breaks my algorihtm in the
next few minutes that would help.

Andrea

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: rwsem benchmark [was Re: [PATCH] rw_semaphores, optimisations try #3]
  2001-04-24 10:44           ` Andrea Arcangeli
@ 2001-04-24 13:07             ` David Howells
  2001-04-24 13:59               ` Andrea Arcangeli
  2001-04-24 15:49             ` Linus Torvalds
  1 sibling, 1 reply; 19+ messages in thread
From: David Howells @ 2001-04-24 13:07 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: torvalds, linux-kernel, dhowells

> so you reproduced a deadlock with my patch applied, or you are saying
> you discovered that case with one of you testcases?

It was my implementation that triggered it (I haven't tried it with yours),
but the bug occurred because the SUBL happened to make the change outside of
the spinlocked region in the slowpath at the same time as the wakeup routine
was running on the other CPU.

I'll have a look at the sequence and make sure that it does actually apply to
your implementation. It may not... but it doesn't hurt to check.

The thing occurred with one of my simple testcases, but only happened once in
a number of runs, fortunately whilst I had the rwsemtrace()'s enabled.

> yes of course, you use rwsem_cmpxchgw that is unnecessary.

Actually, I use this to try and avoid the following loop that you've got in
your code:

> +static void __rwsem_wake(struct rw_semaphore *sem)
...
> +	again:
> +		count = rwsem_xchgadd(-wait->retire, &sem->count);
> +		if (!wake_read && (count & RWSEM_READ_MASK)) {
> +			count = rwsem_xchgadd(wait->retire, &sem->count);
> +			if ((count & RWSEM_READ_MASK) == 1)
> +				goto again;

I now only have that loop in the rwsem_up_write_wake() function.

But! In mine, if __up_write()'s CMPXCHG failed, then it has also read the
counter, which it then passes as an argument to rwsem_up_write_wake(). This
means I can avoid the aforementioned loop in most cases, I suspect, by seeing
if the active counter was 1 at the time of the failed CMPXCHG.

This also means that if a ex-writer wakes up a writer-to-be, the only atomic
instruction performed on sem->count is the CMPXCHG in __up_write().

For the ex-writer waking up readers case, we have to perform an additional
XADD, but this must be done before anyone is woken up, else __up_read() can
get called to decrement the count before we've incremented it. I count the
number of things I want to wake up, adjust the count (one LOCKED ADD only) and
then wake the batch up.

You dive into a LOCKED XADD/XADD loop for each process you wake, which in the
best case will give you one LOCKED XADD per process.

Looking again at rwsem_up_read_wake()... I can't actually eliminate the
CMPXCHG there because the active count has already been decremented, and so
will need to be incremented again prior to a wake-up being performed. However,
if the increment was performed and someone else had incremented the count in
the meantime, we have to decrement it again... but this can cause a transition
back to zero, which we have to check for... and if that occurred...

You get the idea, anyway.

Oh yes... this idea should be faster on SPARC/SPARC64 and IA64 which don't
have useful XADD instructions (FETCHADD can only use small immediate values),
only CMPXCHG/CAS are really useful there.

> I guess I'm faster because I avoid the pipeline stall using "+m" (sem->count)
> that is written as a constant, that was definitely intentional idea.

"+m" doesn't avoid the stall. That's just shorthand for:

	: "=m"(sem->count) : "m"(sem->count)

which is what mine has.

"a+" luckily causes the avoidance by saying EAX gets clobbered, causing the
compiler to save via an additional register. Note that the compiler saves
anyway, even if the register will be discarded after being restored - yuk!

I think this one will depend on the surrounding code anyway. I suspect in some
chunks of C mine will be faster, and in others yours will be, all depending on
when EAX is loaded.

> My point is that when you set XADD you still force duplication of the header
> stuff into the the asm/*

If you want to use my XADD algorithm, then I think that's going to be fair
enough for now. You have to provide some extra routines anyway.

David

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: rwsem benchmark [was Re: [PATCH] rw_semaphores, optimisations try #3]
  2001-04-24 12:19             ` Andrea Arcangeli
@ 2001-04-24 13:10               ` Andrea Arcangeli
  0 siblings, 0 replies; 19+ messages in thread
From: Andrea Arcangeli @ 2001-04-24 13:10 UTC (permalink / raw)
  To: David Howells; +Cc: linux-kernel

[-- Attachment #1: Type: text/plain, Size: 2397 bytes --]

On Tue, Apr 24, 2001 at 02:19:28PM +0200, Andrea Arcangeli wrote:
> I'm starting the benchmarks of the C version and I will post a number update
> and a new patch in a few minutes.

(sorry for the below wrap around, just grow your terminal to read it stright)

				aa RW (reads) aa RW (writes) R1	      R2      RO      W1       WO
dh generic out of line try3	5842496	      3016649        13309316 5010534 3850228 13012701 1825789
dh generic out of line try3 #2	5823381	      3006773	     13311722 5023185 3845954 13021716 1802560
aa generic out of line buggy	6061713	      3129801	     14251500 4972932 4253814 13652385 1751857
aa generic out of line #2 buggy	6099046	      3148951	     14265389 4936267 4253432 13632914 1753608
aa generic out of line	        6133756       3167138	     14244991 5122637 4254504 13656896 1797627
aa generic out of line #2	6093079	      3145761	     14259141 5126506 4254532 13658447 1803505

dh x86 asm in line try3 	5789542       2989478	     16922653 5650211 4956250 15431139 813756
dh x86 asm in line try3 #2	5801777	      2995669	     16946132 5647272 4959828 15439790 816005
aa x86 asm in line buggy	5736978	      2962325	     17044842 5603085 4831655 16064773 860791
aa x86 asm in line #2 buggy	5799163	      2994404	     17053405 5601647 4833518 16037018 864103
aa generic in line	        5706875       2946931	     16943038 5644018 4837576 16085859 870833
aa generic in line #2	        5755126       2971578	     16924502 5639379 4836111 16073916 873499

I tagged my previous rows as "buggy", I left your try#3 at the start of each
version and I added at the end the new numbers with the -9 fixed revision of my
rwsem at the end. new graph is attached.

So nothing interesting is changed in the numbers as far I can tell after the
fixes and improvement of the fast path using "a" instead of "+a".

Unless you can provide a testcase that fails with my smarter and more compact
algorithm I suggest to Linus to merge my code into pre7.

Against pre6:

	ftp://ftp.kernel.org/pub/linux/kernel/people/andrea/patches/v2.4/2.4.4pre6/rwsem-9

against David's try2:

	ftp://ftp.kernel.org/pub/linux/kernel/people/andrea/patches/v2.4/2.4.4pre6/rwsem-9-against-dh-try2

against David's try3:

	ftp://ftp.kernel.org/pub/linux/kernel/people/andrea/patches/v2.4/2.4.4pre6/rwsem-9-against-dh-try3

I will keep doing regression testing in the next hours of course.

Andrea

[-- Attachment #2: rwsem-9.png --]
[-- Type: image/png, Size: 12674 bytes --]

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: rwsem benchmark [was Re: [PATCH] rw_semaphores, optimisations try #3]
  2001-04-24 13:07             ` David Howells
@ 2001-04-24 13:59               ` Andrea Arcangeli
  0 siblings, 0 replies; 19+ messages in thread
From: Andrea Arcangeli @ 2001-04-24 13:59 UTC (permalink / raw)
  To: David Howells; +Cc: torvalds, linux-kernel, dhowells

On Tue, Apr 24, 2001 at 02:07:47PM +0100, David Howells wrote:
> It was my implementation that triggered it (I haven't tried it with yours),
> but the bug occurred because the SUBL happened to make the change outside of
> the spinlocked region in the slowpath at the same time as the wakeup routine
> was running on the other CPU.

That problem seems not to apply to my slowpath algorithm.

> I'll have a look at the sequence and make sure that it does actually apply to
> your implementation. It may not... but it doesn't hurt to check.

indeed, I'd be glad if you could verify of course, thanks!

> > yes of course, you use rwsem_cmpxchgw that is unnecessary.
> 
> Actually, I use this to try and avoid the following loop that you've got in
> your code:
> 
> > +static void __rwsem_wake(struct rw_semaphore *sem)
> ...
> > +	again:
> > +		count = rwsem_xchgadd(-wait->retire, &sem->count);
> > +		if (!wake_read && (count & RWSEM_READ_MASK)) {
> > +			count = rwsem_xchgadd(wait->retire, &sem->count);
> > +			if ((count & RWSEM_READ_MASK) == 1)
> > +				goto again;
> 
> I now only have that loop in the rwsem_up_write_wake() function.

I don't care about the slow path performance, mainly if I to make it faster I have
to slowdown up_write with an cmpxchg too.

> But! In mine, if __up_write()'s CMPXCHG failed, then it has also read the
> counter, which it then passes as an argument to rwsem_up_write_wake(). This
> means I can avoid the aforementioned loop in most cases, I suspect, by seeing
> if the active counter was 1 at the time of the failed CMPXCHG.
> 
> This also means that if a ex-writer wakes up a writer-to-be, the only atomic
> instruction performed on sem->count is the CMPXCHG in __up_write().
> 
> For the ex-writer waking up readers case, we have to perform an additional
> XADD, but this must be done before anyone is woken up, else __up_read() can
> get called to decrement the count before we've incremented it. I count the
> number of things I want to wake up, adjust the count (one LOCKED ADD only) and
> then wake the batch up.
> 
> You dive into a LOCKED XADD/XADD loop for each process you wake, which in the
> best case will give you one LOCKED XADD per process.

I don't dive into the loop for each process I wake, I only execute one locked
xadd for each prcess I wake (it's the de-retire logic). As I also execute
1 xadd for each process that goes to sleep in schedule() in the slow path
(that's the retire logic). That's really clean beahiour of my slow path I think.

> Looking again at rwsem_up_read_wake()... I can't actually eliminate the
> CMPXCHG there because the active count has already been decremented, and so
> will need to be incremented again prior to a wake-up being performed. However,
> if the increment was performed and someone else had incremented the count in
> the meantime, we have to decrement it again... but this can cause a transition
> back to zero, which we have to check for... and if that occurred...

I think what you describe is similar of what I'm doing in my algorithm (and this way
I don't need any chmxchg in both slow and fast path).

> You get the idea, anyway.
>
> Oh yes... this idea should be faster on SPARC/SPARC64 and IA64 which don't
> have useful XADD instructions (FETCHADD can only use small immediate values),
> only CMPXCHG/CAS are really useful there.
> 
> > I guess I'm faster because I avoid the pipeline stall using "+m" (sem->count)
> > that is written as a constant, that was definitely intentional idea.
> 
> "+m" doesn't avoid the stall. That's just shorthand for:
> 
> 	: "=m"(sem->count) : "m"(sem->count)
> 
> which is what mine has.

You have it yes, but you don't use it ;). While I use it so my compiler can
choose if to put dereference the constant in the incl/etc or to put the (%%eax)
that you hardcoded.

> "a+" luckily causes the avoidance by saying EAX gets clobbered, causing the
> compiler to save via an additional register. Note that the compiler saves
> anyway, even if the register will be discarded after being restored - yuk!
> 
> I think this one will depend on the surrounding code anyway. I suspect in some
> chunks of C mine will be faster, and in others yours will be, all depending on
> when EAX is loaded.

Ok but you save only in the slow path so the fast path should not be affected.
So doing "a" instead of "+a" shouldn't be able to hurt the fast path, it should
only help I think (that's why I changed it too).

> > My point is that when you set XADD you still force duplication of the header
> > stuff into the the asm/*
> 
> If you want to use my XADD algorithm, then I think that's going to be fair
> enough for now. You have to provide some extra routines anyway.

from the arch part you "only" have to provide some extra routine (not the
headers), that's why I included <asm/rwsem_xchgadd.h> from <linux/rwsem_xchgadd.h> ;)

Andrea

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] rw_semaphores, optimisations try #3
  2001-04-24 10:05   ` David Howells
@ 2001-04-24 15:40     ` Linus Torvalds
  2001-04-24 16:37       ` David Howells
  0 siblings, 1 reply; 19+ messages in thread
From: Linus Torvalds @ 2001-04-24 15:40 UTC (permalink / raw)
  To: David Howells; +Cc: linux-kernel


On Tue, 24 Apr 2001, David Howells wrote:
> 
> Yes but the "struct rwsem_waiter" batch would have to be entirely deleted from
> the list before any of them are woken, otherwise the waking processes may
> destroy their "rwsem_waiter" blocks before they are dequeued (this destruction
> is not guarded by a spinlock).

Look again.

Yes, they may destroy the list, but nobody cares.

Why?

 - nobody will look up the list because we do have the spinlock at this
   point, so a destroyed list doesn't actually _matter_ to anybody

   You were actually depending on this earlier, although maybe not on
   purpose.

 - list_remove_between() doesn't care about the integrity of the entries
   it destroys. It only uses, and only changes, the entries that are still
   on the list.

Subtlety is fine. It might warrant a comment, though.

		Linus


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: rwsem benchmark [was Re: [PATCH] rw_semaphores, optimisations try #3]
  2001-04-24 10:44           ` Andrea Arcangeli
  2001-04-24 13:07             ` David Howells
@ 2001-04-24 15:49             ` Linus Torvalds
  1 sibling, 0 replies; 19+ messages in thread
From: Linus Torvalds @ 2001-04-24 15:49 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: David Howells, David Howells, linux-kernel


On Tue, 24 Apr 2001, Andrea Arcangeli wrote:
> 
> > > Again it's not a performance issue, the "+a" (sem) is a correctness issue
> > > because the slow path will clobber it.
> > 
> > There must be a performance issue too, otherwise our read up/down fastpaths
> > are the same. Which clearly they're not.
> 
> I guess I'm faster because I avoid the pipeline stall using "+m" (sem->count)
> that is written as a constant, that was definitely intentional idea.

Guys.

You're arguing over stalls that are (a) compiler-dependent and (b) in code
that doesn't hapeen _anywhere_ except in the specific benchmark you're
using.

Get over it.

 - The benchmark may use constant addresses. None of the kernel does. The
   benchmark is fairly meaningless in this regard.

 - the stalls will almost certainly depend on the code around the thing,
   and will also depend on the compiler version. If you're down to
   haggling about issues like that, then there is no real difference
   between the code.

So calm down guys. And improving the benchmark might not be a bad idea.

		Linus


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] rw_semaphores, optimisations try #3
  2001-04-24 15:40     ` Linus Torvalds
@ 2001-04-24 16:37       ` David Howells
  0 siblings, 0 replies; 19+ messages in thread
From: David Howells @ 2001-04-24 16:37 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel

Linus Torvalds <torvalds@transmeta.com> wrote:
> - nobody will look up the list because we do have the spinlock at this
>   point, so a destroyed list doesn't actually _matter_ to anybody

I suppose that it'll be okay, provided I take care not to access a block for a
task I've just woken up.

> - list_remove_between() doesn't care about the integrity of the entries
>   it destroys. It only uses, and only changes, the entries that are still
>   on the list.

True. Okay, I can change it to use that.

David

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2001-04-24 16:37 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2001-04-23 20:35 [PATCH] rw_semaphores, optimisations try #3 D.W.Howells
2001-04-23 21:34 ` Andrea Arcangeli
2001-04-24  4:56   ` rwsem benchmark [was Re: [PATCH] rw_semaphores, optimisations try #3] Andrea Arcangeli
2001-04-24  8:56     ` David Howells
2001-04-24  9:49       ` Andrea Arcangeli
2001-04-24 10:25         ` David Howells
2001-04-24 10:44           ` Andrea Arcangeli
2001-04-24 13:07             ` David Howells
2001-04-24 13:59               ` Andrea Arcangeli
2001-04-24 15:49             ` Linus Torvalds
2001-04-24 10:17       ` Andrea Arcangeli
2001-04-24 10:33         ` David Howells
2001-04-24 10:46           ` Andrea Arcangeli
2001-04-24 12:19             ` Andrea Arcangeli
2001-04-24 13:10               ` Andrea Arcangeli
2001-04-23 22:23 ` [PATCH] rw_semaphores, optimisations try #3 Linus Torvalds
2001-04-24 10:05   ` David Howells
2001-04-24 15:40     ` Linus Torvalds
2001-04-24 16:37       ` David Howells

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).