All of lore.kernel.org
 help / color / mirror / Atom feed
* [rfc][patch] queued spinlocks (i386)
@ 2007-03-23  8:59 Nick Piggin
  2007-03-23  9:40 ` Eric Dumazet
  2007-03-23 10:04 ` Ingo Molnar
  0 siblings, 2 replies; 25+ messages in thread
From: Nick Piggin @ 2007-03-23  8:59 UTC (permalink / raw)
  To: Linux Kernel Mailing List; +Cc: Ravikiran G Thirumalai, Ingo Molnar


Implement queued spinlocks for i386. This shouldn't increase the size of
the spinlock structure, while still able to handle 2^16 CPUs.

Not completely implemented with assembly yet, to make the algorithm a bit
clearer.

The queued spinlock has 2 fields, a head and a tail, which are indexes
into a FIFO of waiting CPUs. To take a spinlock, a CPU performs an
"atomic_inc_return" on the head index, and keeps the returned value as
a ticket. The CPU then spins until the tail index is equal to that
ticket.

To unlock a spinlock, the tail index is incremented (this can be non
atomic, because only the lock owner will modify tail).

Implementation inefficiencies aside, this change should have little
effect on performance for uncontended locks, but will have quite a
large cost for highly contended locks [O(N) cacheline transfers vs
O(1) per lock aquisition, where N is the number of CPUs contending].
The benefit is is that contended locks will not cause any starvation.

Just an idea. Big NUMA hardware seems to have fairness logic that
prevents starvation for the regular spinlock logic. But it might be
interesting for -rt kernel or systems with starvation issues. 


Index: linux-2.6/include/asm-i386/spinlock.h
===================================================================
--- linux-2.6.orig/include/asm-i386/spinlock.h
+++ linux-2.6/include/asm-i386/spinlock.h
@@ -29,69 +29,23 @@
 
 static inline int __raw_spin_is_locked(raw_spinlock_t *x)
 {
-	return *(volatile signed char *)(&(x)->slock) <= 0;
+	return unlikely(x->qhead != x->qtail);
 }
 
 static inline void __raw_spin_lock(raw_spinlock_t *lock)
 {
-	asm volatile("\n1:\t"
-		     LOCK_PREFIX " ; decb %0\n\t"
-		     "jns 3f\n"
-		     "2:\t"
-		     "rep;nop\n\t"
-		     "cmpb $0,%0\n\t"
-		     "jle 2b\n\t"
-		     "jmp 1b\n"
-		     "3:\n\t"
-		     : "+m" (lock->slock) : : "memory");
-}
+	unsigned short pos = 1;
 
-/*
- * It is easier for the lock validator if interrupts are not re-enabled
- * in the middle of a lock-acquire. This is a performance feature anyway
- * so we turn it off:
- *
- * NOTE: there's an irqs-on section here, which normally would have to be
- * irq-traced, but on CONFIG_TRACE_IRQFLAGS we never use this variant.
- */
-#ifndef CONFIG_PROVE_LOCKING
-static inline void __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags)
-{
-	asm volatile(
-		"\n1:\t"
-		LOCK_PREFIX " ; decb %[slock]\n\t"
-		"jns 5f\n"
-		"2:\t"
-		"testl $0x200, %[flags]\n\t"
-		"jz 4f\n\t"
-		STI_STRING "\n"
-		"3:\t"
-		"rep;nop\n\t"
-		"cmpb $0, %[slock]\n\t"
-		"jle 3b\n\t"
-		CLI_STRING "\n\t"
-		"jmp 1b\n"
-		"4:\t"
-		"rep;nop\n\t"
-		"cmpb $0, %[slock]\n\t"
-		"jg 1b\n\t"
-		"jmp 4b\n"
-		"5:\n\t"
-		: [slock] "+m" (lock->slock)
-		: [flags] "r" (flags)
-	 	  CLI_STI_INPUT_ARGS
-		: "memory" CLI_STI_CLOBBERS);
+	asm volatile(LOCK_PREFIX "xaddw %0, %1\n\t"
+		     : "+r" (pos), "+m" (lock->qhead) : : "memory");
+	while (unlikely(pos != lock->qtail))
+		cpu_relax();
 }
-#endif
 
 static inline int __raw_spin_trylock(raw_spinlock_t *lock)
 {
-	char oldval;
-	asm volatile(
-		"xchgb %b0,%1"
-		:"=q" (oldval), "+m" (lock->slock)
-		:"0" (0) : "memory");
-	return oldval > 0;
+	unsigned short qtail = lock->qtail;
+	return likely(cmpxchg(&lock->qhead, qtail, qtail+1) == qtail);
 }
 
 /*
@@ -105,18 +59,14 @@ static inline int __raw_spin_trylock(raw
 
 static inline void __raw_spin_unlock(raw_spinlock_t *lock)
 {
-	asm volatile("movb $1,%0" : "+m" (lock->slock) :: "memory");
+	asm volatile("addw $1,%0" : "+m" (lock->qtail) :: "memory");
 }
 
 #else
 
 static inline void __raw_spin_unlock(raw_spinlock_t *lock)
 {
-	char oldval = 1;
-
-	asm volatile("xchgb %b0, %1"
-		     : "=q" (oldval), "+m" (lock->slock)
-		     : "0" (oldval) : "memory");
+	asm volatile(LOCK_PREFIX "addw $1,%0" : "+m" (lock->qtail) :: "memory");
 }
 
 #endif
Index: linux-2.6/include/asm-i386/spinlock_types.h
===================================================================
--- linux-2.6.orig/include/asm-i386/spinlock_types.h
+++ linux-2.6/include/asm-i386/spinlock_types.h
@@ -6,10 +6,11 @@
 #endif
 
 typedef struct {
-	unsigned int slock;
+	unsigned short qhead;
+	unsigned short qtail;
 } raw_spinlock_t;
 
-#define __RAW_SPIN_LOCK_UNLOCKED	{ 1 }
+#define __RAW_SPIN_LOCK_UNLOCKED	{ 0, 0 }
 
 typedef struct {
 	unsigned int lock;

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [rfc][patch] queued spinlocks (i386)
  2007-03-23  8:59 [rfc][patch] queued spinlocks (i386) Nick Piggin
@ 2007-03-23  9:40 ` Eric Dumazet
  2007-03-23  9:59   ` Nick Piggin
  2007-03-23 19:27   ` Ravikiran G Thirumalai
  2007-03-23 10:04 ` Ingo Molnar
  1 sibling, 2 replies; 25+ messages in thread
From: Eric Dumazet @ 2007-03-23  9:40 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Linux Kernel Mailing List, Ravikiran G Thirumalai, Ingo Molnar

On Fri, 23 Mar 2007 09:59:11 +0100
Nick Piggin <npiggin@suse.de> wrote:

> 
> Implement queued spinlocks for i386. This shouldn't increase the size of
> the spinlock structure, while still able to handle 2^16 CPUs.
> 
> Not completely implemented with assembly yet, to make the algorithm a bit
> clearer.
> 
> The queued spinlock has 2 fields, a head and a tail, which are indexes
> into a FIFO of waiting CPUs. To take a spinlock, a CPU performs an
> "atomic_inc_return" on the head index, and keeps the returned value as
> a ticket. The CPU then spins until the tail index is equal to that
> ticket.
> 
> To unlock a spinlock, the tail index is incremented (this can be non
> atomic, because only the lock owner will modify tail).
> 
> Implementation inefficiencies aside, this change should have little
> effect on performance for uncontended locks, but will have quite a
> large cost for highly contended locks [O(N) cacheline transfers vs
> O(1) per lock aquisition, where N is the number of CPUs contending].
> The benefit is is that contended locks will not cause any starvation.
> 
> Just an idea. Big NUMA hardware seems to have fairness logic that
> prevents starvation for the regular spinlock logic. But it might be
> interesting for -rt kernel or systems with starvation issues. 

It's a very nice idea Nick.

You also have for free the number or cpus that are before you.

On big SMP/NUMA, we could use this information to call a special lock_cpu_relax() function to avoid cacheline transferts.

	asm volatile(LOCK_PREFIX "xaddw %0, %1\n\t"
		     : "+r" (pos), "+m" (lock->qhead) : : "memory");
	for (;;) {
		unsigned short nwait = pos - lock->qtail;
		if (likely(nwait == 0))
			break;
		lock_cpu_relax(lock, nwait);
	}

lock_cpu_relax(raw_spinlock_t *lock, unsigned int nwait)
{
unsigned int cycles = nwait * lock->min_cycles_per_round;
busy_loop(cycles);
}

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [rfc][patch] queued spinlocks (i386)
  2007-03-23  9:40 ` Eric Dumazet
@ 2007-03-23  9:59   ` Nick Piggin
  2007-03-23 19:27   ` Ravikiran G Thirumalai
  1 sibling, 0 replies; 25+ messages in thread
From: Nick Piggin @ 2007-03-23  9:59 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Linux Kernel Mailing List, Ravikiran G Thirumalai, Ingo Molnar

On Fri, Mar 23, 2007 at 10:40:17AM +0100, Eric Dumazet wrote:
> On Fri, 23 Mar 2007 09:59:11 +0100
> Nick Piggin <npiggin@suse.de> wrote:
> 
> > 
> > Implement queued spinlocks for i386. This shouldn't increase the size of
> > the spinlock structure, while still able to handle 2^16 CPUs.
> > 
> > Not completely implemented with assembly yet, to make the algorithm a bit
> > clearer.
> > 
> > The queued spinlock has 2 fields, a head and a tail, which are indexes
> > into a FIFO of waiting CPUs. To take a spinlock, a CPU performs an
> > "atomic_inc_return" on the head index, and keeps the returned value as
> > a ticket. The CPU then spins until the tail index is equal to that
> > ticket.
> > 
> > To unlock a spinlock, the tail index is incremented (this can be non
> > atomic, because only the lock owner will modify tail).
> > 
> > Implementation inefficiencies aside, this change should have little
> > effect on performance for uncontended locks, but will have quite a
> > large cost for highly contended locks [O(N) cacheline transfers vs
> > O(1) per lock aquisition, where N is the number of CPUs contending].
> > The benefit is is that contended locks will not cause any starvation.
> > 
> > Just an idea. Big NUMA hardware seems to have fairness logic that
> > prevents starvation for the regular spinlock logic. But it might be
> > interesting for -rt kernel or systems with starvation issues. 
> 
> It's a very nice idea Nick.
> 
> You also have for free the number or cpus that are before you.
> 
> On big SMP/NUMA, we could use this information to call a special lock_cpu_relax() function to avoid cacheline transferts.
> 
> 	asm volatile(LOCK_PREFIX "xaddw %0, %1\n\t"
> 		     : "+r" (pos), "+m" (lock->qhead) : : "memory");
> 	for (;;) {
> 		unsigned short nwait = pos - lock->qtail;
> 		if (likely(nwait == 0))
> 			break;
> 		lock_cpu_relax(lock, nwait);
> 	}
> 
> lock_cpu_relax(raw_spinlock_t *lock, unsigned int nwait)
> {
> unsigned int cycles = nwait * lock->min_cycles_per_round;
> busy_loop(cycles);
> }

Yeah that's true, it would give you a lot more information for use in a
lock backoff system, so that might counter the lesser efficiency of queued
locks under contention.

You could perhaps even wait using monitor/mwait on the lock cacheline
if there are more than X CPUs in front of you. I'm not sure if the cost of
mwait is entirely within the core that executes it, or whether it slows
down cache coherency as well (which would make that idea a showstopper).


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [rfc][patch] queued spinlocks (i386)
  2007-03-23  8:59 [rfc][patch] queued spinlocks (i386) Nick Piggin
  2007-03-23  9:40 ` Eric Dumazet
@ 2007-03-23 10:04 ` Ingo Molnar
  2007-03-23 10:10   ` Nick Piggin
  2007-03-23 10:32   ` Nick Piggin
  1 sibling, 2 replies; 25+ messages in thread
From: Ingo Molnar @ 2007-03-23 10:04 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Linux Kernel Mailing List, Ravikiran G Thirumalai


* Nick Piggin <npiggin@suse.de> wrote:

> Implement queued spinlocks for i386. [...]

isnt this patented by MS? (which might not worry you SuSE/Novell guys, 
but it might be a worry for the rest of the world ;-)

	Ingo

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [rfc][patch] queued spinlocks (i386)
  2007-03-23 10:04 ` Ingo Molnar
@ 2007-03-23 10:10   ` Nick Piggin
  2007-03-23 16:48     ` Parag Warudkar
  2007-03-23 18:15     ` Davide Libenzi
  2007-03-23 10:32   ` Nick Piggin
  1 sibling, 2 replies; 25+ messages in thread
From: Nick Piggin @ 2007-03-23 10:10 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Linux Kernel Mailing List, Ravikiran G Thirumalai

On Fri, Mar 23, 2007 at 11:04:18AM +0100, Ingo Molnar wrote:
> 
> * Nick Piggin <npiggin@suse.de> wrote:
> 
> > Implement queued spinlocks for i386. [...]
> 
> isnt this patented by MS? (which might not worry you SuSE/Novell guys, 
> but it might be a worry for the rest of the world ;-)

I never thought a FIFO queue would be patentable.

Do you have a reference to the patent number?

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [rfc][patch] queued spinlocks (i386)
  2007-03-23 10:04 ` Ingo Molnar
  2007-03-23 10:10   ` Nick Piggin
@ 2007-03-23 10:32   ` Nick Piggin
  2007-03-23 10:40     ` Eric Dumazet
                       ` (3 more replies)
  1 sibling, 4 replies; 25+ messages in thread
From: Nick Piggin @ 2007-03-23 10:32 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Linux Kernel Mailing List, Ravikiran G Thirumalai

On Fri, Mar 23, 2007 at 11:04:18AM +0100, Ingo Molnar wrote:
> 
> * Nick Piggin <npiggin@suse.de> wrote:
> 
> > Implement queued spinlocks for i386. [...]
> 
> isnt this patented by MS? (which might not worry you SuSE/Novell guys, 
> but it might be a worry for the rest of the world ;-)

Hmm, it looks like they have implemented a system where the spinning
cpu sleeps on a per-CPU variable rather than the lock itself, and
the releasing cpu writes to that variable to wake it.  They do this
so that spinners don't continually perform exclusive->shared
transitions on the lock cacheline. They call these things queued
spinlocks.  They don't seem to be very patent worthy either, but
maybe it is what you're thinking of?

I'm not as concerned about the contended performance of spinlocks
for Linux as MS seems to be for windows (they seem to be very proud
of this lock). Because if it is a big problem then IMO it is a bug.

This was just something I had in mind when the hardware lock
starvation issue came up, so I thought I should quickly code it up
and RFC...  actually it makes contended performance worse, but I'm
not too worried about that because I'm happy I was able to implement
it without increasing data size or number of locked operations.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [rfc][patch] queued spinlocks (i386)
  2007-03-23 10:32   ` Nick Piggin
@ 2007-03-23 10:40     ` Eric Dumazet
  2007-03-23 11:02     ` William Lee Irwin III
                       ` (2 subsequent siblings)
  3 siblings, 0 replies; 25+ messages in thread
From: Eric Dumazet @ 2007-03-23 10:40 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Ingo Molnar, Linux Kernel Mailing List, Ravikiran G Thirumalai

On Fri, 23 Mar 2007 11:32:44 +0100
Nick Piggin <npiggin@suse.de> wrote:

> On Fri, Mar 23, 2007 at 11:04:18AM +0100, Ingo Molnar wrote:
> > 
> > * Nick Piggin <npiggin@suse.de> wrote:
> > 
> > > Implement queued spinlocks for i386. [...]
> > 
> > isnt this patented by MS? (which might not worry you SuSE/Novell guys, 
> > but it might be a worry for the rest of the world ;-)
> 
> Hmm, it looks like they have implemented a system where the spinning
> cpu sleeps on a per-CPU variable rather than the lock itself, and
> the releasing cpu writes to that variable to wake it.  They do this
> so that spinners don't continually perform exclusive->shared
> transitions on the lock cacheline. They call these things queued
> spinlocks.  They don't seem to be very patent worthy either, but
> maybe it is what you're thinking of?
> 
> I'm not as concerned about the contended performance of spinlocks
> for Linux as MS seems to be for windows (they seem to be very proud
> of this lock). Because if it is a big problem then IMO it is a bug.
> 
> This was just something I had in mind when the hardware lock
> starvation issue came up, so I thought I should quickly code it up
> and RFC...  actually it makes contended performance worse, but I'm
> not too worried about that because I'm happy I was able to implement
> it without increasing data size or number of locked operations.

Sure, but please note that you should rename your patch to :

"Implement queued spinlocks for i486"

:)

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [rfc][patch] queued spinlocks (i386)
  2007-03-23 10:32   ` Nick Piggin
  2007-03-23 10:40     ` Eric Dumazet
@ 2007-03-23 11:02     ` William Lee Irwin III
  2007-03-24 15:55     ` Nikita Danilov
  2007-03-24 21:41     ` Andrew Morton
  3 siblings, 0 replies; 25+ messages in thread
From: William Lee Irwin III @ 2007-03-23 11:02 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Ingo Molnar, Linux Kernel Mailing List, Ravikiran G Thirumalai

On Fri, Mar 23, 2007 at 11:04:18AM +0100, Ingo Molnar wrote:
>> isnt this patented by MS? (which might not worry you SuSE/Novell guys, 
>> but it might be a worry for the rest of the world ;-)

On Fri, Mar 23, 2007 at 11:32:44AM +0100, Nick Piggin wrote:
> Hmm, it looks like they have implemented a system where the spinning
> cpu sleeps on a per-CPU variable rather than the lock itself, and
> the releasing cpu writes to that variable to wake it.  They do this
> so that spinners don't continually perform exclusive->shared
> transitions on the lock cacheline. They call these things queued
> spinlocks.  They don't seem to be very patent worthy either, but
> maybe it is what you're thinking of?

Those exclusive-to-shared transitions are among the cacheline transfers
typically accounted to the algorithms in their complexity analyses.


-- wli

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [rfc][patch] queued spinlocks (i386)
  2007-03-23 10:10   ` Nick Piggin
@ 2007-03-23 16:48     ` Parag Warudkar
  2007-03-23 18:15     ` Davide Libenzi
  1 sibling, 0 replies; 25+ messages in thread
From: Parag Warudkar @ 2007-03-23 16:48 UTC (permalink / raw)
  To: linux-kernel

Nick Piggin <npiggin <at> suse.de> writes:

> 
> On Fri, Mar 23, 2007 at 11:04:18AM +0100, Ingo Molnar wrote:
> > 
> > * Nick Piggin <npiggin <at> suse.de> wrote:
> > 
> > > Implement queued spinlocks for i386. [...]
> > 
> > isnt this patented by MS? (which might not worry you SuSE/Novell guys, 
> > but it might be a worry for the rest of the world 
> 
> I never thought a FIFO queue would be patentable.
> 
> Do you have a reference to the patent number?
> 

The ReactOS guys none the less, seem to have implemented queued spin locks -
http://www.reactos.org/en/newsletter_10.html (see the HAL section.)

Now I haven't looked at the code to see if they are different from what MS has
patented but given that it's a clone of Windows, one would think at least from
the behavioral pov they are similar?

Parag


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [rfc][patch] queued spinlocks (i386)
  2007-03-23 10:10   ` Nick Piggin
  2007-03-23 16:48     ` Parag Warudkar
@ 2007-03-23 18:15     ` Davide Libenzi
  1 sibling, 0 replies; 25+ messages in thread
From: Davide Libenzi @ 2007-03-23 18:15 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Ingo Molnar, Linux Kernel Mailing List

On Fri, 23 Mar 2007, Nick Piggin wrote:

> On Fri, Mar 23, 2007 at 11:04:18AM +0100, Ingo Molnar wrote:
> > 
> > * Nick Piggin <npiggin@suse.de> wrote:
> > 
> > > Implement queued spinlocks for i386. [...]
> > 
> > isnt this patented by MS? (which might not worry you SuSE/Novell guys, 
> > but it might be a worry for the rest of the world ;-)
> 
> I never thought a FIFO queue would be patentable.
> 
> Do you have a reference to the patent number?

FWIW, MS patented even the preprocessor in computer languages ;)
Holding a patent does not make it automatically enforceable.
I remember thare was a discussion time ago about queued spinlocks on lkml.



- Davide



^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [rfc][patch] queued spinlocks (i386)
  2007-03-23  9:40 ` Eric Dumazet
  2007-03-23  9:59   ` Nick Piggin
@ 2007-03-23 19:27   ` Ravikiran G Thirumalai
  1 sibling, 0 replies; 25+ messages in thread
From: Ravikiran G Thirumalai @ 2007-03-23 19:27 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Nick Piggin, Linux Kernel Mailing List, Ingo Molnar

On Fri, Mar 23, 2007 at 10:40:17AM +0100, Eric Dumazet wrote:
> On Fri, 23 Mar 2007 09:59:11 +0100
> Nick Piggin <npiggin@suse.de> wrote:
> 
> > 
> > Implement queued spinlocks for i386. This shouldn't increase the size of
> > the spinlock structure, while still able to handle 2^16 CPUs.
> > 
> > Not completely implemented with assembly yet, to make the algorithm a bit
> > clearer.
> > 
> > The queued spinlock has 2 fields, a head and a tail, which are indexes
> > into a FIFO of waiting CPUs. To take a spinlock, a CPU performs an
> > "atomic_inc_return" on the head index, and keeps the returned value as
> > a ticket. The CPU then spins until the tail index is equal to that
> > ticket.
> > 
> > To unlock a spinlock, the tail index is incremented (this can be non
> > atomic, because only the lock owner will modify tail).
> > 
> > Implementation inefficiencies aside, this change should have little
> > effect on performance for uncontended locks, but will have quite a
> > large cost for highly contended locks [O(N) cacheline transfers vs
> > O(1) per lock aquisition, where N is the number of CPUs contending].
> > The benefit is is that contended locks will not cause any starvation.
> > 
> > Just an idea. Big NUMA hardware seems to have fairness logic that
> > prevents starvation for the regular spinlock logic. But it might be
> > interesting for -rt kernel or systems with starvation issues. 
> 
> It's a very nice idea Nick.

Amen to that.

> 
> You also have for free the number or cpus that are before you.
> 
> On big SMP/NUMA, we could use this information to call a special lock_cpu_relax() function to avoid cacheline transferts.
> 
> 	asm volatile(LOCK_PREFIX "xaddw %0, %1\n\t"
> 		     : "+r" (pos), "+m" (lock->qhead) : : "memory");
> 	for (;;) {
> 		unsigned short nwait = pos - lock->qtail;
> 		if (likely(nwait == 0))
> 			break;
> 		lock_cpu_relax(lock, nwait);
> 	}
> 
> lock_cpu_relax(raw_spinlock_t *lock, unsigned int nwait)
> {
> unsigned int cycles = nwait * lock->min_cycles_per_round;
> busy_loop(cycles);
> }

Good Idea.  Hopefully, this should reduce the number of cacheline transfers
in the contended case.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [rfc][patch] queued spinlocks (i386)
  2007-03-23 10:32   ` Nick Piggin
  2007-03-23 10:40     ` Eric Dumazet
  2007-03-23 11:02     ` William Lee Irwin III
@ 2007-03-24 15:55     ` Nikita Danilov
  2007-03-24 17:29       ` Ingo Molnar
  2007-03-24 21:41     ` Andrew Morton
  3 siblings, 1 reply; 25+ messages in thread
From: Nikita Danilov @ 2007-03-24 15:55 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Ingo Molnar, Linux Kernel Mailing List, Ravikiran G Thirumalai

Nick Piggin writes:
 > On Fri, Mar 23, 2007 at 11:04:18AM +0100, Ingo Molnar wrote:
 > > 
 > > * Nick Piggin <npiggin@suse.de> wrote:
 > > 
 > > > Implement queued spinlocks for i386. [...]
 > > 
 > > isnt this patented by MS? (which might not worry you SuSE/Novell guys, 
 > > but it might be a worry for the rest of the world ;-)
 > 
 > Hmm, it looks like they have implemented a system where the spinning
 > cpu sleeps on a per-CPU variable rather than the lock itself, and
 > the releasing cpu writes to that variable to wake it.  They do this
 > so that spinners don't continually perform exclusive->shared
 > transitions on the lock cacheline. They call these things queued
 > spinlocks.  They don't seem to be very patent worthy either, but

Indeed, this technique is very well known. E.g.,
http://citeseer.ist.psu.edu/anderson01sharedmemory.html has a whole
section (3. Local-spin Algorithms) on them, citing papers from the 1990
onward.

Nikita.


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [rfc][patch] queued spinlocks (i386)
  2007-03-24 15:55     ` Nikita Danilov
@ 2007-03-24 17:29       ` Ingo Molnar
  2007-03-24 18:49         ` Nikita Danilov
  2007-03-28  6:43         ` Nick Piggin
  0 siblings, 2 replies; 25+ messages in thread
From: Ingo Molnar @ 2007-03-24 17:29 UTC (permalink / raw)
  To: Nikita Danilov
  Cc: Nick Piggin, Linux Kernel Mailing List, Ravikiran G Thirumalai


* Nikita Danilov <nikita@clusterfs.com> wrote:

> Indeed, this technique is very well known. E.g., 
> http://citeseer.ist.psu.edu/anderson01sharedmemory.html has a whole 
> section (3. Local-spin Algorithms) on them, citing papers from the 
> 1990 onward.

that is a cool reference! So i'd suggest to do (redo?) the patch based 
on those concepts and that terminology and not use 'queued spinlocks' 
that are commonly associated with MS's stuff. And as a result the 
contended case would be optimized some more via local-spin algorithms. 
(which is not a key thing for us, but which would be nice to have 
nevertheless)

	Ingo

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [rfc][patch] queued spinlocks (i386)
  2007-03-24 17:29       ` Ingo Molnar
@ 2007-03-24 18:49         ` Nikita Danilov
  2007-03-28  6:43         ` Nick Piggin
  1 sibling, 0 replies; 25+ messages in thread
From: Nikita Danilov @ 2007-03-24 18:49 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Nick Piggin, Linux Kernel Mailing List, Ravikiran G Thirumalai

Ingo Molnar writes:
 > 
 > * Nikita Danilov <nikita@clusterfs.com> wrote:
 > 
 > > Indeed, this technique is very well known. E.g., 
 > > http://citeseer.ist.psu.edu/anderson01sharedmemory.html has a whole 
 > > section (3. Local-spin Algorithms) on them, citing papers from the 
 > > 1990 onward.
 > 
 > that is a cool reference! So i'd suggest to do (redo?) the patch based 
 > on those concepts and that terminology and not use 'queued spinlocks' 

There is some old version:

http://namesys.com/pub/misc-patches/unsupported/extra/2004.02.04/p06-locallock.patch
http://namesys.com/pub/misc-patches/unsupported/extra/2004.02.04/p07-locallock-bkl.patch
http://namesys.com/pub/misc-patches/unsupported/extra/2004.02.04/p08-locallock-zone.patch

http://namesys.com/pub/misc-patches/unsupported/extra/2004.02.04/p0b-atomic_dec_and_locallock.patch
http://namesys.com/pub/misc-patches/unsupported/extra/2004.02.04/p0c-locallock-dcache.patch

This version retains original spin-lock interface (i.e., no additional
"queue link" pointer is passed to the locking function). As a result,
lock data structure contains an array of NR_CPU counters, so it's only
suitable for global statically allocated locks.

 > that are commonly associated with MS's stuff. And as a result the 
 > contended case would be optimized some more via local-spin algorithms. 
 > (which is not a key thing for us, but which would be nice to have 
 > nevertheless)
 > 
 > 	Ingo

Nikita.


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [rfc][patch] queued spinlocks (i386)
  2007-03-23 10:32   ` Nick Piggin
                       ` (2 preceding siblings ...)
  2007-03-24 15:55     ` Nikita Danilov
@ 2007-03-24 21:41     ` Andrew Morton
  2007-03-28  6:56       ` Nick Piggin
  3 siblings, 1 reply; 25+ messages in thread
From: Andrew Morton @ 2007-03-24 21:41 UTC (permalink / raw)
  To: Nick Piggin; +Cc: mingo, linux-kernel, kiran

> On Fri, 23 Mar 2007 11:32:44 +0100 Nick Piggin <npiggin@suse.de> wrote:
>
> I'm not as concerned about the contended performance of spinlocks
>

The contended case matters.  Back in 2.5.something I screwed up the debug
version of one of the locks (rwlock, iirc) - it was simply missing a
cpu_relax(), and some people's benchmarks halved.

> This was just something I had in mind when the hardware lock
> starvation issue came up

It looks like a good way to address the lru_lock starvation/capture
problem.  But I think I'd be more comfortable if we were to introduce it as
a new lock type, rather than as a reimplementation of the existing
spin_lock().   Initially, at least.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [rfc][patch] queued spinlocks (i386)
  2007-03-24 17:29       ` Ingo Molnar
  2007-03-24 18:49         ` Nikita Danilov
@ 2007-03-28  6:43         ` Nick Piggin
  2007-03-28 19:26           ` Davide Libenzi
  1 sibling, 1 reply; 25+ messages in thread
From: Nick Piggin @ 2007-03-28  6:43 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Nikita Danilov, Linux Kernel Mailing List, Ravikiran G Thirumalai

On Sat, Mar 24, 2007 at 06:29:59PM +0100, Ingo Molnar wrote:
> 
> * Nikita Danilov <nikita@clusterfs.com> wrote:
> 
> > Indeed, this technique is very well known. E.g., 
> > http://citeseer.ist.psu.edu/anderson01sharedmemory.html has a whole 
> > section (3. Local-spin Algorithms) on them, citing papers from the 
> > 1990 onward.
> 
> that is a cool reference! So i'd suggest to do (redo?) the patch based 
> on those concepts and that terminology and not use 'queued spinlocks' 
> that are commonly associated with MS's stuff. And as a result the 
> contended case would be optimized some more via local-spin algorithms. 
> (which is not a key thing for us, but which would be nice to have 
> nevertheless)

Firstly, the terminology in that paper _is_ "queue lock", which isn't
really surprising. I don't really know or care about what MS calls their
locks, but I'd suggest that their queued spinlock is probably named in
reference to its queueing property rather than its local spin property.

Mine is a queueing lock, so I'll continue to call it queued spinlocks
(not that the terminology will make it into our API anyway, because I
intend for them simply to be an implementation of spinlocks).

Secondly, as you say, local spin isn't really a must have. If SGI hasn't
made a big stink about them by now, then I think queued locks (which are
addressing a real hardware starvation issue on opterons) is more important.

That isn't to say that local spin might not help performance, or that
my queued spinlocks would make it impossible to implement... it's just
that it isn't my aim.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [rfc][patch] queued spinlocks (i386)
  2007-03-24 21:41     ` Andrew Morton
@ 2007-03-28  6:56       ` Nick Piggin
  0 siblings, 0 replies; 25+ messages in thread
From: Nick Piggin @ 2007-03-28  6:56 UTC (permalink / raw)
  To: Andrew Morton; +Cc: mingo, linux-kernel, kiran

On Sat, Mar 24, 2007 at 01:41:28PM -0800, Andrew Morton wrote:
> > On Fri, 23 Mar 2007 11:32:44 +0100 Nick Piggin <npiggin@suse.de> wrote:
> >
> > I'm not as concerned about the contended performance of spinlocks
> >
> 
> The contended case matters.  Back in 2.5.something I screwed up the debug
> version of one of the locks (rwlock, iirc) - it was simply missing a
> cpu_relax(), and some people's benchmarks halved.

Do you have a reference?

rwlocks are a bit funny, because if they are found to be useful (that
is, they get used somewhere), then it indicates there can be situations
with a lot of contention and spinning.

Wheras we usually prefer not to use spinlocks in situations like that.

Not that I'm claiming the contended case doesn't matter, but I think
problems there indicate a bug (and I think rwlocks are almost always
questionable).

Anyway, I'll look at doing some contended case optimisations afterward.
 

> > This was just something I had in mind when the hardware lock
> > starvation issue came up
> 
> It looks like a good way to address the lru_lock starvation/capture
> problem.  But I think I'd be more comfortable if we were to introduce it as
> a new lock type, rather than as a reimplementation of the existing
> spin_lock().   Initially, at least.

I'd hate to have a proliferation of lock types though. I think my
queued spinlock addresses a real hardware limitation of some systems.

In situations where contention isn't a problem, then queued locks won't
cause a slowdown. In situations where it is, starvation could also be
a problem.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [rfc][patch] queued spinlocks (i386)
  2007-03-28  6:43         ` Nick Piggin
@ 2007-03-28 19:26           ` Davide Libenzi
  2007-03-28 22:00             ` Davide Libenzi
  2007-03-29  1:24             ` Nick Piggin
  0 siblings, 2 replies; 25+ messages in thread
From: Davide Libenzi @ 2007-03-28 19:26 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Ingo Molnar, Nikita Danilov, Linux Kernel Mailing List,
	Ravikiran G Thirumalai

On Wed, 28 Mar 2007, Nick Piggin wrote:

> On Sat, Mar 24, 2007 at 06:29:59PM +0100, Ingo Molnar wrote:
> > 
> > * Nikita Danilov <nikita@clusterfs.com> wrote:
> > 
> > > Indeed, this technique is very well known. E.g., 
> > > http://citeseer.ist.psu.edu/anderson01sharedmemory.html has a whole 
> > > section (3. Local-spin Algorithms) on them, citing papers from the 
> > > 1990 onward.
> > 
> > that is a cool reference! So i'd suggest to do (redo?) the patch based 
> > on those concepts and that terminology and not use 'queued spinlocks' 
> > that are commonly associated with MS's stuff. And as a result the 
> > contended case would be optimized some more via local-spin algorithms. 
> > (which is not a key thing for us, but which would be nice to have 
> > nevertheless)
> 
> Firstly, the terminology in that paper _is_ "queue lock", which isn't
> really surprising. I don't really know or care about what MS calls their
> locks, but I'd suggest that their queued spinlock is probably named in
> reference to its queueing property rather than its local spin property.

The method you propose is otherwise called "Ticket Lock":

http://en.wikipedia.org/wiki/Ticket_lock
http://www.cs.rochester.edu/research/synchronization/pseudocode/ss.html#ticket




- Davide



^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [rfc][patch] queued spinlocks (i386)
  2007-03-28 19:26           ` Davide Libenzi
@ 2007-03-28 22:00             ` Davide Libenzi
  2007-03-29  1:36               ` Nick Piggin
  2007-03-29  1:24             ` Nick Piggin
  1 sibling, 1 reply; 25+ messages in thread
From: Davide Libenzi @ 2007-03-28 22:00 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Ingo Molnar, Nikita Danilov, Linux Kernel Mailing List,
	Ravikiran G Thirumalai

On Wed, 28 Mar 2007, Davide Libenzi wrote:

> On Wed, 28 Mar 2007, Nick Piggin wrote:
> 
> > On Sat, Mar 24, 2007 at 06:29:59PM +0100, Ingo Molnar wrote:
> > > 
> > > * Nikita Danilov <nikita@clusterfs.com> wrote:
> > > 
> > > > Indeed, this technique is very well known. E.g., 
> > > > http://citeseer.ist.psu.edu/anderson01sharedmemory.html has a whole 
> > > > section (3. Local-spin Algorithms) on them, citing papers from the 
> > > > 1990 onward.
> > > 
> > > that is a cool reference! So i'd suggest to do (redo?) the patch based 
> > > on those concepts and that terminology and not use 'queued spinlocks' 
> > > that are commonly associated with MS's stuff. And as a result the 
> > > contended case would be optimized some more via local-spin algorithms. 
> > > (which is not a key thing for us, but which would be nice to have 
> > > nevertheless)
> > 
> > Firstly, the terminology in that paper _is_ "queue lock", which isn't
> > really surprising. I don't really know or care about what MS calls their
> > locks, but I'd suggest that their queued spinlock is probably named in
> > reference to its queueing property rather than its local spin property.
> 
> The method you propose is otherwise called "Ticket Lock":
> 
> http://en.wikipedia.org/wiki/Ticket_lock
> http://www.cs.rochester.edu/research/synchronization/pseudocode/ss.html#ticket

That this work prio-art dates to 1991:

http://www.cs.rochester.edu/u/scott/papers/1991_TOCS_synch.pdf

So I would not worry to much about patents here. At least W2K MS ones ;)
What I would worry though, is to add another class of locks. There's no 
reason why Ticket Locks would perform worse than our spinlock, in both 
contended and not-contended case, AFAICS. And they have a nice FIFO 
behaviour.



- Davide



^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [rfc][patch] queued spinlocks (i386)
  2007-03-28 19:26           ` Davide Libenzi
  2007-03-28 22:00             ` Davide Libenzi
@ 2007-03-29  1:24             ` Nick Piggin
  1 sibling, 0 replies; 25+ messages in thread
From: Nick Piggin @ 2007-03-29  1:24 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Ingo Molnar, Nikita Danilov, Linux Kernel Mailing List,
	Ravikiran G Thirumalai

On Wed, Mar 28, 2007 at 12:26:57PM -0700, Davide Libenzi wrote:
> On Wed, 28 Mar 2007, Nick Piggin wrote:
> 
> > On Sat, Mar 24, 2007 at 06:29:59PM +0100, Ingo Molnar wrote:
> > > 
> > > * Nikita Danilov <nikita@clusterfs.com> wrote:
> > > 
> > > > Indeed, this technique is very well known. E.g., 
> > > > http://citeseer.ist.psu.edu/anderson01sharedmemory.html has a whole 
> > > > section (3. Local-spin Algorithms) on them, citing papers from the 
> > > > 1990 onward.
> > > 
> > > that is a cool reference! So i'd suggest to do (redo?) the patch based 
> > > on those concepts and that terminology and not use 'queued spinlocks' 
> > > that are commonly associated with MS's stuff. And as a result the 
> > > contended case would be optimized some more via local-spin algorithms. 
> > > (which is not a key thing for us, but which would be nice to have 
> > > nevertheless)
> > 
> > Firstly, the terminology in that paper _is_ "queue lock", which isn't
> > really surprising. I don't really know or care about what MS calls their
> > locks, but I'd suggest that their queued spinlock is probably named in
> > reference to its queueing property rather than its local spin property.
> 
> The method you propose is otherwise called "Ticket Lock":
> 
> http://en.wikipedia.org/wiki/Ticket_lock
> http://www.cs.rochester.edu/research/synchronization/pseudocode/ss.html#ticket

Yes, a ticket based FIFO queue isn't new... I think we have a lot of
xamples already in the kernel. Using them to implement queue locks
obviously isn't new either.

I don't think we'd have to worry about patents.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [rfc][patch] queued spinlocks (i386)
  2007-03-28 22:00             ` Davide Libenzi
@ 2007-03-29  1:36               ` Nick Piggin
  2007-03-29  7:16                 ` Nick Piggin
  0 siblings, 1 reply; 25+ messages in thread
From: Nick Piggin @ 2007-03-29  1:36 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Ingo Molnar, Nikita Danilov, Linux Kernel Mailing List,
	Ravikiran G Thirumalai

On Wed, Mar 28, 2007 at 03:00:21PM -0700, Davide Libenzi wrote:
> On Wed, 28 Mar 2007, Davide Libenzi wrote:
> 
> > The method you propose is otherwise called "Ticket Lock":
> > 
> > http://en.wikipedia.org/wiki/Ticket_lock
> > http://www.cs.rochester.edu/research/synchronization/pseudocode/ss.html#ticket
> 
> That this work prio-art dates to 1991:
> 
> http://www.cs.rochester.edu/u/scott/papers/1991_TOCS_synch.pdf
> 
> So I would not worry to much about patents here. At least W2K MS ones ;)
> What I would worry though, is to add another class of locks. There's no 

No, as you see from my patch I just change the spinlock implementation
to a queued one. I agree it doesn't make sense to add a new type of lock.


> reason why Ticket Locks would perform worse than our spinlock, in both 
> contended and not-contended case, AFAICS. And they have a nice FIFO 
> behaviour.

In most cases, no. For the uncontended case they should be about the
same. They have the same spinning behaviour. However there is a little
window where they might be a bit slower I think... actually perhaps I'm
wrong!

Currently if you have 4 CPUs spinning and the lock is released, all 4
CPU cachelines will be invalidated, then they will be loaded again, and
found to be 0, so they all try to atomic_dec_return the counter, each
one invalidating others' cachelines. 1 gets through.

With my queued locks, all 4 cachelines are invalidated and loaded, but
only one will be allowed to proceed, and there are 0 atomic operations
or stores of any kind.

So I take that back: our current spinlocks have a worse thundering herd
behaviour under contention than my queued ones. So I'll definitely
push the patch through.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [rfc][patch] queued spinlocks (i386)
  2007-03-29  1:36               ` Nick Piggin
@ 2007-03-29  7:16                 ` Nick Piggin
  2007-03-30  0:27                   ` Davide Libenzi
  0 siblings, 1 reply; 25+ messages in thread
From: Nick Piggin @ 2007-03-29  7:16 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Ingo Molnar, Nikita Danilov, Linux Kernel Mailing List,
	Ravikiran G Thirumalai

On Thu, Mar 29, 2007 at 03:36:52AM +0200, Nick Piggin wrote:
> In most cases, no. For the uncontended case they should be about the
> same. They have the same spinning behaviour. However there is a little
> window where they might be a bit slower I think... actually perhaps I'm
> wrong!
> 
> Currently if you have 4 CPUs spinning and the lock is released, all 4
> CPU cachelines will be invalidated, then they will be loaded again, and
> found to be 0, so they all try to atomic_dec_return the counter, each
> one invalidating others' cachelines. 1 gets through.
> 
> With my queued locks, all 4 cachelines are invalidated and loaded, but
> only one will be allowed to proceed, and there are 0 atomic operations
> or stores of any kind.
> 
> So I take that back: our current spinlocks have a worse thundering herd
> behaviour under contention than my queued ones. So I'll definitely
> push the patch through.

OK, it isn't a big difference, but a user-space test is showing slightly
(~2%) improvement in the contended case on a 16 core Opteron.

There is a case where the present spinlocks are almost twice as fast on
this machine (in terms of aggregate throughput), and that is when a lock
is taken right after it is released. This is because the same CPU will
often be able to retake the lock without transitioning the cache. This is
going to be a rare case for us, and would suggest suboptimal code anyway 
(ie. the lock should just be kept rather than dropped and retaken).

Actually, one situation where it comes up is when we drop and retake a
lock that needs_lockbreak. Of course, the queued lock behaviour is
desired in that case anyway.

However single-thread performance is presently a bit down. OTOH, the
assembly generated by gcc looks like it could be improved upon (even by
me :P).

This is what I've got so far. Should work for i386 and x86_64. Any
enhancements or results from other CPUs would be interesting.

--
#define _GNU_SOURCE
#include <stdlib.h>
#include <assert.h>
#include <stdio.h>
#include <pthread.h>
#include <sched.h>
#include <sys/time.h>

#define likely(x)	__builtin_expect(!!(x), 1)
#define unlikely(x)	__builtin_expect(!!(x), 0)
static inline void cpu_relax(void)
{
	asm volatile("rep ; nop" : : : "memory");
}

static unsigned long delta(struct timeval start, struct timeval end)
{
	return (end.tv_sec - start.tv_sec) * 1000000
		+ end.tv_usec - start.tv_usec;
}

static unsigned long lps;
static void calibrate_delay(void)
{
	struct timeval start, now;
	unsigned long i;
	gettimeofday(&start, NULL);
	i = 0;
	gettimeofday(&start, NULL);
	do {
		gettimeofday(&now, NULL);
		i++;
	} while (delta(start, now) < 1000000 * 2);
	lps = i / 2;
	printf("%lu lps\n", lps);
}

static void delay(unsigned long usec)
{
	struct timeval now;
	unsigned long loops = (lps*usec + 999999) / 1000000;
	unsigned long i;

	for (i = 0; i < loops; i++)
		gettimeofday(&now, NULL);
}

#define QUEUE_LOCKS
#ifdef QUEUE_LOCKS

typedef struct {
	unsigned short qhead;
	unsigned short qtail;
} spinlock_t;

#define SPIN_LOCK_UNLOCKED (spinlock_t){ 0, 0 }

static inline void spin_lock(spinlock_t *lock)
{
	unsigned short pos = 1;

	asm volatile("lock ; xaddw %0, %1\n\t"
			: "+r" (pos), "+m" (lock->qhead) : : "memory");
	while (unlikely(pos != lock->qtail))
		cpu_relax();
}

static inline void spin_unlock(spinlock_t *lock)
{
	asm volatile("addw $1, %0\n\t"
			: "+m" (lock->qtail) : : "memory");
}
#else
typedef struct {
	unsigned int slock;
} spinlock_t;

#define SPIN_LOCK_UNLOCKED (spinlock_t){ 1 }

static inline void spin_lock(spinlock_t *lock)
{
	asm volatile(
		"1:\n\t"
		"lock ; decl %0\n\t"
		"jns 2f\n\t"
		"3:\n\t"
		"rep;nop\n\t"
		"cmpl $0,%0\n\t"
		"jle 3b\n\t"
		"jmp 1b\n\t"
		"2:\n\t" : "=m" (lock->slock) : : "memory");
}

static inline void spin_unlock(spinlock_t *lock)
{
	asm volatile("movl $1,%0" :"=m" (lock->slock) :: "memory");
}
#endif

static struct {
	spinlock_t lock;
	unsigned long count;
} __attribute__((__aligned__(128))) data;


#undef MULTI_THREAD
#ifdef MULTI_THREAD
#define NR_THREADS	16
#define NR_ITERS	200000
#else
#define NR_THREADS	1
#define NR_ITERS	1000000000
#endif

static long go = 0;

static void *thread(void *arg)
{
	cpu_set_t mask;
	struct sched_param param;
	long nr = (long)arg;
	long i;

	assert(nr < NR_THREADS);

	CPU_ZERO(&mask);
	CPU_SET(nr, &mask);

	if (sched_setaffinity(0, sizeof(mask), &mask) == -1)
		perror("sched_setaffinity"), exit(1);

	param.sched_priority = 90;
	if (sched_setscheduler(0, SCHED_FIFO, &param) == -1)
		perror("sched_setscheduler"), exit(1);

	while (!go)
		sched_yield();

	for (i = 0; i < NR_ITERS; i++) {
		/*
		 * delay outside the lock should be non-zero to simulate
		 * reasonable code. The larger the delay inside the lock
		 * in relation to the delay outside, the more contention.
		 * For single-thread mode, delays aren't so critical.
		 */
		spin_lock(&data.lock);
		data.count++;
		spin_unlock(&data.lock);
//		delay(1);
	}

	return NULL;
}

int main(void)
{
	pthread_t threads[NR_THREADS];
	struct sched_param param;
	long i;

	param.sched_priority = 90;
	if (sched_setscheduler(0, SCHED_FIFO, &param) == -1)
		perror("sched_setscheduler"), exit(1);

 	data.lock = SPIN_LOCK_UNLOCKED;
	data.count = 0;

	calibrate_delay();

	for (i = 0; i < NR_THREADS; i++) {
		if (pthread_create(&threads[i], NULL, thread, (void *)i) == -1)
			perror("pthread_create"), exit(1);
	}

	go = 1;
	sched_yield();
	
	for (i = 0; i < NR_THREADS; i++) {
		if (pthread_join(threads[i], NULL) == -1)
			perror("pthread_join"), exit(1);
	}

	assert(data.count == NR_ITERS*NR_THREADS);

	exit(0);
}

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [rfc][patch] queued spinlocks (i386)
  2007-03-29  7:16                 ` Nick Piggin
@ 2007-03-30  0:27                   ` Davide Libenzi
  2007-03-30  1:59                     ` Nick Piggin
  0 siblings, 1 reply; 25+ messages in thread
From: Davide Libenzi @ 2007-03-30  0:27 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Ingo Molnar, Nikita Danilov, Linux Kernel Mailing List,
	Ravikiran G Thirumalai

On Thu, 29 Mar 2007, Nick Piggin wrote:

> On Thu, Mar 29, 2007 at 03:36:52AM +0200, Nick Piggin wrote:
> > In most cases, no. For the uncontended case they should be about the
> > same. They have the same spinning behaviour. However there is a little
> > window where they might be a bit slower I think... actually perhaps I'm
> > wrong!
> > 
> > Currently if you have 4 CPUs spinning and the lock is released, all 4
> > CPU cachelines will be invalidated, then they will be loaded again, and
> > found to be 0, so they all try to atomic_dec_return the counter, each
> > one invalidating others' cachelines. 1 gets through.
> > 
> > With my queued locks, all 4 cachelines are invalidated and loaded, but
> > only one will be allowed to proceed, and there are 0 atomic operations
> > or stores of any kind.
> > 
> > So I take that back: our current spinlocks have a worse thundering herd
> > behaviour under contention than my queued ones. So I'll definitely
> > push the patch through.
> 
> OK, it isn't a big difference, but a user-space test is showing slightly
> (~2%) improvement in the contended case on a 16 core Opteron.
> 
> There is a case where the present spinlocks are almost twice as fast on
> this machine (in terms of aggregate throughput), and that is when a lock
> is taken right after it is released. This is because the same CPU will
> often be able to retake the lock without transitioning the cache. This is
> going to be a rare case for us, and would suggest suboptimal code anyway 
> (ie. the lock should just be kept rather than dropped and retaken).
> 
> Actually, one situation where it comes up is when we drop and retake a
> lock that needs_lockbreak. Of course, the queued lock behaviour is
> desired in that case anyway.
> 
> However single-thread performance is presently a bit down. OTOH, the
> assembly generated by gcc looks like it could be improved upon (even by
> me :P).
> 
> This is what I've got so far. Should work for i386 and x86_64. Any
> enhancements or results from other CPUs would be interesting.

I slightly modified it to use cycles:

http://www.xmailserver.org/qspins.c

Here (Dual Opteron 252) queued locks (ticklocks) are about 10% slower in 
both cases. This is really a microbench, and assembly matter a lot. I did 
not have time to look at the generated one yet, but optimizing branches 
can help in those cases.




- Davide



^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [rfc][patch] queued spinlocks (i386)
  2007-03-30  0:27                   ` Davide Libenzi
@ 2007-03-30  1:59                     ` Nick Piggin
  2007-03-30  2:43                       ` Davide Libenzi
  0 siblings, 1 reply; 25+ messages in thread
From: Nick Piggin @ 2007-03-30  1:59 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Ingo Molnar, Nikita Danilov, Linux Kernel Mailing List,
	Ravikiran G Thirumalai

On Thu, Mar 29, 2007 at 05:27:24PM -0700, Davide Libenzi wrote:
> On Thu, 29 Mar 2007, Nick Piggin wrote:
> 
> > On Thu, Mar 29, 2007 at 03:36:52AM +0200, Nick Piggin wrote:
> > > In most cases, no. For the uncontended case they should be about the
> > > same. They have the same spinning behaviour. However there is a little
> > > window where they might be a bit slower I think... actually perhaps I'm
> > > wrong!
> > > 
> > > Currently if you have 4 CPUs spinning and the lock is released, all 4
> > > CPU cachelines will be invalidated, then they will be loaded again, and
> > > found to be 0, so they all try to atomic_dec_return the counter, each
> > > one invalidating others' cachelines. 1 gets through.
> > > 
> > > With my queued locks, all 4 cachelines are invalidated and loaded, but
> > > only one will be allowed to proceed, and there are 0 atomic operations
> > > or stores of any kind.
> > > 
> > > So I take that back: our current spinlocks have a worse thundering herd
> > > behaviour under contention than my queued ones. So I'll definitely
> > > push the patch through.
> > 
> > OK, it isn't a big difference, but a user-space test is showing slightly
> > (~2%) improvement in the contended case on a 16 core Opteron.
> > 
> > There is a case where the present spinlocks are almost twice as fast on
> > this machine (in terms of aggregate throughput), and that is when a lock
> > is taken right after it is released. This is because the same CPU will
> > often be able to retake the lock without transitioning the cache. This is
> > going to be a rare case for us, and would suggest suboptimal code anyway 
> > (ie. the lock should just be kept rather than dropped and retaken).
> > 
> > Actually, one situation where it comes up is when we drop and retake a
> > lock that needs_lockbreak. Of course, the queued lock behaviour is
> > desired in that case anyway.
> > 
> > However single-thread performance is presently a bit down. OTOH, the
> > assembly generated by gcc looks like it could be improved upon (even by
> > me :P).
> > 
> > This is what I've got so far. Should work for i386 and x86_64. Any
> > enhancements or results from other CPUs would be interesting.
> 
> I slightly modified it to use cycles:
> 
> http://www.xmailserver.org/qspins.c

Slightly more than slightly ;)

You want to have a delay _outside_ the critical section as well, for
multi-thread tests, otherwise the releasing CPU often just retakes
the lock (in the unqueued lock case). As I said, most kernel code
should _not_ be dropping and retaking locks.

> Here (Dual Opteron 252) queued locks (ticklocks) are about 10% slower in 
> both cases. This is really a microbench, and assembly matter a lot. I did 
> not have time to look at the generated one yet, but optimizing branches 
> can help in those cases.


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [rfc][patch] queued spinlocks (i386)
  2007-03-30  1:59                     ` Nick Piggin
@ 2007-03-30  2:43                       ` Davide Libenzi
  0 siblings, 0 replies; 25+ messages in thread
From: Davide Libenzi @ 2007-03-30  2:43 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Ingo Molnar, Nikita Danilov, Linux Kernel Mailing List,
	Ravikiran G Thirumalai

On Fri, 30 Mar 2007, Nick Piggin wrote:

> > I slightly modified it to use cycles:
> > 
> > http://www.xmailserver.org/qspins.c
> 
> Slightly more than slightly ;)
> 
> You want to have a delay _outside_ the critical section as well, for
> multi-thread tests, otherwise the releasing CPU often just retakes
> the lock (in the unqueued lock case). As I said, most kernel code
> should _not_ be dropping and retaking locks.

Yeah. ATM it mostly does double-takes.



- Davide



^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2007-03-30  2:45 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-03-23  8:59 [rfc][patch] queued spinlocks (i386) Nick Piggin
2007-03-23  9:40 ` Eric Dumazet
2007-03-23  9:59   ` Nick Piggin
2007-03-23 19:27   ` Ravikiran G Thirumalai
2007-03-23 10:04 ` Ingo Molnar
2007-03-23 10:10   ` Nick Piggin
2007-03-23 16:48     ` Parag Warudkar
2007-03-23 18:15     ` Davide Libenzi
2007-03-23 10:32   ` Nick Piggin
2007-03-23 10:40     ` Eric Dumazet
2007-03-23 11:02     ` William Lee Irwin III
2007-03-24 15:55     ` Nikita Danilov
2007-03-24 17:29       ` Ingo Molnar
2007-03-24 18:49         ` Nikita Danilov
2007-03-28  6:43         ` Nick Piggin
2007-03-28 19:26           ` Davide Libenzi
2007-03-28 22:00             ` Davide Libenzi
2007-03-29  1:36               ` Nick Piggin
2007-03-29  7:16                 ` Nick Piggin
2007-03-30  0:27                   ` Davide Libenzi
2007-03-30  1:59                     ` Nick Piggin
2007-03-30  2:43                       ` Davide Libenzi
2007-03-29  1:24             ` Nick Piggin
2007-03-24 21:41     ` Andrew Morton
2007-03-28  6:56       ` Nick Piggin

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.