All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC] Hierarchical BackOff Locks
@ 2005-06-15 22:51 Christoph Lameter
  2005-06-22 15:45 ` Zoltan Menyhart
                   ` (3 more replies)
  0 siblings, 4 replies; 6+ messages in thread
From: Christoph Lameter @ 2005-06-15 22:51 UTC (permalink / raw)
  To: linux-ia64

Somehow this did not make it to linux-ia64 back in May.

Spinlocks cause a lot of contention if more than 4 processors are trying 
to acquire the lock. If a lock is under contention by more than 8 
processors then the majority of the time is spent waiting for spinlocks 
rather than doing productive work.

See http://www.sgi.com/products/software/linux/downloads/sync_linux.pdf

The contention could be limited by introducing a method for NUMA aware 
node backoff first proposed by Zoran Radovich called "HBO Locks". 
The patch here is just implementing the spinlock piece for ia64.

HBO locks can:

1. Detect the distance to the processor holding the lock.
2. Do a NUMA aware backoff so that local processors have a higher chance
   of getting to lock to avoid excessive transfer across the NUMA link
3. Avoid multiple processors from the same node trying to acquire the same
   remote lock.
4. Avoid starvation by an "angry" mode that forces another node to give up
   its ownership of the lock.
5. Have a fast uncontended case that is competitive with regular 
spinlocks.

Performance graphs are included in the paper. Performance benefits start 
at 8 processors and get up to 3 fold for 60 processors.

Does something like this have any chance of making it into the kernel?

Index: linux-2.6.11/include/asm-ia64/spinlock.h
=================================--- linux-2.6.11.orig/include/asm-ia64/spinlock.h	2005-03-01 23:37:48.000000000 -0800
+++ linux-2.6.11/include/asm-ia64/spinlock.h	2005-05-24 13:54:21.000000000 -0700
@@ -11,7 +11,8 @@
 
 #include <linux/compiler.h>
 #include <linux/kernel.h>
-
+#include <linux/config.h>
+#include <linux/cache.h>
 #include <asm/atomic.h>
 #include <asm/bitops.h>
 #include <asm/intrinsics.h>
@@ -24,10 +25,13 @@ typedef struct {
 #endif
 } spinlock_t;
 
+/* These definitions do not mean much. Lots of code for IA64 depends on
+ * zeroed memory containing unlocked spinlocks
+ */
 #define SPIN_LOCK_UNLOCKED			(spinlock_t) { 0 }
 #define spin_lock_init(x)			((x)->lock = 0)
 
-#ifdef ASM_SUPPORTED
+#if defined(ASM_SUPPORTED) & !defined(CONFIG_HBO_LOCKS)
 /*
  * Try to get the lock.  If we fail to get the lock, make a non-standard call to
  * ia64_spinlock_contention().  We do not use a normal call because that would force all
@@ -95,6 +99,35 @@ _raw_spin_lock_flags (spinlock_t *lock, 
 }
 #define _raw_spin_lock(lock) _raw_spin_lock_flags(lock, 0)
 #else /* !ASM_SUPPORTED */
+
+#ifdef CONFIG_HBO_LOCKS
+
+void hbo_spinlock_contention(__u32 *, unsigned long, unsigned long);
+void hbo_spinlock_wait_remote(__u32 *, unsigned long);
+
+struct node_lock_s {
+	void *spin_at;
+} ____cacheline_aligned;
+
+extern struct node_lock_s ndata[MAX_NUMNODES];
+
+#define _raw_spin_lock_flags(__x, __flags)						\
+do {											\
+	__u32 *ia64_spinlock_ptr = (__u32 *) (__x);					\
+	__u64 ia64_spinlock_val;							\
+							\
+	if (unlikely(ndata[numa_node_id()].spin_at = ia64_spinlock_ptr))		\
+		hbo_spinlock_wait_remote(ia64_spinlock_ptr, __flags);			\
+							\
+	ia64_spinlock_val = ia64_cmpxchg4_acq(ia64_spinlock_ptr, numa_node_id()+ 1, 0);	\
+							\
+	if (unlikely(ia64_spinlock_val))						\
+		hbo_spinlock_contention(ia64_spinlock_ptr, ia64_spinlock_val, __flags);	\
+} while (0)
+
+#define _raw_spin_lock(lock) _raw_spin_lock_flags(lock, 0)
+
+#else
 #define _raw_spin_lock_flags(lock, flags) _raw_spin_lock(lock)
 # define _raw_spin_lock(x)								\
 do {											\
@@ -109,11 +142,16 @@ do {											\
 		} while (ia64_spinlock_val);						\
 	}										\
 } while (0)
+#endif
 #endif /* !ASM_SUPPORTED */
 
 #define spin_is_locked(x)	((x)->lock != 0)
 #define _raw_spin_unlock(x)	do { barrier(); ((spinlock_t *) x)->lock = 0; } while (0)
+#ifdef CONFIG_HBO_LOCKS
+#define _raw_spin_trylock(x)	(cmpxchg_acq(&(x)->lock, 0, numa_node_id() + 1) = 0)
+#else
 #define _raw_spin_trylock(x)	(cmpxchg_acq(&(x)->lock, 0, 1) = 0)
+#endif
 #define spin_unlock_wait(x)	do { barrier(); } while ((x)->lock)
 
 typedef struct {
Index: linux-2.6.11/arch/ia64/Kconfig
=================================--- linux-2.6.11.orig/arch/ia64/Kconfig	2005-05-24 13:40:29.000000000 -0700
+++ linux-2.6.11/arch/ia64/Kconfig	2005-05-24 13:40:46.000000000 -0700
@@ -296,6 +296,19 @@ config PREEMPT
           Say Y here if you are building a kernel for a desktop, embedded
           or real-time system.  Say N if you are unsure.
 
+config HBO_LOCKS
+	bool "HBO GT SD Locks"
+	help
+	depends on (SMP && NUMA)
+	default y
+	help
+	  HBO locks reduces contention for spinlocks by saving the node id when the
+	  lock is taken. The cpu waiting on the spinlock can then select an appropriate
+	  backoff algorithm depending on the distance to the node. This also insures that
+	  it is highly likely that another cpu on the same node gets to a spinlock before
+	  remote nodes to avoid too much cacheline bouncing. A node may "get angry" in
+	  order to avoid starvation and stop local nodes from getting the lock.
+
 config HAVE_DEC_LOCK
 	bool
 	depends on (SMP || PREEMPT)
Index: linux-2.6.11/arch/ia64/mm/numa.c
=================================--- linux-2.6.11.orig/arch/ia64/mm/numa.c	2005-03-01 23:37:52.000000000 -0800
+++ linux-2.6.11/arch/ia64/mm/numa.c	2005-05-24 13:40:46.000000000 -0700
@@ -17,9 +17,11 @@
 #include <linux/node.h>
 #include <linux/init.h>
 #include <linux/bootmem.h>
+#include <linux/proc_fs.h>
+#include <linux/nodemask.h>
 #include <asm/mmzone.h>
 #include <asm/numa.h>
-
+#include <asm/uaccess.h>
 
 /*
  * The following structures are usually initialized by ACPI or
@@ -47,3 +49,296 @@ paddr_to_nid(unsigned long paddr)
 
 	return (i < num_node_memblks) ? node_memblk[i].nid : (num_node_memblks ? -1 : 0);
 }
+
+#ifdef CONFIG_HBO_LOCKS
+
+struct node_lock_s ndata[MAX_NUMNODES];
+
+/* Counters and statistics available via /proc/hbo */
+struct hbo_c_s {
+	unsigned long contention;
+	unsigned long local_block;
+	unsigned long remote_block;
+	unsigned long remote_lock;
+	unsigned long retry;
+};
+
+DEFINE_PER_CPU(struct hbo_c_s, counters);
+
+#define INC_HC(member) __get_cpu_var(counters).member++
+#define DEC_HC(member) __get_cpu_var(counters).member--
+
+static inline void maybe_enable_irq(unsigned long flags)
+{
+	if (flags & IA64_PSR_I_BIT)
+	 	local_irq_enable();
+}
+
+static inline void maybe_disable_irq(unsigned long flags)
+{
+	if (flags & IA64_PSR_I_BIT)
+		local_irq_disable();
+}
+
+/*
+ * Number of retries until we get so angry that we block the locks from the
+ * node they originate from
+ */
+static unsigned int anger_limit = 50;
+static unsigned int backoff_factor = 2*8;
+/*
+ * Backoff parameters for node local locks
+ */
+static unsigned int local_backoff = 20;
+static unsigned int local_cap = 20;
+
+/*
+ * Backoff parameters for the case that a lock is held
+ * by a processor on a remote node
+ */
+static unsigned int remote_backoff = 1000;
+static unsigned int remote_cap = 200000;
+
+void hbo_spinlock_contention(__u32 *lock, unsigned long lockval, unsigned long flags)
+{
+	INC_HC(contention);
+	do {
+		unsigned int backoff = local_backoff;
+		unsigned int cap = local_cap;
+		unsigned int remote = 0;
+		unsigned int anger_level = 0;
+		unsigned int stopped_nodes = 0;
+
+		maybe_enable_irq(flags);
+
+		if (unlikely(lockval-1 != numa_node_id())) {
+			/* remote backoff */
+			backoff = remote_backoff;
+			cap = remote_cap;
+			/*
+			 * Make sure that no other cpu of this node tries
+			 * to acquire the same remote lock.
+			 */
+			ndata[numa_node_id()].spin_at = lock;
+			remote = 1;
+			INC_HC(remote_lock);
+		}
+
+		for(;;) {
+			int i;
+
+			for(i = 0; i < backoff; i++)
+				cpu_relax();
+
+			/* Increase backoff for next cycle */
+			backoff = min(((backoff * backoff_factor) << 8), cap);
+
+			ia64_barrier();
+
+			/* No cmpxchg so that we will not get an exclusive cache line */
+			lockval = *lock;
+
+			if (likely(lockval =0))
+				break;
+
+			if (remote) {
+				anger_level++;
+				if (anger_level = anger_limit) {
+					INC_HC(remote_block);
+					/*
+					 * Block any remote threads that attempt
+					 * to get the lock and make them spin locally
+					 * on that node.
+					 */
+					ndata[lockval-1].spin_at = lock;
+					stopped_nodes++;
+					/*
+					 * Use local backoff instead of remote
+					 * to have more of a chance to get the lock
+					 */
+					backoff = local_backoff;
+					cap = local_backoff;
+					anger_level = 0;
+				}
+			}
+
+		};
+
+		maybe_disable_irq(flags);
+		INC_HC(retry);
+		/* Lock was unlocked. Now use cmpxchg acquiring an exclusive cache line */
+		lockval = ia64_cmpxchg4_acq(lock, numa_node_id()+1, 0);
+
+		/* Release eventually spinning other cpus on this node since either
+		 * the lock has been acquired by this cpu or the node holding the lock
+		 * may have changed.
+		 */
+		if (unlikely(remote)) {
+			/* Remove off-node block */
+			ndata[numa_node_id()].spin_at = NULL;
+			if (stopped_nodes) {
+				int i;
+				/*
+				 * Remove any remote blocks we established when we
+				 * got angry
+				 */
+				for_each_online_node(i)
+					cmpxchg(&(ndata[i].spin_at), lock, NULL);
+			}
+		}
+
+        } while (likely(lockval));
+}
+
+void hbo_spinlock_wait_remote(__u32 *lock, unsigned long flags)
+{
+	INC_HC(local_block);
+	maybe_enable_irq(flags);
+	while (ndata[numa_node_id()].spin_at = lock)
+		cpu_relax();
+	maybe_disable_irq(flags);
+}
+
+static int hbo_read_proc(char *page, char **start, off_t off,
+	int count, int *eof, void *data)
+{
+	int cpu = (long)data;
+	struct hbo_c_s summary, *x;
+	int n;
+	int len;
+
+	if (cpu >=0)
+		x = &per_cpu(counters, cpu);
+	else {
+		memcpy(&summary, &per_cpu(counters, 0), sizeof(summary));
+		for(n = 1; n < num_possible_cpus(); n++) {
+                        struct hbo_c_s *c = &per_cpu(counters, n);
+
+			summary.contention += c->contention;
+			summary.local_block += c->local_block;
+			summary.remote_block += c->remote_block;
+			summary.remote_lock += c->remote_lock;
+			summary.retry += c->retry;
+		}
+		x = &summary;
+	}
+
+	len = 0;
+	len += sprintf(page+len, "Contentions      : %lu\n",x->contention);
+	len += sprintf(page+len, "Remote Lock      : %lu\n",x->remote_lock);
+	len += sprintf(page+len, "Retry            : %lu\n",x->retry);
+	len += sprintf(page+len, "Local block      : %lu\n",x->local_block);
+	len += sprintf(page+len, "Remote block     : %lu\n",x->remote_block);
+
+	if (len <= off+count)
+		*eof = 1;
+	*start = page + off;
+	len -= off;
+	if (len>count)
+		len = count;
+	if (len<0)
+		len = 0;
+        return len;
+}
+
+static int hbo_reset_write(struct file *file, const char __user *buffer,
+	unsigned long count, void *data)
+{
+	int i;
+
+	for (i = 0; i < num_possible_cpus(); i++) {
+		struct hbo_c_s *c = &per_cpu(counters, i);
+		memset(c, 0, sizeof(struct hbo_c_s));
+	}
+	return count;
+}
+
+static int hbo_write_int(struct file *file, const char __user *buffer,
+	unsigned long count, void *data)
+{
+	unsigned int * v = data;
+	char c[11];
+
+	if (count < 1)
+		return -EINVAL;
+
+	if (copy_from_user(c, buffer, 10))
+		return -EFAULT;
+
+	*v = simple_strtoul(buffer,NULL, 10);
+	return count;
+}
+
+static int hbo_read_int(char *page, char **start, off_t off, int count,
+			int *eof, void *data)
+{
+	unsigned int *v = data;
+	int len;
+
+	len = sprintf(page, "%u\n",*v);
+	if (len <= off+count)
+		*eof = 1;
+	*start = page + off;
+	len -= off;
+	if (len>count)
+		len = count;
+	if (len<0)
+		len = 0;
+	return len;
+}
+
+static __init int init_hbolock(void)
+{
+	int i;
+	struct proc_dir_entry *proc_hbo, *hbo_reset, *x;
+
+	proc_hbo = proc_mkdir("hbo",NULL);
+	hbo_reset = create_proc_entry("reset", S_IWUGO, proc_hbo);
+	hbo_reset->write_proc = hbo_reset_write;
+
+	x = create_proc_entry("all", S_IRUGO, proc_hbo);
+	x->read_proc = &hbo_read_proc;
+	x->data = (void *)-1;
+
+	/* ints */
+	x = create_proc_entry("local_backoff", 0664, proc_hbo);
+	x->read_proc = hbo_read_int;
+	x->write_proc = hbo_write_int;
+	x->data = &local_backoff;
+	x = create_proc_entry("local_cap", 0664, proc_hbo);
+	x->read_proc = hbo_read_int;
+	x->write_proc = hbo_write_int;
+	x->data = &local_cap;
+	x = create_proc_entry("remote_backoff", 0664, proc_hbo);
+	x->read_proc = hbo_read_int;
+	x->write_proc = hbo_write_int;
+	x->data = &remote_backoff;
+	x = create_proc_entry("remote_cap", 0664, proc_hbo);
+	x->read_proc = hbo_read_int;
+	x->write_proc = hbo_write_int;
+	x->data = &remote_cap;
+	x = create_proc_entry("backoff_factor", 0664, proc_hbo);
+	x->read_proc = hbo_read_int;
+	x->write_proc = hbo_write_int;
+	x->data = &backoff_factor;
+	x = create_proc_entry("anger_limit", 0664, proc_hbo);
+	x->read_proc = hbo_read_int;
+	x->write_proc = hbo_write_int;
+	x->data = &anger_limit;
+
+	for(i = 0; i < num_possible_cpus(); i++) {
+		char name[20];
+		struct proc_dir_entry *p;
+
+		sprintf(name, "%d", i);
+		p = create_proc_entry(name, S_IRUGO, proc_hbo);
+
+		p->read_proc = hbo_read_proc;
+		p->data = (void *)(unsigned long)i;
+	}
+	return 0;
+}
+
+__initcall(init_hbolock);
+#endif
+

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC] Hierarchical BackOff Locks
  2005-06-15 22:51 [RFC] Hierarchical BackOff Locks Christoph Lameter
@ 2005-06-22 15:45 ` Zoltan Menyhart
  2005-06-24 13:08 ` Christoph Lameter
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 6+ messages in thread
From: Zoltan Menyhart @ 2005-06-22 15:45 UTC (permalink / raw)
  To: linux-ia64

Christoph,

I like your idea.
I'm not sure that I understand correctly the implementation.

I have not got any big machine at hand yet, but as the architectures
will not become any simper in the future, you are right, we should do
something.

My first problem in "_raw_spin_lock_flags()" is:

Assuming the lock is taken by a CPU in node X.
Assuming all the CPUs in the node Y call "_raw_spin_lock_flags()".
They can proceed only with some 100 nanosec. one after the other.
All of them find ".spin_at" not equal to the lock address.
As the lock is busy, all of them go to spin in "hbo_spinlock_contention()".
Only one of them should spin there, the others should go to spin in
"hbo_spinlock_wait_remote()", should not they ?
I am not sure if the algorithm works correctly if more than 1 CPUs enter
in "hbo_spinlock_contention()".

in "hbo_spinlock_contention()":

Have you got just a single global "ndata[]", common for all the spin locks?

> +	/* Increase backoff for next cycle */
> +	backoff = min(((backoff * backoff_factor) << 8), cap);

With the values of "cap" and "back off" you gave, it always results "cap".

BTW should not the local vs. remote values be calculated from some
ACPI table values ?

> +	if (unlikely(remote)) {
> +		/* Remove off-node block */
> +		ndata[numa_node_id()].spin_at = NULL;
> +		if (stopped_nodes) {
> +			int i;
> +			/*
> +			 * Remove any remote blocks we established when we
> +			 * got angry
> +			 */
> +			for_each_online_node(i)
> +				cmpxchg(&(ndata[i].spin_at), lock, NULL);

I guess we should clean the ".spin_at" on the stopped nodes only if
we really have got the lock.

You may have up to 256 nodes. I think we should not touch in vain the
".spin_at"-s for the nodes which are not waiting for the lock.

Why to use "cmpxchg()" ?

Some more verbose comments will be appreciated :-)
(in spinlock.h, at the function headers: what, why, ...)

> See http://www.sgi.com/products/software/linux/downloads/sync_linux.pdf

What beck mark and what measurement tools do you use e.g. for creating
"Illustration 3 Average Fault Time..." ?

> +/* These definitions do not mean much. Lots of code for IA64 depends on
> + * zeroed memory containing unlocked spinlocks
> + */
>  #define SPIN_LOCK_UNLOCKED			(spinlock_t) { 0 }
>  #define spin_lock_init(x)			((x)->lock = 0)

I am for using them everywhere, instead of forgetting about them :-)

> +extern struct node_lock_s ndata[MAX_NUMNODES];

Can we have a more speaking name, not just "ndata" ?

> +
> +#define _raw_spin_lock_flags(__x, __flags)						\
> +do {											\
> +	__u32 *ia64_spinlock_ptr = (__u32 *) (__x);					\

I guess it would be more readable to take the address of the field "->lock".
Please take advantage of the type checking mechanisms.
A lock word is a volatile...

	volatile __u32 *ia64_spinlock_ptr = &__x->lock;

Please consider the other similar lock references, too.

> +	if (unlikely(ndata[numa_node_id()].spin_at = ia64_spinlock_ptr))		\

It will be expanded in to *some* nested arrays:

if (!!(ndata[((int)(cpu_to_node_map[(ia64_getreg(_IA64_REG_TP)->cpu)]))].spin_at = ia64_spinlock_ptr))

Can we consider instead of "cpu_to_node_map[(ia64_getreg(_IA64_REG_TP)->cpu)]"
perhaps some "per_cpu__my_slock_idx" ?

> +	ia64_spinlock_val = ia64_cmpxchg4_acq(ia64_spinlock_ptr, numa_node_id()+ 1, 0);	\

Instead of "numa_node_id()+ 1" perhaps some "per_cpu__my_slock_id" ?

As we have got 32 bits, perhaps we could include both the node ID and the CPU number
in to the busy lock word format. It could help to find out on a dead-locked system,
who has locked what :-)

Regards,

Zoltan Menyhart


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC] Hierarchical BackOff Locks
  2005-06-15 22:51 [RFC] Hierarchical BackOff Locks Christoph Lameter
  2005-06-22 15:45 ` Zoltan Menyhart
@ 2005-06-24 13:08 ` Christoph Lameter
  2005-06-24 17:53 ` Christoph Lameter
  2005-06-27 15:30 ` Zoltan Menyhart
  3 siblings, 0 replies; 6+ messages in thread
From: Christoph Lameter @ 2005-06-24 13:08 UTC (permalink / raw)
  To: linux-ia64

On Wed, 22 Jun 2005, Zoltan Menyhart wrote:

> My first problem in "_raw_spin_lock_flags()" is:
> 
> Assuming the lock is taken by a CPU in node X.
> Assuming all the CPUs in the node Y call "_raw_spin_lock_flags()".
> They can proceed only with some 100 nanosec. one after the other.
> All of them find ".spin_at" not equal to the lock address.
> As the lock is busy, all of them go to spin in "hbo_spinlock_contention()".
> Only one of them should spin there, the others should go to spin in
> "hbo_spinlock_wait_remote()", should not they ?

Ideally yes but that is not necessary for the correctness of the 
algorithm.

> I am not sure if the algorithm works correctly if more than 1 CPUs enter
> in "hbo_spinlock_contention()".

Then multiple attempts to acquire the same remote lock may occur. This 
will reduce performance.

> > +	/* Increase backoff for next cycle */
> > +	backoff = min(((backoff * backoff_factor) << 8), cap);
> 
> With the values of "cap" and "back off" you gave, it always results "cap".

We should have  >> instead of <<.
 
> BTW should not the local vs. remote values be calculated from some
> ACPI table values ?

Yes consulting a slit table would be ideal.

> > +			for_each_online_node(i)
> > +				cmpxchg(&(ndata[i].spin_at), lock, NULL);
> 
> I guess we should clean the ".spin_at" on the stopped nodes only if
> we really have got the lock.

No we need to remove any nodes that we stopped even if we did not get the 
lock otherwise we may cause other processes to be be stuck waiting in a 
"spin_at" loop.

> You may have up to 256 nodes. I think we should not touch in vain the
> ".spin_at"-s for the nodes which are not waiting for the lock.
> 
> Why to use "cmpxchg()" ?

Another cpu may have overwritten the spin_at value and its not good to 
reset the spin_at value to NULL if its in use by another node.

> > See http://www.sgi.com/products/software/linux/downloads/sync_linux.pdf
> 
> What beck mark and what measurement tools do you use e.g. for creating
> "Illustration 3 Average Fault Time..." ?

See the performance counter patch posted to linux-ia64.

> > +/* These definitions do not mean much. Lots of code for IA64 depends on
> > + * zeroed memory containing unlocked spinlocks
> > + */
> >  #define SPIN_LOCK_UNLOCKED			(spinlock_t) { 0 }
> >  #define spin_lock_init(x)			((x)->lock = 0)
> 
> I am for using them everywhere, instead of forgetting about them :-)

Agreed. But the reality is that IA64 arch specific code assumes zeroed 
memory contains unlocked spinlocks.

> > +extern struct node_lock_s ndata[MAX_NUMNODES];
> 
> Can we have a more speaking name, not just "ndata" ?

Ok. Changed to node_lock_info.

> > +
> > +#define _raw_spin_lock_flags(__x, __flags)
> > \
> > +do {
> > \
> > +	__u32 *ia64_spinlock_ptr = (__u32 *) (__x);
> > \
> 
> I guess it would be more readable to take the address of the field "->lock".
> Please take advantage of the type checking mechanisms.
> A lock word is a volatile...

Ok. Done.

> 
> 	volatile __u32 *ia64_spinlock_ptr = &__x->lock;
> 
> Please consider the other similar lock references, too.

Cannot find any other references like that.

> 
> > +	if (unlikely(ndata[numa_node_id()].spin_at = ia64_spinlock_ptr))
> > \
> 
> It will be expanded in to *some* nested arrays:
> 
> if
> (!!(ndata[((int)(cpu_to_node_map[(ia64_getreg(_IA64_REG_TP)->cpu)]))].spin_at
> = ia64_spinlock_ptr))
> 
> Can we consider instead of "cpu_to_node_map[(ia64_getreg(_IA64_REG_TP)->cpu)]"
> perhaps some "per_cpu__my_slock_idx" ?

Yes that may be a good optimization.


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC] Hierarchical BackOff Locks
  2005-06-15 22:51 [RFC] Hierarchical BackOff Locks Christoph Lameter
  2005-06-22 15:45 ` Zoltan Menyhart
  2005-06-24 13:08 ` Christoph Lameter
@ 2005-06-24 17:53 ` Christoph Lameter
  2005-06-27 15:30 ` Zoltan Menyhart
  3 siblings, 0 replies; 6+ messages in thread
From: Christoph Lameter @ 2005-06-24 17:53 UTC (permalink / raw)
  To: linux-ia64

On Wed, 22 Jun 2005, Zoltan Menyhart wrote:

> Can we consider instead of "cpu_to_node_map[(ia64_getreg(_IA64_REG_TP)->cpu)]"
> perhaps some "per_cpu__my_slock_idx" ?
> 
> > +	ia64_spinlock_val = ia64_cmpxchg4_acq(ia64_spinlock_ptr,
> > numa_node_id()+ 1, 0);	\
> 
> Instead of "numa_node_id()+ 1" perhaps some "per_cpu__my_slock_id" ?

I just tried to implement your proposed method but we will run into issues 
with preemption. I would have preempt disable/enables or irq 
disable/enables inline as well as in the contention handlers.


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC] Hierarchical BackOff Locks
  2005-06-15 22:51 [RFC] Hierarchical BackOff Locks Christoph Lameter
                   ` (2 preceding siblings ...)
  2005-06-24 17:53 ` Christoph Lameter
@ 2005-06-27 15:30 ` Zoltan Menyhart
  3 siblings, 0 replies; 6+ messages in thread
From: Zoltan Menyhart @ 2005-06-27 15:30 UTC (permalink / raw)
  To: linux-ia64

Christoph Lameter wrote:

>>Can we consider instead of "cpu_to_node_map[(ia64_getreg(_IA64_REG_TP)->cpu)]"
>>perhaps some "per_cpu__my_slock_idx" ?
>>
>>
>>>+	ia64_spinlock_val = ia64_cmpxchg4_acq(ia64_spinlock_ptr,
>>>numa_node_id()+ 1, 0);	\
>>
>>Instead of "numa_node_id()+ 1" perhaps some "per_cpu__my_slock_id" ?
> 
> 
> I just tried to implement your proposed method but we will run into issues 
> with preemption. I would have preempt disable/enables or irq 
> disable/enables inline as well as in the contention handlers.

We do have got the appropriate preempt disable/enables or irq disable/enables,
see in "_spin_lock_irq()", "_spin_lock(), etc.
Otherwise even the numa node ID wouldn't be stable.
This is why we could include the CPU ID, too, in to the lock word, for
dead-lock hunting.

>>Assuming the lock is taken by a CPU in node X.
>>Assuming all the CPUs in the node Y call "_raw_spin_lock_flags()".
>>They can proceed only with some 100 nanosec. one after the other.
>>All of them find ".spin_at" not equal to the lock address.
>>As the lock is busy, all of them go to spin in "hbo_spinlock_contention()".
>>Only one of them should spin there, the others should go to spin in
>>"hbo_spinlock_wait_remote()", should not they ?
> 
> 
> Ideally yes but that is not necessary for the correctness of the 
> algorithm.

I'm afraid the hierarchical back-off algorithm will not work, you will
not take advantage of having just a single task looping on the remote
lock word and the others in the same node, looping on a local cache line.
In addition to the scenario above:

		if (unlikely(remote)) {
			/* Remove off-node block */
			ndata[numa_node_id()].spin_at = NULL;

Here, you kick off all the other CPUs waiting for the lock in the
current node. You do it regardless if the lock is really yours or not.
Assuming that the "anger_level < anger_limit", then there is not much
chance to get a remote lock before the others.
The other CPUs waiting for the lock in the current node, most probably
all of them, will enter "hbo_spinlock_contention()" almost at the same time.

I'm afraid the "hierarchical back-off"-ness of the algorithm will be lost.

Should not there be some node-local atomic operation to make sure only
one CPU goes out through the NUMA interconnect?

One more thing about:

		ndata[numa_node_id()].spin_at = NULL;

If I have acquired the lock =>
	why to wake them (all of them) up (the lock is busy) ?
If it is someone else =>
	The other CPUs in the current node can loop at ".spin_at"-s

>>I am not sure if the algorithm works correctly if more than 1 CPUs enter
>>in "hbo_spinlock_contention()".
> 
> 
> Then multiple attempts to acquire the same remote lock may occur. This 
> will reduce performance.

Can you estimate how many remote read / write / invalidate / atomic op.
etc. would a change in the lock ownership costs?

E.g. for the traditional spin lock:
- Waiting on a busy lock does not cost a penny: we loop on a shared
  line in the cache.
- Setting free costs obtaining the exclusivity: killing the other copies
  of the cache line at the N waiting CPUs - very expensive.
- The N CPUs re-read the cache line, it costs almost N remote reads.
  (Assuming cache intervention, no real memory read / write)
- One of the CPUs manages to obtain the exclusivity: kills the other copies
  of the cache line at the N-1 CPUs - very expensive.
- Some not too expensive internal delay due to the atomic operation.
- The N-1 CPUs re-read the cache line, it costs almost N remote reads.
Total: 2 global cache line kills + 2 * N remote reads

The hierarchical back-off algorithm should be cheaper (in average).

>> With the values of "cap" and "back off" you gave, it always results "cap".
> We should have  >> instead of <<. 

As backoff_factor = 2*8, the expression

		/* Increase backoff for next cycle */
		backoff = min(((backoff * backoff_factor) >> 8), cap);

would be:	backoff = min((backoff / 16), cap);

This decreasing "back off", the values of "cap" and "back off" you gave,
do not correspond to the description in the chapter 11.1 in your paper.

Can you give the actual min. - max. time the "back off" would last?

>>>+			for_each_online_node(i)
>>>+				cmpxchg(&(ndata[i].spin_at), lock, NULL);
...
>>You may have up to 256 nodes. I think we should not touch in vain the
>>".spin_at"-s for the nodes which are not waiting for the lock.

Is it really necessary to clear all the ".spin_at"-s ?
It will disturb those nodes which we haven't stopped while we were "angry".
In addition, the ".spin_at" we set, should be cleared only if we have got
the lock, otherwise we increase our chance for starvation.

>>>+extern struct node_lock_s ndata[MAX_NUMNODES];
>>Can we have a more speaking name, not just "ndata" ?
> Ok. Changed to node_lock_info.

Have you got just a single global "node_lock_info[]",
common for all the spin locks?

>>I guess it would be more readable to take the address of the field "->lock".
>>Please take advantage of the type checking mechanisms.
> 
>>	volatile __u32 *ia64_spinlock_ptr = &__x->lock;
>>
>>Please consider the other similar lock references, too.
> 
> Cannot find any other references like that.

I mean a lock is a structure, not just the lock word.
(Some lock instrumentation wound require additional structure fields.)

E.g. passing the lock to "hbo_spinlock_contention()": it could have a
lock pointer instead the address of the lock word itself. It would be more
easy to use some #ifdef-s to activate lock instrumentation.

> +			stopped_nodes++;

Nowadays, it can overflow. I guess you need a flag, not a counter.

Thanks,

Zoltan

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [RFC] Hierarchical BackOff Locks
@ 2005-05-24 21:11 Christoph Lameter
  0 siblings, 0 replies; 6+ messages in thread
From: Christoph Lameter @ 2005-05-24 21:11 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux-ia64

Spinlocks cause a lot of contention if more than 4 processors are trying 
to acquire the lock. If a lock is under contention by more than 8 
processors then the majority of the time is spent waiting for spinlocks 
rather than doing productive work.

See http://www.sgi.com/products/software/linux/downloads/sync_linux.pdf

The contention could be limited by introducing a method for NUMA aware 
node backoff first proposed by Zoran Radovich called "HBO Locks". 
The patch here is just implementing the spinlock piece for ia64.

HBO locks can:

1. Detect the distance to the processor holding the lock.
2. Do a NUMA aware backoff so that local processors have a higher chance
   of getting to lock to avoid excessive transfer across the NUMA link
3. Avoid multiple processors from the same node trying to acquire the same
   remote lock.
4. Avoid starvation by an "angry" mode that forces another node to give up
   its ownership of the lock.
5. Have a fast uncontended case that is competitive with regular 
spinlocks.

Performance graphs are included in the paper. Performance benefits start 
at 8 processors and get up to 3 fold for 60 processors.

Does something like this have any chance of making it into the kernel?

Index: linux-2.6.11/include/asm-ia64/spinlock.h
===================================================================
--- linux-2.6.11.orig/include/asm-ia64/spinlock.h	2005-03-01 23:37:48.000000000 -0800
+++ linux-2.6.11/include/asm-ia64/spinlock.h	2005-05-24 13:54:21.000000000 -0700
@@ -11,7 +11,8 @@
 
 #include <linux/compiler.h>
 #include <linux/kernel.h>
-
+#include <linux/config.h>
+#include <linux/cache.h>
 #include <asm/atomic.h>
 #include <asm/bitops.h>
 #include <asm/intrinsics.h>
@@ -24,10 +25,13 @@ typedef struct {
 #endif
 } spinlock_t;
 
+/* These definitions do not mean much. Lots of code for IA64 depends on
+ * zeroed memory containing unlocked spinlocks
+ */
 #define SPIN_LOCK_UNLOCKED			(spinlock_t) { 0 }
 #define spin_lock_init(x)			((x)->lock = 0)
 
-#ifdef ASM_SUPPORTED
+#if defined(ASM_SUPPORTED) & !defined(CONFIG_HBO_LOCKS)
 /*
  * Try to get the lock.  If we fail to get the lock, make a non-standard call to
  * ia64_spinlock_contention().  We do not use a normal call because that would force all
@@ -95,6 +99,35 @@ _raw_spin_lock_flags (spinlock_t *lock, 
 }
 #define _raw_spin_lock(lock) _raw_spin_lock_flags(lock, 0)
 #else /* !ASM_SUPPORTED */
+
+#ifdef CONFIG_HBO_LOCKS
+
+void hbo_spinlock_contention(__u32 *, unsigned long, unsigned long);
+void hbo_spinlock_wait_remote(__u32 *, unsigned long);
+
+struct node_lock_s {
+	void *spin_at;
+} ____cacheline_aligned;
+
+extern struct node_lock_s ndata[MAX_NUMNODES];
+
+#define _raw_spin_lock_flags(__x, __flags)						\
+do {											\
+	__u32 *ia64_spinlock_ptr = (__u32 *) (__x);					\
+	__u64 ia64_spinlock_val;							\
+							\
+	if (unlikely(ndata[numa_node_id()].spin_at == ia64_spinlock_ptr))		\
+		hbo_spinlock_wait_remote(ia64_spinlock_ptr, __flags);			\
+							\
+	ia64_spinlock_val = ia64_cmpxchg4_acq(ia64_spinlock_ptr, numa_node_id()+ 1, 0);	\
+							\
+	if (unlikely(ia64_spinlock_val))						\
+		hbo_spinlock_contention(ia64_spinlock_ptr, ia64_spinlock_val, __flags);	\
+} while (0)
+
+#define _raw_spin_lock(lock) _raw_spin_lock_flags(lock, 0)
+
+#else
 #define _raw_spin_lock_flags(lock, flags) _raw_spin_lock(lock)
 # define _raw_spin_lock(x)								\
 do {											\
@@ -109,11 +142,16 @@ do {											\
 		} while (ia64_spinlock_val);						\
 	}										\
 } while (0)
+#endif
 #endif /* !ASM_SUPPORTED */
 
 #define spin_is_locked(x)	((x)->lock != 0)
 #define _raw_spin_unlock(x)	do { barrier(); ((spinlock_t *) x)->lock = 0; } while (0)
+#ifdef CONFIG_HBO_LOCKS
+#define _raw_spin_trylock(x)	(cmpxchg_acq(&(x)->lock, 0, numa_node_id() + 1) == 0)
+#else
 #define _raw_spin_trylock(x)	(cmpxchg_acq(&(x)->lock, 0, 1) == 0)
+#endif
 #define spin_unlock_wait(x)	do { barrier(); } while ((x)->lock)
 
 typedef struct {
Index: linux-2.6.11/arch/ia64/Kconfig
===================================================================
--- linux-2.6.11.orig/arch/ia64/Kconfig	2005-05-24 13:40:29.000000000 -0700
+++ linux-2.6.11/arch/ia64/Kconfig	2005-05-24 13:40:46.000000000 -0700
@@ -296,6 +296,19 @@ config PREEMPT
           Say Y here if you are building a kernel for a desktop, embedded
           or real-time system.  Say N if you are unsure.
 
+config HBO_LOCKS
+	bool "HBO GT SD Locks"
+	help
+	depends on (SMP && NUMA)
+	default y
+	help
+	  HBO locks reduces contention for spinlocks by saving the node id when the
+	  lock is taken. The cpu waiting on the spinlock can then select an appropriate
+	  backoff algorithm depending on the distance to the node. This also insures that
+	  it is highly likely that another cpu on the same node gets to a spinlock before
+	  remote nodes to avoid too much cacheline bouncing. A node may "get angry" in
+	  order to avoid starvation and stop local nodes from getting the lock.
+
 config HAVE_DEC_LOCK
 	bool
 	depends on (SMP || PREEMPT)
Index: linux-2.6.11/arch/ia64/mm/numa.c
===================================================================
--- linux-2.6.11.orig/arch/ia64/mm/numa.c	2005-03-01 23:37:52.000000000 -0800
+++ linux-2.6.11/arch/ia64/mm/numa.c	2005-05-24 13:40:46.000000000 -0700
@@ -17,9 +17,11 @@
 #include <linux/node.h>
 #include <linux/init.h>
 #include <linux/bootmem.h>
+#include <linux/proc_fs.h>
+#include <linux/nodemask.h>
 #include <asm/mmzone.h>
 #include <asm/numa.h>
-
+#include <asm/uaccess.h>
 
 /*
  * The following structures are usually initialized by ACPI or
@@ -47,3 +49,296 @@ paddr_to_nid(unsigned long paddr)
 
 	return (i < num_node_memblks) ? node_memblk[i].nid : (num_node_memblks ? -1 : 0);
 }
+
+#ifdef CONFIG_HBO_LOCKS
+
+struct node_lock_s ndata[MAX_NUMNODES];
+
+/* Counters and statistics available via /proc/hbo */
+struct hbo_c_s {
+	unsigned long contention;
+	unsigned long local_block;
+	unsigned long remote_block;
+	unsigned long remote_lock;
+	unsigned long retry;
+};
+
+DEFINE_PER_CPU(struct hbo_c_s, counters);
+
+#define INC_HC(member) __get_cpu_var(counters).member++
+#define DEC_HC(member) __get_cpu_var(counters).member--
+
+static inline void maybe_enable_irq(unsigned long flags)
+{
+	if (flags & IA64_PSR_I_BIT)
+	 	local_irq_enable();
+}
+
+static inline void maybe_disable_irq(unsigned long flags)
+{
+	if (flags & IA64_PSR_I_BIT)
+		local_irq_disable();
+}
+
+/*
+ * Number of retries until we get so angry that we block the locks from the
+ * node they originate from
+ */
+static unsigned int anger_limit = 50;
+static unsigned int backoff_factor = 2*8;
+/*
+ * Backoff parameters for node local locks
+ */
+static unsigned int local_backoff = 20;
+static unsigned int local_cap = 20;
+
+/*
+ * Backoff parameters for the case that a lock is held
+ * by a processor on a remote node
+ */
+static unsigned int remote_backoff = 1000;
+static unsigned int remote_cap = 200000;
+
+void hbo_spinlock_contention(__u32 *lock, unsigned long lockval, unsigned long flags)
+{
+	INC_HC(contention);
+	do {
+		unsigned int backoff = local_backoff;
+		unsigned int cap = local_cap;
+		unsigned int remote = 0;
+		unsigned int anger_level = 0;
+		unsigned int stopped_nodes = 0;
+
+		maybe_enable_irq(flags);
+
+		if (unlikely(lockval-1 != numa_node_id())) {
+			/* remote backoff */
+			backoff = remote_backoff;
+			cap = remote_cap;
+			/*
+			 * Make sure that no other cpu of this node tries
+			 * to acquire the same remote lock.
+			 */
+			ndata[numa_node_id()].spin_at = lock;
+			remote = 1;
+			INC_HC(remote_lock);
+		}
+
+		for(;;) {
+			int i;
+
+			for(i = 0; i < backoff; i++)
+				cpu_relax();
+
+			/* Increase backoff for next cycle */
+			backoff = min(((backoff * backoff_factor) << 8), cap);
+
+			ia64_barrier();
+
+			/* No cmpxchg so that we will not get an exclusive cache line */
+			lockval = *lock;
+
+			if (likely(lockval ==0))
+				break;
+
+			if (remote) {
+				anger_level++;
+				if (anger_level == anger_limit) {
+					INC_HC(remote_block);
+					/*
+					 * Block any remote threads that attempt
+					 * to get the lock and make them spin locally
+					 * on that node.
+					 */
+					ndata[lockval-1].spin_at = lock;
+					stopped_nodes++;
+					/*
+					 * Use local backoff instead of remote
+					 * to have more of a chance to get the lock
+					 */
+					backoff = local_backoff;
+					cap = local_backoff;
+					anger_level = 0;
+				}
+			}
+
+		};
+
+		maybe_disable_irq(flags);
+		INC_HC(retry);
+		/* Lock was unlocked. Now use cmpxchg acquiring an exclusive cache line */
+		lockval = ia64_cmpxchg4_acq(lock, numa_node_id()+1, 0);
+
+		/* Release eventually spinning other cpus on this node since either
+		 * the lock has been acquired by this cpu or the node holding the lock
+		 * may have changed.
+		 */
+		if (unlikely(remote)) {
+			/* Remove off-node block */
+			ndata[numa_node_id()].spin_at = NULL;
+			if (stopped_nodes) {
+				int i;
+				/*
+				 * Remove any remote blocks we established when we
+				 * got angry
+				 */
+				for_each_online_node(i)
+					cmpxchg(&(ndata[i].spin_at), lock, NULL);
+			}
+		}
+
+        } while (likely(lockval));
+}
+
+void hbo_spinlock_wait_remote(__u32 *lock, unsigned long flags)
+{
+	INC_HC(local_block);
+	maybe_enable_irq(flags);
+	while (ndata[numa_node_id()].spin_at == lock)
+		cpu_relax();
+	maybe_disable_irq(flags);
+}
+
+static int hbo_read_proc(char *page, char **start, off_t off,
+	int count, int *eof, void *data)
+{
+	int cpu = (long)data;
+	struct hbo_c_s summary, *x;
+	int n;
+	int len;
+
+	if (cpu >=0)
+		x = &per_cpu(counters, cpu);
+	else {
+		memcpy(&summary, &per_cpu(counters, 0), sizeof(summary));
+		for(n = 1; n < num_possible_cpus(); n++) {
+                        struct hbo_c_s *c = &per_cpu(counters, n);
+
+			summary.contention += c->contention;
+			summary.local_block += c->local_block;
+			summary.remote_block += c->remote_block;
+			summary.remote_lock += c->remote_lock;
+			summary.retry += c->retry;
+		}
+		x = &summary;
+	}
+
+	len = 0;
+	len += sprintf(page+len, "Contentions      : %lu\n",x->contention);
+	len += sprintf(page+len, "Remote Lock      : %lu\n",x->remote_lock);
+	len += sprintf(page+len, "Retry            : %lu\n",x->retry);
+	len += sprintf(page+len, "Local block      : %lu\n",x->local_block);
+	len += sprintf(page+len, "Remote block     : %lu\n",x->remote_block);
+
+	if (len <= off+count)
+		*eof = 1;
+	*start = page + off;
+	len -= off;
+	if (len>count)
+		len = count;
+	if (len<0)
+		len = 0;
+        return len;
+}
+
+static int hbo_reset_write(struct file *file, const char __user *buffer,
+	unsigned long count, void *data)
+{
+	int i;
+
+	for (i = 0; i < num_possible_cpus(); i++) {
+		struct hbo_c_s *c = &per_cpu(counters, i);
+		memset(c, 0, sizeof(struct hbo_c_s));
+	}
+	return count;
+}
+
+static int hbo_write_int(struct file *file, const char __user *buffer,
+	unsigned long count, void *data)
+{
+	unsigned int * v = data;
+	char c[11];
+
+	if (count < 1)
+		return -EINVAL;
+
+	if (copy_from_user(c, buffer, 10))
+		return -EFAULT;
+
+	*v = simple_strtoul(buffer,NULL, 10);
+	return count;
+}
+
+static int hbo_read_int(char *page, char **start, off_t off, int count,
+			int *eof, void *data)
+{
+	unsigned int *v = data;
+	int len;
+
+	len = sprintf(page, "%u\n",*v);
+	if (len <= off+count)
+		*eof = 1;
+	*start = page + off;
+	len -= off;
+	if (len>count)
+		len = count;
+	if (len<0)
+		len = 0;
+	return len;
+}
+
+static __init int init_hbolock(void)
+{
+	int i;
+	struct proc_dir_entry *proc_hbo, *hbo_reset, *x;
+
+	proc_hbo = proc_mkdir("hbo",NULL);
+	hbo_reset = create_proc_entry("reset", S_IWUGO, proc_hbo);
+	hbo_reset->write_proc = hbo_reset_write;
+
+	x = create_proc_entry("all", S_IRUGO, proc_hbo);
+	x->read_proc = &hbo_read_proc;
+	x->data = (void *)-1;
+
+	/* ints */
+	x = create_proc_entry("local_backoff", 0664, proc_hbo);
+	x->read_proc = hbo_read_int;
+	x->write_proc = hbo_write_int;
+	x->data = &local_backoff;
+	x = create_proc_entry("local_cap", 0664, proc_hbo);
+	x->read_proc = hbo_read_int;
+	x->write_proc = hbo_write_int;
+	x->data = &local_cap;
+	x = create_proc_entry("remote_backoff", 0664, proc_hbo);
+	x->read_proc = hbo_read_int;
+	x->write_proc = hbo_write_int;
+	x->data = &remote_backoff;
+	x = create_proc_entry("remote_cap", 0664, proc_hbo);
+	x->read_proc = hbo_read_int;
+	x->write_proc = hbo_write_int;
+	x->data = &remote_cap;
+	x = create_proc_entry("backoff_factor", 0664, proc_hbo);
+	x->read_proc = hbo_read_int;
+	x->write_proc = hbo_write_int;
+	x->data = &backoff_factor;
+	x = create_proc_entry("anger_limit", 0664, proc_hbo);
+	x->read_proc = hbo_read_int;
+	x->write_proc = hbo_write_int;
+	x->data = &anger_limit;
+
+	for(i = 0; i < num_possible_cpus(); i++) {
+		char name[20];
+		struct proc_dir_entry *p;
+
+		sprintf(name, "%d", i);
+		p = create_proc_entry(name, S_IRUGO, proc_hbo);
+
+		p->read_proc = hbo_read_proc;
+		p->data = (void *)(unsigned long)i;
+	}
+	return 0;
+}
+
+__initcall(init_hbolock);
+#endif
+

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2005-06-27 15:30 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2005-06-15 22:51 [RFC] Hierarchical BackOff Locks Christoph Lameter
2005-06-22 15:45 ` Zoltan Menyhart
2005-06-24 13:08 ` Christoph Lameter
2005-06-24 17:53 ` Christoph Lameter
2005-06-27 15:30 ` Zoltan Menyhart
  -- strict thread matches above, loose matches on Subject: below --
2005-05-24 21:11 Christoph Lameter

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.