* [PATCH v8 1/5] locking/qspinlock: Rename mcs lock/unlock macros and make them more generic
2019-12-30 19:40 [PATCH v8 0/5] Add NUMA-awareness to qspinlock Alex Kogan
@ 2019-12-30 19:40 ` Alex Kogan
2019-12-30 19:40 ` [PATCH v8 2/5] locking/qspinlock: Refactor the qspinlock slow path Alex Kogan
` (5 subsequent siblings)
6 siblings, 0 replies; 39+ messages in thread
From: Alex Kogan @ 2019-12-30 19:40 UTC (permalink / raw)
To: linux, peterz, mingo, will.deacon, arnd, longman, linux-arch,
linux-arm-kernel, linux-kernel, tglx, bp, hpa, x86, guohanjun,
jglauber
Cc: alex.kogan, dave.dice, steven.sistare, daniel.m.jordan
The mcs unlock macro (arch_mcs_pass_lock) should accept the value to be
stored into the lock argument as another argument. This allows using the
same macro in cases where the value to be stored when passing the lock is
different from 1.
Signed-off-by: Alex Kogan <alex.kogan@oracle.com>
Reviewed-by: Steve Sistare <steven.sistare@oracle.com>
---
arch/arm/include/asm/mcs_spinlock.h | 6 +++---
include/asm-generic/mcs_spinlock.h | 4 ++--
kernel/locking/mcs_spinlock.h | 18 +++++++++---------
kernel/locking/qspinlock.c | 4 ++--
kernel/locking/qspinlock_paravirt.h | 2 +-
5 files changed, 17 insertions(+), 17 deletions(-)
diff --git a/arch/arm/include/asm/mcs_spinlock.h b/arch/arm/include/asm/mcs_spinlock.h
index 529d2cf4d06f..693fe6ce3c43 100644
--- a/arch/arm/include/asm/mcs_spinlock.h
+++ b/arch/arm/include/asm/mcs_spinlock.h
@@ -6,7 +6,7 @@
#include <asm/spinlock.h>
/* MCS spin-locking. */
-#define arch_mcs_spin_lock_contended(lock) \
+#define arch_mcs_spin_lock(lock) \
do { \
/* Ensure prior stores are observed before we enter wfe. */ \
smp_mb(); \
@@ -14,9 +14,9 @@ do { \
wfe(); \
} while (0) \
-#define arch_mcs_spin_unlock_contended(lock) \
+#define arch_mcs_pass_lock(lock, val) \
do { \
- smp_store_release(lock, 1); \
+ smp_store_release((lock), (val)); \
dsb_sev(); \
} while (0)
diff --git a/include/asm-generic/mcs_spinlock.h b/include/asm-generic/mcs_spinlock.h
index 10cd4ffc6ba2..868da43dba7c 100644
--- a/include/asm-generic/mcs_spinlock.h
+++ b/include/asm-generic/mcs_spinlock.h
@@ -4,8 +4,8 @@
/*
* Architectures can define their own:
*
- * arch_mcs_spin_lock_contended(l)
- * arch_mcs_spin_unlock_contended(l)
+ * arch_mcs_spin_lock(l)
+ * arch_mcs_pass_lock(l, val)
*
* See kernel/locking/mcs_spinlock.c.
*/
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index 5e10153b4d3c..52d06ec6f525 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -21,7 +21,7 @@ struct mcs_spinlock {
int count; /* nesting count, see qspinlock.c */
};
-#ifndef arch_mcs_spin_lock_contended
+#ifndef arch_mcs_spin_lock
/*
* Using smp_cond_load_acquire() provides the acquire semantics
* required so that subsequent operations happen after the
@@ -29,20 +29,20 @@ struct mcs_spinlock {
* ARM64 would like to do spin-waiting instead of purely
* spinning, and smp_cond_load_acquire() provides that behavior.
*/
-#define arch_mcs_spin_lock_contended(l) \
-do { \
- smp_cond_load_acquire(l, VAL); \
+#define arch_mcs_spin_lock(l) \
+do { \
+ smp_cond_load_acquire(l, VAL); \
} while (0)
#endif
-#ifndef arch_mcs_spin_unlock_contended
+#ifndef arch_mcs_spin_unlock
/*
* smp_store_release() provides a memory barrier to ensure all
* operations in the critical section has been completed before
* unlocking.
*/
-#define arch_mcs_spin_unlock_contended(l) \
- smp_store_release((l), 1)
+#define arch_mcs_pass_lock(l, val) \
+ smp_store_release((l), (val))
#endif
/*
@@ -91,7 +91,7 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
WRITE_ONCE(prev->next, node);
/* Wait until the lock holder passes the lock down. */
- arch_mcs_spin_lock_contended(&node->locked);
+ arch_mcs_spin_lock(&node->locked);
}
/*
@@ -115,7 +115,7 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
}
/* Pass lock to next waiter. */
- arch_mcs_spin_unlock_contended(&next->locked);
+ arch_mcs_pass_lock(&next->locked, 1);
}
#endif /* __LINUX_MCS_SPINLOCK_H */
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 2473f10c6956..804c0fbd6328 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -470,7 +470,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
WRITE_ONCE(prev->next, node);
pv_wait_node(node, prev);
- arch_mcs_spin_lock_contended(&node->locked);
+ arch_mcs_spin_lock(&node->locked);
/*
* While waiting for the MCS lock, the next pointer may have
@@ -549,7 +549,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
if (!next)
next = smp_cond_load_relaxed(&node->next, (VAL));
- arch_mcs_spin_unlock_contended(&next->locked);
+ arch_mcs_pass_lock(&next->locked, 1);
pv_kick_node(lock, next);
release:
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index e84d21aa0722..e98079414671 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -368,7 +368,7 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
*
* Matches with smp_store_mb() and cmpxchg() in pv_wait_node()
*
- * The write to next->locked in arch_mcs_spin_unlock_contended()
+ * The write to next->locked in arch_mcs_pass_lock()
* must be ordered before the read of pn->state in the cmpxchg()
* below for the code to work correctly. To guarantee full ordering
* irrespective of the success or failure of the cmpxchg(),
--
2.21.0 (Apple Git-122.2)
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
^ permalink raw reply related [flat|nested] 39+ messages in thread
* [PATCH v8 2/5] locking/qspinlock: Refactor the qspinlock slow path
2019-12-30 19:40 [PATCH v8 0/5] Add NUMA-awareness to qspinlock Alex Kogan
2019-12-30 19:40 ` [PATCH v8 1/5] locking/qspinlock: Rename mcs lock/unlock macros and make them more generic Alex Kogan
@ 2019-12-30 19:40 ` Alex Kogan
2019-12-30 19:40 ` [PATCH v8 3/5] locking/qspinlock: Introduce CNA into the slow path of qspinlock Alex Kogan
` (4 subsequent siblings)
6 siblings, 0 replies; 39+ messages in thread
From: Alex Kogan @ 2019-12-30 19:40 UTC (permalink / raw)
To: linux, peterz, mingo, will.deacon, arnd, longman, linux-arch,
linux-arm-kernel, linux-kernel, tglx, bp, hpa, x86, guohanjun,
jglauber
Cc: alex.kogan, dave.dice, steven.sistare, daniel.m.jordan
Move some of the code manipulating the spin lock into separate functions.
This would allow easier integration of alternative ways to manipulate
that lock.
Signed-off-by: Alex Kogan <alex.kogan@oracle.com>
Reviewed-by: Steve Sistare <steven.sistare@oracle.com>
---
kernel/locking/qspinlock.c | 38 ++++++++++++++++++++++++++++++++++++--
1 file changed, 36 insertions(+), 2 deletions(-)
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 804c0fbd6328..c06d1e8075d9 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -288,6 +288,34 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock,
#define queued_spin_lock_slowpath native_queued_spin_lock_slowpath
#endif
+/*
+ * __try_clear_tail - try to clear tail by setting the lock value to
+ * _Q_LOCKED_VAL.
+ * @lock: Pointer to the queued spinlock structure
+ * @val: Current value of the lock
+ * @node: Pointer to the MCS node of the lock holder
+ */
+static __always_inline bool __try_clear_tail(struct qspinlock *lock,
+ u32 val,
+ struct mcs_spinlock *node)
+{
+ return atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL);
+}
+
+/*
+ * __mcs_pass_lock - pass the MCS lock to the next waiter
+ * @node: Pointer to the MCS node of the lock holder
+ * @next: Pointer to the MCS node of the first waiter in the MCS queue
+ */
+static __always_inline void __mcs_pass_lock(struct mcs_spinlock *node,
+ struct mcs_spinlock *next)
+{
+ arch_mcs_pass_lock(&next->locked, 1);
+}
+
+#define try_clear_tail __try_clear_tail
+#define mcs_pass_lock __mcs_pass_lock
+
#endif /* _GEN_PV_LOCK_SLOWPATH */
/**
@@ -532,7 +560,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
* PENDING will make the uncontended transition fail.
*/
if ((val & _Q_TAIL_MASK) == tail) {
- if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL))
+ if (try_clear_tail(lock, val, node))
goto release; /* No contention */
}
@@ -549,7 +577,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
if (!next)
next = smp_cond_load_relaxed(&node->next, (VAL));
- arch_mcs_pass_lock(&next->locked, 1);
+ mcs_pass_lock(node, next);
pv_kick_node(lock, next);
release:
@@ -574,6 +602,12 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath);
#undef pv_kick_node
#undef pv_wait_head_or_lock
+#undef try_clear_tail
+#define try_clear_tail __try_clear_tail
+
+#undef mcs_pass_lock
+#define mcs_pass_lock __mcs_pass_lock
+
#undef queued_spin_lock_slowpath
#define queued_spin_lock_slowpath __pv_queued_spin_lock_slowpath
--
2.21.0 (Apple Git-122.2)
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
^ permalink raw reply related [flat|nested] 39+ messages in thread
* [PATCH v8 3/5] locking/qspinlock: Introduce CNA into the slow path of qspinlock
2019-12-30 19:40 [PATCH v8 0/5] Add NUMA-awareness to qspinlock Alex Kogan
2019-12-30 19:40 ` [PATCH v8 1/5] locking/qspinlock: Rename mcs lock/unlock macros and make them more generic Alex Kogan
2019-12-30 19:40 ` [PATCH v8 2/5] locking/qspinlock: Refactor the qspinlock slow path Alex Kogan
@ 2019-12-30 19:40 ` Alex Kogan
2020-01-03 22:14 ` Waiman Long
` (2 more replies)
2019-12-30 19:40 ` [PATCH v8 4/5] locking/qspinlock: Introduce starvation avoidance into CNA Alex Kogan
` (3 subsequent siblings)
6 siblings, 3 replies; 39+ messages in thread
From: Alex Kogan @ 2019-12-30 19:40 UTC (permalink / raw)
To: linux, peterz, mingo, will.deacon, arnd, longman, linux-arch,
linux-arm-kernel, linux-kernel, tglx, bp, hpa, x86, guohanjun,
jglauber
Cc: alex.kogan, dave.dice, steven.sistare, daniel.m.jordan
In CNA, spinning threads are organized in two queues, a main queue for
threads running on the same node as the current lock holder, and a
secondary queue for threads running on other nodes. After acquiring the
MCS lock and before acquiring the spinlock, the lock holder scans the
main queue looking for a thread running on the same node (pre-scan). If
found (call it thread T), all threads in the main queue between the
current lock holder and T are moved to the end of the secondary queue.
If such T is not found, we make another scan of the main queue when
unlocking the MCS lock (post-scan), starting at the position where
pre-scan stopped. If both scans fail to find such T, the MCS lock is
passed to the first thread in the secondary queue. If the secondary queue
is empty, the lock is passed to the next thread in the main queue.
For more details, see https://arxiv.org/abs/1810.05600.
Note that this variant of CNA may introduce starvation by continuously
passing the lock to threads running on the same node. This issue
will be addressed later in the series.
Enabling CNA is controlled via a new configuration option
(NUMA_AWARE_SPINLOCKS). By default, the CNA variant is patched in at the
boot time only if we run on a multi-node machine in native environment and
the new config is enabled. (For the time being, the patching requires
CONFIG_PARAVIRT_SPINLOCKS to be enabled as well. However, this should be
resolved once static_call() is available.) This default behavior can be
overridden with the new kernel boot command-line option
"numa_spinlock=on/off" (default is "auto").
Signed-off-by: Alex Kogan <alex.kogan@oracle.com>
Reviewed-by: Steve Sistare <steven.sistare@oracle.com>
---
.../admin-guide/kernel-parameters.txt | 10 +
arch/x86/Kconfig | 20 ++
arch/x86/include/asm/qspinlock.h | 4 +
arch/x86/kernel/alternative.c | 4 +
kernel/locking/mcs_spinlock.h | 2 +-
kernel/locking/qspinlock.c | 39 ++-
kernel/locking/qspinlock_cna.h | 319 ++++++++++++++++++
7 files changed, 393 insertions(+), 5 deletions(-)
create mode 100644 kernel/locking/qspinlock_cna.h
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index ade4e6ec23e0..b68cb80e477f 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3190,6 +3190,16 @@
nox2apic [X86-64,APIC] Do not enable x2APIC mode.
+ numa_spinlock= [NUMA, PV_OPS] Select the NUMA-aware variant
+ of spinlock. The options are:
+ auto - Enable this variant if running on a multi-node
+ machine in native environment.
+ on - Unconditionally enable this variant.
+ off - Unconditionally disable this variant.
+
+ Not specifying this option is equivalent to
+ numa_spinlock=auto.
+
cpu0_hotplug [X86] Turn on CPU0 hotplug feature when
CONFIG_BOOTPARAM_HOTPLUG_CPU0 is off.
Some features depend on CPU0. Known dependencies are:
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5e8949953660..26dd29b2d515 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1562,6 +1562,26 @@ config NUMA
Otherwise, you should say N.
+config NUMA_AWARE_SPINLOCKS
+ bool "Numa-aware spinlocks"
+ depends on NUMA
+ depends on QUEUED_SPINLOCKS
+ depends on 64BIT
+ # For now, we depend on PARAVIRT_SPINLOCKS to make the patching work.
+ # This is awkward, but hopefully would be resolved once static_call()
+ # is available.
+ depends on PARAVIRT_SPINLOCKS
+ default y
+ help
+ Introduce NUMA (Non Uniform Memory Access) awareness into
+ the slow path of spinlocks.
+
+ In this variant of qspinlock, the kernel will try to keep the lock
+ on the same node, thus reducing the number of remote cache misses,
+ while trading some of the short term fairness for better performance.
+
+ Say N if you want absolute first come first serve fairness.
+
config AMD_NUMA
def_bool y
prompt "Old style AMD Opteron NUMA detection"
diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h
index 444d6fd9a6d8..fe4884b6f1b4 100644
--- a/arch/x86/include/asm/qspinlock.h
+++ b/arch/x86/include/asm/qspinlock.h
@@ -27,6 +27,10 @@ static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lo
return val;
}
+#ifdef CONFIG_NUMA_AWARE_SPINLOCKS
+extern void cna_configure_spin_lock_slowpath(void);
+#endif
+
#ifdef CONFIG_PARAVIRT_SPINLOCKS
extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
extern void __pv_init_lock_hash(void);
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 9ec463fe96f2..5a59d06a9d21 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -738,6 +738,10 @@ void __init alternative_instructions(void)
}
#endif
+#if defined(CONFIG_NUMA_AWARE_SPINLOCKS)
+ cna_configure_spin_lock_slowpath();
+#endif
+
apply_paravirt(__parainstructions, __parainstructions_end);
restart_nmi();
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index 52d06ec6f525..e40b9538b79f 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -17,7 +17,7 @@
struct mcs_spinlock {
struct mcs_spinlock *next;
- int locked; /* 1 if lock acquired */
+ unsigned int locked; /* 1 if lock acquired */
int count; /* nesting count, see qspinlock.c */
};
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index c06d1e8075d9..609980a53841 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -11,7 +11,7 @@
* Peter Zijlstra <peterz@infradead.org>
*/
-#ifndef _GEN_PV_LOCK_SLOWPATH
+#if !defined(_GEN_PV_LOCK_SLOWPATH) && !defined(_GEN_CNA_LOCK_SLOWPATH)
#include <linux/smp.h>
#include <linux/bug.h>
@@ -70,7 +70,8 @@
/*
* On 64-bit architectures, the mcs_spinlock structure will be 16 bytes in
* size and four of them will fit nicely in one 64-byte cacheline. For
- * pvqspinlock, however, we need more space for extra data. To accommodate
+ * pvqspinlock, however, we need more space for extra data. The same also
+ * applies for the NUMA-aware variant of spinlocks (CNA). To accommodate
* that, we insert two more long words to pad it up to 32 bytes. IOW, only
* two of them can fit in a cacheline in this case. That is OK as it is rare
* to have more than 2 levels of slowpath nesting in actual use. We don't
@@ -79,7 +80,7 @@
*/
struct qnode {
struct mcs_spinlock mcs;
-#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#if defined(CONFIG_PARAVIRT_SPINLOCKS) || defined(CONFIG_NUMA_AWARE_SPINLOCKS)
long reserved[2];
#endif
};
@@ -103,6 +104,8 @@ struct qnode {
* Exactly fits one 64-byte cacheline on a 64-bit architecture.
*
* PV doubles the storage and uses the second cacheline for PV state.
+ * CNA also doubles the storage and uses the second cacheline for
+ * CNA-specific state.
*/
static DEFINE_PER_CPU_ALIGNED(struct qnode, qnodes[MAX_NODES]);
@@ -316,7 +319,7 @@ static __always_inline void __mcs_pass_lock(struct mcs_spinlock *node,
#define try_clear_tail __try_clear_tail
#define mcs_pass_lock __mcs_pass_lock
-#endif /* _GEN_PV_LOCK_SLOWPATH */
+#endif /* _GEN_PV_LOCK_SLOWPATH && _GEN_CNA_LOCK_SLOWPATH */
/**
* queued_spin_lock_slowpath - acquire the queued spinlock
@@ -588,6 +591,34 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
}
EXPORT_SYMBOL(queued_spin_lock_slowpath);
+/*
+ * Generate the code for NUMA-aware spinlocks
+ */
+#if !defined(_GEN_CNA_LOCK_SLOWPATH) && defined(CONFIG_NUMA_AWARE_SPINLOCKS)
+#define _GEN_CNA_LOCK_SLOWPATH
+
+#undef pv_wait_head_or_lock
+#define pv_wait_head_or_lock cna_pre_scan
+
+#undef try_clear_tail
+#define try_clear_tail cna_try_change_tail
+
+#undef mcs_pass_lock
+#define mcs_pass_lock cna_pass_lock
+
+#undef queued_spin_lock_slowpath
+/*
+ * defer defining queued_spin_lock_slowpath until after the include to
+ * avoid a name clash with the identically named field in pv_ops.lock
+ * (see cna_configure_spin_lock_slowpath())
+ */
+#include "qspinlock_cna.h"
+#define queued_spin_lock_slowpath __cna_queued_spin_lock_slowpath
+
+#include "qspinlock.c"
+
+#endif
+
/*
* Generate the paravirt code for queued_spin_unlock_slowpath().
*/
diff --git a/kernel/locking/qspinlock_cna.h b/kernel/locking/qspinlock_cna.h
new file mode 100644
index 000000000000..3c99a4a6184b
--- /dev/null
+++ b/kernel/locking/qspinlock_cna.h
@@ -0,0 +1,319 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _GEN_CNA_LOCK_SLOWPATH
+#error "do not include this file"
+#endif
+
+#include <linux/topology.h>
+
+/*
+ * Implement a NUMA-aware version of MCS (aka CNA, or compact NUMA-aware lock).
+ *
+ * In CNA, spinning threads are organized in two queues, a main queue for
+ * threads running on the same NUMA node as the current lock holder, and a
+ * secondary queue for threads running on other nodes. Schematically, it
+ * looks like this:
+ *
+ * cna_node
+ * +----------+ +--------+ +--------+
+ * |mcs:next | -> |mcs:next| -> ... |mcs:next| -> NULL [Main queue]
+ * |mcs:locked| -+ +--------+ +--------+
+ * +----------+ |
+ * +----------------------+
+ * \/
+ * +--------+ +--------+
+ * |mcs:next| -> ... |mcs:next| [Secondary queue]
+ * +--------+ +--------+
+ * ^ |
+ * +--------------------+
+ *
+ * N.B. locked = 1 if secondary queue is absent. Othewrise, it contains the
+ * encoded pointer to the tail of the secondary queue, which is organized as a
+ * circular list.
+ *
+ * After acquiring the MCS lock and before acquiring the spinlock, the lock
+ * holder scans the main queue looking for a thread running on the same node
+ * (pre-scan). If found (call it thread T), all threads in the main queue
+ * between the current lock holder and T are moved to the end of the secondary
+ * queue. If such T is not found, we make another scan of the main queue when
+ * unlocking the MCS lock (post-scan), starting at the node where pre-scan
+ * stopped. If both scans fail to find such T, the MCS lock is passed to the
+ * first thread in the secondary queue. If the secondary queue is empty, the
+ * lock is passed to the next thread in the main queue.
+ *
+ * For more details, see https://arxiv.org/abs/1810.05600.
+ *
+ * Authors: Alex Kogan <alex.kogan@oracle.com>
+ * Dave Dice <dave.dice@oracle.com>
+ */
+
+struct cna_node {
+ struct mcs_spinlock mcs;
+ int numa_node;
+ u32 encoded_tail;
+ u32 pre_scan_result; /* encoded tail or enum val */
+};
+
+enum {
+ LOCAL_WAITER_FOUND = 2, /* 0 and 1 are reserved for @locked */
+ MIN_ENCODED_TAIL
+};
+
+static void __init cna_init_nodes_per_cpu(unsigned int cpu)
+{
+ struct mcs_spinlock *base = per_cpu_ptr(&qnodes[0].mcs, cpu);
+ int numa_node = cpu_to_node(cpu);
+ int i;
+
+ for (i = 0; i < MAX_NODES; i++) {
+ struct cna_node *cn = (struct cna_node *)grab_mcs_node(base, i);
+
+ cn->numa_node = numa_node;
+ cn->encoded_tail = encode_tail(cpu, i);
+ /*
+ * make sure @encoded_tail is not confused with other valid
+ * values for @locked (0 or 1) or with designated values for
+ * @pre_scan_result
+ */
+ WARN_ON(cn->encoded_tail < MIN_ENCODED_TAIL);
+ }
+}
+
+static int __init cna_init_nodes(void)
+{
+ unsigned int cpu;
+
+ /*
+ * this will break on 32bit architectures, so we restrict
+ * the use of CNA to 64bit only (see arch/x86/Kconfig)
+ */
+ BUILD_BUG_ON(sizeof(struct cna_node) > sizeof(struct qnode));
+ /* we store an ecoded tail word in the node's @locked field */
+ BUILD_BUG_ON(sizeof(u32) > sizeof(unsigned int));
+
+ for_each_possible_cpu(cpu)
+ cna_init_nodes_per_cpu(cpu);
+
+ return 0;
+}
+early_initcall(cna_init_nodes);
+
+/* this function is called only when the primary queue is empty */
+static inline bool cna_try_change_tail(struct qspinlock *lock, u32 val,
+ struct mcs_spinlock *node)
+{
+ struct mcs_spinlock *head_2nd, *tail_2nd;
+ u32 new;
+
+ /* If the secondary queue is empty, do what MCS does. */
+ if (node->locked <= 1)
+ return __try_clear_tail(lock, val, node);
+
+ /*
+ * Try to update the tail value to the last node in the secondary queue.
+ * If successful, pass the lock to the first thread in the secondary
+ * queue. Doing those two actions effectively moves all nodes from the
+ * secondary queue into the main one.
+ */
+ tail_2nd = decode_tail(node->locked);
+ head_2nd = tail_2nd->next;
+ new = ((struct cna_node *)tail_2nd)->encoded_tail + _Q_LOCKED_VAL;
+
+ if (atomic_try_cmpxchg_relaxed(&lock->val, &val, new)) {
+ /*
+ * Try to reset @next in tail_2nd to NULL, but no need to check
+ * the result - if failed, a new successor has updated it.
+ */
+ cmpxchg_relaxed(&tail_2nd->next, head_2nd, NULL);
+ arch_mcs_pass_lock(&head_2nd->locked, 1);
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * cna_splice_tail -- splice nodes in the main queue between [first, last]
+ * onto the secondary queue.
+ */
+static void cna_splice_tail(struct mcs_spinlock *node,
+ struct mcs_spinlock *first,
+ struct mcs_spinlock *last)
+{
+ /* remove [first,last] */
+ node->next = last->next;
+
+ /* stick [first,last] on the secondary queue tail */
+ if (node->locked <= 1) { /* if secondary queue is empty */
+ /* create secondary queue */
+ last->next = first;
+ } else {
+ /* add to the tail of the secondary queue */
+ struct mcs_spinlock *tail_2nd = decode_tail(node->locked);
+ struct mcs_spinlock *head_2nd = tail_2nd->next;
+
+ tail_2nd->next = first;
+ last->next = head_2nd;
+ }
+
+ node->locked = ((struct cna_node *)last)->encoded_tail;
+}
+
+/*
+ * cna_scan_main_queue - scan the main waiting queue looking for the first
+ * thread running on the same NUMA node as the lock holder. If found (call it
+ * thread T), move all threads in the main queue between the lock holder and
+ * T to the end of the secondary queue and return 0
+ * (=SUCCESSOR_FROM_SAME_NUMA_NODE_FOUND); otherwise, return the encoded
+ * pointer of the last scanned node in the primary queue (so a subsequent scan
+ * can be resumed from that node).
+ *
+ * Schematically, this may look like the following (nn stands for numa_node and
+ * et stands for encoded_tail).
+ *
+ * when cna_scan_main_queue() is called (the secondary queue is empty):
+ *
+ * A+------------+ B+--------+ C+--------+ T+--------+
+ * |mcs:next | -> |mcs:next| -> |mcs:next| -> |mcs:next| -> NULL
+ * |mcs:locked=1| |cna:nn=0| |cna:nn=2| |cna:nn=1|
+ * |cna:nn=1 | +--------+ +--------+ +--------+
+ * +----------- +
+ *
+ * when cna_scan_main_queue() returns (the secondary queue contains B and C):
+ *
+ * A+----------------+ T+--------+
+ * |mcs:next | -> |mcs:next| -> NULL
+ * |mcs:locked=C.et | -+ |cna:nn=1|
+ * |cna:nn=1 | | +--------+
+ * +--------------- + +-----+
+ * \/
+ * B+--------+ C+--------+
+ * |mcs:next| -> |mcs:next| -+
+ * |cna:nn=0| |cna:nn=2| |
+ * +--------+ +--------+ |
+ * ^ |
+ * +---------------------+
+ *
+ * The worst case complexity of the scan is O(n), where n is the number
+ * of current waiters. However, the amortized complexity is close to O(1),
+ * as the immediate successor is likely to be running on the same node once
+ * threads from other nodes are moved to the secondary queue.
+ *
+ * @node : Pointer to the MCS node of the lock holder
+ * @pred_start: Pointer to the MCS node of the waiter whose successor should be
+ * the first node in the scan
+ * Return : LOCAL_WAITER_FOUND or encoded tail of the last scanned waiter
+ */
+static u32 cna_scan_main_queue(struct mcs_spinlock *node,
+ struct mcs_spinlock *pred_start)
+{
+ struct cna_node *cn = (struct cna_node *)node;
+ struct cna_node *cni = (struct cna_node *)READ_ONCE(pred_start->next);
+ struct cna_node *last;
+ int my_numa_node = cn->numa_node;
+
+ /* find any next waiter on 'our' NUMA node */
+ for (last = cn;
+ cni && cni->numa_node != my_numa_node;
+ last = cni, cni = (struct cna_node *)READ_ONCE(cni->mcs.next))
+ ;
+
+ /* if found, splice any skipped waiters onto the secondary queue */
+ if (cni) {
+ if (last != cn) /* did we skip any waiters? */
+ cna_splice_tail(node, node->next,
+ (struct mcs_spinlock *)last);
+ return LOCAL_WAITER_FOUND;
+ }
+
+ return last->encoded_tail;
+}
+
+__always_inline u32 cna_pre_scan(struct qspinlock *lock,
+ struct mcs_spinlock *node)
+{
+ struct cna_node *cn = (struct cna_node *)node;
+
+ cn->pre_scan_result = cna_scan_main_queue(node, node);
+
+ return 0;
+}
+
+static inline void cna_pass_lock(struct mcs_spinlock *node,
+ struct mcs_spinlock *next)
+{
+ struct cna_node *cn = (struct cna_node *)node;
+ struct mcs_spinlock *next_holder = next, *tail_2nd;
+ u32 val = 1;
+
+ u32 scan = cn->pre_scan_result;
+
+ /*
+ * check if a successor from the same numa node has not been found in
+ * pre-scan, and if so, try to find it in post-scan starting from the
+ * node where pre-scan stopped (stored in @pre_scan_result)
+ */
+ if (scan >= MIN_ENCODED_TAIL)
+ scan = cna_scan_main_queue(node, decode_tail(scan));
+
+ if (scan == LOCAL_WAITER_FOUND) {
+ next_holder = node->next;
+ /*
+ * we unlock successor by passing a non-zero value,
+ * so set @val to 1 iff @locked is 0, which will happen
+ * if we acquired the MCS lock when its queue was empty
+ */
+ val = node->locked ? node->locked : 1;
+ } else if (node->locked > 1) { /* if secondary queue is not empty */
+ /* next holder will be the first node in the secondary queue */
+ tail_2nd = decode_tail(node->locked);
+ /* @tail_2nd->next points to the head of the secondary queue */
+ next_holder = tail_2nd->next;
+ /* splice the secondary queue onto the head of the main queue */
+ tail_2nd->next = next;
+ }
+
+ arch_mcs_pass_lock(&next_holder->locked, val);
+}
+
+/*
+ * Constant (boot-param configurable) flag selecting the NUMA-aware variant
+ * of spinlock. Possible values: -1 (off) / 0 (auto, default) / 1 (on).
+ */
+static int numa_spinlock_flag;
+
+static int __init numa_spinlock_setup(char *str)
+{
+ if (!strcmp(str, "auto")) {
+ numa_spinlock_flag = 0;
+ return 1;
+ } else if (!strcmp(str, "on")) {
+ numa_spinlock_flag = 1;
+ return 1;
+ } else if (!strcmp(str, "off")) {
+ numa_spinlock_flag = -1;
+ return 1;
+ }
+
+ return 0;
+}
+__setup("numa_spinlock=", numa_spinlock_setup);
+
+void __cna_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+
+/*
+ * Switch to the NUMA-friendly slow path for spinlocks when we have
+ * multiple NUMA nodes in native environment, unless the user has
+ * overridden this default behavior by setting the numa_spinlock flag.
+ */
+void cna_configure_spin_lock_slowpath(void)
+{
+ if ((numa_spinlock_flag == 1) ||
+ (numa_spinlock_flag == 0 && nr_node_ids > 1 &&
+ pv_ops.lock.queued_spin_lock_slowpath ==
+ native_queued_spin_lock_slowpath)) {
+ pv_ops.lock.queued_spin_lock_slowpath =
+ __cna_queued_spin_lock_slowpath;
+
+ pr_info("Enabling CNA spinlock\n");
+ }
+}
--
2.21.0 (Apple Git-122.2)
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
^ permalink raw reply related [flat|nested] 39+ messages in thread
* Re: [PATCH v8 3/5] locking/qspinlock: Introduce CNA into the slow path of qspinlock
2019-12-30 19:40 ` [PATCH v8 3/5] locking/qspinlock: Introduce CNA into the slow path of qspinlock Alex Kogan
@ 2020-01-03 22:14 ` Waiman Long
2020-01-06 15:02 ` Alex Kogan
2020-01-21 13:48 ` Peter Zijlstra
2020-01-21 14:42 ` Peter Zijlstra
2 siblings, 1 reply; 39+ messages in thread
From: Waiman Long @ 2020-01-03 22:14 UTC (permalink / raw)
To: Alex Kogan, linux, peterz, mingo, will.deacon, arnd, linux-arch,
linux-arm-kernel, linux-kernel, tglx, bp, hpa, x86, guohanjun,
jglauber
Cc: dave.dice, steven.sistare, daniel.m.jordan
On 12/30/19 2:40 PM, Alex Kogan wrote:
> +/*
> + * cna_scan_main_queue - scan the main waiting queue looking for the first
> + * thread running on the same NUMA node as the lock holder. If found (call it
> + * thread T), move all threads in the main queue between the lock holder and
> + * T to the end of the secondary queue and return 0
> + * (=SUCCESSOR_FROM_SAME_NUMA_NODE_FOUND); otherwise, return the encoded
Are you talking about LOCAL_WAITER_FOUND?
> + * pointer of the last scanned node in the primary queue (so a subsequent scan
> + * can be resumed from that node).
> + *
> + * Schematically, this may look like the following (nn stands for numa_node and
> + * et stands for encoded_tail).
> + *
> + * when cna_scan_main_queue() is called (the secondary queue is empty):
> + *
> + * A+------------+ B+--------+ C+--------+ T+--------+
> + * |mcs:next | -> |mcs:next| -> |mcs:next| -> |mcs:next| -> NULL
> + * |mcs:locked=1| |cna:nn=0| |cna:nn=2| |cna:nn=1|
> + * |cna:nn=1 | +--------+ +--------+ +--------+
> + * +----------- +
> + *
> + * when cna_scan_main_queue() returns (the secondary queue contains B and C):
> + *
> + * A+----------------+ T+--------+
> + * |mcs:next | -> |mcs:next| -> NULL
> + * |mcs:locked=C.et | -+ |cna:nn=1|
> + * |cna:nn=1 | | +--------+
> + * +--------------- + +-----+
> + * \/
> + * B+--------+ C+--------+
> + * |mcs:next| -> |mcs:next| -+
> + * |cna:nn=0| |cna:nn=2| |
> + * +--------+ +--------+ |
> + * ^ |
> + * +---------------------+
> + *
> + * The worst case complexity of the scan is O(n), where n is the number
> + * of current waiters. However, the amortized complexity is close to O(1),
> + * as the immediate successor is likely to be running on the same node once
> + * threads from other nodes are moved to the secondary queue.
> + *
> + * @node : Pointer to the MCS node of the lock holder
> + * @pred_start: Pointer to the MCS node of the waiter whose successor should be
> + * the first node in the scan
> + * Return : LOCAL_WAITER_FOUND or encoded tail of the last scanned waiter
> + */
> +static u32 cna_scan_main_queue(struct mcs_spinlock *node,
> + struct mcs_spinlock *pred_start)
> +{
> + struct cna_node *cn = (struct cna_node *)node;
> + struct cna_node *cni = (struct cna_node *)READ_ONCE(pred_start->next);
> + struct cna_node *last;
> + int my_numa_node = cn->numa_node;
> +
> + /* find any next waiter on 'our' NUMA node */
> + for (last = cn;
> + cni && cni->numa_node != my_numa_node;
> + last = cni, cni = (struct cna_node *)READ_ONCE(cni->mcs.next))
> + ;
> +
> + /* if found, splice any skipped waiters onto the secondary queue */
> + if (cni) {
> + if (last != cn) /* did we skip any waiters? */
> + cna_splice_tail(node, node->next,
> + (struct mcs_spinlock *)last);
> + return LOCAL_WAITER_FOUND;
> + }
> +
> + return last->encoded_tail;
> +}
> +
>
> +/*
> + * Switch to the NUMA-friendly slow path for spinlocks when we have
> + * multiple NUMA nodes in native environment, unless the user has
> + * overridden this default behavior by setting the numa_spinlock flag.
> + */
> +void cna_configure_spin_lock_slowpath(void)
Nit: There should be a __init.
> +{
> + if ((numa_spinlock_flag == 1) ||
> + (numa_spinlock_flag == 0 && nr_node_ids > 1 &&
> + pv_ops.lock.queued_spin_lock_slowpath ==
> + native_queued_spin_lock_slowpath)) {
> + pv_ops.lock.queued_spin_lock_slowpath =
> + __cna_queued_spin_lock_slowpath;
> +
> + pr_info("Enabling CNA spinlock\n");
> + }
> +}
Other than these two minor nits, the rests looks good to me.
Cheers,
Longman
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH v8 3/5] locking/qspinlock: Introduce CNA into the slow path of qspinlock
2020-01-03 22:14 ` Waiman Long
@ 2020-01-06 15:02 ` Alex Kogan
0 siblings, 0 replies; 39+ messages in thread
From: Alex Kogan @ 2020-01-06 15:02 UTC (permalink / raw)
To: Waiman Long
Cc: linux-arch, guohanjun, arnd, Peter Zijlstra, dave.dice, jglauber,
x86, will.deacon, linux, linux-kernel, mingo, bp, hpa,
steven.sistare, tglx, daniel.m.jordan, linux-arm-kernel
> On Jan 3, 2020, at 5:14 PM, Waiman Long <longman@redhat.com> wrote:
>
> On 12/30/19 2:40 PM, Alex Kogan wrote:
>> +/*
>> + * cna_scan_main_queue - scan the main waiting queue looking for the first
>> + * thread running on the same NUMA node as the lock holder. If found (call it
>> + * thread T), move all threads in the main queue between the lock holder and
>> + * T to the end of the secondary queue and return 0
>> + * (=SUCCESSOR_FROM_SAME_NUMA_NODE_FOUND); otherwise, return the encoded
> Are you talking about LOCAL_WAITER_FOUND?
Ahh, yes — good catch!
>> + * pointer of the last scanned node in the primary queue (so a subsequent scan
>> + * can be resumed from that node).
>> + *
>> + * Schematically, this may look like the following (nn stands for numa_node and
>> + * et stands for encoded_tail).
>> + *
>> + * when cna_scan_main_queue() is called (the secondary queue is empty):
>> + *
>> + * A+------------+ B+--------+ C+--------+ T+--------+
>> + * |mcs:next | -> |mcs:next| -> |mcs:next| -> |mcs:next| -> NULL
>> + * |mcs:locked=1| |cna:nn=0| |cna:nn=2| |cna:nn=1|
>> + * |cna:nn=1 | +--------+ +--------+ +--------+
>> + * +----------- +
>> + *
>> + * when cna_scan_main_queue() returns (the secondary queue contains B and C):
>> + *
>> + * A+----------------+ T+--------+
>> + * |mcs:next | -> |mcs:next| -> NULL
>> + * |mcs:locked=C.et | -+ |cna:nn=1|
>> + * |cna:nn=1 | | +--------+
>> + * +--------------- + +-----+
>> + * \/
>> + * B+--------+ C+--------+
>> + * |mcs:next| -> |mcs:next| -+
>> + * |cna:nn=0| |cna:nn=2| |
>> + * +--------+ +--------+ |
>> + * ^ |
>> + * +---------------------+
>> + *
>> + * The worst case complexity of the scan is O(n), where n is the number
>> + * of current waiters. However, the amortized complexity is close to O(1),
>> + * as the immediate successor is likely to be running on the same node once
>> + * threads from other nodes are moved to the secondary queue.
>> + *
>> + * @node : Pointer to the MCS node of the lock holder
>> + * @pred_start: Pointer to the MCS node of the waiter whose successor should be
>> + * the first node in the scan
>> + * Return : LOCAL_WAITER_FOUND or encoded tail of the last scanned waiter
>> + */
>> +static u32 cna_scan_main_queue(struct mcs_spinlock *node,
>> + struct mcs_spinlock *pred_start)
>> +{
>> + struct cna_node *cn = (struct cna_node *)node;
>> + struct cna_node *cni = (struct cna_node *)READ_ONCE(pred_start->next);
>> + struct cna_node *last;
>> + int my_numa_node = cn->numa_node;
>> +
>> + /* find any next waiter on 'our' NUMA node */
>> + for (last = cn;
>> + cni && cni->numa_node != my_numa_node;
>> + last = cni, cni = (struct cna_node *)READ_ONCE(cni->mcs.next))
>> + ;
>> +
>> + /* if found, splice any skipped waiters onto the secondary queue */
>> + if (cni) {
>> + if (last != cn) /* did we skip any waiters? */
>> + cna_splice_tail(node, node->next,
>> + (struct mcs_spinlock *)last);
>> + return LOCAL_WAITER_FOUND;
>> + }
>> +
>> + return last->encoded_tail;
>> +}
>> +
>>
>> +/*
>> + * Switch to the NUMA-friendly slow path for spinlocks when we have
>> + * multiple NUMA nodes in native environment, unless the user has
>> + * overridden this default behavior by setting the numa_spinlock flag.
>> + */
>> +void cna_configure_spin_lock_slowpath(void)
> Nit: There should be a __init.
True. I will fix that.
>> +{
>> + if ((numa_spinlock_flag == 1) ||
>> + (numa_spinlock_flag == 0 && nr_node_ids > 1 &&
>> + pv_ops.lock.queued_spin_lock_slowpath ==
>> + native_queued_spin_lock_slowpath)) {
>> + pv_ops.lock.queued_spin_lock_slowpath =
>> + __cna_queued_spin_lock_slowpath;
>> +
>> + pr_info("Enabling CNA spinlock\n");
>> + }
>> +}
>
> Other than these two minor nits, the rests looks good to me.
Great. I will revise and resubmit.
Best regards,
— Alex
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH v8 3/5] locking/qspinlock: Introduce CNA into the slow path of qspinlock
2019-12-30 19:40 ` [PATCH v8 3/5] locking/qspinlock: Introduce CNA into the slow path of qspinlock Alex Kogan
2020-01-03 22:14 ` Waiman Long
@ 2020-01-21 13:48 ` Peter Zijlstra
2020-01-21 14:42 ` Peter Zijlstra
2 siblings, 0 replies; 39+ messages in thread
From: Peter Zijlstra @ 2020-01-21 13:48 UTC (permalink / raw)
To: Alex Kogan
Cc: linux-arch, guohanjun, arnd, dave.dice, jglauber, x86,
will.deacon, linux, steven.sistare, linux-kernel, mingo, bp, hpa,
longman, tglx, daniel.m.jordan, linux-arm-kernel
On Mon, Dec 30, 2019 at 02:40:40PM -0500, Alex Kogan wrote:
> +#define try_clear_tail cna_try_change_tail
That's inconsistent; please run
's/cna_try_change_tail/cna_try_clear_tail/g' on your patch.
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH v8 3/5] locking/qspinlock: Introduce CNA into the slow path of qspinlock
2019-12-30 19:40 ` [PATCH v8 3/5] locking/qspinlock: Introduce CNA into the slow path of qspinlock Alex Kogan
2020-01-03 22:14 ` Waiman Long
2020-01-21 13:48 ` Peter Zijlstra
@ 2020-01-21 14:42 ` Peter Zijlstra
2 siblings, 0 replies; 39+ messages in thread
From: Peter Zijlstra @ 2020-01-21 14:42 UTC (permalink / raw)
To: Alex Kogan
Cc: linux-arch, guohanjun, arnd, dave.dice, jglauber, x86,
will.deacon, linux, steven.sistare, linux-kernel, mingo, bp, hpa,
longman, tglx, daniel.m.jordan, linux-arm-kernel
On Mon, Dec 30, 2019 at 02:40:40PM -0500, Alex Kogan wrote:
> +#define pv_wait_head_or_lock cna_pre_scan
Also inconsitent naming.
> +__always_inline u32 cna_pre_scan(struct qspinlock *lock,
> + struct mcs_spinlock *node)
> +{
> + struct cna_node *cn = (struct cna_node *)node;
> +
> + cn->pre_scan_result = cna_scan_main_queue(node, node);
> +
> + return 0;
> +}
The thinking here is that we're trying to make use of the time otherwise
spend spinning on atomic_cond_read_acquire(), to search for a potential
unlock candidate?
Surely that deserves a comment.
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
^ permalink raw reply [flat|nested] 39+ messages in thread
* [PATCH v8 4/5] locking/qspinlock: Introduce starvation avoidance into CNA
2019-12-30 19:40 [PATCH v8 0/5] Add NUMA-awareness to qspinlock Alex Kogan
` (2 preceding siblings ...)
2019-12-30 19:40 ` [PATCH v8 3/5] locking/qspinlock: Introduce CNA into the slow path of qspinlock Alex Kogan
@ 2019-12-30 19:40 ` Alex Kogan
2020-01-06 15:33 ` Waiman Long
2020-01-21 13:29 ` Peter Zijlstra
2019-12-30 19:40 ` [PATCH v8 5/5] locking/qspinlock: Introduce the shuffle reduction optimization " Alex Kogan
` (2 subsequent siblings)
6 siblings, 2 replies; 39+ messages in thread
From: Alex Kogan @ 2019-12-30 19:40 UTC (permalink / raw)
To: linux, peterz, mingo, will.deacon, arnd, longman, linux-arch,
linux-arm-kernel, linux-kernel, tglx, bp, hpa, x86, guohanjun,
jglauber
Cc: alex.kogan, dave.dice, steven.sistare, daniel.m.jordan
Keep track of the number of intra-node lock handoffs, and force
inter-node handoff once this number reaches a preset threshold.
The default value for the threshold can be overridden with
the new kernel boot command-line option "numa_spinlock_threshold".
Signed-off-by: Alex Kogan <alex.kogan@oracle.com>
Reviewed-by: Steve Sistare <steven.sistare@oracle.com>
---
.../admin-guide/kernel-parameters.txt | 8 ++++
kernel/locking/qspinlock.c | 3 ++
kernel/locking/qspinlock_cna.h | 41 ++++++++++++++++++-
3 files changed, 51 insertions(+), 1 deletion(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index b68cb80e477f..30d79819a3b0 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3200,6 +3200,14 @@
Not specifying this option is equivalent to
numa_spinlock=auto.
+ numa_spinlock_threshold= [NUMA, PV_OPS]
+ Set the threshold for the number of intra-node
+ lock hand-offs before the NUMA-aware spinlock
+ is forced to be passed to a thread on another NUMA node.
+ Valid values are in the [0..31] range. Smaller values
+ result in a more fair, but less performant spinlock, and
+ vice versa. The default value is 16.
+
cpu0_hotplug [X86] Turn on CPU0 hotplug feature when
CONFIG_BOOTPARAM_HOTPLUG_CPU0 is off.
Some features depend on CPU0. Known dependencies are:
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 609980a53841..e382d8946ccc 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -597,6 +597,9 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath);
#if !defined(_GEN_CNA_LOCK_SLOWPATH) && defined(CONFIG_NUMA_AWARE_SPINLOCKS)
#define _GEN_CNA_LOCK_SLOWPATH
+#undef pv_init_node
+#define pv_init_node cna_init_node
+
#undef pv_wait_head_or_lock
#define pv_wait_head_or_lock cna_pre_scan
diff --git a/kernel/locking/qspinlock_cna.h b/kernel/locking/qspinlock_cna.h
index 3c99a4a6184b..30feff02865d 100644
--- a/kernel/locking/qspinlock_cna.h
+++ b/kernel/locking/qspinlock_cna.h
@@ -51,13 +51,25 @@ struct cna_node {
int numa_node;
u32 encoded_tail;
u32 pre_scan_result; /* encoded tail or enum val */
+ u32 intra_count;
};
enum {
LOCAL_WAITER_FOUND = 2, /* 0 and 1 are reserved for @locked */
+ FLUSH_SECONDARY_QUEUE = 3,
MIN_ENCODED_TAIL
};
+/*
+ * Controls the threshold for the number of intra-node lock hand-offs before
+ * the NUMA-aware variant of spinlock is forced to be passed to a thread on
+ * another NUMA node. By default, the chosen value provides reasonable
+ * long-term fairness without sacrificing performance compared to a lock
+ * that does not have any fairness guarantees. The default setting can
+ * be changed with the "numa_spinlock_threshold" boot option.
+ */
+int intra_node_handoff_threshold __ro_after_init = 1 << 16;
+
static void __init cna_init_nodes_per_cpu(unsigned int cpu)
{
struct mcs_spinlock *base = per_cpu_ptr(&qnodes[0].mcs, cpu);
@@ -97,6 +109,11 @@ static int __init cna_init_nodes(void)
}
early_initcall(cna_init_nodes);
+static __always_inline void cna_init_node(struct mcs_spinlock *node)
+{
+ ((struct cna_node *)node)->intra_count = 0;
+}
+
/* this function is called only when the primary queue is empty */
static inline bool cna_try_change_tail(struct qspinlock *lock, u32 val,
struct mcs_spinlock *node)
@@ -233,7 +250,9 @@ __always_inline u32 cna_pre_scan(struct qspinlock *lock,
{
struct cna_node *cn = (struct cna_node *)node;
- cn->pre_scan_result = cna_scan_main_queue(node, node);
+ cn->pre_scan_result =
+ cn->intra_count == intra_node_handoff_threshold ?
+ FLUSH_SECONDARY_QUEUE : cna_scan_main_queue(node, node);
return 0;
}
@@ -263,6 +282,9 @@ static inline void cna_pass_lock(struct mcs_spinlock *node,
* if we acquired the MCS lock when its queue was empty
*/
val = node->locked ? node->locked : 1;
+ /* inc @intra_count if the secondary queue is not empty */
+ ((struct cna_node *)next_holder)->intra_count =
+ cn->intra_count + (node->locked > 1);
} else if (node->locked > 1) { /* if secondary queue is not empty */
/* next holder will be the first node in the secondary queue */
tail_2nd = decode_tail(node->locked);
@@ -317,3 +339,20 @@ void cna_configure_spin_lock_slowpath(void)
pr_info("Enabling CNA spinlock\n");
}
}
+
+static int __init numa_spinlock_threshold_setup(char *str)
+{
+ int new_threshold_param;
+
+ if (get_option(&str, &new_threshold_param)) {
+ /* valid value is between 0 and 31 */
+ if (new_threshold_param < 0 || new_threshold_param > 31)
+ return 0;
+
+ intra_node_handoff_threshold = 1 << new_threshold_param;
+ return 1;
+ }
+
+ return 0;
+}
+__setup("numa_spinlock_threshold=", numa_spinlock_threshold_setup);
--
2.21.0 (Apple Git-122.2)
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
^ permalink raw reply related [flat|nested] 39+ messages in thread
* Re: [PATCH v8 4/5] locking/qspinlock: Introduce starvation avoidance into CNA
2019-12-30 19:40 ` [PATCH v8 4/5] locking/qspinlock: Introduce starvation avoidance into CNA Alex Kogan
@ 2020-01-06 15:33 ` Waiman Long
2020-01-21 13:29 ` Peter Zijlstra
1 sibling, 0 replies; 39+ messages in thread
From: Waiman Long @ 2020-01-06 15:33 UTC (permalink / raw)
To: Alex Kogan, linux, peterz, mingo, will.deacon, arnd, linux-arch,
linux-arm-kernel, linux-kernel, tglx, bp, hpa, x86, guohanjun,
jglauber
Cc: dave.dice, steven.sistare, daniel.m.jordan
On 12/30/19 2:40 PM, Alex Kogan wrote:
> Keep track of the number of intra-node lock handoffs, and force
> inter-node handoff once this number reaches a preset threshold.
> The default value for the threshold can be overridden with
> the new kernel boot command-line option "numa_spinlock_threshold".
>
> Signed-off-by: Alex Kogan <alex.kogan@oracle.com>
> Reviewed-by: Steve Sistare <steven.sistare@oracle.com>
> ---
> .../admin-guide/kernel-parameters.txt | 8 ++++
> kernel/locking/qspinlock.c | 3 ++
> kernel/locking/qspinlock_cna.h | 41 ++++++++++++++++++-
> 3 files changed, 51 insertions(+), 1 deletion(-)
>
> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> index b68cb80e477f..30d79819a3b0 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -3200,6 +3200,14 @@
> Not specifying this option is equivalent to
> numa_spinlock=auto.
>
> + numa_spinlock_threshold= [NUMA, PV_OPS]
> + Set the threshold for the number of intra-node
> + lock hand-offs before the NUMA-aware spinlock
> + is forced to be passed to a thread on another NUMA node.
> + Valid values are in the [0..31] range. Smaller values
> + result in a more fair, but less performant spinlock, and
> + vice versa. The default value is 16.
> +
> cpu0_hotplug [X86] Turn on CPU0 hotplug feature when
> CONFIG_BOOTPARAM_HOTPLUG_CPU0 is off.
> Some features depend on CPU0. Known dependencies are:
> diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
> index 609980a53841..e382d8946ccc 100644
> --- a/kernel/locking/qspinlock.c
> +++ b/kernel/locking/qspinlock.c
> @@ -597,6 +597,9 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath);
> #if !defined(_GEN_CNA_LOCK_SLOWPATH) && defined(CONFIG_NUMA_AWARE_SPINLOCKS)
> #define _GEN_CNA_LOCK_SLOWPATH
>
> +#undef pv_init_node
> +#define pv_init_node cna_init_node
> +
> #undef pv_wait_head_or_lock
> #define pv_wait_head_or_lock cna_pre_scan
>
> diff --git a/kernel/locking/qspinlock_cna.h b/kernel/locking/qspinlock_cna.h
> index 3c99a4a6184b..30feff02865d 100644
> --- a/kernel/locking/qspinlock_cna.h
> +++ b/kernel/locking/qspinlock_cna.h
> @@ -51,13 +51,25 @@ struct cna_node {
> int numa_node;
> u32 encoded_tail;
> u32 pre_scan_result; /* encoded tail or enum val */
> + u32 intra_count;
> };
>
> enum {
> LOCAL_WAITER_FOUND = 2, /* 0 and 1 are reserved for @locked */
> + FLUSH_SECONDARY_QUEUE = 3,
> MIN_ENCODED_TAIL
> };
>
> +/*
> + * Controls the threshold for the number of intra-node lock hand-offs before
> + * the NUMA-aware variant of spinlock is forced to be passed to a thread on
> + * another NUMA node. By default, the chosen value provides reasonable
> + * long-term fairness without sacrificing performance compared to a lock
> + * that does not have any fairness guarantees. The default setting can
> + * be changed with the "numa_spinlock_threshold" boot option.
> + */
> +int intra_node_handoff_threshold __ro_after_init = 1 << 16;
Another minor nit. (1 << 31) is negative for an int value. I will
suggest that you either make intra_node_handoff_threshold an unsigned
int or limit the upper bound to 30.
Cheers,
Longman
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH v8 4/5] locking/qspinlock: Introduce starvation avoidance into CNA
2019-12-30 19:40 ` [PATCH v8 4/5] locking/qspinlock: Introduce starvation avoidance into CNA Alex Kogan
2020-01-06 15:33 ` Waiman Long
@ 2020-01-21 13:29 ` Peter Zijlstra
2020-01-21 13:50 ` Peter Zijlstra
2020-01-21 15:45 ` Waiman Long
1 sibling, 2 replies; 39+ messages in thread
From: Peter Zijlstra @ 2020-01-21 13:29 UTC (permalink / raw)
To: Alex Kogan
Cc: linux-arch, guohanjun, arnd, dave.dice, jglauber, x86,
will.deacon, linux, steven.sistare, linux-kernel, mingo, bp, hpa,
longman, tglx, daniel.m.jordan, linux-arm-kernel
On Mon, Dec 30, 2019 at 02:40:41PM -0500, Alex Kogan wrote:
> +/*
> + * Controls the threshold for the number of intra-node lock hand-offs before
> + * the NUMA-aware variant of spinlock is forced to be passed to a thread on
> + * another NUMA node. By default, the chosen value provides reasonable
> + * long-term fairness without sacrificing performance compared to a lock
> + * that does not have any fairness guarantees. The default setting can
> + * be changed with the "numa_spinlock_threshold" boot option.
> + */
> +int intra_node_handoff_threshold __ro_after_init = 1 << 16;
There is a distinct lack of quantitative data to back up that
'reasonable' claim there.
Where is the table of inter-node latencies observed for the various
values tested, and on what criteria is this number deemed reasonable?
To me, 64k lock hold times seems like a giant number, entirely outside
of reasonable.
> +
> static void __init cna_init_nodes_per_cpu(unsigned int cpu)
> {
> struct mcs_spinlock *base = per_cpu_ptr(&qnodes[0].mcs, cpu);
> @@ -97,6 +109,11 @@ static int __init cna_init_nodes(void)
> }
> early_initcall(cna_init_nodes);
>
> +static __always_inline void cna_init_node(struct mcs_spinlock *node)
> +{
> + ((struct cna_node *)node)->intra_count = 0;
> +}
> +
> /* this function is called only when the primary queue is empty */
> static inline bool cna_try_change_tail(struct qspinlock *lock, u32 val,
> struct mcs_spinlock *node)
> @@ -233,7 +250,9 @@ __always_inline u32 cna_pre_scan(struct qspinlock *lock,
> {
> struct cna_node *cn = (struct cna_node *)node;
>
> - cn->pre_scan_result = cna_scan_main_queue(node, node);
> + cn->pre_scan_result =
> + cn->intra_count == intra_node_handoff_threshold ?
> + FLUSH_SECONDARY_QUEUE : cna_scan_main_queue(node, node);
Because:
if (cn->intra_count < intra_node_handoff_threshold)
cn->pre_scan_result = cna_scan_main_queue(node, node);
else
cn->pre_scan_result = FLUSH_SECONDARY_QUEUE;
was too readable?
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH v8 4/5] locking/qspinlock: Introduce starvation avoidance into CNA
2020-01-21 13:29 ` Peter Zijlstra
@ 2020-01-21 13:50 ` Peter Zijlstra
2020-01-21 21:19 ` Daniel Bristot de Oliveira
2020-01-21 15:45 ` Waiman Long
1 sibling, 1 reply; 39+ messages in thread
From: Peter Zijlstra @ 2020-01-21 13:50 UTC (permalink / raw)
To: Alex Kogan
Cc: linux-arch, guohanjun, arnd, dave.dice, jglauber, x86,
will.deacon, linux, steven.sistare, linux-kernel, mingo, bp, hpa,
longman, tglx, daniel.m.jordan, bristot, linux-arm-kernel
On Tue, Jan 21, 2020 at 02:29:49PM +0100, Peter Zijlstra wrote:
> On Mon, Dec 30, 2019 at 02:40:41PM -0500, Alex Kogan wrote:
>
> > +/*
> > + * Controls the threshold for the number of intra-node lock hand-offs before
> > + * the NUMA-aware variant of spinlock is forced to be passed to a thread on
> > + * another NUMA node. By default, the chosen value provides reasonable
> > + * long-term fairness without sacrificing performance compared to a lock
> > + * that does not have any fairness guarantees. The default setting can
> > + * be changed with the "numa_spinlock_threshold" boot option.
> > + */
> > +int intra_node_handoff_threshold __ro_after_init = 1 << 16;
>
> There is a distinct lack of quantitative data to back up that
> 'reasonable' claim there.
>
> Where is the table of inter-node latencies observed for the various
> values tested, and on what criteria is this number deemed reasonable?
>
> To me, 64k lock hold times seems like a giant number, entirely outside
> of reasonable.
Daniel, IIRC you just did a paper on constructing worst case latencies
from measuring pieces. Do you have data on average lock hold times?
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH v8 4/5] locking/qspinlock: Introduce starvation avoidance into CNA
2020-01-21 13:50 ` Peter Zijlstra
@ 2020-01-21 21:19 ` Daniel Bristot de Oliveira
0 siblings, 0 replies; 39+ messages in thread
From: Daniel Bristot de Oliveira @ 2020-01-21 21:19 UTC (permalink / raw)
To: Peter Zijlstra, Alex Kogan
Cc: linux-arch, guohanjun, arnd, dave.dice, jglauber, x86,
will.deacon, linux, steven.sistare, linux-kernel, mingo, bp, hpa,
longman, tglx, daniel.m.jordan, linux-arm-kernel
On 1/21/20 2:50 PM, Peter Zijlstra wrote:
> On Tue, Jan 21, 2020 at 02:29:49PM +0100, Peter Zijlstra wrote:
>> On Mon, Dec 30, 2019 at 02:40:41PM -0500, Alex Kogan wrote:
>>
>>> +/*
>>> + * Controls the threshold for the number of intra-node lock hand-offs before
>>> + * the NUMA-aware variant of spinlock is forced to be passed to a thread on
>>> + * another NUMA node. By default, the chosen value provides reasonable
>>> + * long-term fairness without sacrificing performance compared to a lock
>>> + * that does not have any fairness guarantees. The default setting can
>>> + * be changed with the "numa_spinlock_threshold" boot option.
>>> + */
>>> +int intra_node_handoff_threshold __ro_after_init = 1 << 16;
>> There is a distinct lack of quantitative data to back up that
>> 'reasonable' claim there.
>>
>> Where is the table of inter-node latencies observed for the various
>> values tested, and on what criteria is this number deemed reasonable?
>>
>> To me, 64k lock hold times seems like a giant number, entirely outside
>> of reasonable.
> Daniel, IIRC you just did a paper on constructing worst case latencies
> from measuring pieces. Do you have data on average lock hold times?
>
I am still writing the paper, but I do not have the (avg) lock times. It is it
is in the TODO list, though!
-- Daniel
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH v8 4/5] locking/qspinlock: Introduce starvation avoidance into CNA
2020-01-21 13:29 ` Peter Zijlstra
2020-01-21 13:50 ` Peter Zijlstra
@ 2020-01-21 15:45 ` Waiman Long
[not found] ` <3862F8A1-FF9B-40AD-A88E-2C0BA7AF6F58@oracle.com>
1 sibling, 1 reply; 39+ messages in thread
From: Waiman Long @ 2020-01-21 15:45 UTC (permalink / raw)
To: Peter Zijlstra, Alex Kogan
Cc: linux-arch, guohanjun, arnd, dave.dice, jglauber, x86,
will.deacon, linux, linux-kernel, mingo, bp, hpa, steven.sistare,
tglx, daniel.m.jordan, linux-arm-kernel
On 1/21/20 8:29 AM, Peter Zijlstra wrote:
> On Mon, Dec 30, 2019 at 02:40:41PM -0500, Alex Kogan wrote:
>
>> +/*
>> + * Controls the threshold for the number of intra-node lock hand-offs before
>> + * the NUMA-aware variant of spinlock is forced to be passed to a thread on
>> + * another NUMA node. By default, the chosen value provides reasonable
>> + * long-term fairness without sacrificing performance compared to a lock
>> + * that does not have any fairness guarantees. The default setting can
>> + * be changed with the "numa_spinlock_threshold" boot option.
>> + */
>> +int intra_node_handoff_threshold __ro_after_init = 1 << 16;
> There is a distinct lack of quantitative data to back up that
> 'reasonable' claim there.
>
> Where is the table of inter-node latencies observed for the various
> values tested, and on what criteria is this number deemed reasonable?
>
> To me, 64k lock hold times seems like a giant number, entirely outside
> of reasonable.
I actually had similar question before, but having the capability of
changing the default with boot time parameter alleviate some of my
concern. I will certainly like to see actual data on how different
values will affect the performance of the code.
Cheers,
Longman
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
^ permalink raw reply [flat|nested] 39+ messages in thread
* [PATCH v8 5/5] locking/qspinlock: Introduce the shuffle reduction optimization into CNA
2019-12-30 19:40 [PATCH v8 0/5] Add NUMA-awareness to qspinlock Alex Kogan
` (3 preceding siblings ...)
2019-12-30 19:40 ` [PATCH v8 4/5] locking/qspinlock: Introduce starvation avoidance into CNA Alex Kogan
@ 2019-12-30 19:40 ` Alex Kogan
2020-01-22 9:56 ` Peter Zijlstra
2020-01-06 15:48 ` [PATCH v8 0/5] Add NUMA-awareness to qspinlock Waiman Long
2020-01-08 5:09 ` Shijith Thotton
6 siblings, 1 reply; 39+ messages in thread
From: Alex Kogan @ 2019-12-30 19:40 UTC (permalink / raw)
To: linux, peterz, mingo, will.deacon, arnd, longman, linux-arch,
linux-arm-kernel, linux-kernel, tglx, bp, hpa, x86, guohanjun,
jglauber
Cc: alex.kogan, dave.dice, steven.sistare, daniel.m.jordan
This performance optimization reduces the probability threads will be
shuffled between the main and secondary queues when the secondary queue
is empty. It is helpful when the lock is only lightly contended.
Signed-off-by: Alex Kogan <alex.kogan@oracle.com>
Reviewed-by: Steve Sistare <steven.sistare@oracle.com>
---
kernel/locking/qspinlock_cna.h | 46 ++++++++++++++++++++++++++++++++--
1 file changed, 44 insertions(+), 2 deletions(-)
diff --git a/kernel/locking/qspinlock_cna.h b/kernel/locking/qspinlock_cna.h
index 30feff02865d..f21056560104 100644
--- a/kernel/locking/qspinlock_cna.h
+++ b/kernel/locking/qspinlock_cna.h
@@ -4,6 +4,7 @@
#endif
#include <linux/topology.h>
+#include <linux/random.h>
/*
* Implement a NUMA-aware version of MCS (aka CNA, or compact NUMA-aware lock).
@@ -57,6 +58,7 @@ struct cna_node {
enum {
LOCAL_WAITER_FOUND = 2, /* 0 and 1 are reserved for @locked */
FLUSH_SECONDARY_QUEUE = 3,
+ PASS_LOCK_IMMEDIATELY = 4,
MIN_ENCODED_TAIL
};
@@ -70,6 +72,34 @@ enum {
*/
int intra_node_handoff_threshold __ro_after_init = 1 << 16;
+/*
+ * Controls the probability for enabling the scan of the main queue when
+ * the secondary queue is empty. The chosen value reduces the amount of
+ * unnecessary shuffling of threads between the two waiting queues when
+ * the contention is low, while responding fast enough and enabling
+ * the shuffling when the contention is high.
+ */
+#define SHUFFLE_REDUCTION_PROB_ARG (7)
+
+/* Per-CPU pseudo-random number seed */
+static DEFINE_PER_CPU(u32, seed);
+
+/*
+ * Return false with probability 1 / 2^@num_bits.
+ * Intuitively, the larger @num_bits the less likely false is to be returned.
+ * @num_bits must be a number between 0 and 31.
+ */
+static bool probably(unsigned int num_bits)
+{
+ u32 s;
+
+ s = this_cpu_read(seed);
+ s = next_pseudo_random32(s);
+ this_cpu_write(seed, s);
+
+ return s & ((1 << num_bits) - 1);
+}
+
static void __init cna_init_nodes_per_cpu(unsigned int cpu)
{
struct mcs_spinlock *base = per_cpu_ptr(&qnodes[0].mcs, cpu);
@@ -251,8 +281,11 @@ __always_inline u32 cna_pre_scan(struct qspinlock *lock,
struct cna_node *cn = (struct cna_node *)node;
cn->pre_scan_result =
- cn->intra_count == intra_node_handoff_threshold ?
- FLUSH_SECONDARY_QUEUE : cna_scan_main_queue(node, node);
+ (node->locked <= 1 && probably(SHUFFLE_REDUCTION_PROB_ARG)) ?
+ PASS_LOCK_IMMEDIATELY :
+ cn->intra_count == intra_node_handoff_threshold ?
+ FLUSH_SECONDARY_QUEUE :
+ cna_scan_main_queue(node, node);
return 0;
}
@@ -266,6 +299,14 @@ static inline void cna_pass_lock(struct mcs_spinlock *node,
u32 scan = cn->pre_scan_result;
+ /*
+ * perf. optimization - check if we can skip the logic of triaging
+ * through other possible values in @scan (helps under light lock
+ * contention)
+ */
+ if (scan == PASS_LOCK_IMMEDIATELY)
+ goto pass_lock;
+
/*
* check if a successor from the same numa node has not been found in
* pre-scan, and if so, try to find it in post-scan starting from the
@@ -294,6 +335,7 @@ static inline void cna_pass_lock(struct mcs_spinlock *node,
tail_2nd->next = next;
}
+pass_lock:
arch_mcs_pass_lock(&next_holder->locked, val);
}
--
2.21.0 (Apple Git-122.2)
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
^ permalink raw reply related [flat|nested] 39+ messages in thread
* Re: [PATCH v8 5/5] locking/qspinlock: Introduce the shuffle reduction optimization into CNA
2019-12-30 19:40 ` [PATCH v8 5/5] locking/qspinlock: Introduce the shuffle reduction optimization " Alex Kogan
@ 2020-01-22 9:56 ` Peter Zijlstra
0 siblings, 0 replies; 39+ messages in thread
From: Peter Zijlstra @ 2020-01-22 9:56 UTC (permalink / raw)
To: Alex Kogan
Cc: linux-arch, guohanjun, arnd, dave.dice, jglauber, x86,
will.deacon, linux, steven.sistare, linux-kernel, mingo, bp, hpa,
longman, tglx, daniel.m.jordan, linux-arm-kernel
On Mon, Dec 30, 2019 at 02:40:42PM -0500, Alex Kogan wrote:
> @@ -251,8 +281,11 @@ __always_inline u32 cna_pre_scan(struct qspinlock *lock,
> struct cna_node *cn = (struct cna_node *)node;
>
> cn->pre_scan_result =
> - cn->intra_count == intra_node_handoff_threshold ?
> - FLUSH_SECONDARY_QUEUE : cna_scan_main_queue(node, node);
> + (node->locked <= 1 && probably(SHUFFLE_REDUCTION_PROB_ARG)) ?
> + PASS_LOCK_IMMEDIATELY :
> + cn->intra_count == intra_node_handoff_threshold ?
> + FLUSH_SECONDARY_QUEUE :
> + cna_scan_main_queue(node, node);
>
> return 0;
> }
Let me just, once again, remind people that the Linux Kernel is not part
of the Obfuscated C code contest.
> Reviewed-by: Steve Sistare <steven.sistare@oracle.com>
Seriously, in what universe is that actually readable code? Steve quick,
say what it does.
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH v8 0/5] Add NUMA-awareness to qspinlock
2019-12-30 19:40 [PATCH v8 0/5] Add NUMA-awareness to qspinlock Alex Kogan
` (4 preceding siblings ...)
2019-12-30 19:40 ` [PATCH v8 5/5] locking/qspinlock: Introduce the shuffle reduction optimization " Alex Kogan
@ 2020-01-06 15:48 ` Waiman Long
2020-01-08 5:09 ` Shijith Thotton
6 siblings, 0 replies; 39+ messages in thread
From: Waiman Long @ 2020-01-06 15:48 UTC (permalink / raw)
To: Alex Kogan, linux, peterz, mingo, will.deacon, arnd, linux-arch,
linux-arm-kernel, linux-kernel, tglx, bp, hpa, x86, guohanjun,
jglauber
Cc: dave.dice, steven.sistare, daniel.m.jordan
On 12/30/19 2:40 PM, Alex Kogan wrote:
> Minor changes from v7 based on feedback from Longman:
> -----------------------------------------------------
>
> - Move __init functions from alternative.c to qspinlock_cna.h
>
> - Introduce enum for return values from cna_pre_scan(), for better
> readability.
>
> - Add/revise a few comments to improve readability.
>
>
> Summary
> -------
>
> Lock throughput can be increased by handing a lock to a waiter on the
> same NUMA node as the lock holder, provided care is taken to avoid
> starvation of waiters on other NUMA nodes. This patch introduces CNA
> (compact NUMA-aware lock) as the slow path for qspinlock. It is
> enabled through a configuration option (NUMA_AWARE_SPINLOCKS).
>
> CNA is a NUMA-aware version of the MCS lock. Spinning threads are
> organized in two queues, a main queue for threads running on the same
> node as the current lock holder, and a secondary queue for threads
> running on other nodes. Threads store the ID of the node on which
> they are running in their queue nodes. After acquiring the MCS lock and
> before acquiring the spinlock, the lock holder scans the main queue
> looking for a thread running on the same node (pre-scan). If found (call
> it thread T), all threads in the main queue between the current lock
> holder and T are moved to the end of the secondary queue. If such T
> is not found, we make another scan of the main queue after acquiring
> the spinlock when unlocking the MCS lock (post-scan), starting at the
> node where pre-scan stopped. If both scans fail to find such T, the
> MCS lock is passed to the first thread in the secondary queue. If the
> secondary queue is empty, the MCS lock is passed to the next thread in the
> main queue. To avoid starvation of threads in the secondary queue, those
> threads are moved back to the head of the main queue after a certain
> number of intra-node lock hand-offs.
>
> More details are available at https://arxiv.org/abs/1810.05600.
>
> The series applies on top of v5.5.0-rc2, commit ea200dec51.
> Performance numbers are available in previous revisions
> of the series.
>
> Further comments are welcome and appreciated.
>
> Alex Kogan (5):
> locking/qspinlock: Rename mcs lock/unlock macros and make them more
> generic
> locking/qspinlock: Refactor the qspinlock slow path
> locking/qspinlock: Introduce CNA into the slow path of qspinlock
> locking/qspinlock: Introduce starvation avoidance into CNA
> locking/qspinlock: Introduce the shuffle reduction optimization into
> CNA
>
> .../admin-guide/kernel-parameters.txt | 18 +
> arch/arm/include/asm/mcs_spinlock.h | 6 +-
> arch/x86/Kconfig | 20 +
> arch/x86/include/asm/qspinlock.h | 4 +
> arch/x86/kernel/alternative.c | 4 +
> include/asm-generic/mcs_spinlock.h | 4 +-
> kernel/locking/mcs_spinlock.h | 20 +-
> kernel/locking/qspinlock.c | 82 +++-
> kernel/locking/qspinlock_cna.h | 400 ++++++++++++++++++
> kernel/locking/qspinlock_paravirt.h | 2 +-
> 10 files changed, 537 insertions(+), 23 deletions(-)
> create mode 100644 kernel/locking/qspinlock_cna.h
>
I have reviewed this patch series. Besides a few minor nits, the rests
look solid to me. So you can put my review tag.
Reviewed-by: Waiman Long <longman@redhat.com>
Peter and Will, would you mind taking a look to see if you have anything
to add?
Thanks,
Longman
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH v8 0/5] Add NUMA-awareness to qspinlock
2019-12-30 19:40 [PATCH v8 0/5] Add NUMA-awareness to qspinlock Alex Kogan
` (5 preceding siblings ...)
2020-01-06 15:48 ` [PATCH v8 0/5] Add NUMA-awareness to qspinlock Waiman Long
@ 2020-01-08 5:09 ` Shijith Thotton
2020-01-21 9:21 ` Shijith Thotton
6 siblings, 1 reply; 39+ messages in thread
From: Shijith Thotton @ 2020-01-08 5:09 UTC (permalink / raw)
To: Alex Kogan, will.deacon
Cc: linux-arch, arnd, peterz, dave.dice, x86, guohanjun, linux,
steven.sistare, linux-kernel, mingo, bp, hpa, longman, tglx,
daniel.m.jordan, linux-arm-kernel
Hi Will,
On Mon, Dec 30, 2019 at 02:40:37PM -0500, Alex Kogan wrote:
> Minor changes from v7 based on feedback from Longman:
> -----------------------------------------------------
>
> - Move __init functions from alternative.c to qspinlock_cna.h
>
> - Introduce enum for return values from cna_pre_scan(), for better
> readability.
>
> - Add/revise a few comments to improve readability.
>
>
> Summary
> -------
>
> Lock throughput can be increased by handing a lock to a waiter on the
> same NUMA node as the lock holder, provided care is taken to avoid
> starvation of waiters on other NUMA nodes. This patch introduces CNA
> (compact NUMA-aware lock) as the slow path for qspinlock. It is
> enabled through a configuration option (NUMA_AWARE_SPINLOCKS).
>
> CNA is a NUMA-aware version of the MCS lock. Spinning threads are
> organized in two queues, a main queue for threads running on the same
> node as the current lock holder, and a secondary queue for threads
> running on other nodes. Threads store the ID of the node on which
> they are running in their queue nodes. After acquiring the MCS lock and
> before acquiring the spinlock, the lock holder scans the main queue
> looking for a thread running on the same node (pre-scan). If found (call
> it thread T), all threads in the main queue between the current lock
> holder and T are moved to the end of the secondary queue. If such T
> is not found, we make another scan of the main queue after acquiring
> the spinlock when unlocking the MCS lock (post-scan), starting at the
> node where pre-scan stopped. If both scans fail to find such T, the
> MCS lock is passed to the first thread in the secondary queue. If the
> secondary queue is empty, the MCS lock is passed to the next thread in the
> main queue. To avoid starvation of threads in the secondary queue, those
> threads are moved back to the head of the main queue after a certain
> number of intra-node lock hand-offs.
>
> More details are available at https://urldefense.proofpoint.com/v2/url?u=https-3A__arxiv.org_abs_1810.05600&d=DwIDAg&c=nKjWec2b6R0mOyPaz7xtfQ&r=G9w4KsPaQLACBfGCL35PtiRH996yqJDxAZwrWegU2qQ&m=AoOLTQlgNjtdBvY_yWd6ViBXrVM6o2wqXOdFA4B_F2A&s=yUjG9gfi3BtLKDEjgki86h52GVXMvDQ6ZClMvoIG034&e= .
>
> The series applies on top of v5.5.0-rc2, commit ea200dec51.
> Performance numbers are available in previous revisions
> of the series.
>
> Further comments are welcome and appreciated.
>
> Alex Kogan (5):
> locking/qspinlock: Rename mcs lock/unlock macros and make them more
> generic
> locking/qspinlock: Refactor the qspinlock slow path
> locking/qspinlock: Introduce CNA into the slow path of qspinlock
> locking/qspinlock: Introduce starvation avoidance into CNA
> locking/qspinlock: Introduce the shuffle reduction optimization into
> CNA
>
> .../admin-guide/kernel-parameters.txt | 18 +
> arch/arm/include/asm/mcs_spinlock.h | 6 +-
> arch/x86/Kconfig | 20 +
> arch/x86/include/asm/qspinlock.h | 4 +
> arch/x86/kernel/alternative.c | 4 +
> include/asm-generic/mcs_spinlock.h | 4 +-
> kernel/locking/mcs_spinlock.h | 20 +-
> kernel/locking/qspinlock.c | 82 +++-
> kernel/locking/qspinlock_cna.h | 400 ++++++++++++++++++
> kernel/locking/qspinlock_paravirt.h | 2 +-
> 10 files changed, 537 insertions(+), 23 deletions(-)
> create mode 100644 kernel/locking/qspinlock_cna.h
>
> --
> 2.21.0 (Apple Git-122.2)
>
Tried out queued spinlock slowpath improvements on arm64 (ThunderX2) by
hardwiring CNA APIs to queued_spin_lock_slowpath() and numbers are pretty
good with the CNA changes.
Speed-up on v5.5-rc4 kernel:
will-it-scale/open1_threads:
#thr speed-up
1 1.00
2 0.97
4 0.98
8 1.02
16 0.95
32 1.63
64 1.70
128 2.09
224 2.16
will-it-scale/lock2_threads:
#thr speed-up
1 0.98
2 0.99
4 0.90
8 0.98
16 0.99
32 1.52
64 2.31
128 2.25
224 2.04
#thr - number of threads
speed-up - number with CNA patch / number with stock kernel
Please share your thoughts on best way to enable this series on arm64.
Thanks,
Shijith
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
^ permalink raw reply [flat|nested] 39+ messages in thread
* Re: [PATCH v8 0/5] Add NUMA-awareness to qspinlock
2020-01-08 5:09 ` Shijith Thotton
@ 2020-01-21 9:21 ` Shijith Thotton
0 siblings, 0 replies; 39+ messages in thread
From: Shijith Thotton @ 2020-01-21 9:21 UTC (permalink / raw)
To: Alex Kogan, will, catalin.marinas
Cc: linux-arch, arnd, peterz, dave.dice, x86, guohanjun, linux,
steven.sistare, linux-kernel, mingo, bp, hpa, longman, tglx,
daniel.m.jordan, linux-arm-kernel
Hi Will/Catalin,
On Wed, Jan 08, 2020 at 05:09:05AM +0000, Shijith Thotton wrote:
> On Mon, Dec 30, 2019 at 02:40:37PM -0500, Alex Kogan wrote:
> > Minor changes from v7 based on feedback from Longman:
> > -----------------------------------------------------
> >
> > - Move __init functions from alternative.c to qspinlock_cna.h
> >
> > - Introduce enum for return values from cna_pre_scan(), for better
> > readability.
> >
> > - Add/revise a few comments to improve readability.
> >
> >
> > Summary
> > -------
> >
> > Lock throughput can be increased by handing a lock to a waiter on the
> > same NUMA node as the lock holder, provided care is taken to avoid
> > starvation of waiters on other NUMA nodes. This patch introduces CNA
> > (compact NUMA-aware lock) as the slow path for qspinlock. It is
> > enabled through a configuration option (NUMA_AWARE_SPINLOCKS).
> >
> > CNA is a NUMA-aware version of the MCS lock. Spinning threads are
> > organized in two queues, a main queue for threads running on the same
> > node as the current lock holder, and a secondary queue for threads
> > running on other nodes. Threads store the ID of the node on which
> > they are running in their queue nodes. After acquiring the MCS lock and
> > before acquiring the spinlock, the lock holder scans the main queue
> > looking for a thread running on the same node (pre-scan). If found (call
> > it thread T), all threads in the main queue between the current lock
> > holder and T are moved to the end of the secondary queue. If such T
> > is not found, we make another scan of the main queue after acquiring
> > the spinlock when unlocking the MCS lock (post-scan), starting at the
> > node where pre-scan stopped. If both scans fail to find such T, the
> > MCS lock is passed to the first thread in the secondary queue. If the
> > secondary queue is empty, the MCS lock is passed to the next thread in the
> > main queue. To avoid starvation of threads in the secondary queue, those
> > threads are moved back to the head of the main queue after a certain
> > number of intra-node lock hand-offs.
> >
> > More details are available at https://urldefense.proofpoint.com/v2/url?u=https-3A__arxiv.org_abs_1810.05600&d=DwIDAg&c=nKjWec2b6R0mOyPaz7xtfQ&r=G9w4KsPaQLACBfGCL35PtiRH996yqJDxAZwrWegU2qQ&m=AoOLTQlgNjtdBvY_yWd6ViBXrVM6o2wqXOdFA4B_F2A&s=yUjG9gfi3BtLKDEjgki86h52GVXMvDQ6ZClMvoIG034&e= .
> >
> > The series applies on top of v5.5.0-rc2, commit ea200dec51.
> > Performance numbers are available in previous revisions
> > of the series.
> >
> > Further comments are welcome and appreciated.
> >
> > Alex Kogan (5):
> > locking/qspinlock: Rename mcs lock/unlock macros and make them more
> > generic
> > locking/qspinlock: Refactor the qspinlock slow path
> > locking/qspinlock: Introduce CNA into the slow path of qspinlock
> > locking/qspinlock: Introduce starvation avoidance into CNA
> > locking/qspinlock: Introduce the shuffle reduction optimization into
> > CNA
> >
> > .../admin-guide/kernel-parameters.txt | 18 +
> > arch/arm/include/asm/mcs_spinlock.h | 6 +-
> > arch/x86/Kconfig | 20 +
> > arch/x86/include/asm/qspinlock.h | 4 +
> > arch/x86/kernel/alternative.c | 4 +
> > include/asm-generic/mcs_spinlock.h | 4 +-
> > kernel/locking/mcs_spinlock.h | 20 +-
> > kernel/locking/qspinlock.c | 82 +++-
> > kernel/locking/qspinlock_cna.h | 400 ++++++++++++++++++
> > kernel/locking/qspinlock_paravirt.h | 2 +-
> > 10 files changed, 537 insertions(+), 23 deletions(-)
> > create mode 100644 kernel/locking/qspinlock_cna.h
> >
> > --
> > 2.21.0 (Apple Git-122.2)
> >
>
> Tried out queued spinlock slowpath improvements on arm64 (ThunderX2) by
> hardwiring CNA APIs to queued_spin_lock_slowpath() and numbers are pretty
> good with the CNA changes.
>
> Speed-up on v5.5-rc4 kernel:
>
> will-it-scale/open1_threads:
> #thr speed-up
> 1 1.00
> 2 0.97
> 4 0.98
> 8 1.02
> 16 0.95
> 32 1.63
> 64 1.70
> 128 2.09
> 224 2.16
>
> will-it-scale/lock2_threads:
> #thr speed-up
> 1 0.98
> 2 0.99
> 4 0.90
> 8 0.98
> 16 0.99
> 32 1.52
> 64 2.31
> 128 2.25
> 224 2.04
>
> #thr - number of threads
> speed-up - number with CNA patch / number with stock kernel
>
> Please share your thoughts on best way to enable this series on arm64.
Please comment if you got a chance to look at this.
Thanks,
Shijith
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
^ permalink raw reply [flat|nested] 39+ messages in thread