* [RFC][PATCH 00/13] percpu rwsem -v2
@ 2015-06-22 12:16 Peter Zijlstra
2015-06-22 12:16 ` [RFC][PATCH 01/13] rcu: Create rcu_sync infrastructure Peter Zijlstra
` (16 more replies)
0 siblings, 17 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-22 12:16 UTC (permalink / raw)
To: oleg, paulmck
Cc: tj, mingo, linux-kernel, der.herr, peterz, dave, riel, viro, torvalds
This is a derived work of the cpu hotplug lock rework I did in 2013 which never
really went anywhere because Linus didn't like it.
This applies those same optimizations to the percpu-rwsem. Seeing how we did
all the work it seemed a waste to not use it at all. Linus still didn't like it
because there was only a single user, there are two now:
- uprobes
- cgroups
This series converts the cpu hotplug lock into a percpu-rwsem to provide a 3rd
user.
Also, since Linus thinks lglocks is a failed locking primitive (which I whole
heartedly agree with, its preempt-disable latencies are an abomination), it
also converts the global part of fs/locks's usage of lglock over to a
percpu-rwsem and uses a per-cpu spinlock for the local part. This both provides
another (4th) percpu-rwsem users and removes an lglock user.
It further removes the stop_machine lglock usage, and with it kills lglocks.
Changes since -v1:
- Added the missing smp_load_acquire()/smp_store_release() as spotted by Oleg
- Added percpu_down_read_trylock()
- Convert cpu hotplug lock
- Convert fs/locks
- Removes lglock from stop_machine
- Removes lglock
---
Documentation/locking/lglock.txt | 166 -------------------------
fs/Kconfig | 1 +
fs/file_table.c | 1 -
fs/locks.c | 65 +++++++---
include/linux/cpu.h | 6 +
include/linux/lglock.h | 81 -------------
include/linux/percpu-rwsem.h | 96 +++++++++++++--
include/linux/sched.h | 9 +-
init/main.c | 1 +
kernel/cpu.c | 130 ++++++--------------
kernel/fork.c | 2 +
kernel/locking/Makefile | 1 -
kernel/locking/lglock.c | 111 -----------------
kernel/locking/percpu-rwsem.c | 255 +++++++++++++++++++++------------------
kernel/rcu/Makefile | 2 +-
kernel/stop_machine.c | 52 ++++----
lib/Kconfig | 10 ++
17 files changed, 371 insertions(+), 618 deletions(-)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/
^ permalink raw reply [flat|nested] 106+ messages in thread
* [RFC][PATCH 01/13] rcu: Create rcu_sync infrastructure
2015-06-22 12:16 [RFC][PATCH 00/13] percpu rwsem -v2 Peter Zijlstra
@ 2015-06-22 12:16 ` Peter Zijlstra
2015-06-22 12:16 ` [RFC][PATCH 02/13] rcusync: Introduce struct rcu_sync_ops Peter Zijlstra
` (15 subsequent siblings)
16 siblings, 0 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-22 12:16 UTC (permalink / raw)
To: oleg, paulmck
Cc: tj, mingo, linux-kernel, der.herr, peterz, dave, riel, viro, torvalds
[-- Attachment #1: oleg_nesterov-2_rcu-create_rcu_sync_infrastructure.patch --]
[-- Type: text/plain, Size: 5875 bytes --]
It is functionally equivalent to
struct rcu_sync_struct {
atomic_t counter;
};
static inline bool rcu_sync_is_idle(struct rcu_sync_struct *rss)
{
return atomic_read(&rss->counter) == 0;
}
static inline void rcu_sync_enter(struct rcu_sync_struct *rss)
{
atomic_inc(&rss->counter);
synchronize_sched();
}
static inline void rcu_sync_exit(struct rcu_sync_struct *rss)
{
synchronize_sched();
atomic_dec(&rss->counter);
}
except: it records the state and synchronize_sched() is only called by
rcu_sync_enter() and only if necessary.
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
include/linux/rcusync.h | 64 ++++++++++++++++++++++++++++
kernel/rcu/Makefile | 2
kernel/rcu/sync.c | 108 ++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 173 insertions(+), 1 deletion(-)
--- /dev/null
+++ b/include/linux/rcusync.h
@@ -0,0 +1,64 @@
+#ifndef _LINUX_RCUSYNC_H_
+#define _LINUX_RCUSYNC_H_
+
+#include <linux/wait.h>
+#include <linux/rcupdate.h>
+
+struct rcu_sync_struct {
+ int gp_state;
+ int gp_count;
+ wait_queue_head_t gp_wait;
+
+ int cb_state;
+ struct rcu_head cb_head;
+
+ void (*sync)(void);
+ void (*call)(struct rcu_head *, void (*)(struct rcu_head *));
+};
+
+#define ___RCU_SYNC_INIT(name) \
+ .gp_state = 0, \
+ .gp_count = 0, \
+ .gp_wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.gp_wait), \
+ .cb_state = 0
+
+#define __RCU_SCHED_SYNC_INIT(name) { \
+ ___RCU_SYNC_INIT(name), \
+ .sync = synchronize_sched, \
+ .call = call_rcu_sched, \
+}
+
+#define __RCU_BH_SYNC_INIT(name) { \
+ ___RCU_SYNC_INIT(name), \
+ .sync = synchronize_rcu_bh, \
+ .call = call_rcu_bh, \
+}
+
+#define __RCU_SYNC_INIT(name) { \
+ ___RCU_SYNC_INIT(name), \
+ .sync = synchronize_rcu, \
+ .call = call_rcu, \
+}
+
+#define DEFINE_RCU_SCHED_SYNC(name) \
+ struct rcu_sync_struct name = __RCU_SCHED_SYNC_INIT(name)
+
+#define DEFINE_RCU_BH_SYNC(name) \
+ struct rcu_sync_struct name = __RCU_BH_SYNC_INIT(name)
+
+#define DEFINE_RCU_SYNC(name) \
+ struct rcu_sync_struct name = __RCU_SYNC_INIT(name)
+
+static inline bool rcu_sync_is_idle(struct rcu_sync_struct *rss)
+{
+ return !rss->gp_state; /* GP_IDLE */
+}
+
+enum rcu_sync_type { RCU_SYNC, RCU_SCHED_SYNC, RCU_BH_SYNC };
+
+extern void rcu_sync_init(struct rcu_sync_struct *, enum rcu_sync_type);
+extern void rcu_sync_enter(struct rcu_sync_struct *);
+extern void rcu_sync_exit(struct rcu_sync_struct *);
+
+#endif /* _LINUX_RCUSYNC_H_ */
+
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -1,4 +1,4 @@
-obj-y += update.o
+obj-y += update.o sync.o
obj-$(CONFIG_SRCU) += srcu.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
obj-$(CONFIG_TREE_RCU) += tree.o
--- /dev/null
+++ b/kernel/rcu/sync.c
@@ -0,0 +1,108 @@
+
+#include <linux/rcusync.h>
+#include <linux/sched.h>
+
+enum { GP_IDLE = 0, GP_PENDING, GP_PASSED };
+enum { CB_IDLE = 0, CB_PENDING, CB_REPLAY };
+
+#define rss_lock gp_wait.lock
+
+void rcu_sync_init(struct rcu_sync_struct *rss, enum rcu_sync_type type)
+{
+ memset(rss, 0, sizeof(*rss));
+ init_waitqueue_head(&rss->gp_wait);
+
+ switch (type) {
+ case RCU_SYNC:
+ rss->sync = synchronize_rcu;
+ rss->call = call_rcu;
+ break;
+
+ case RCU_SCHED_SYNC:
+ rss->sync = synchronize_sched;
+ rss->call = call_rcu_sched;
+ break;
+
+ case RCU_BH_SYNC:
+ rss->sync = synchronize_rcu_bh;
+ rss->call = call_rcu_bh;
+ break;
+ }
+}
+
+void rcu_sync_enter(struct rcu_sync_struct *rss)
+{
+ bool need_wait, need_sync;
+
+ spin_lock_irq(&rss->rss_lock);
+ need_wait = rss->gp_count++;
+ need_sync = rss->gp_state == GP_IDLE;
+ if (need_sync)
+ rss->gp_state = GP_PENDING;
+ spin_unlock_irq(&rss->rss_lock);
+
+ BUG_ON(need_wait && need_sync);
+
+ if (need_sync) {
+ rss->sync();
+ rss->gp_state = GP_PASSED;
+ wake_up_all(&rss->gp_wait);
+ } else if (need_wait) {
+ wait_event(rss->gp_wait, rss->gp_state == GP_PASSED);
+ } else {
+ /*
+ * Possible when there's a pending CB from a rcu_sync_exit().
+ * Nobody has yet been allowed the 'fast' path and thus we can
+ * avoid doing any sync(). The callback will get 'dropped'.
+ */
+ BUG_ON(rss->gp_state != GP_PASSED);
+ }
+}
+
+static void rcu_sync_func(struct rcu_head *rcu)
+{
+ struct rcu_sync_struct *rss =
+ container_of(rcu, struct rcu_sync_struct, cb_head);
+ unsigned long flags;
+
+
+ BUG_ON(rss->gp_state != GP_PASSED);
+ BUG_ON(rss->cb_state == CB_IDLE);
+
+ spin_lock_irqsave(&rss->rss_lock, flags);
+ if (rss->gp_count) {
+ /*
+ * A new rcu_sync_begin() has happened; drop the callback.
+ */
+ rss->cb_state = CB_IDLE;
+ } else if (rss->cb_state == CB_REPLAY) {
+ /*
+ * A new rcu_sync_exit() has happened; requeue the callback
+ * to catch a later GP.
+ */
+ rss->cb_state = CB_PENDING;
+ rss->call(&rss->cb_head, rcu_sync_func);
+ } else {
+ /*
+ * We're at least a GP after rcu_sync_exit(); eveybody will now
+ * have observed the write side critical section. Let 'em rip!.
+ */
+ rss->cb_state = CB_IDLE;
+ rss->gp_state = GP_IDLE;
+ }
+ spin_unlock_irqrestore(&rss->rss_lock, flags);
+}
+
+void rcu_sync_exit(struct rcu_sync_struct *rss)
+{
+ spin_lock_irq(&rss->rss_lock);
+ if (!--rss->gp_count) {
+ if (rss->cb_state == CB_IDLE) {
+ rss->cb_state = CB_PENDING;
+ rss->call(&rss->cb_head, rcu_sync_func);
+ } else if (rss->cb_state == CB_PENDING) {
+ rss->cb_state = CB_REPLAY;
+ }
+ }
+ spin_unlock_irq(&rss->rss_lock);
+}
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/
^ permalink raw reply [flat|nested] 106+ messages in thread
* [RFC][PATCH 02/13] rcusync: Introduce struct rcu_sync_ops
2015-06-22 12:16 [RFC][PATCH 00/13] percpu rwsem -v2 Peter Zijlstra
2015-06-22 12:16 ` [RFC][PATCH 01/13] rcu: Create rcu_sync infrastructure Peter Zijlstra
@ 2015-06-22 12:16 ` Peter Zijlstra
2015-06-22 12:16 ` [RFC][PATCH 03/13] rcusync: Add the CONFIG_PROVE_RCU checks Peter Zijlstra
` (14 subsequent siblings)
16 siblings, 0 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-22 12:16 UTC (permalink / raw)
To: oleg, paulmck
Cc: tj, mingo, linux-kernel, der.herr, peterz, dave, riel, viro, torvalds
[-- Attachment #1: oleg_nesterov-4_rcusync-introduce_struct_rcu_sync_ops.patch --]
[-- Type: text/plain, Size: 5102 bytes --]
Add the new struct rcu_sync_ops which holds sync/call methods, and
turn the function pointers in rcu_sync_struct into an array of struct
rcu_sync_ops.
This simplifies the "init" helpers, and this way it is simpler to add
the new methods we need, especially ifdef'ed.
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
include/linux/rcusync.h | 60 ++++++++++++++++++------------------------------
kernel/rcu/sync.c | 43 +++++++++++++++++-----------------
2 files changed, 45 insertions(+), 58 deletions(-)
--- a/include/linux/rcusync.h
+++ b/include/linux/rcusync.h
@@ -4,6 +4,8 @@
#include <linux/wait.h>
#include <linux/rcupdate.h>
+enum rcu_sync_type { RCU_SYNC, RCU_SCHED_SYNC, RCU_BH_SYNC };
+
struct rcu_sync_struct {
int gp_state;
int gp_count;
@@ -12,53 +14,37 @@ struct rcu_sync_struct {
int cb_state;
struct rcu_head cb_head;
- void (*sync)(void);
- void (*call)(struct rcu_head *, void (*)(struct rcu_head *));
+ enum rcu_sync_type gp_type;
};
-#define ___RCU_SYNC_INIT(name) \
- .gp_state = 0, \
- .gp_count = 0, \
- .gp_wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.gp_wait), \
- .cb_state = 0
-
-#define __RCU_SCHED_SYNC_INIT(name) { \
- ___RCU_SYNC_INIT(name), \
- .sync = synchronize_sched, \
- .call = call_rcu_sched, \
-}
-
-#define __RCU_BH_SYNC_INIT(name) { \
- ___RCU_SYNC_INIT(name), \
- .sync = synchronize_rcu_bh, \
- .call = call_rcu_bh, \
-}
-
-#define __RCU_SYNC_INIT(name) { \
- ___RCU_SYNC_INIT(name), \
- .sync = synchronize_rcu, \
- .call = call_rcu, \
-}
-
-#define DEFINE_RCU_SCHED_SYNC(name) \
- struct rcu_sync_struct name = __RCU_SCHED_SYNC_INIT(name)
-
-#define DEFINE_RCU_BH_SYNC(name) \
- struct rcu_sync_struct name = __RCU_BH_SYNC_INIT(name)
-
-#define DEFINE_RCU_SYNC(name) \
- struct rcu_sync_struct name = __RCU_SYNC_INIT(name)
-
static inline bool rcu_sync_is_idle(struct rcu_sync_struct *rss)
{
return !rss->gp_state; /* GP_IDLE */
}
-enum rcu_sync_type { RCU_SYNC, RCU_SCHED_SYNC, RCU_BH_SYNC };
-
extern void rcu_sync_init(struct rcu_sync_struct *, enum rcu_sync_type);
extern void rcu_sync_enter(struct rcu_sync_struct *);
extern void rcu_sync_exit(struct rcu_sync_struct *);
+#define __RCU_SYNC_INITIALIZER(name, type) { \
+ .gp_state = 0, \
+ .gp_count = 0, \
+ .gp_wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.gp_wait), \
+ .cb_state = 0, \
+ .gp_type = type, \
+ }
+
+#define __DEFINE_RCU_SYNC(name, type) \
+ struct rcu_sync_struct name = __RCU_SYNC_INITIALIZER(name, type)
+
+#define DEFINE_RCU_SYNC(name) \
+ __DEFINE_RCU_SYNC(name, RCU_SYNC)
+
+#define DEFINE_RCU_SCHED_SYNC(name) \
+ __DEFINE_RCU_SYNC(name, RCU_SCHED_SYNC)
+
+#define DEFINE_RCU_BH_SYNC(name) \
+ __DEFINE_RCU_SYNC(name, RCU_BH_SYNC)
+
#endif /* _LINUX_RCUSYNC_H_ */
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -1,7 +1,24 @@
-
#include <linux/rcusync.h>
#include <linux/sched.h>
+static const struct {
+ void (*sync)(void);
+ void (*call)(struct rcu_head *, void (*)(struct rcu_head *));
+} gp_ops[] = {
+ [RCU_SYNC] = {
+ .sync = synchronize_rcu,
+ .call = call_rcu,
+ },
+ [RCU_SCHED_SYNC] = {
+ .sync = synchronize_sched,
+ .call = call_rcu_sched,
+ },
+ [RCU_BH_SYNC] = {
+ .sync = synchronize_rcu_bh,
+ .call = call_rcu_bh,
+ },
+};
+
enum { GP_IDLE = 0, GP_PENDING, GP_PASSED };
enum { CB_IDLE = 0, CB_PENDING, CB_REPLAY };
@@ -11,23 +28,7 @@ void rcu_sync_init(struct rcu_sync_struc
{
memset(rss, 0, sizeof(*rss));
init_waitqueue_head(&rss->gp_wait);
-
- switch (type) {
- case RCU_SYNC:
- rss->sync = synchronize_rcu;
- rss->call = call_rcu;
- break;
-
- case RCU_SCHED_SYNC:
- rss->sync = synchronize_sched;
- rss->call = call_rcu_sched;
- break;
-
- case RCU_BH_SYNC:
- rss->sync = synchronize_rcu_bh;
- rss->call = call_rcu_bh;
- break;
- }
+ rss->gp_type = type;
}
void rcu_sync_enter(struct rcu_sync_struct *rss)
@@ -44,7 +45,7 @@ void rcu_sync_enter(struct rcu_sync_stru
BUG_ON(need_wait && need_sync);
if (need_sync) {
- rss->sync();
+ gp_ops[rss->gp_type].sync();
rss->gp_state = GP_PASSED;
wake_up_all(&rss->gp_wait);
} else if (need_wait) {
@@ -81,7 +82,7 @@ static void rcu_sync_func(struct rcu_hea
* to catch a later GP.
*/
rss->cb_state = CB_PENDING;
- rss->call(&rss->cb_head, rcu_sync_func);
+ gp_ops[rss->gp_type].call(&rss->cb_head, rcu_sync_func);
} else {
/*
* We're at least a GP after rcu_sync_exit(); eveybody will now
@@ -99,7 +100,7 @@ void rcu_sync_exit(struct rcu_sync_struc
if (!--rss->gp_count) {
if (rss->cb_state == CB_IDLE) {
rss->cb_state = CB_PENDING;
- rss->call(&rss->cb_head, rcu_sync_func);
+ gp_ops[rss->gp_type].call(&rss->cb_head, rcu_sync_func);
} else if (rss->cb_state == CB_PENDING) {
rss->cb_state = CB_REPLAY;
}
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/
^ permalink raw reply [flat|nested] 106+ messages in thread
* [RFC][PATCH 03/13] rcusync: Add the CONFIG_PROVE_RCU checks
2015-06-22 12:16 [RFC][PATCH 00/13] percpu rwsem -v2 Peter Zijlstra
2015-06-22 12:16 ` [RFC][PATCH 01/13] rcu: Create rcu_sync infrastructure Peter Zijlstra
2015-06-22 12:16 ` [RFC][PATCH 02/13] rcusync: Introduce struct rcu_sync_ops Peter Zijlstra
@ 2015-06-22 12:16 ` Peter Zijlstra
2015-06-22 12:16 ` [RFC][PATCH 04/13] rcusync: Introduce rcu_sync_dtor() Peter Zijlstra
` (13 subsequent siblings)
16 siblings, 0 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-22 12:16 UTC (permalink / raw)
To: oleg, paulmck
Cc: tj, mingo, linux-kernel, der.herr, peterz, dave, riel, viro, torvalds
[-- Attachment #1: oleg_nesterov-5_rcusync-add_the_config_prove_rcu_checks.patch --]
[-- Type: text/plain, Size: 2438 bytes --]
It would be nice to validate that the caller of rcu_sync_is_idle()
holds the corresponding type of RCU read-side lock. Add the new
rcu_sync_ops->held() method and change rcu_sync_is_idle() to
WARN() if it returns false.
This obviously penalizes the readers (fast-path), but only if
CONFIG_PROVE_RCU.
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Suggested-by: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
include/linux/rcusync.h | 6 ++++++
kernel/rcu/sync.c | 21 +++++++++++++++++++++
2 files changed, 27 insertions(+)
--- a/include/linux/rcusync.h
+++ b/include/linux/rcusync.h
@@ -17,9 +17,15 @@ struct rcu_sync_struct {
enum rcu_sync_type gp_type;
};
+extern bool __rcu_sync_is_idle(struct rcu_sync_struct *);
+
static inline bool rcu_sync_is_idle(struct rcu_sync_struct *rss)
{
+#ifdef CONFIG_PROVE_RCU
+ return __rcu_sync_is_idle(rss);
+#else
return !rss->gp_state; /* GP_IDLE */
+#endif
}
extern void rcu_sync_init(struct rcu_sync_struct *, enum rcu_sync_type);
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -1,21 +1,33 @@
#include <linux/rcusync.h>
#include <linux/sched.h>
+#ifdef CONFIG_PROVE_RCU
+#define __INIT_HELD(func) .held = func,
+#else
+#define __INIT_HELD(func)
+#endif
+
static const struct {
void (*sync)(void);
void (*call)(struct rcu_head *, void (*)(struct rcu_head *));
+#ifdef CONFIG_PROVE_RCU
+ int (*held)(void);
+#endif
} gp_ops[] = {
[RCU_SYNC] = {
.sync = synchronize_rcu,
.call = call_rcu,
+ __INIT_HELD(rcu_read_lock_held)
},
[RCU_SCHED_SYNC] = {
.sync = synchronize_sched,
.call = call_rcu_sched,
+ __INIT_HELD(rcu_read_lock_sched_held)
},
[RCU_BH_SYNC] = {
.sync = synchronize_rcu_bh,
.call = call_rcu_bh,
+ __INIT_HELD(rcu_read_lock_bh_held)
},
};
@@ -24,6 +36,15 @@ enum { CB_IDLE = 0, CB_PENDING, CB_REPLA
#define rss_lock gp_wait.lock
+#ifdef CONFIG_PROVE_RCU
+bool __rcu_sync_is_idle(struct rcu_sync_struct *rss)
+{
+ WARN_ON(!gp_ops[rss->gp_type].held());
+ return rss->gp_state == GP_IDLE;
+}
+EXPORT_SYMBOL_GPL(__rcu_sync_is_idle);
+#endif
+
void rcu_sync_init(struct rcu_sync_struct *rss, enum rcu_sync_type type)
{
memset(rss, 0, sizeof(*rss));
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/
^ permalink raw reply [flat|nested] 106+ messages in thread
* [RFC][PATCH 04/13] rcusync: Introduce rcu_sync_dtor()
2015-06-22 12:16 [RFC][PATCH 00/13] percpu rwsem -v2 Peter Zijlstra
` (2 preceding siblings ...)
2015-06-22 12:16 ` [RFC][PATCH 03/13] rcusync: Add the CONFIG_PROVE_RCU checks Peter Zijlstra
@ 2015-06-22 12:16 ` Peter Zijlstra
2015-06-22 12:16 ` [RFC][PATCH 05/13] percpu-rwsem: Optimize readers and reduce global impact Peter Zijlstra
` (12 subsequent siblings)
16 siblings, 0 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-22 12:16 UTC (permalink / raw)
To: oleg, paulmck
Cc: tj, mingo, linux-kernel, der.herr, peterz, dave, riel, viro, torvalds
[-- Attachment #1: oleg_nesterov-6_rcusync-introduce_rcu_sync_dtor.patch --]
[-- Type: text/plain, Size: 2278 bytes --]
Add the new rcu_sync_ops->wait() method and the new helper,
rcu_sync_dtor().
It is needed if you are going to, say, kfree(rcu_sync_object).
It simply calls ops->wait() to "flush" the potentially pending
rcu callback.
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
include/linux/rcusync.h | 1 +
kernel/rcu/sync.c | 22 ++++++++++++++++++++++
2 files changed, 23 insertions(+)
--- a/include/linux/rcusync.h
+++ b/include/linux/rcusync.h
@@ -31,6 +31,7 @@ static inline bool rcu_sync_is_idle(stru
extern void rcu_sync_init(struct rcu_sync_struct *, enum rcu_sync_type);
extern void rcu_sync_enter(struct rcu_sync_struct *);
extern void rcu_sync_exit(struct rcu_sync_struct *);
+extern void rcu_sync_dtor(struct rcu_sync_struct *);
#define __RCU_SYNC_INITIALIZER(name, type) { \
.gp_state = 0, \
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -10,6 +10,7 @@
static const struct {
void (*sync)(void);
void (*call)(struct rcu_head *, void (*)(struct rcu_head *));
+ void (*wait)(void);
#ifdef CONFIG_PROVE_RCU
int (*held)(void);
#endif
@@ -17,16 +18,19 @@ static const struct {
[RCU_SYNC] = {
.sync = synchronize_rcu,
.call = call_rcu,
+ .wait = rcu_barrier,
__INIT_HELD(rcu_read_lock_held)
},
[RCU_SCHED_SYNC] = {
.sync = synchronize_sched,
.call = call_rcu_sched,
+ .wait = rcu_barrier_sched,
__INIT_HELD(rcu_read_lock_sched_held)
},
[RCU_BH_SYNC] = {
.sync = synchronize_rcu_bh,
.call = call_rcu_bh,
+ .wait = rcu_barrier_bh,
__INIT_HELD(rcu_read_lock_bh_held)
},
};
@@ -128,3 +132,21 @@ void rcu_sync_exit(struct rcu_sync_struc
}
spin_unlock_irq(&rss->rss_lock);
}
+
+void rcu_sync_dtor(struct rcu_sync_struct *rss)
+{
+ int cb_state;
+
+ BUG_ON(rss->gp_count);
+
+ spin_lock_irq(&rss->rss_lock);
+ if (rss->cb_state == CB_REPLAY)
+ rss->cb_state = CB_PENDING;
+ cb_state = rss->cb_state;
+ spin_unlock_irq(&rss->rss_lock);
+
+ if (cb_state != CB_IDLE) {
+ gp_ops[rss->gp_type].wait();
+ BUG_ON(rss->cb_state != CB_IDLE);
+ }
+}
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/
^ permalink raw reply [flat|nested] 106+ messages in thread
* [RFC][PATCH 05/13] percpu-rwsem: Optimize readers and reduce global impact
2015-06-22 12:16 [RFC][PATCH 00/13] percpu rwsem -v2 Peter Zijlstra
` (3 preceding siblings ...)
2015-06-22 12:16 ` [RFC][PATCH 04/13] rcusync: Introduce rcu_sync_dtor() Peter Zijlstra
@ 2015-06-22 12:16 ` Peter Zijlstra
2015-06-22 23:02 ` Oleg Nesterov
2015-06-23 7:28 ` Nicholas Mc Guire
2015-06-22 12:16 ` [RFC][PATCH 06/13] percpu-rwsem: Provide percpu_down_read_trylock() Peter Zijlstra
` (11 subsequent siblings)
16 siblings, 2 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-22 12:16 UTC (permalink / raw)
To: oleg, paulmck
Cc: tj, mingo, linux-kernel, der.herr, peterz, dave, riel, viro, torvalds
[-- Attachment #1: peterz-zijlstra-opt-percpu-rwsem.patch --]
[-- Type: text/plain, Size: 13552 bytes --]
Currently the percpu-rwsem has two issues:
- it switches to (global) atomic ops while a writer is waiting;
which could be quite a while and slows down releasing the readers.
- it employs synchronize_sched_expedited() _twice_ which is evil and
should die -- it shoots IPIs around the machine.
This patch cures the first problem by ordering the reader-state vs
reader-count (see the comments in __percpu_down_read() and
percpu_down_write()). This changes a global atomic op into a full
memory barrier, which doesn't have the global cacheline contention.
It cures the second problem by employing the rcu-sync primitives by
Oleg which reduces to no sync_sched() calls in the 'normal' case of
no write contention -- global locks had better be rare, and has a
maximum of one sync_sched() call in case of contention.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
include/linux/percpu-rwsem.h | 62 +++++++++-
kernel/locking/percpu-rwsem.c | 243 ++++++++++++++++++++++--------------------
2 files changed, 182 insertions(+), 123 deletions(-)
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -5,18 +5,64 @@
#include <linux/rwsem.h>
#include <linux/percpu.h>
#include <linux/wait.h>
+#include <linux/rcusync.h>
#include <linux/lockdep.h>
struct percpu_rw_semaphore {
- unsigned int __percpu *fast_read_ctr;
- atomic_t write_ctr;
+ unsigned int __percpu *refcount;
+ int state;
+ struct rcu_sync_struct rss;
+ wait_queue_head_t writer;
struct rw_semaphore rw_sem;
- atomic_t slow_read_ctr;
- wait_queue_head_t write_waitq;
};
-extern void percpu_down_read(struct percpu_rw_semaphore *);
-extern void percpu_up_read(struct percpu_rw_semaphore *);
+extern void __percpu_down_read(struct percpu_rw_semaphore *);
+extern void __percpu_up_read(struct percpu_rw_semaphore *);
+
+static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
+{
+ might_sleep();
+
+ rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 0, _RET_IP_);
+
+ preempt_disable();
+ /*
+ * We are in an RCU-sched read-side critical section, so the writer
+ * cannot both change sem->state from readers_fast and start
+ * checking counters while we are here. So if we see !sem->state,
+ * we know that the writer won't be checking until we past the
+ * preempt_enable() and that once the synchronize_sched() is done, the
+ * writer will see anything we did within this RCU-sched read-side
+ * critical section.
+ */
+ __this_cpu_inc(*sem->refcount);
+ if (unlikely(!rcu_sync_is_idle(&sem->rss)))
+ __percpu_down_read(sem); /* Unconditional memory barrier. */
+ preempt_enable();
+ /*
+ * The barrier() from preempt_enable() prevents the compiler from
+ * bleeding the critical section out.
+ */
+}
+
+static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
+{
+ /*
+ * The barrier() in preempt_disable() prevents the compiler from
+ * bleeding the critical section out.
+ */
+ preempt_disable();
+ /*
+ * Same as in percpu_down_read().
+ */
+ if (likely(rcu_sync_is_idle(&sem->rss)))
+ __this_cpu_dec(*sem->refcount);
+ else
+ __percpu_up_read(sem); /* Unconditional memory barrier. */
+ preempt_enable();
+
+ rwsem_release(&sem->rw_sem.dep_map, 1, _RET_IP_);
+}
extern void percpu_down_write(struct percpu_rw_semaphore *);
extern void percpu_up_write(struct percpu_rw_semaphore *);
@@ -25,10 +71,10 @@ extern int __percpu_init_rwsem(struct pe
const char *, struct lock_class_key *);
extern void percpu_free_rwsem(struct percpu_rw_semaphore *);
-#define percpu_init_rwsem(brw) \
+#define percpu_init_rwsem(sem) \
({ \
static struct lock_class_key rwsem_key; \
- __percpu_init_rwsem(brw, #brw, &rwsem_key); \
+ __percpu_init_rwsem(sem, #sem, &rwsem_key); \
})
#endif
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -8,158 +8,171 @@
#include <linux/sched.h>
#include <linux/errno.h>
-int __percpu_init_rwsem(struct percpu_rw_semaphore *brw,
+enum { readers_slow, readers_block };
+
+int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
const char *name, struct lock_class_key *rwsem_key)
{
- brw->fast_read_ctr = alloc_percpu(int);
- if (unlikely(!brw->fast_read_ctr))
+ sem->refcount = alloc_percpu(unsigned int);
+ if (unlikely(!sem->refcount))
return -ENOMEM;
- /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
- __init_rwsem(&brw->rw_sem, name, rwsem_key);
- atomic_set(&brw->write_ctr, 0);
- atomic_set(&brw->slow_read_ctr, 0);
- init_waitqueue_head(&brw->write_waitq);
+ sem->state = readers_slow;
+ rcu_sync_init(&sem->rss, RCU_SCHED_SYNC);
+ init_waitqueue_head(&sem->writer);
+ __init_rwsem(&sem->rw_sem, name, rwsem_key);
+
return 0;
}
-void percpu_free_rwsem(struct percpu_rw_semaphore *brw)
+void percpu_free_rwsem(struct percpu_rw_semaphore *sem)
{
- free_percpu(brw->fast_read_ctr);
- brw->fast_read_ctr = NULL; /* catch use after free bugs */
+ rcu_sync_dtor(&sem->rss);
+ free_percpu(sem->refcount);
+ sem->refcount = NULL; /* catch use after free bugs */
}
-/*
- * This is the fast-path for down_read/up_read, it only needs to ensure
- * there is no pending writer (atomic_read(write_ctr) == 0) and inc/dec the
- * fast per-cpu counter. The writer uses synchronize_sched_expedited() to
- * serialize with the preempt-disabled section below.
- *
- * The nontrivial part is that we should guarantee acquire/release semantics
- * in case when
- *
- * R_W: down_write() comes after up_read(), the writer should see all
- * changes done by the reader
- * or
- * W_R: down_read() comes after up_write(), the reader should see all
- * changes done by the writer
- *
- * If this helper fails the callers rely on the normal rw_semaphore and
- * atomic_dec_and_test(), so in this case we have the necessary barriers.
- *
- * But if it succeeds we do not have any barriers, atomic_read(write_ctr) or
- * __this_cpu_add() below can be reordered with any LOAD/STORE done by the
- * reader inside the critical section. See the comments in down_write and
- * up_write below.
- */
-static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val)
+void __percpu_down_read(struct percpu_rw_semaphore *sem)
{
- bool success = false;
+ /*
+ * Due to having preemption disabled the decrement happens on
+ * the same CPU as the increment, avoiding the
+ * increment-on-one-CPU-and-decrement-on-another problem.
+ *
+ * And yes, if the reader misses the writer's assignment of
+ * readers_block to sem->state, then the writer is
+ * guaranteed to see the reader's increment. Conversely, any
+ * readers that increment their sem->refcount after the
+ * writer looks are guaranteed to see the readers_block value,
+ * which in turn means that they are guaranteed to immediately
+ * decrement their sem->refcount, so that it doesn't matter
+ * that the writer missed them.
+ */
+
+ smp_mb(); /* A matches D */
+
+ /*
+ * If !readers_block the critical section starts here, matched by the
+ * release in percpu_up_write().
+ */
+ if (likely(smp_load_acquire(&sem->state) != readers_block))
+ return;
+
+ /*
+ * Per the above comment; we still have preemption disabled and
+ * will thus decrement on the same CPU as we incremented.
+ */
+ __percpu_up_read(sem);
+
+ /*
+ * We either call schedule() in the wait, or we'll fall through
+ * and reschedule on the preempt_enable() in percpu_down_read().
+ */
+ preempt_enable_no_resched();
+
+ /*
+ * Avoid lockdep for the down/up_read() we already have them.
+ */
+ __down_read(&sem->rw_sem);
+ __this_cpu_inc(*sem->refcount);
+ __up_read(&sem->rw_sem);
preempt_disable();
- if (likely(!atomic_read(&brw->write_ctr))) {
- __this_cpu_add(*brw->fast_read_ctr, val);
- success = true;
- }
- preempt_enable();
+}
+
+void __percpu_up_read(struct percpu_rw_semaphore *sem)
+{
+ smp_mb(); /* B matches C */
+ /*
+ * In other words, if they see our decrement (presumably to aggregate
+ * zero, as that is the only time it matters) they will also see our
+ * critical section.
+ */
+ this_cpu_dec(*sem->refcount);
- return success;
+ /* Prod writer to recheck readers_active */
+ wake_up(&sem->writer);
}
+
+#define per_cpu_sum(var) \
+({ \
+ typeof(var) __sum = 0; \
+ int cpu; \
+ for_each_possible_cpu(cpu) \
+ __sum += per_cpu(var, cpu); \
+ __sum; \
+})
+
/*
- * Like the normal down_read() this is not recursive, the writer can
- * come after the first percpu_down_read() and create the deadlock.
- *
- * Note: returns with lock_is_held(brw->rw_sem) == T for lockdep,
- * percpu_up_read() does rwsem_release(). This pairs with the usage
- * of ->rw_sem in percpu_down/up_write().
+ * Return true if the modular sum of the sem->refcount per-CPU variable is
+ * zero. If this sum is zero, then it is stable due to the fact that if any
+ * newly arriving readers increment a given counter, they will immediately
+ * decrement that same counter.
*/
-void percpu_down_read(struct percpu_rw_semaphore *brw)
+static bool readers_active_check(struct percpu_rw_semaphore *sem)
{
- might_sleep();
- if (likely(update_fast_ctr(brw, +1))) {
- rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_);
- return;
- }
+ if (per_cpu_sum(*sem->refcount) != 0)
+ return false;
+
+ /*
+ * If we observed the decrement; ensure we see the entire critical
+ * section.
+ */
+
+ smp_mb(); /* C matches B */
- down_read(&brw->rw_sem);
- atomic_inc(&brw->slow_read_ctr);
- /* avoid up_read()->rwsem_release() */
- __up_read(&brw->rw_sem);
+ return true;
}
-void percpu_up_read(struct percpu_rw_semaphore *brw)
+void percpu_down_write(struct percpu_rw_semaphore *sem)
{
- rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_);
+ down_write(&sem->rw_sem);
- if (likely(update_fast_ctr(brw, -1)))
- return;
+ /* Notify readers to take the slow path. */
+ rcu_sync_enter(&sem->rss);
- /* false-positive is possible but harmless */
- if (atomic_dec_and_test(&brw->slow_read_ctr))
- wake_up_all(&brw->write_waitq);
-}
+ /*
+ * Notify new readers to block; up until now, and thus throughout the
+ * longish rcu_sync_enter() above, new readers could still come in.
+ */
+ sem->state = readers_block;
-static int clear_fast_ctr(struct percpu_rw_semaphore *brw)
-{
- unsigned int sum = 0;
- int cpu;
+ smp_mb(); /* D matches A */
- for_each_possible_cpu(cpu) {
- sum += per_cpu(*brw->fast_read_ctr, cpu);
- per_cpu(*brw->fast_read_ctr, cpu) = 0;
- }
+ /*
+ * If they don't see our writer of readers_block to sem->state,
+ * then we are guaranteed to see their sem->refcount increment, and
+ * therefore will wait for them.
+ */
- return sum;
+ /* Wait for all now active readers to complete. */
+ wait_event(sem->writer, readers_active_check(sem));
}
-/*
- * A writer increments ->write_ctr to force the readers to switch to the
- * slow mode, note the atomic_read() check in update_fast_ctr().
- *
- * After that the readers can only inc/dec the slow ->slow_read_ctr counter,
- * ->fast_read_ctr is stable. Once the writer moves its sum into the slow
- * counter it represents the number of active readers.
- *
- * Finally the writer takes ->rw_sem for writing and blocks the new readers,
- * then waits until the slow counter becomes zero.
- */
-void percpu_down_write(struct percpu_rw_semaphore *brw)
+void percpu_up_write(struct percpu_rw_semaphore *sem)
{
- /* tell update_fast_ctr() there is a pending writer */
- atomic_inc(&brw->write_ctr);
/*
- * 1. Ensures that write_ctr != 0 is visible to any down_read/up_read
- * so that update_fast_ctr() can't succeed.
+ * Signal the writer is done, no fast path yet.
*
- * 2. Ensures we see the result of every previous this_cpu_add() in
- * update_fast_ctr().
+ * One reason that we cannot just immediately flip to readers_fast is
+ * that new readers might fail to see the results of this writer's
+ * critical section.
*
- * 3. Ensures that if any reader has exited its critical section via
- * fast-path, it executes a full memory barrier before we return.
- * See R_W case in the comment above update_fast_ctr().
+ * Therefore we force it through the slow path which guarantees an
+ * acquire and thereby guarantees the critical section's consistency.
*/
- synchronize_sched_expedited();
+ smp_store_release(&sem->state, readers_slow);
- /* exclude other writers, and block the new readers completely */
- down_write(&brw->rw_sem);
-
- /* nobody can use fast_read_ctr, move its sum into slow_read_ctr */
- atomic_add(clear_fast_ctr(brw), &brw->slow_read_ctr);
-
- /* wait for all readers to complete their percpu_up_read() */
- wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr));
-}
+ /*
+ * Release the write lock, this will allow readers back in the game.
+ */
+ up_write(&sem->rw_sem);
-void percpu_up_write(struct percpu_rw_semaphore *brw)
-{
- /* release the lock, but the readers can't use the fast-path */
- up_write(&brw->rw_sem);
/*
- * Insert the barrier before the next fast-path in down_read,
- * see W_R case in the comment above update_fast_ctr().
+ * Once this completes (at least one RCU grace period hence) the reader
+ * fast path will be available again. Safe to use outside the exclusive
+ * write lock because its counting.
*/
- synchronize_sched_expedited();
- /* the last writer unblocks update_fast_ctr() */
- atomic_dec(&brw->write_ctr);
+ rcu_sync_exit(&sem->rss);
}
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/
^ permalink raw reply [flat|nested] 106+ messages in thread
* [RFC][PATCH 06/13] percpu-rwsem: Provide percpu_down_read_trylock()
2015-06-22 12:16 [RFC][PATCH 00/13] percpu rwsem -v2 Peter Zijlstra
` (4 preceding siblings ...)
2015-06-22 12:16 ` [RFC][PATCH 05/13] percpu-rwsem: Optimize readers and reduce global impact Peter Zijlstra
@ 2015-06-22 12:16 ` Peter Zijlstra
2015-06-22 23:08 ` Oleg Nesterov
2015-06-22 12:16 ` [RFC][PATCH 07/13] sched: Reorder task_struct Peter Zijlstra
` (10 subsequent siblings)
16 siblings, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-22 12:16 UTC (permalink / raw)
To: oleg, paulmck
Cc: tj, mingo, linux-kernel, der.herr, peterz, dave, riel, viro, torvalds
[-- Attachment #1: peterz-percpu-rwsem-trydown.patch --]
[-- Type: text/plain, Size: 1711 bytes --]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
include/linux/percpu-rwsem.h | 17 +++++++++++++++++
kernel/locking/percpu-rwsem.c | 12 ++++++++++++
2 files changed, 29 insertions(+)
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -17,6 +17,7 @@ struct percpu_rw_semaphore {
};
extern void __percpu_down_read(struct percpu_rw_semaphore *);
+extern bool __percpu_down_read_trylock(struct percpu_rw_semaphore *);
extern void __percpu_up_read(struct percpu_rw_semaphore *);
static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
@@ -45,6 +46,22 @@ static inline void percpu_down_read(stru
*/
}
+static inline bool percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
+{
+ bool ret = true;
+
+ preempt_disable();
+ __this_cpu_inc(*sem->refcount);
+ if (unlikely(!rcu_sync_is_idle(&sem->rss)))
+ ret = __percpu_down_read_trylock(sem);
+ preempt_enable();
+
+ if (ret)
+ rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 1, _RET_IP_);
+
+ return ret;
+}
+
static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
{
/*
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -80,6 +80,18 @@ void __percpu_down_read(struct percpu_rw
preempt_disable();
}
+bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
+{
+ smp_mb(); /* A matches D */
+
+ if (likely(smp_load_acquire(&sem->state) != readers_block))
+ return true;
+
+ __percpu_up_read(sem);
+
+ return false;
+}
+
void __percpu_up_read(struct percpu_rw_semaphore *sem)
{
smp_mb(); /* B matches C */
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/
^ permalink raw reply [flat|nested] 106+ messages in thread
* [RFC][PATCH 07/13] sched: Reorder task_struct
2015-06-22 12:16 [RFC][PATCH 00/13] percpu rwsem -v2 Peter Zijlstra
` (5 preceding siblings ...)
2015-06-22 12:16 ` [RFC][PATCH 06/13] percpu-rwsem: Provide percpu_down_read_trylock() Peter Zijlstra
@ 2015-06-22 12:16 ` Peter Zijlstra
2015-06-22 12:16 ` [RFC][PATCH 08/13] percpu-rwsem: DEFINE_STATIC_PERCPU_RWSEM Peter Zijlstra
` (9 subsequent siblings)
16 siblings, 0 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-22 12:16 UTC (permalink / raw)
To: oleg, paulmck
Cc: tj, mingo, linux-kernel, der.herr, peterz, dave, riel, viro, torvalds
[-- Attachment #1: peterz-sched-reorder.patch --]
[-- Type: text/plain, Size: 850 bytes --]
Fill some 4 byte holes by slightly re-ordering some variables.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
include/linux/sched.h | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1363,17 +1363,16 @@ struct task_struct {
atomic_t usage;
unsigned int flags; /* per process flags, defined below */
unsigned int ptrace;
+ int on_rq;
#ifdef CONFIG_SMP
struct llist_node wake_entry;
int on_cpu;
+ int wake_cpu;
struct task_struct *last_wakee;
unsigned long wakee_flips;
unsigned long wakee_flip_decay_ts;
-
- int wake_cpu;
#endif
- int on_rq;
int prio, static_prio, normal_prio;
unsigned int rt_priority;
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/
^ permalink raw reply [flat|nested] 106+ messages in thread
* [RFC][PATCH 08/13] percpu-rwsem: DEFINE_STATIC_PERCPU_RWSEM
2015-06-22 12:16 [RFC][PATCH 00/13] percpu rwsem -v2 Peter Zijlstra
` (6 preceding siblings ...)
2015-06-22 12:16 ` [RFC][PATCH 07/13] sched: Reorder task_struct Peter Zijlstra
@ 2015-06-22 12:16 ` Peter Zijlstra
2015-06-22 12:16 ` [RFC][PATCH 09/13] hotplug: Replace hotplug lock with percpu-rwsem Peter Zijlstra
` (8 subsequent siblings)
16 siblings, 0 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-22 12:16 UTC (permalink / raw)
To: oleg, paulmck
Cc: tj, mingo, linux-kernel, der.herr, peterz, dave, riel, viro, torvalds
[-- Attachment #1: peterz-percpu-rwsem-static-init.patch --]
[-- Type: text/plain, Size: 1160 bytes --]
Provide a static init
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
include/linux/percpu-rwsem.h | 13 +++++++++++++
1 file changed, 13 insertions(+)
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -16,6 +16,19 @@ struct percpu_rw_semaphore {
struct rw_semaphore rw_sem;
};
+#define DEFINE_STATIC_PERCPU_RWSEM(name) \
+static DEFINE_PER_CPU(unsigned int, __percpu_rwsem_refcount_##name); \
+static struct percpu_rw_semaphore name = { \
+ .refcount = &__percpu_rwsem_refcount_##name, \
+ .state = 0, \
+ .rss = __RCU_SYNC_INITIALIZER(name.rss, RCU_SCHED_SYNC), \
+ .writer = __WAIT_QUEUE_HEAD_INITIALIZER(name.writer), \
+ .rw_sem = __RWSEM_INITIALIZER(name.rw_sem), \
+}
+
+#define lockdep_assert_held_percpu_rwsem(sem) \
+ lockdep_assert_held(&(sem)->rw_sem)
+
extern void __percpu_down_read(struct percpu_rw_semaphore *);
extern bool __percpu_down_read_trylock(struct percpu_rw_semaphore *);
extern void __percpu_up_read(struct percpu_rw_semaphore *);
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/
^ permalink raw reply [flat|nested] 106+ messages in thread
* [RFC][PATCH 09/13] hotplug: Replace hotplug lock with percpu-rwsem
2015-06-22 12:16 [RFC][PATCH 00/13] percpu rwsem -v2 Peter Zijlstra
` (7 preceding siblings ...)
2015-06-22 12:16 ` [RFC][PATCH 08/13] percpu-rwsem: DEFINE_STATIC_PERCPU_RWSEM Peter Zijlstra
@ 2015-06-22 12:16 ` Peter Zijlstra
2015-06-22 22:57 ` Oleg Nesterov
2015-06-22 12:16 ` [RFC][PATCH 10/13] fs/locks: Replace lg_global with a percpu-rwsem Peter Zijlstra
` (7 subsequent siblings)
16 siblings, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-22 12:16 UTC (permalink / raw)
To: oleg, paulmck
Cc: tj, mingo, linux-kernel, der.herr, peterz, dave, riel, viro, torvalds
[-- Attachment #1: peterz-hotplug-rwsem.patch --]
[-- Type: text/plain, Size: 8424 bytes --]
The cpu hotplug lock is a rwsem with read-in-write and read-in-read
recursion. Implement it as such.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
include/linux/cpu.h | 6 +
include/linux/percpu-rwsem.h | 10 ++-
include/linux/sched.h | 4 +
init/main.c | 1
kernel/cpu.c | 133 +++++++++++++------------------------------
kernel/fork.c | 2
lib/Kconfig | 5 +
7 files changed, 66 insertions(+), 95 deletions(-)
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -224,6 +224,9 @@ extern struct bus_type cpu_subsys;
#ifdef CONFIG_HOTPLUG_CPU
/* Stop CPUs going up and down. */
+extern void cpu_hotplug_init(void);
+extern void cpu_hotplug_init_task(struct task_struct *p);
+
extern void cpu_hotplug_begin(void);
extern void cpu_hotplug_done(void);
extern void get_online_cpus(void);
@@ -242,6 +245,9 @@ int cpu_down(unsigned int cpu);
#else /* CONFIG_HOTPLUG_CPU */
+static inline void cpu_hotplug_init(void) {}
+static inline void cpu_hotplug_init_task(struct task_struct *p) {}
+
static inline void cpu_hotplug_begin(void) {}
static inline void cpu_hotplug_done(void) {}
#define get_online_cpus() do { } while (0)
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -20,12 +20,10 @@ extern void __percpu_down_read(struct pe
extern bool __percpu_down_read_trylock(struct percpu_rw_semaphore *);
extern void __percpu_up_read(struct percpu_rw_semaphore *);
-static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
+static inline void _percpu_down_read(struct percpu_rw_semaphore *sem)
{
might_sleep();
- rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 0, _RET_IP_);
-
preempt_disable();
/*
* We are in an RCU-sched read-side critical section, so the writer
@@ -46,6 +44,12 @@ static inline void percpu_down_read(stru
*/
}
+static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
+{
+ rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 0, _RET_IP_);
+ _percpu_down_read(sem);
+}
+
static inline bool percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
{
bool ret = true;
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1389,6 +1389,10 @@ struct task_struct {
unsigned int btrace_seq;
#endif
+#ifdef CONFIG_HOTPLUG_CPU
+ int cpuhp_ref;
+#endif
+
unsigned int policy;
int nr_cpus_allowed;
cpumask_t cpus_allowed;
--- a/init/main.c
+++ b/init/main.c
@@ -588,6 +588,7 @@ asmlinkage __visible void __init start_k
sched_clock_postinit();
perf_event_init();
profile_init();
+ cpu_hotplug_init();
call_function_init();
WARN(!irqs_disabled(), "Interrupts were enabled early\n");
early_boot_irqs_disabled = false;
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -22,6 +22,7 @@
#include <linux/lockdep.h>
#include <linux/tick.h>
#include <trace/events/power.h>
+#include <linux/percpu-rwsem.h>
#include "smpboot.h"
@@ -50,7 +51,8 @@ EXPORT_SYMBOL(cpu_notifier_register_done
static RAW_NOTIFIER_HEAD(cpu_chain);
-/* If set, cpu_up and cpu_down will return -EBUSY and do nothing.
+/*
+ * If set, cpu_up and cpu_down will return -EBUSY and do nothing.
* Should always be manipulated under cpu_add_remove_lock
*/
static int cpu_hotplug_disabled;
@@ -58,126 +60,72 @@ static int cpu_hotplug_disabled;
#ifdef CONFIG_HOTPLUG_CPU
static struct {
- struct task_struct *active_writer;
- /* wait queue to wake up the active_writer */
- wait_queue_head_t wq;
- /* verifies that no writer will get active while readers are active */
- struct mutex lock;
- /*
- * Also blocks the new readers during
- * an ongoing cpu hotplug operation.
- */
- atomic_t refcount;
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- struct lockdep_map dep_map;
-#endif
-} cpu_hotplug = {
- .active_writer = NULL,
- .wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq),
- .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- .dep_map = {.name = "cpu_hotplug.lock" },
-#endif
-};
-
-/* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */
-#define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map)
-#define cpuhp_lock_acquire_tryread() \
- lock_map_acquire_tryread(&cpu_hotplug.dep_map)
-#define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map)
-#define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map)
+ struct percpu_rw_semaphore rwsem;
+ struct task_struct *writer;
+} cpu_hotplug = { .writer = &init_task, };
+
+void cpu_hotplug_init(void)
+{
+ percpu_init_rwsem(&cpu_hotplug.rwsem);
+ cpu_hotplug.writer = NULL;
+}
+void cpu_hotplug_init_task(struct task_struct *p)
+{
+ p->cpuhp_ref = 0;
+}
void get_online_cpus(void)
{
might_sleep();
- if (cpu_hotplug.active_writer == current)
+
+ /* read in write recursion */
+ if (cpu_hotplug.writer == current)
+ return;
+
+ /* read in read recursion */
+ if (current->cpuhp_ref++)
return;
- cpuhp_lock_acquire_read();
- mutex_lock(&cpu_hotplug.lock);
- atomic_inc(&cpu_hotplug.refcount);
- mutex_unlock(&cpu_hotplug.lock);
+
+ lock_map_acquire_read(&cpu_hotplug.rwsem.rw_sem.dep_map);
+ _percpu_down_read(&cpu_hotplug.rwsem);
}
EXPORT_SYMBOL_GPL(get_online_cpus);
bool try_get_online_cpus(void)
{
- if (cpu_hotplug.active_writer == current)
+ if (cpu_hotplug.writer == current)
return true;
- if (!mutex_trylock(&cpu_hotplug.lock))
- return false;
- cpuhp_lock_acquire_tryread();
- atomic_inc(&cpu_hotplug.refcount);
- mutex_unlock(&cpu_hotplug.lock);
- return true;
+
+ if (current->cpuhp_ref++)
+ return true;
+
+ return percpu_down_read_trylock(&cpu_hotplug.rwsem);
}
EXPORT_SYMBOL_GPL(try_get_online_cpus);
void put_online_cpus(void)
{
- int refcount;
-
- if (cpu_hotplug.active_writer == current)
+ if (cpu_hotplug.writer == current)
return;
- refcount = atomic_dec_return(&cpu_hotplug.refcount);
- if (WARN_ON(refcount < 0)) /* try to fix things up */
- atomic_inc(&cpu_hotplug.refcount);
-
- if (refcount <= 0 && waitqueue_active(&cpu_hotplug.wq))
- wake_up(&cpu_hotplug.wq);
-
- cpuhp_lock_release();
+ if (--current->cpuhp_ref)
+ return;
+ percpu_up_read(&cpu_hotplug.rwsem);
}
EXPORT_SYMBOL_GPL(put_online_cpus);
-/*
- * This ensures that the hotplug operation can begin only when the
- * refcount goes to zero.
- *
- * Note that during a cpu-hotplug operation, the new readers, if any,
- * will be blocked by the cpu_hotplug.lock
- *
- * Since cpu_hotplug_begin() is always called after invoking
- * cpu_maps_update_begin(), we can be sure that only one writer is active.
- *
- * Note that theoretically, there is a possibility of a livelock:
- * - Refcount goes to zero, last reader wakes up the sleeping
- * writer.
- * - Last reader unlocks the cpu_hotplug.lock.
- * - A new reader arrives at this moment, bumps up the refcount.
- * - The writer acquires the cpu_hotplug.lock finds the refcount
- * non zero and goes to sleep again.
- *
- * However, this is very difficult to achieve in practice since
- * get_online_cpus() not an api which is called all that often.
- *
- */
void cpu_hotplug_begin(void)
{
- DEFINE_WAIT(wait);
-
- cpu_hotplug.active_writer = current;
- cpuhp_lock_acquire();
-
- for (;;) {
- mutex_lock(&cpu_hotplug.lock);
- prepare_to_wait(&cpu_hotplug.wq, &wait, TASK_UNINTERRUPTIBLE);
- if (likely(!atomic_read(&cpu_hotplug.refcount)))
- break;
- mutex_unlock(&cpu_hotplug.lock);
- schedule();
- }
- finish_wait(&cpu_hotplug.wq, &wait);
+ percpu_down_write(&cpu_hotplug.rwsem);
+ cpu_hotplug.writer = current;
}
void cpu_hotplug_done(void)
{
- cpu_hotplug.active_writer = NULL;
- mutex_unlock(&cpu_hotplug.lock);
- cpuhp_lock_release();
+ cpu_hotplug.writer = NULL;
+ percpu_up_write(&cpu_hotplug.rwsem);
}
/*
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1410,6 +1410,8 @@ static struct task_struct *copy_process(
p->sequential_io_avg = 0;
#endif
+ cpu_hotplug_init_task(p);
+
/* Perform scheduler related setup. Assign this task to a CPU. */
retval = sched_fork(clone_flags, p);
if (retval)
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -56,6 +56,11 @@ config STMP_DEVICE
config PERCPU_RWSEM
bool
+config PERCPU_RWSEM_HOTPLUG
+ def_bool y
+ depends on HOTPLUG_CPU
+ select PERCPU_RWSEM
+
config ARCH_USE_CMPXCHG_LOCKREF
bool
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/
^ permalink raw reply [flat|nested] 106+ messages in thread
* [RFC][PATCH 10/13] fs/locks: Replace lg_global with a percpu-rwsem
2015-06-22 12:16 [RFC][PATCH 00/13] percpu rwsem -v2 Peter Zijlstra
` (8 preceding siblings ...)
2015-06-22 12:16 ` [RFC][PATCH 09/13] hotplug: Replace hotplug lock with percpu-rwsem Peter Zijlstra
@ 2015-06-22 12:16 ` Peter Zijlstra
2015-06-22 12:16 ` [RFC][PATCH 11/13] fs/locks: Replace lg_local with a per-cpu spinlock Peter Zijlstra
` (6 subsequent siblings)
16 siblings, 0 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-22 12:16 UTC (permalink / raw)
To: oleg, paulmck
Cc: tj, mingo, linux-kernel, der.herr, peterz, dave, riel, viro, torvalds
[-- Attachment #1: peterz-fslocks-rwsem.patch --]
[-- Type: text/plain, Size: 5096 bytes --]
Replace the global part of the lglock with a percpu-rwsem.
Since fcl_lock is a spinlock and itself nests under i_lock, which too
is a spinlock we cannot acquire sleeping locks at
locks_{insert,remove}_global_locks().
We can however wrap all fcl_lock acquisitions with percpu_down_read
such that all invocations of locks_{insert,remove}_global_locks() have
that read lock held.
This allows us to replace the lg_global part of the lglock with the
write side of the rwsem.
In the absense of writers, percpu_{down,up}_read() are free of atomic
instructions. This further avoids the very long preempt-disable
regions caused by lglock on larger machines.
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
fs/locks.c | 18 ++++++++++++++++++
1 file changed, 18 insertions(+)
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -165,6 +165,7 @@ int lease_break_time = 45;
*/
DEFINE_STATIC_LGLOCK(file_lock_lglock);
static DEFINE_PER_CPU(struct hlist_head, file_lock_list);
+static struct percpu_rw_semaphore file_rwsem;
/*
* The blocked_hash is used to find POSIX lock loops for deadlock detection.
@@ -556,6 +557,8 @@ static int posix_same_owner(struct file_
/* Must be called with the flc_lock held! */
static void locks_insert_global_locks(struct file_lock *fl)
{
+ lockdep_assert_held_percpu_rwsem(&file_rwsem);
+
lg_local_lock(&file_lock_lglock);
fl->fl_link_cpu = smp_processor_id();
hlist_add_head(&fl->fl_link, this_cpu_ptr(&file_lock_list));
@@ -565,6 +568,8 @@ static void locks_insert_global_locks(st
/* Must be called with the flc_lock held! */
static void locks_delete_global_locks(struct file_lock *fl)
{
+ lockdep_assert_held_percpu_rwsem(&file_rwsem);
+
/*
* Avoid taking lock if already unhashed. This is safe since this check
* is done while holding the flc_lock, and new insertions into the list
@@ -885,6 +890,7 @@ static int flock_lock_file(struct file *
return -ENOMEM;
}
+ percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
if (request->fl_flags & FL_ACCESS)
goto find_conflict;
@@ -925,6 +931,7 @@ static int flock_lock_file(struct file *
out:
spin_unlock(&ctx->flc_lock);
+ percpu_up_read(&file_rwsem);
if (new_fl)
locks_free_lock(new_fl);
locks_dispose_list(&dispose);
@@ -960,6 +967,7 @@ static int __posix_lock_file(struct inod
new_fl2 = locks_alloc_lock();
}
+ percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
/*
* New lock request. Walk all POSIX locks and look for conflicts. If
@@ -1131,6 +1139,7 @@ static int __posix_lock_file(struct inod
}
out:
spin_unlock(&ctx->flc_lock);
+ percpu_up_read(&file_rwsem);
/*
* Free any unused locks.
*/
@@ -1407,6 +1416,7 @@ int __break_lease(struct inode *inode, u
return error;
}
+ percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
time_out_leases(inode, &dispose);
@@ -1477,6 +1487,7 @@ int __break_lease(struct inode *inode, u
}
out:
spin_unlock(&ctx->flc_lock);
+ percpu_up_read(&file_rwsem);
locks_dispose_list(&dispose);
locks_free_lock(new_fl);
return error;
@@ -1630,6 +1641,7 @@ generic_add_lease(struct file *filp, lon
return -EINVAL;
}
+ percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
time_out_leases(inode, &dispose);
error = check_conflicting_open(dentry, arg, lease->fl_flags);
@@ -1700,6 +1712,7 @@ generic_add_lease(struct file *filp, lon
lease->fl_lmops->lm_setup(lease, priv);
out:
spin_unlock(&ctx->flc_lock);
+ percpu_up_read(&file_rwsem);
locks_dispose_list(&dispose);
if (is_deleg)
mutex_unlock(&inode->i_mutex);
@@ -1722,6 +1735,7 @@ static int generic_delete_lease(struct f
return error;
}
+ percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
if (fl->fl_file == filp &&
@@ -1734,6 +1748,7 @@ static int generic_delete_lease(struct f
if (victim)
error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose);
spin_unlock(&ctx->flc_lock);
+ percpu_up_read(&file_rwsem);
locks_dispose_list(&dispose);
return error;
}
@@ -2634,6 +2649,7 @@ static void *locks_start(struct seq_file
struct locks_iterator *iter = f->private;
iter->li_pos = *pos + 1;
+ percpu_down_write(&file_rwsem);
lg_global_lock(&file_lock_lglock);
spin_lock(&blocked_lock_lock);
return seq_hlist_start_percpu(&file_lock_list, &iter->li_cpu, *pos);
@@ -2652,6 +2668,7 @@ static void locks_stop(struct seq_file *
{
spin_unlock(&blocked_lock_lock);
lg_global_unlock(&file_lock_lglock);
+ percpu_up_write(&file_rwsem);
}
static const struct seq_operations locks_seq_operations = {
@@ -2693,6 +2710,7 @@ static int __init filelock_init(void)
sizeof(struct file_lock), 0, SLAB_PANIC, NULL);
lg_lock_init(&file_lock_lglock, "file_lock_lglock");
+ percpu_init_rwsem(&file_rwsem);
for_each_possible_cpu(i)
INIT_HLIST_HEAD(per_cpu_ptr(&file_lock_list, i));
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/
^ permalink raw reply [flat|nested] 106+ messages in thread
* [RFC][PATCH 11/13] fs/locks: Replace lg_local with a per-cpu spinlock
2015-06-22 12:16 [RFC][PATCH 00/13] percpu rwsem -v2 Peter Zijlstra
` (9 preceding siblings ...)
2015-06-22 12:16 ` [RFC][PATCH 10/13] fs/locks: Replace lg_global with a percpu-rwsem Peter Zijlstra
@ 2015-06-22 12:16 ` Peter Zijlstra
2015-06-23 0:19 ` Oleg Nesterov
2015-06-22 12:16 ` [RFC][PATCH 12/13] stop_machine: Remove lglock Peter Zijlstra
` (5 subsequent siblings)
16 siblings, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-22 12:16 UTC (permalink / raw)
To: oleg, paulmck
Cc: tj, mingo, linux-kernel, der.herr, peterz, dave, riel, viro, torvalds
[-- Attachment #1: peterz-fslocks-remove-lg.patch --]
[-- Type: text/plain, Size: 4580 bytes --]
As Oleg suggested, replace file_lock_list with a structure containing
the hlist head and a spinlock.
This completely removes the lglock from fs/locks.
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Suggested-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
fs/Kconfig | 1 +
fs/locks.c | 47 +++++++++++++++++++++++++++++------------------
2 files changed, 30 insertions(+), 18 deletions(-)
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -65,6 +65,7 @@ config EXPORTFS
config FILE_LOCKING
bool "Enable POSIX file locking API" if EXPERT
default y
+ select PERCPU_RWSEM
help
This option enables standard file locking support, required
for filesystems like NFS and for the flock() system
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -128,7 +128,6 @@
#include <linux/pid_namespace.h>
#include <linux/hashtable.h>
#include <linux/percpu.h>
-#include <linux/lglock.h>
#define CREATE_TRACE_POINTS
#include <trace/events/filelock.h>
@@ -159,12 +158,17 @@ int lease_break_time = 45;
/*
* The global file_lock_list is only used for displaying /proc/locks, so we
- * keep a list on each CPU, with each list protected by its own spinlock via
- * the file_lock_lglock. Note that alterations to the list also require that
- * the relevant flc_lock is held.
+ * keep a list on each CPU, with each list protected by its own spinlock.
+ * Global serialization is done using file_rwsem.
+ *
+ * Note that alterations to the list also require that the relevant flc_lock is
+ * held.
*/
-DEFINE_STATIC_LGLOCK(file_lock_lglock);
-static DEFINE_PER_CPU(struct hlist_head, file_lock_list);
+struct file_lock_list_struct {
+ spinlock_t lock;
+ struct hlist_head hlist;
+};
+static DEFINE_PER_CPU(struct file_lock_list_struct, file_lock_list);
static struct percpu_rw_semaphore file_rwsem;
/*
@@ -557,17 +561,21 @@ static int posix_same_owner(struct file_
/* Must be called with the flc_lock held! */
static void locks_insert_global_locks(struct file_lock *fl)
{
+ struct file_lock_list_struct *fll = this_cpu_ptr(&file_lock_list);
+
lockdep_assert_held_percpu_rwsem(&file_rwsem);
- lg_local_lock(&file_lock_lglock);
+ spin_lock(&fll->lock);
fl->fl_link_cpu = smp_processor_id();
- hlist_add_head(&fl->fl_link, this_cpu_ptr(&file_lock_list));
- lg_local_unlock(&file_lock_lglock);
+ hlist_add_head(&fl->fl_link, &fll->hlist);
+ spin_unlock(&fll->lock);
}
/* Must be called with the flc_lock held! */
static void locks_delete_global_locks(struct file_lock *fl)
{
+ struct file_lock_list_struct *fll;
+
lockdep_assert_held_percpu_rwsem(&file_rwsem);
/*
@@ -577,9 +585,11 @@ static void locks_delete_global_locks(st
*/
if (hlist_unhashed(&fl->fl_link))
return;
- lg_local_lock_cpu(&file_lock_lglock, fl->fl_link_cpu);
+
+ fll = per_cpu_ptr(&file_lock_list, fl->fl_link_cpu);
+ spin_lock(&fll->lock);
hlist_del_init(&fl->fl_link);
- lg_local_unlock_cpu(&file_lock_lglock, fl->fl_link_cpu);
+ spin_unlock(&fll->lock);
}
static unsigned long
@@ -2650,9 +2660,8 @@ static void *locks_start(struct seq_file
iter->li_pos = *pos + 1;
percpu_down_write(&file_rwsem);
- lg_global_lock(&file_lock_lglock);
spin_lock(&blocked_lock_lock);
- return seq_hlist_start_percpu(&file_lock_list, &iter->li_cpu, *pos);
+ return seq_hlist_start_percpu(&file_lock_list.hlist, &iter->li_cpu, *pos);
}
static void *locks_next(struct seq_file *f, void *v, loff_t *pos)
@@ -2660,14 +2669,13 @@ static void *locks_next(struct seq_file
struct locks_iterator *iter = f->private;
++iter->li_pos;
- return seq_hlist_next_percpu(v, &file_lock_list, &iter->li_cpu, pos);
+ return seq_hlist_next_percpu(v, &file_lock_list.hlist, &iter->li_cpu, pos);
}
static void locks_stop(struct seq_file *f, void *v)
__releases(&blocked_lock_lock)
{
spin_unlock(&blocked_lock_lock);
- lg_global_unlock(&file_lock_lglock);
percpu_up_write(&file_rwsem);
}
@@ -2709,11 +2717,14 @@ static int __init filelock_init(void)
filelock_cache = kmem_cache_create("file_lock_cache",
sizeof(struct file_lock), 0, SLAB_PANIC, NULL);
- lg_lock_init(&file_lock_lglock, "file_lock_lglock");
percpu_init_rwsem(&file_rwsem);
- for_each_possible_cpu(i)
- INIT_HLIST_HEAD(per_cpu_ptr(&file_lock_list, i));
+ for_each_possible_cpu(i) {
+ struct file_lock_list_struct *fll = per_cpu_ptr(&file_lock_list, i);
+
+ spin_lock_init(&fll->lock);
+ INIT_HLIST_HEAD(&fll->hlist);
+ }
return 0;
}
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/
^ permalink raw reply [flat|nested] 106+ messages in thread
* [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-06-22 12:16 [RFC][PATCH 00/13] percpu rwsem -v2 Peter Zijlstra
` (10 preceding siblings ...)
2015-06-22 12:16 ` [RFC][PATCH 11/13] fs/locks: Replace lg_local with a per-cpu spinlock Peter Zijlstra
@ 2015-06-22 12:16 ` Peter Zijlstra
2015-06-22 22:21 ` Oleg Nesterov
2015-06-22 12:16 ` [RFC][PATCH 13/13] locking: " Peter Zijlstra
` (4 subsequent siblings)
16 siblings, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-22 12:16 UTC (permalink / raw)
To: oleg, paulmck
Cc: tj, mingo, linux-kernel, der.herr, peterz, dave, riel, viro, torvalds
[-- Attachment #1: peterz-stop_machine-lg-removal.patch --]
[-- Type: text/plain, Size: 4198 bytes --]
We can replace both the global and local part of the lglock by better
usage of cpu_stopper::lock.
By having stop_two_cpus() acquire two cpu_stopper::locks we gain full
order against the global stop_machine which takes each of these locks
in order.
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
kernel/stop_machine.c | 52 ++++++++++++++++++++++++++++----------------------
lib/Kconfig | 5 ++++
2 files changed, 35 insertions(+), 22 deletions(-)
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -20,7 +20,6 @@
#include <linux/kallsyms.h>
#include <linux/smpboot.h>
#include <linux/atomic.h>
-#include <linux/lglock.h>
/*
* Structure to determine completion condition and record errors. May
@@ -44,14 +43,6 @@ static DEFINE_PER_CPU(struct cpu_stopper
static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task);
static bool stop_machine_initialized = false;
-/*
- * Avoids a race between stop_two_cpus and global stop_cpus, where
- * the stoppers could get queued up in reverse order, leading to
- * system deadlock. Using an lglock means stop_two_cpus remains
- * relatively cheap.
- */
-DEFINE_STATIC_LGLOCK(stop_cpus_lock);
-
static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
{
memset(done, 0, sizeof(*done));
@@ -71,21 +62,26 @@ static void cpu_stop_signal_done(struct
}
/* queue @work to @stopper. if offline, @work is completed immediately */
-static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
+static void __cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
{
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
struct task_struct *p = per_cpu(cpu_stopper_task, cpu);
- unsigned long flags;
-
- spin_lock_irqsave(&stopper->lock, flags);
-
if (stopper->enabled) {
list_add_tail(&work->list, &stopper->works);
wake_up_process(p);
- } else
+ } else {
cpu_stop_signal_done(work->done, false);
+ }
+}
+static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
+{
+ struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
+ unsigned long flags;
+
+ spin_lock_irqsave(&stopper->lock, flags);
+ __cpu_stop_queue_work(cpu, work);
spin_unlock_irqrestore(&stopper->lock, flags);
}
@@ -224,9 +220,14 @@ static int multi_cpu_stop(void *data)
*/
int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg)
{
- struct cpu_stop_done done;
+ struct cpu_stopper *stopper1, *stopper2;
struct cpu_stop_work work1, work2;
struct multi_stop_data msdata;
+ struct cpu_stop_done done;
+ unsigned long flags;
+
+ if (cpu2 < cpu1)
+ swap(cpu1, cpu2);
preempt_disable();
msdata = (struct multi_stop_data){
@@ -258,10 +259,17 @@ int stop_two_cpus(unsigned int cpu1, uns
return -ENOENT;
}
- lg_double_lock(&stop_cpus_lock, cpu1, cpu2);
- cpu_stop_queue_work(cpu1, &work1);
- cpu_stop_queue_work(cpu2, &work2);
- lg_double_unlock(&stop_cpus_lock, cpu1, cpu2);
+ stopper1 = per_cpu_ptr(&cpu_stopper, cpu1);
+ stopper2 = per_cpu_ptr(&cpu_stopper, cpu2);
+
+ spin_lock_irqsave(&stopper1->lock, flags);
+ spin_lock(&stopper2->lock);
+
+ __cpu_stop_queue_work(cpu1, &work1);
+ __cpu_stop_queue_work(cpu2, &work2);
+
+ spin_unlock(&stopper2->lock);
+ spin_unlock_irqrestore(&stopper1->lock, flags);
preempt_enable();
@@ -315,10 +323,10 @@ static void queue_stop_cpus_work(const s
* preempted by a stopper which might wait for other stoppers
* to enter @fn which can lead to deadlock.
*/
- lg_global_lock(&stop_cpus_lock);
+ preempt_disable();
for_each_cpu(cpu, cpumask)
cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu));
- lg_global_unlock(&stop_cpus_lock);
+ preempt_enable();
}
static int __stop_cpus(const struct cpumask *cpumask,
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -61,6 +61,11 @@ config PERCPU_RWSEM_HOTPLUG
depends on HOTPLUG_CPU
select PERCPU_RWSEM
+config PERCPU_RWSEM_SMP
+ def_bool y
+ depends on SMP
+ select PERCPU_RWSEM
+
config ARCH_USE_CMPXCHG_LOCKREF
bool
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/
^ permalink raw reply [flat|nested] 106+ messages in thread
* [RFC][PATCH 13/13] locking: Remove lglock
2015-06-22 12:16 [RFC][PATCH 00/13] percpu rwsem -v2 Peter Zijlstra
` (11 preceding siblings ...)
2015-06-22 12:16 ` [RFC][PATCH 12/13] stop_machine: Remove lglock Peter Zijlstra
@ 2015-06-22 12:16 ` Peter Zijlstra
2015-06-22 12:36 ` [RFC][PATCH 00/13] percpu rwsem -v2 Peter Zijlstra
` (3 subsequent siblings)
16 siblings, 0 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-22 12:16 UTC (permalink / raw)
To: oleg, paulmck
Cc: tj, mingo, linux-kernel, der.herr, peterz, dave, riel, viro, torvalds
[-- Attachment #1: peterz-remove_lglock.patch --]
[-- Type: text/plain, Size: 13502 bytes --]
Since there are no users left of this primitive, make it go away.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
Documentation/locking/lglock.txt | 166 ---------------------------------------
fs/file_table.c | 1
include/linux/lglock.h | 81 -------------------
kernel/locking/Makefile | 1
kernel/locking/lglock.c | 111 --------------------------
5 files changed, 360 deletions(-)
--- a/Documentation/locking/lglock.txt
+++ /dev/null
@@ -1,166 +0,0 @@
-lglock - local/global locks for mostly local access patterns
-------------------------------------------------------------
-
-Origin: Nick Piggin's VFS scalability series introduced during
- 2.6.35++ [1] [2]
-Location: kernel/locking/lglock.c
- include/linux/lglock.h
-Users: currently only the VFS and stop_machine related code
-
-Design Goal:
-------------
-
-Improve scalability of globally used large data sets that are
-distributed over all CPUs as per_cpu elements.
-
-To manage global data structures that are partitioned over all CPUs
-as per_cpu elements but can be mostly handled by CPU local actions
-lglock will be used where the majority of accesses are cpu local
-reading and occasional cpu local writing with very infrequent
-global write access.
-
-
-* deal with things locally whenever possible
- - very fast access to the local per_cpu data
- - reasonably fast access to specific per_cpu data on a different
- CPU
-* while making global action possible when needed
- - by expensive access to all CPUs locks - effectively
- resulting in a globally visible critical section.
-
-Design:
--------
-
-Basically it is an array of per_cpu spinlocks with the
-lg_local_lock/unlock accessing the local CPUs lock object and the
-lg_local_lock_cpu/unlock_cpu accessing a remote CPUs lock object
-the lg_local_lock has to disable preemption as migration protection so
-that the reference to the local CPUs lock does not go out of scope.
-Due to the lg_local_lock/unlock only touching cpu-local resources it
-is fast. Taking the local lock on a different CPU will be more
-expensive but still relatively cheap.
-
-One can relax the migration constraints by acquiring the current
-CPUs lock with lg_local_lock_cpu, remember the cpu, and release that
-lock at the end of the critical section even if migrated. This should
-give most of the performance benefits without inhibiting migration
-though needs careful considerations for nesting of lglocks and
-consideration of deadlocks with lg_global_lock.
-
-The lg_global_lock/unlock locks all underlying spinlocks of all
-possible CPUs (including those off-line). The preemption disable/enable
-are needed in the non-RT kernels to prevent deadlocks like:
-
- on cpu 1
-
- task A task B
- lg_global_lock
- got cpu 0 lock
- <<<< preempt <<<<
- lg_local_lock_cpu for cpu 0
- spin on cpu 0 lock
-
-On -RT this deadlock scenario is resolved by the arch_spin_locks in the
-lglocks being replaced by rt_mutexes which resolve the above deadlock
-by boosting the lock-holder.
-
-
-Implementation:
----------------
-
-The initial lglock implementation from Nick Piggin used some complex
-macros to generate the lglock/brlock in lglock.h - they were later
-turned into a set of functions by Andi Kleen [7]. The change to functions
-was motivated by the presence of multiple lock users and also by them
-being easier to maintain than the generating macros. This change to
-functions is also the basis to eliminated the restriction of not
-being initializeable in kernel modules (the remaining problem is that
-locks are not explicitly initialized - see lockdep-design.txt)
-
-Declaration and initialization:
--------------------------------
-
- #include <linux/lglock.h>
-
- DEFINE_LGLOCK(name)
- or:
- DEFINE_STATIC_LGLOCK(name);
-
- lg_lock_init(&name, "lockdep_name_string");
-
- on UP this is mapped to DEFINE_SPINLOCK(name) in both cases, note
- also that as of 3.18-rc6 all declaration in use are of the _STATIC_
- variant (and it seems that the non-static was never in use).
- lg_lock_init is initializing the lockdep map only.
-
-Usage:
-------
-
-From the locking semantics it is a spinlock. It could be called a
-locality aware spinlock. lg_local_* behaves like a per_cpu
-spinlock and lg_global_* like a global spinlock.
-No surprises in the API.
-
- lg_local_lock(*lglock);
- access to protected per_cpu object on this CPU
- lg_local_unlock(*lglock);
-
- lg_local_lock_cpu(*lglock, cpu);
- access to protected per_cpu object on other CPU cpu
- lg_local_unlock_cpu(*lglock, cpu);
-
- lg_global_lock(*lglock);
- access all protected per_cpu objects on all CPUs
- lg_global_unlock(*lglock);
-
- There are no _trylock variants of the lglocks.
-
-Note that the lg_global_lock/unlock has to iterate over all possible
-CPUs rather than the actually present CPUs or a CPU could go off-line
-with a held lock [4] and that makes it very expensive. A discussion on
-these issues can be found at [5]
-
-Constraints:
-------------
-
- * currently the declaration of lglocks in kernel modules is not
- possible, though this should be doable with little change.
- * lglocks are not recursive.
- * suitable for code that can do most operations on the CPU local
- data and will very rarely need the global lock
- * lg_global_lock/unlock is *very* expensive and does not scale
- * on UP systems all lg_* primitives are simply spinlocks
- * in PREEMPT_RT the spinlock becomes an rt-mutex and can sleep but
- does not change the tasks state while sleeping [6].
- * in PREEMPT_RT the preempt_disable/enable in lg_local_lock/unlock
- is downgraded to a migrate_disable/enable, the other
- preempt_disable/enable are downgraded to barriers [6].
- The deadlock noted for non-RT above is resolved due to rt_mutexes
- boosting the lock-holder in this case which arch_spin_locks do
- not do.
-
-lglocks were designed for very specific problems in the VFS and probably
-only are the right answer in these corner cases. Any new user that looks
-at lglocks probably wants to look at the seqlock and RCU alternatives as
-her first choice. There are also efforts to resolve the RCU issues that
-currently prevent using RCU in place of view remaining lglocks.
-
-Note on brlock history:
------------------------
-
-The 'Big Reader' read-write spinlocks were originally introduced by
-Ingo Molnar in 2000 (2.4/2.5 kernel series) and removed in 2003. They
-later were introduced by the VFS scalability patch set in 2.6 series
-again as the "big reader lock" brlock [2] variant of lglock which has
-been replaced by seqlock primitives or by RCU based primitives in the
-3.13 kernel series as was suggested in [3] in 2003. The brlock was
-entirely removed in the 3.13 kernel series.
-
-Link: 1 http://lkml.org/lkml/2010/8/2/81
-Link: 2 http://lwn.net/Articles/401738/
-Link: 3 http://lkml.org/lkml/2003/3/9/205
-Link: 4 https://lkml.org/lkml/2011/8/24/185
-Link: 5 http://lkml.org/lkml/2011/12/18/189
-Link: 6 https://www.kernel.org/pub/linux/kernel/projects/rt/
- patch series - lglocks-rt.patch.patch
-Link: 7 http://lkml.org/lkml/2012/3/5/26
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -20,7 +20,6 @@
#include <linux/cdev.h>
#include <linux/fsnotify.h>
#include <linux/sysctl.h>
-#include <linux/lglock.h>
#include <linux/percpu_counter.h>
#include <linux/percpu.h>
#include <linux/hardirq.h>
--- a/include/linux/lglock.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Specialised local-global spinlock. Can only be declared as global variables
- * to avoid overhead and keep things simple (and we don't want to start using
- * these inside dynamically allocated structures).
- *
- * "local/global locks" (lglocks) can be used to:
- *
- * - Provide fast exclusive access to per-CPU data, with exclusive access to
- * another CPU's data allowed but possibly subject to contention, and to
- * provide very slow exclusive access to all per-CPU data.
- * - Or to provide very fast and scalable read serialisation, and to provide
- * very slow exclusive serialisation of data (not necessarily per-CPU data).
- *
- * Brlocks are also implemented as a short-hand notation for the latter use
- * case.
- *
- * Copyright 2009, 2010, Nick Piggin, Novell Inc.
- */
-#ifndef __LINUX_LGLOCK_H
-#define __LINUX_LGLOCK_H
-
-#include <linux/spinlock.h>
-#include <linux/lockdep.h>
-#include <linux/percpu.h>
-#include <linux/cpu.h>
-#include <linux/notifier.h>
-
-#ifdef CONFIG_SMP
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-#define LOCKDEP_INIT_MAP lockdep_init_map
-#else
-#define LOCKDEP_INIT_MAP(a, b, c, d)
-#endif
-
-struct lglock {
- arch_spinlock_t __percpu *lock;
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- struct lock_class_key lock_key;
- struct lockdep_map lock_dep_map;
-#endif
-};
-
-#define DEFINE_LGLOCK(name) \
- static DEFINE_PER_CPU(arch_spinlock_t, name ## _lock) \
- = __ARCH_SPIN_LOCK_UNLOCKED; \
- struct lglock name = { .lock = &name ## _lock }
-
-#define DEFINE_STATIC_LGLOCK(name) \
- static DEFINE_PER_CPU(arch_spinlock_t, name ## _lock) \
- = __ARCH_SPIN_LOCK_UNLOCKED; \
- static struct lglock name = { .lock = &name ## _lock }
-
-void lg_lock_init(struct lglock *lg, char *name);
-
-void lg_local_lock(struct lglock *lg);
-void lg_local_unlock(struct lglock *lg);
-void lg_local_lock_cpu(struct lglock *lg, int cpu);
-void lg_local_unlock_cpu(struct lglock *lg, int cpu);
-
-void lg_double_lock(struct lglock *lg, int cpu1, int cpu2);
-void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2);
-
-void lg_global_lock(struct lglock *lg);
-void lg_global_unlock(struct lglock *lg);
-
-#else
-/* When !CONFIG_SMP, map lglock to spinlock */
-#define lglock spinlock
-#define DEFINE_LGLOCK(name) DEFINE_SPINLOCK(name)
-#define DEFINE_STATIC_LGLOCK(name) static DEFINE_SPINLOCK(name)
-#define lg_lock_init(lg, name) spin_lock_init(lg)
-#define lg_local_lock spin_lock
-#define lg_local_unlock spin_unlock
-#define lg_local_lock_cpu(lg, cpu) spin_lock(lg)
-#define lg_local_unlock_cpu(lg, cpu) spin_unlock(lg)
-#define lg_global_lock spin_lock
-#define lg_global_unlock spin_unlock
-#endif
-
-#endif
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -15,7 +15,6 @@ obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
endif
obj-$(CONFIG_SMP) += spinlock.o
obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o
-obj-$(CONFIG_SMP) += lglock.o
obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o
obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
--- a/kernel/locking/lglock.c
+++ /dev/null
@@ -1,111 +0,0 @@
-/* See include/linux/lglock.h for description */
-#include <linux/module.h>
-#include <linux/lglock.h>
-#include <linux/cpu.h>
-#include <linux/string.h>
-
-/*
- * Note there is no uninit, so lglocks cannot be defined in
- * modules (but it's fine to use them from there)
- * Could be added though, just undo lg_lock_init
- */
-
-void lg_lock_init(struct lglock *lg, char *name)
-{
- LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0);
-}
-EXPORT_SYMBOL(lg_lock_init);
-
-void lg_local_lock(struct lglock *lg)
-{
- arch_spinlock_t *lock;
-
- preempt_disable();
- lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
- lock = this_cpu_ptr(lg->lock);
- arch_spin_lock(lock);
-}
-EXPORT_SYMBOL(lg_local_lock);
-
-void lg_local_unlock(struct lglock *lg)
-{
- arch_spinlock_t *lock;
-
- lock_release(&lg->lock_dep_map, 1, _RET_IP_);
- lock = this_cpu_ptr(lg->lock);
- arch_spin_unlock(lock);
- preempt_enable();
-}
-EXPORT_SYMBOL(lg_local_unlock);
-
-void lg_local_lock_cpu(struct lglock *lg, int cpu)
-{
- arch_spinlock_t *lock;
-
- preempt_disable();
- lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
- lock = per_cpu_ptr(lg->lock, cpu);
- arch_spin_lock(lock);
-}
-EXPORT_SYMBOL(lg_local_lock_cpu);
-
-void lg_local_unlock_cpu(struct lglock *lg, int cpu)
-{
- arch_spinlock_t *lock;
-
- lock_release(&lg->lock_dep_map, 1, _RET_IP_);
- lock = per_cpu_ptr(lg->lock, cpu);
- arch_spin_unlock(lock);
- preempt_enable();
-}
-EXPORT_SYMBOL(lg_local_unlock_cpu);
-
-void lg_double_lock(struct lglock *lg, int cpu1, int cpu2)
-{
- BUG_ON(cpu1 == cpu2);
-
- /* lock in cpu order, just like lg_global_lock */
- if (cpu2 < cpu1)
- swap(cpu1, cpu2);
-
- preempt_disable();
- lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
- arch_spin_lock(per_cpu_ptr(lg->lock, cpu1));
- arch_spin_lock(per_cpu_ptr(lg->lock, cpu2));
-}
-
-void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2)
-{
- lock_release(&lg->lock_dep_map, 1, _RET_IP_);
- arch_spin_unlock(per_cpu_ptr(lg->lock, cpu1));
- arch_spin_unlock(per_cpu_ptr(lg->lock, cpu2));
- preempt_enable();
-}
-
-void lg_global_lock(struct lglock *lg)
-{
- int i;
-
- preempt_disable();
- lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
- for_each_possible_cpu(i) {
- arch_spinlock_t *lock;
- lock = per_cpu_ptr(lg->lock, i);
- arch_spin_lock(lock);
- }
-}
-EXPORT_SYMBOL(lg_global_lock);
-
-void lg_global_unlock(struct lglock *lg)
-{
- int i;
-
- lock_release(&lg->lock_dep_map, 1, _RET_IP_);
- for_each_possible_cpu(i) {
- arch_spinlock_t *lock;
- lock = per_cpu_ptr(lg->lock, i);
- arch_spin_unlock(lock);
- }
- preempt_enable();
-}
-EXPORT_SYMBOL(lg_global_unlock);
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 00/13] percpu rwsem -v2
2015-06-22 12:16 [RFC][PATCH 00/13] percpu rwsem -v2 Peter Zijlstra
` (12 preceding siblings ...)
2015-06-22 12:16 ` [RFC][PATCH 13/13] locking: " Peter Zijlstra
@ 2015-06-22 12:36 ` Peter Zijlstra
2015-06-22 18:11 ` Daniel Wagner
` (2 subsequent siblings)
16 siblings, 0 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-22 12:36 UTC (permalink / raw)
To: oleg, paulmck
Cc: tj, mingo, linux-kernel, der.herr, dave, riel, viro, torvalds
I forgot to Re-instate "From: Oleg Nesterov" On the first 4 patches.
Sorry about that. I'll take more care with a next posting.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 00/13] percpu rwsem -v2
2015-06-22 12:16 [RFC][PATCH 00/13] percpu rwsem -v2 Peter Zijlstra
` (13 preceding siblings ...)
2015-06-22 12:36 ` [RFC][PATCH 00/13] percpu rwsem -v2 Peter Zijlstra
@ 2015-06-22 18:11 ` Daniel Wagner
2015-06-22 19:05 ` Peter Zijlstra
2015-06-22 20:06 ` Linus Torvalds
2015-06-23 16:10 ` Davidlohr Bueso
16 siblings, 1 reply; 106+ messages in thread
From: Daniel Wagner @ 2015-06-22 18:11 UTC (permalink / raw)
To: Peter Zijlstra, oleg, paulmck
Cc: tj, mingo, linux-kernel, der.herr, dave, riel, viro, torvalds
On 06/22/2015 02:16 PM, Peter Zijlstra wrote:
> Also, since Linus thinks lglocks is a failed locking primitive (which I whole
> heartedly agree with, its preempt-disable latencies are an abomination), it
> also converts the global part of fs/locks's usage of lglock over to a
> percpu-rwsem and uses a per-cpu spinlock for the local part. This both provides
> another (4th) percpu-rwsem users and removes an lglock user.
I did a quick lockperf run with these patches on a 4 socket E5-4610 machine.
These microbenches execercise the fs' locks a bit.
I suspect I got the wrong tree. The patches did not apply cleanly. The resulting
kernel boots fine and doesn't explode... so far...
The results aren't looking too bad. Though building a kernel with 'make -j200'
was extreme slow. I'll look into it tomorrow.
https://git.samba.org/jlayton/linux.git/?p=jlayton/lockperf.git;a=summary
flock01
mean variance sigma max min
4.1.0 11.7075 816.3341 28.5716 125.6552 0.0021
percpu-rwsem 11.4614 760.1345 27.5705 132.5030 0.0026
flock02
mean variance sigma max min
4.1.0 7.0197 1.1812 1.0868 10.6188 5.1706
percpu-rwsem 9.3194 1.3443 1.1594 11.5902 6.6138
lease01
mean variance sigma max min
4.1.0 41.8361 23.8462 4.8833 51.3493 28.5859
percpu-rwsem 40.2738 20.8323 4.5642 49.6037 28.0704
lease02
mean variance sigma max min
4.1.0 71.2159 12.7763 3.5744 77.8432 58.0390
percpu-rwsem 71.4312 14.7688 3.8430 76.5036 57.8615
posix01
mean variance sigma max min
4.1.0 121.9020 27882.5260 166.9806 603.5509 0.0063
percpu-rwsem 185.3981 38474.3836 196.1489 580.6532 0.0073
posix02
mean variance sigma max min
4.1.0 12.7461 3.1802 1.7833 15.5411 8.1018
percpu-rwsem 16.2341 4.3038 2.0746 19.3271 11.1751
posix03
mean variance sigma max min
4.1.0 0.9121 0.0000 0.0000 0.9121 0.9121
percpu-rwsem 0.9379 0.0000 0.0000 0.9379 0.9379
posix04
mean variance sigma max min
4.1.0 0.0703 0.0044 0.0664 0.6764 0.0437
percpu-rwsem 0.0675 0.0007 0.0267 0.3236 0.0491
cheers,
daniel
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 00/13] percpu rwsem -v2
2015-06-22 18:11 ` Daniel Wagner
@ 2015-06-22 19:05 ` Peter Zijlstra
2015-06-23 9:35 ` Daniel Wagner
0 siblings, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-22 19:05 UTC (permalink / raw)
To: Daniel Wagner
Cc: oleg, paulmck, tj, mingo, linux-kernel, der.herr, dave, riel,
viro, torvalds
On Mon, Jun 22, 2015 at 08:11:14PM +0200, Daniel Wagner wrote:
> On 06/22/2015 02:16 PM, Peter Zijlstra wrote:
> > Also, since Linus thinks lglocks is a failed locking primitive (which I whole
> > heartedly agree with, its preempt-disable latencies are an abomination), it
> > also converts the global part of fs/locks's usage of lglock over to a
> > percpu-rwsem and uses a per-cpu spinlock for the local part. This both provides
> > another (4th) percpu-rwsem users and removes an lglock user.
>
> I did a quick lockperf run with these patches on a 4 socket E5-4610 machine.
> These microbenches execercise the fs' locks a bit.
>
> I suspect I got the wrong tree. The patches did not apply cleanly. The resulting
> kernel boots fine and doesn't explode... so far...
Its against tip/master, although I expect the locking/core bits that
were sent to Linus earlier today to be the biggest missing piece.
All I really did was build a kernel with lockdep enabled and boot +
build a kernel to see it didn't go belly up.
> The results aren't looking too bad. Though building a kernel with 'make -j200'
> was extreme slow. I'll look into it tomorrow.
>
> https://git.samba.org/jlayton/linux.git/?p=jlayton/lockperf.git;a=summary
Sweet, I wasn't aware these existed. I'll go have a play.
> posix01
> mean variance sigma max min
> 4.1.0 121.9020 27882.5260 166.9806 603.5509 0.0063
> percpu-rwsem 185.3981 38474.3836 196.1489 580.6532 0.0073
>
>
> posix02
> mean variance sigma max min
> 4.1.0 12.7461 3.1802 1.7833 15.5411 8.1018
> percpu-rwsem 16.2341 4.3038 2.0746 19.3271 11.1751
>
These two seem to hurt, lemme go look at what they do.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 00/13] percpu rwsem -v2
2015-06-22 12:16 [RFC][PATCH 00/13] percpu rwsem -v2 Peter Zijlstra
` (14 preceding siblings ...)
2015-06-22 18:11 ` Daniel Wagner
@ 2015-06-22 20:06 ` Linus Torvalds
2015-06-23 16:10 ` Davidlohr Bueso
16 siblings, 0 replies; 106+ messages in thread
From: Linus Torvalds @ 2015-06-22 20:06 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Oleg Nesterov, Paul McKenney, Tejun Heo, Ingo Molnar,
Linux Kernel Mailing List, der.herr, Davidlohr Bueso,
Rik van Riel, Al Viro
On Mon, Jun 22, 2015 at 5:16 AM, Peter Zijlstra <peterz@infradead.org> wrote:
>
> It further removes the stop_machine lglock usage, and with it kills lglocks.
Ok. With all the conversions, and removal of lglock, my dislike of
this goes away.
I'm somewhat worried about Daniel's report about "building a kernel
with 'make -j200' was extreme slow", but that may be due to something
else (does the machine have enough memory for "make -j200"? The kernel
compile parallelizes so well, and gcc uses so much memory, that you
need a *lot* of memory to use things like "-j200").
But assuming that gets sorted out, and somebody looks at the few file
locking performance issues, I have no objections to this series any
more.
Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-06-22 12:16 ` [RFC][PATCH 12/13] stop_machine: Remove lglock Peter Zijlstra
@ 2015-06-22 22:21 ` Oleg Nesterov
2015-06-23 10:09 ` Peter Zijlstra
0 siblings, 1 reply; 106+ messages in thread
From: Oleg Nesterov @ 2015-06-22 22:21 UTC (permalink / raw)
To: Peter Zijlstra
Cc: paulmck, tj, mingo, linux-kernel, der.herr, dave, riel, viro, torvalds
On 06/22, Peter Zijlstra wrote:
>
> By having stop_two_cpus() acquire two cpu_stopper::locks we gain full
> order against the global stop_machine which takes each of these locks
> in order.
Yes, but stop_machine() locks/unlocs cpu_stopper->lock sequentially, it
never holds more than 1 ->lock, so
> +static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
> +{
> + struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
> + unsigned long flags;
> +
> + spin_lock_irqsave(&stopper->lock, flags);
> + __cpu_stop_queue_work(cpu, work);
> spin_unlock_irqrestore(&stopper->lock, flags);
> }
...
> int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg)
> {
> - struct cpu_stop_done done;
> + struct cpu_stopper *stopper1, *stopper2;
> struct cpu_stop_work work1, work2;
> struct multi_stop_data msdata;
> + struct cpu_stop_done done;
> + unsigned long flags;
> +
> + if (cpu2 < cpu1)
> + swap(cpu1, cpu2);
...
> + stopper1 = per_cpu_ptr(&cpu_stopper, cpu1);
> + stopper2 = per_cpu_ptr(&cpu_stopper, cpu2);
> +
> + spin_lock_irqsave(&stopper1->lock, flags);
> + spin_lock(&stopper2->lock);
> +
> + __cpu_stop_queue_work(cpu1, &work1);
> + __cpu_stop_queue_work(cpu2, &work2);
Suppose that stop_two_cpus(cpu1 => 0, cpu2 => 1) races with stop_machine().
- stop_machine takes the lock on CPU 0, adds the work
and drops the lock
- cpu_stop_queue_work() queues both works
- stop_machine takes the lock on CPU 1, etc
In this case both CPU 0 and 1 will run multi_cpu_stop() but they will
use different multi_stop_data's, so they will wait for each other
forever?
Oleg.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 09/13] hotplug: Replace hotplug lock with percpu-rwsem
2015-06-22 12:16 ` [RFC][PATCH 09/13] hotplug: Replace hotplug lock with percpu-rwsem Peter Zijlstra
@ 2015-06-22 22:57 ` Oleg Nesterov
2015-06-23 7:16 ` Peter Zijlstra
0 siblings, 1 reply; 106+ messages in thread
From: Oleg Nesterov @ 2015-06-22 22:57 UTC (permalink / raw)
To: Peter Zijlstra
Cc: paulmck, tj, mingo, linux-kernel, der.herr, dave, riel, viro, torvalds
On 06/22, Peter Zijlstra wrote:
>
> The cpu hotplug lock is a rwsem with read-in-write and read-in-read
> recursion. Implement it as such.
And this patch fixes the problem afaics. Currently cpu_hotplug_begin()
can livelock because it doesn't stop the new readers. With this patch
this is no longer possible.
> -static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
> +static inline void _percpu_down_read(struct percpu_rw_semaphore *sem)
> {
> might_sleep();
>
> - rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 0, _RET_IP_);
> -
> preempt_disable();
> /*
> * We are in an RCU-sched read-side critical section, so the writer
> @@ -46,6 +44,12 @@ static inline void percpu_down_read(stru
> */
> }
>
> +static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
> +{
> + rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 0, _RET_IP_);
> + _percpu_down_read(sem);
> +}
...
> void get_online_cpus(void)
> {
> might_sleep();
> - if (cpu_hotplug.active_writer == current)
> +
> + /* read in write recursion */
> + if (cpu_hotplug.writer == current)
> + return;
> +
> + /* read in read recursion */
> + if (current->cpuhp_ref++)
> return;
> - cpuhp_lock_acquire_read();
> - mutex_lock(&cpu_hotplug.lock);
> - atomic_inc(&cpu_hotplug.refcount);
> - mutex_unlock(&cpu_hotplug.lock);
> +
> + lock_map_acquire_read(&cpu_hotplug.rwsem.rw_sem.dep_map);
> + _percpu_down_read(&cpu_hotplug.rwsem);
> }
Confused... Why do we need _percpu_down_read()? Can't get_online_cpus()
just use percpu_down_read() ?
Yes, percpu_down_read() is not recursive, like the normal down_read().
But this does not matter because we rely on ->cpuhp_ref anyway?
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1410,6 +1410,8 @@ static struct task_struct *copy_process(
> p->sequential_io_avg = 0;
> #endif
>
> + cpu_hotplug_init_task(p);
This is probably unnecessary, copy_process() should not be called under
get_online_cpus().
Oleg.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 05/13] percpu-rwsem: Optimize readers and reduce global impact
2015-06-22 12:16 ` [RFC][PATCH 05/13] percpu-rwsem: Optimize readers and reduce global impact Peter Zijlstra
@ 2015-06-22 23:02 ` Oleg Nesterov
2015-06-23 7:28 ` Nicholas Mc Guire
1 sibling, 0 replies; 106+ messages in thread
From: Oleg Nesterov @ 2015-06-22 23:02 UTC (permalink / raw)
To: Peter Zijlstra
Cc: paulmck, tj, mingo, linux-kernel, der.herr, dave, riel, viro, torvalds
On 06/22, Peter Zijlstra wrote:
>
> +enum { readers_slow, readers_block };
I still think this enum doesn't make sense, and percpu_rw_semaphore->state
should be a boolean. But this is really minor and subjective.
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 06/13] percpu-rwsem: Provide percpu_down_read_trylock()
2015-06-22 12:16 ` [RFC][PATCH 06/13] percpu-rwsem: Provide percpu_down_read_trylock() Peter Zijlstra
@ 2015-06-22 23:08 ` Oleg Nesterov
0 siblings, 0 replies; 106+ messages in thread
From: Oleg Nesterov @ 2015-06-22 23:08 UTC (permalink / raw)
To: Peter Zijlstra
Cc: paulmck, tj, mingo, linux-kernel, der.herr, dave, riel, viro, torvalds
On 06/22, Peter Zijlstra wrote:
>
> +static inline bool percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
> +{
> + bool ret = true;
> +
> + preempt_disable();
> + __this_cpu_inc(*sem->refcount);
> + if (unlikely(!rcu_sync_is_idle(&sem->rss)))
> + ret = __percpu_down_read_trylock(sem);
> + preempt_enable();
> +
> + if (ret)
> + rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 1, _RET_IP_);
> +
> + return ret;
> +}
...
> +bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
> +{
> + smp_mb(); /* A matches D */
> +
> + if (likely(smp_load_acquire(&sem->state) != readers_block))
> + return true;
> +
> + __percpu_up_read(sem);
> +
> + return false;
> +}
Looks like we can slightly refactor this code to avoid the code
duplication. But this is minor too and we can do this later.
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 11/13] fs/locks: Replace lg_local with a per-cpu spinlock
2015-06-22 12:16 ` [RFC][PATCH 11/13] fs/locks: Replace lg_local with a per-cpu spinlock Peter Zijlstra
@ 2015-06-23 0:19 ` Oleg Nesterov
0 siblings, 0 replies; 106+ messages in thread
From: Oleg Nesterov @ 2015-06-23 0:19 UTC (permalink / raw)
To: Peter Zijlstra
Cc: paulmck, tj, mingo, linux-kernel, der.herr, dave, riel, viro, torvalds
Off-topic question,
On 06/22, Peter Zijlstra wrote:
>
> @@ -2650,9 +2660,8 @@ static void *locks_start(struct seq_file
>
> iter->li_pos = *pos + 1;
> percpu_down_write(&file_rwsem);
> - lg_global_lock(&file_lock_lglock);
> spin_lock(&blocked_lock_lock);
> - return seq_hlist_start_percpu(&file_lock_list, &iter->li_cpu, *pos);
> + return seq_hlist_start_percpu(&file_lock_list.hlist, &iter->li_cpu, *pos);
> }
...
> static void locks_stop(struct seq_file *f, void *v)
> __releases(&blocked_lock_lock)
> {
> spin_unlock(&blocked_lock_lock);
With or without this patch, why locks_start/locks_stop need to take/drop
blocked_lock_lock ?
Oleg.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 09/13] hotplug: Replace hotplug lock with percpu-rwsem
2015-06-22 22:57 ` Oleg Nesterov
@ 2015-06-23 7:16 ` Peter Zijlstra
2015-06-23 17:01 ` Oleg Nesterov
0 siblings, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-23 7:16 UTC (permalink / raw)
To: Oleg Nesterov
Cc: paulmck, tj, mingo, linux-kernel, der.herr, dave, riel, viro, torvalds
On Tue, Jun 23, 2015 at 12:57:39AM +0200, Oleg Nesterov wrote:
> > +
> > + lock_map_acquire_read(&cpu_hotplug.rwsem.rw_sem.dep_map);
> > + _percpu_down_read(&cpu_hotplug.rwsem);
> > }
>
> Confused... Why do we need _percpu_down_read()? Can't get_online_cpus()
> just use percpu_down_read() ?
>
> Yes, percpu_down_read() is not recursive, like the normal down_read().
> But this does not matter because we rely on ->cpuhp_ref anyway?
While we will not call the actual lock, lockdep will still get confused
by the inconsistent locking order observed.
Change it and boot, you'll find lockdep output pretty quickly.
> > --- a/kernel/fork.c
> > +++ b/kernel/fork.c
> > @@ -1410,6 +1410,8 @@ static struct task_struct *copy_process(
> > p->sequential_io_avg = 0;
> > #endif
> >
> > + cpu_hotplug_init_task(p);
>
> This is probably unnecessary, copy_process() should not be called under
> get_online_cpus().
Probably true, in which case we could still use the callback to insert a
WARN_ON_ONCE(p->cpuhp_ref) :-)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 05/13] percpu-rwsem: Optimize readers and reduce global impact
2015-06-22 12:16 ` [RFC][PATCH 05/13] percpu-rwsem: Optimize readers and reduce global impact Peter Zijlstra
2015-06-22 23:02 ` Oleg Nesterov
@ 2015-06-23 7:28 ` Nicholas Mc Guire
2015-06-25 19:08 ` Peter Zijlstra
1 sibling, 1 reply; 106+ messages in thread
From: Nicholas Mc Guire @ 2015-06-23 7:28 UTC (permalink / raw)
To: Peter Zijlstra
Cc: oleg, paulmck, tj, mingo, linux-kernel, dave, riel, viro, torvalds
A bit off-topic probably
but maybe this should not be in kernel/locking/percpu-rwsem.c but in a
generic percpu location as this construct is present in the core a few times
atleast in:
kernel/irq/irqdesc.c:kstat_irqs
kernel/fork.c:nr_processes
mm/memcontrol.c:mem_cgroup_read_events
mm/memcontrol.c:mem_cgroup_read_stat
> +
> +#define per_cpu_sum(var) \
> +({ \
> + typeof(var) __sum = 0; \
> + int cpu; \
> + for_each_possible_cpu(cpu) \
> + __sum += per_cpu(var, cpu); \
> + __sum; \
> +})
> +
so maybe put it into include/linux/percpu.h ?
thx!
hofrat
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at http://www.tux.org/lkml/
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 00/13] percpu rwsem -v2
2015-06-22 19:05 ` Peter Zijlstra
@ 2015-06-23 9:35 ` Daniel Wagner
2015-06-23 10:00 ` Ingo Molnar
2015-06-23 14:34 ` Peter Zijlstra
0 siblings, 2 replies; 106+ messages in thread
From: Daniel Wagner @ 2015-06-23 9:35 UTC (permalink / raw)
To: Peter Zijlstra, Daniel Wagner
Cc: oleg, paulmck, tj, mingo, linux-kernel, der.herr, dave, riel,
viro, torvalds, der.herr
On 06/22/2015 09:05 PM, Peter Zijlstra wrote:
> On Mon, Jun 22, 2015 at 08:11:14PM +0200, Daniel Wagner wrote:
>> On 06/22/2015 02:16 PM, Peter Zijlstra wrote:
>>> Also, since Linus thinks lglocks is a failed locking primitive (which I whole
>>> heartedly agree with, its preempt-disable latencies are an abomination), it
>>> also converts the global part of fs/locks's usage of lglock over to a
>>> percpu-rwsem and uses a per-cpu spinlock for the local part. This both provides
>>> another (4th) percpu-rwsem users and removes an lglock user.
>>
>> I did a quick lockperf run with these patches on a 4 socket E5-4610 machine.
>> These microbenches execercise the fs' locks a bit.
>>
>> I suspect I got the wrong tree. The patches did not apply cleanly. The resulting
>> kernel boots fine and doesn't explode... so far...
>
> Its against tip/master, although I expect the locking/core bits that
> were sent to Linus earlier today to be the biggest missing piece.
>
> All I really did was build a kernel with lockdep enabled and boot +
> build a kernel to see it didn't go belly up.
>
>> The results aren't looking too bad. Though building a kernel with 'make -j200'
>> was extreme slow. I'll look into it tomorrow.
So this turns out to be false alarm. I had icecream installed/actived
and that interfered with gcc. Stupid me.
The machine has 0.5TB memory and doesn't seem to be really concerned about
'make -j200'
make clean && time make -j200
mainline 4.1.0
2nd run
real 1m7.595s
user 28m43.125s
sys 3m48.189s
tip v4.1-2756-ge3d06bd
2nd run
real 1m6.871s
user 28m50.803s
sys 3m50.223s
3rd run
real 1m6.974s
user 28m52.093s
sys 3m50.259s
tip v4.1-2769-g6ce2591 (percpu-rwsem)
2nd run
real 1m7.847s
user 29m0.439s
sys 3m51.181s
3rd run
real 1m7.113s
user 29m3.127s
sys 3m51.516s
Compared to 'make -j64' on tip v4.1-2756-ge3d06bd
2nd run
real 1m7.605s
user 28m3.121s
sys 3m52.541s
>> https://git.samba.org/jlayton/linux.git/?p=jlayton/lockperf.git;a=summary
>
> Sweet, I wasn't aware these existed. I'll go have a play.
>
>> posix01
>> mean variance sigma max min
>> 4.1.0 121.9020 27882.5260 166.9806 603.5509 0.0063
>> percpu-rwsem 185.3981 38474.3836 196.1489 580.6532 0.0073
>>
>>
>> posix02
>> mean variance sigma max min
>> 4.1.0 12.7461 3.1802 1.7833 15.5411 8.1018
>> percpu-rwsem 16.2341 4.3038 2.0746 19.3271 11.1751
>>
>
> These two seem to hurt, lemme go look at what they do.
Now here the same tests with tip and tip+percpu-rwsem. The patches
applied cleanly :)
I put all the raw data here[1] in case someone is interested. Some of the
test behave a bit strange, running extremely fast compared to the other runs.
That is probably the result of me trying to reduce the run time to the min.
flock01
mean variance sigma max min
4.1.0 11.7075 816.3341 28.5716 125.6552 0.0021
4.1.0+percpu-rwsem 11.4614 760.1345 27.5705 132.5030 0.0026
tip 6.8390 329.3037 18.1467 81.0373 0.0021
tip+percpu-rwsem 10.0870 546.7435 23.3825 106.2396 0.0026
flock02
mean variance sigma max min
4.1.0 7.0197 1.1812 1.0868 10.6188 5.1706
4.1.0+percpu-rwsem 9.3194 1.3443 1.1594 11.5902 6.6138
tip 7.1057 1.6719 1.2930 11.2362 5.1434
tip+percpu-rwsem 9.0357 1.9874 1.4097 14.0254 6.4380
lease01
mean variance sigma max min
4.1.0 41.8361 23.8462 4.8833 51.3493 28.5859
4.1.0+percpu-rwsem 40.2738 20.8323 4.5642 49.6037 28.0704
tip 30.2617 13.0900 3.6180 36.6398 20.2085
tip+percpu-rwsem 31.2730 17.9787 4.2401 37.8981 19.2944
lease02
mean variance sigma max min
4.1.0 71.2159 12.7763 3.5744 77.8432 58.0390
4.1.0+percpu-rwsem 71.4312 14.7688 3.8430 76.5036 57.8615
tip 20.2019 5.2042 2.2813 23.1071 13.4647
tip+percpu-rwsem 20.8305 6.6631 2.5813 23.8034 11.2815
posix01
mean variance sigma max min
4.1.0 121.9020 27882.5260 166.9806 603.5509 0.0063
4.1.0+percpu-rwsem 185.3981 38474.3836 196.1489 580.6532 0.0073
tip 129.2736 23752.7122 154.1191 474.0604 0.0063
tip+percpu-rwsem 142.6474 24732.1571 157.2646 468.7478 0.0072
posix02
mean variance sigma max min
4.1.0 12.7461 3.1802 1.7833 15.5411 8.1018
4.1.0+percpu-rwsem 16.2341 4.3038 2.0746 19.3271 11.1751
tip 13.2810 5.3958 2.3229 20.1243 8.9361
tip+percpu-rwsem 15.6802 4.7514 2.1798 21.5704 9.4074
posix03
mean variance sigma max min
4.1.0 0.9121 0.0000 0.0000 0.9121 0.9121
4.1.0+percpu-rwsem 0.9379 0.0000 0.0000 0.9379 0.9379
tip 0.8647 0.0009 0.0297 0.9274 0.7995
tip+percpu-rwsem 0.8147 0.0003 0.0161 0.8530 0.7824
posix04
mean variance sigma max min
4.1.0 0.0703 0.0044 0.0664 0.6764 0.0437
4.1.0+percpu-rwsem 0.0675 0.0007 0.0267 0.3236 0.0491
tip 0.0618 0.0027 0.0521 0.5642 0.0453
tip+percpu-rwsem 0.0658 0.0003 0.0175 0.1793 0.0493
cheers,
daniel
[1] http://monom.org/percpu-rwsem/
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 00/13] percpu rwsem -v2
2015-06-23 9:35 ` Daniel Wagner
@ 2015-06-23 10:00 ` Ingo Molnar
2015-06-23 14:34 ` Peter Zijlstra
1 sibling, 0 replies; 106+ messages in thread
From: Ingo Molnar @ 2015-06-23 10:00 UTC (permalink / raw)
To: Daniel Wagner
Cc: Peter Zijlstra, oleg, paulmck, tj, mingo, linux-kernel, der.herr,
dave, riel, viro, torvalds
* Daniel Wagner <daniel.wagner@bmw-carit.de> wrote:
> The machine has 0.5TB memory and doesn't seem to be really concerned about
> 'make -j200'
>
> make clean && time make -j200
>
> mainline 4.1.0
> 2nd run
> real 1m7.595s
> user 28m43.125s
> sys 3m48.189s
>
>
> tip v4.1-2756-ge3d06bd
> 2nd run
> real 1m6.871s
> user 28m50.803s
> sys 3m50.223s
> 3rd run
> real 1m6.974s
> user 28m52.093s
> sys 3m50.259s
>
>
> tip v4.1-2769-g6ce2591 (percpu-rwsem)
> 2nd run
> real 1m7.847s
> user 29m0.439s
> sys 3m51.181s
> 3rd run
> real 1m7.113s
> user 29m3.127s
> sys 3m51.516s
>
>
>
> Compared to 'make -j64' on tip v4.1-2756-ge3d06bd
> 2nd run
> real 1m7.605s
> user 28m3.121s
> sys 3m52.541s
Btw., instead of just listing the raw runs, you can get an automatic average and
stddev numbers with this:
$ perf stat --null --repeat 5 --pre 'make clean' --post 'sync' make -j200
Performance counter stats for 'make -j200' (3 runs):
29.068162979 seconds time elapsed ( +- 0.27% )
Thanks,
Ingo
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-06-22 22:21 ` Oleg Nesterov
@ 2015-06-23 10:09 ` Peter Zijlstra
2015-06-23 10:55 ` Peter Zijlstra
2015-06-23 16:20 ` Oleg Nesterov
0 siblings, 2 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-23 10:09 UTC (permalink / raw)
To: Oleg Nesterov
Cc: paulmck, tj, mingo, linux-kernel, der.herr, dave, riel, viro, torvalds
On Tue, Jun 23, 2015 at 12:21:52AM +0200, Oleg Nesterov wrote:
> Suppose that stop_two_cpus(cpu1 => 0, cpu2 => 1) races with stop_machine().
>
> - stop_machine takes the lock on CPU 0, adds the work
> and drops the lock
>
> - cpu_stop_queue_work() queues both works
cpu_stop_queue_work() only ever queues _1_ work.
> - stop_machine takes the lock on CPU 1, etc
>
> In this case both CPU 0 and 1 will run multi_cpu_stop() but they will
> use different multi_stop_data's, so they will wait for each other
> forever?
So what you're saying is:
queue_stop_cpus_work() stop_two_cpus()
cpu_stop_queue_work(0,..);
spin_lock(0);
spin_lock(1);
__cpu_stop_queue_work(0,..);
__cpu_stop_queue_work(1,..);
spin_unlock(1);
spin_unlock(0);
cpu_stop_queue_work(1,..);
Indeed, I don't know what I was thinking...
We can of course slap a percpu-rwsem in, but I wonder if there's
anything smarter we can do here.
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-06-23 10:09 ` Peter Zijlstra
@ 2015-06-23 10:55 ` Peter Zijlstra
2015-06-23 11:20 ` Peter Zijlstra
2015-06-23 14:39 ` Paul E. McKenney
2015-06-23 16:20 ` Oleg Nesterov
1 sibling, 2 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-23 10:55 UTC (permalink / raw)
To: Oleg Nesterov
Cc: paulmck, tj, mingo, linux-kernel, der.herr, dave, riel, viro, torvalds
On Tue, Jun 23, 2015 at 12:09:32PM +0200, Peter Zijlstra wrote:
> We can of course slap a percpu-rwsem in, but I wonder if there's
> anything smarter we can do here.
Urgh, we cannot use percpu-rwsem here, because that would require
percpu_down_write_trylock(), and I'm not sure we can get around the
sync_sched() for that.
Now try_stop_cpus(), which requires the down_write_trylock() is used to
implement synchronize_sched_expedited().
Using sync_sched() to implement sync_sched_expedited would make me
happy, but it does somewhat defeat the purpose.
Also, I think _expedited is used too eagerly, look at this:
+void dm_sync_table(struct mapped_device *md)
+{
+ synchronize_srcu(&md->io_barrier);
+ synchronize_rcu_expedited();
+}
sync_srcu() is slow already, why then bother with an
sync_rcu_expedited() :/
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-06-23 10:55 ` Peter Zijlstra
@ 2015-06-23 11:20 ` Peter Zijlstra
2015-06-23 13:08 ` Peter Zijlstra
2015-06-23 14:39 ` Paul E. McKenney
1 sibling, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-23 11:20 UTC (permalink / raw)
To: Oleg Nesterov
Cc: paulmck, tj, mingo, linux-kernel, der.herr, dave, riel, viro, torvalds
On Tue, Jun 23, 2015 at 12:55:48PM +0200, Peter Zijlstra wrote:
> On Tue, Jun 23, 2015 at 12:09:32PM +0200, Peter Zijlstra wrote:
> > We can of course slap a percpu-rwsem in, but I wonder if there's
> > anything smarter we can do here.
>
> Urgh, we cannot use percpu-rwsem here, because that would require
> percpu_down_write_trylock(), and I'm not sure we can get around the
> sync_sched() for that.
>
> Now try_stop_cpus(), which requires the down_write_trylock() is used to
> implement synchronize_sched_expedited().
>
> Using sync_sched() to implement sync_sched_expedited would make me
> happy, but it does somewhat defeat the purpose.
Paul, why does this use stop_machine anyway? I seemed to remember you
sending resched IPIs around.
The rcu_sched_qs() thing would set passed_quiesce, which you can then
collect to gauge progress.
Shooting IPIs around is bad enough, but running a full blown
stop_machine is really blunt and heavy.
Also, OMFG @ 74b51ee152b6 ("ACPI / osl: speedup grace period in
acpi_os_map_cleanup"), that's an expedited use to help the nVidiot
binary blob. WTF!!
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-06-23 11:20 ` Peter Zijlstra
@ 2015-06-23 13:08 ` Peter Zijlstra
2015-06-23 16:36 ` Oleg Nesterov
2015-06-23 17:30 ` Paul E. McKenney
0 siblings, 2 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-23 13:08 UTC (permalink / raw)
To: Oleg Nesterov
Cc: paulmck, tj, mingo, linux-kernel, der.herr, dave, riel, viro, torvalds
On Tue, Jun 23, 2015 at 01:20:41PM +0200, Peter Zijlstra wrote:
> Paul, why does this use stop_machine anyway? I seemed to remember you
> sending resched IPIs around.
>
> The rcu_sched_qs() thing would set passed_quiesce, which you can then
> collect to gauge progress.
>
> Shooting IPIs around is bad enough, but running a full blown
> stop_machine is really blunt and heavy.
Is there anything obviously amiss with the below? It does stop_one_cpu()
in a loop instead of the multi cpu stop_machine and is therefore much
friendlier (albeit still heavier than bare resched IPIs) since the CPUs
do not have to go an sync up.
After all, all we're really interested in is that each CPUs has
scheduled at least once, we do not care about the cross cpu syncup.
---
include/linux/stop_machine.h | 7 ----
kernel/rcu/tree.c | 99 +++++---------------------------------------
kernel/stop_machine.c | 30 --------------
3 files changed, 10 insertions(+), 126 deletions(-)
diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index d2abbdb8c6aa..f992da7ee492 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -32,7 +32,6 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
struct cpu_stop_work *work_buf);
int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
-int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
#else /* CONFIG_SMP */
@@ -83,12 +82,6 @@ static inline int stop_cpus(const struct cpumask *cpumask,
return -ENOENT;
}
-static inline int try_stop_cpus(const struct cpumask *cpumask,
- cpu_stop_fn_t fn, void *arg)
-{
- return stop_cpus(cpumask, fn, arg);
-}
-
#endif /* CONFIG_SMP */
/*
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index add042926a66..4a8cde155dce 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3257,7 +3257,7 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
{
/*
* There must be a full memory barrier on each affected CPU
- * between the time that try_stop_cpus() is called and the
+ * between the time that stop_one_cpu() is called and the
* time that it returns.
*
* In the current initial implementation of cpu_stop, the
@@ -3291,25 +3291,12 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
* grace period. We are then done, so we use atomic_cmpxchg() to
* update sync_sched_expedited_done to match our snapshot -- but
* only if someone else has not already advanced past our snapshot.
- *
- * On the other hand, if try_stop_cpus() fails, we check the value
- * of sync_sched_expedited_done. If it has advanced past our
- * initial snapshot, then someone else must have forced a grace period
- * some time after we took our snapshot. In this case, our work is
- * done for us, and we can simply return. Otherwise, we try again,
- * but keep our initial snapshot for purposes of checking for someone
- * doing our work for us.
- *
- * If we fail too many times in a row, we fall back to synchronize_sched().
*/
void synchronize_sched_expedited(void)
{
- cpumask_var_t cm;
- bool cma = false;
- int cpu;
- long firstsnap, s, snap;
- int trycount = 0;
struct rcu_state *rsp = &rcu_sched_state;
+ long s, snap;
+ int cpu;
/*
* If we are in danger of counter wrap, just do synchronize_sched().
@@ -3332,7 +3319,6 @@ void synchronize_sched_expedited(void)
* full memory barrier.
*/
snap = atomic_long_inc_return(&rsp->expedited_start);
- firstsnap = snap;
if (!try_get_online_cpus()) {
/* CPU hotplug operation in flight, fall back to normal GP. */
wait_rcu_gp(call_rcu_sched);
@@ -3341,82 +3327,17 @@ void synchronize_sched_expedited(void)
}
WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
- /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
- cma = zalloc_cpumask_var(&cm, GFP_KERNEL);
- if (cma) {
- cpumask_copy(cm, cpu_online_mask);
- cpumask_clear_cpu(raw_smp_processor_id(), cm);
- for_each_cpu(cpu, cm) {
- struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-
- if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
- cpumask_clear_cpu(cpu, cm);
- }
- if (cpumask_weight(cm) == 0)
- goto all_cpus_idle;
- }
-
- /*
- * Each pass through the following loop attempts to force a
- * context switch on each CPU.
- */
- while (try_stop_cpus(cma ? cm : cpu_online_mask,
- synchronize_sched_expedited_cpu_stop,
- NULL) == -EAGAIN) {
- put_online_cpus();
- atomic_long_inc(&rsp->expedited_tryfail);
-
- /* Check to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone1);
- free_cpumask_var(cm);
- return;
- }
-
- /* No joy, try again later. Or just synchronize_sched(). */
- if (trycount++ < 10) {
- udelay(trycount * num_online_cpus());
- } else {
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
- free_cpumask_var(cm);
- return;
- }
+ for_each_online_cpu(cpu) {
+ struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
- /* Recheck to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone2);
- free_cpumask_var(cm);
- return;
- }
+ /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
+ if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
+ continue;
- /*
- * Refetching sync_sched_expedited_started allows later
- * callers to piggyback on our grace period. We retry
- * after they started, so our grace period works for them,
- * and they started after our first try, so their grace
- * period works for us.
- */
- if (!try_get_online_cpus()) {
- /* CPU hotplug operation in flight, use normal GP. */
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
- free_cpumask_var(cm);
- return;
- }
- snap = atomic_long_read(&rsp->expedited_start);
- smp_mb(); /* ensure read is before try_stop_cpus(). */
+ stop_one_cpu(cpu, synchronize_sched_expedited_cpu_stop, NULL);
}
- atomic_long_inc(&rsp->expedited_stoppedcpus);
-all_cpus_idle:
- free_cpumask_var(cm);
+ atomic_long_inc(&rsp->expedited_stoppedcpus);
/*
* Everyone up to our most recent fetch is covered by our grace
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index fd643d8c4b42..b1329a213503 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -371,36 +371,6 @@ int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
return ret;
}
-/**
- * try_stop_cpus - try to stop multiple cpus
- * @cpumask: cpus to stop
- * @fn: function to execute
- * @arg: argument to @fn
- *
- * Identical to stop_cpus() except that it fails with -EAGAIN if
- * someone else is already using the facility.
- *
- * CONTEXT:
- * Might sleep.
- *
- * RETURNS:
- * -EAGAIN if someone else is already stopping cpus, -ENOENT if
- * @fn(@arg) was not executed at all because all cpus in @cpumask were
- * offline; otherwise, 0 if all executions of @fn returned 0, any non
- * zero return value if any returned non zero.
- */
-int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
-{
- int ret;
-
- /* static works are used, process one request at a time */
- if (!mutex_trylock(&stop_cpus_mutex))
- return -EAGAIN;
- ret = __stop_cpus(cpumask, fn, arg);
- mutex_unlock(&stop_cpus_mutex);
- return ret;
-}
-
static int cpu_stop_should_run(unsigned int cpu)
{
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
^ permalink raw reply related [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 00/13] percpu rwsem -v2
2015-06-23 9:35 ` Daniel Wagner
2015-06-23 10:00 ` Ingo Molnar
@ 2015-06-23 14:34 ` Peter Zijlstra
2015-06-23 14:56 ` Daniel Wagner
1 sibling, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-23 14:34 UTC (permalink / raw)
To: Daniel Wagner
Cc: oleg, paulmck, tj, mingo, linux-kernel, der.herr, dave, riel,
viro, torvalds, jlayton
On Tue, Jun 23, 2015 at 11:35:24AM +0200, Daniel Wagner wrote:
> flock01
> mean variance sigma max min
> 4.1.0 11.7075 816.3341 28.5716 125.6552 0.0021
> 4.1.0+percpu-rwsem 11.4614 760.1345 27.5705 132.5030 0.0026
> tip 6.8390 329.3037 18.1467 81.0373 0.0021
> tip+percpu-rwsem 10.0870 546.7435 23.3825 106.2396 0.0026
> posix01
> mean variance sigma max min
> 4.1.0 121.9020 27882.5260 166.9806 603.5509 0.0063
> 4.1.0+percpu-rwsem 185.3981 38474.3836 196.1489 580.6532 0.0073
> tip 129.2736 23752.7122 154.1191 474.0604 0.0063
> tip+percpu-rwsem 142.6474 24732.1571 157.2646 468.7478 0.0072
Both these tests are incredibly unstable for me (as well as for you it
appears). Variance is through the roof on them.
I get runtimes like:
root@ivb-ex:/usr/local/src/lockperf# ./flock01 -n 240 -l 32 /tmp/a
0.266157011
root@ivb-ex:/usr/local/src/lockperf# ./flock01 -n 240 -l 32 /tmp/a
139.303399960
That's not really inspiring, if I use bigger loop counts it more or less
settles, but then the EX is unusable because it ends up running 3000
seconds per test.
In any case, on a smaller box (ivb-ep) I got the below results:
posix01
mean variance sigma max min
data-4.1.0-02756-ge3d06bd 250.7032 40.4864 6.3629 263.7736 238.5192
data-4.1.0-02756-ge3d06bd-dirty 252.6847 35.8953 5.9913 270.1679 233.0215
Which looks better, but the difference is still well within the variance
and thus not significant.
Lemme continue playing with this for a bit more.
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-06-23 10:55 ` Peter Zijlstra
2015-06-23 11:20 ` Peter Zijlstra
@ 2015-06-23 14:39 ` Paul E. McKenney
1 sibling, 0 replies; 106+ messages in thread
From: Paul E. McKenney @ 2015-06-23 14:39 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
viro, torvalds
On Tue, Jun 23, 2015 at 12:55:48PM +0200, Peter Zijlstra wrote:
> On Tue, Jun 23, 2015 at 12:09:32PM +0200, Peter Zijlstra wrote:
> > We can of course slap a percpu-rwsem in, but I wonder if there's
> > anything smarter we can do here.
>
> Urgh, we cannot use percpu-rwsem here, because that would require
> percpu_down_write_trylock(), and I'm not sure we can get around the
> sync_sched() for that.
>
> Now try_stop_cpus(), which requires the down_write_trylock() is used to
> implement synchronize_sched_expedited().
>
> Using sync_sched() to implement sync_sched_expedited would make me
> happy, but it does somewhat defeat the purpose.
>
>
>
> Also, I think _expedited is used too eagerly, look at this:
>
> +void dm_sync_table(struct mapped_device *md)
> +{
> + synchronize_srcu(&md->io_barrier);
> + synchronize_rcu_expedited();
> +}
>
> sync_srcu() is slow already, why then bother with an
> sync_rcu_expedited() :/
Actually, this code was added in 2013, which was after the new variant of
synchronize_srcu(), which last I checked is reasonably fast in the common
case (no readers and not having tons of concurrent synchronize_srcu()
calls on the same srcu_struct), especially on systems with a small number
of CPUs, courtesy of srcu_read_lock()'s and srcu_read_unlock()'s read-side
memory barriers.
So synchronize_rcu() really would be expected to have quite a bit higher
latency than synchronize_srcu().
Thanx, Paul
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 00/13] percpu rwsem -v2
2015-06-23 14:34 ` Peter Zijlstra
@ 2015-06-23 14:56 ` Daniel Wagner
2015-06-23 17:50 ` Peter Zijlstra
0 siblings, 1 reply; 106+ messages in thread
From: Daniel Wagner @ 2015-06-23 14:56 UTC (permalink / raw)
To: Peter Zijlstra
Cc: oleg, paulmck, tj, mingo, linux-kernel, der.herr, dave, riel,
viro, torvalds, jlayton
On 06/23/2015 04:34 PM, Peter Zijlstra wrote:
> On Tue, Jun 23, 2015 at 11:35:24AM +0200, Daniel Wagner wrote:
>> flock01
>> mean variance sigma max min
>> 4.1.0 11.7075 816.3341 28.5716 125.6552 0.0021
>> 4.1.0+percpu-rwsem 11.4614 760.1345 27.5705 132.5030 0.0026
>> tip 6.8390 329.3037 18.1467 81.0373 0.0021
>> tip+percpu-rwsem 10.0870 546.7435 23.3825 106.2396 0.0026
>
>> posix01
>> mean variance sigma max min
>> 4.1.0 121.9020 27882.5260 166.9806 603.5509 0.0063
>> 4.1.0+percpu-rwsem 185.3981 38474.3836 196.1489 580.6532 0.0073
>> tip 129.2736 23752.7122 154.1191 474.0604 0.0063
>> tip+percpu-rwsem 142.6474 24732.1571 157.2646 468.7478 0.0072
>
> Both these tests are incredibly unstable for me (as well as for you it
> appears). Variance is through the roof on them.
Since on my test machine not all 4 socket have inter connection, I pinned the
tests down to one socket to see if that reduces the variance.
Expect flock01 and posix01 show now really low variances (3 runs):
[...]
flock02
mean variance sigma max min
tip-1 11.8994 0.5874 0.7664 13.2022 8.6324
tip-2 11.7394 0.5252 0.7247 13.2540 9.7513
tip-3 11.8155 0.5288 0.7272 13.2700 9.9480
tip+percpu-rswem-1 15.3601 0.8981 0.9477 16.8116 12.6910
tip+percpu-rswem-2 15.2558 0.8442 0.9188 17.0199 12.9586
tip+percpu-rswem-3 15.5297 0.6386 0.7991 17.4392 12.7992
lease01
mean variance sigma max min
tip-1 0.3424 0.0001 0.0110 0.3644 0.3088
tip-2 0.3627 0.0003 0.0185 0.4140 0.3312
tip-3 0.3446 0.0002 0.0125 0.3851 0.3155
tip+percpu-rswem-1 0.3464 0.0001 0.0116 0.3781 0.3113
tip+percpu-rswem-2 0.3597 0.0003 0.0162 0.3978 0.3250
tip+percpu-rswem-3 0.3513 0.0002 0.0151 0.3933 0.3122
[...]
So with this setup we can start to compare the numbers.
> I get runtimes like:
>
> root@ivb-ex:/usr/local/src/lockperf# ./flock01 -n 240 -l 32 /tmp/a
> 0.266157011
> root@ivb-ex:/usr/local/src/lockperf# ./flock01 -n 240 -l 32 /tmp/a
> 139.303399960
Same here:
flock01
mean variance sigma max min
tip-1 242.6147 3632.6201 60.2712 313.3081 86.3743
tip-2 233.1934 3850.1995 62.0500 318.2716 101.2738
tip-3 223.0392 3944.5220 62.8054 318.1932 110.8155
tip+percpu-rswem-1 276.5913 2145.0510 46.3147 317.5385 156.1318
tip+percpu-rswem-2 270.7089 2735.7635 52.3045 318.9418 154.5902
tip+percpu-rswem-3 267.8207 3028.3557 55.0305 320.2987 150.9659
posix01
mean variance sigma max min
tip-1 18.8729 151.2810 12.2996 37.3563 0.0060
tip-2 17.6894 140.9982 11.8743 37.2080 0.0060
tip-3 18.7785 145.1217 12.0466 35.5001 0.0060
tip+percpu-rswem-1 18.9970 163.8856 12.8018 35.8795 0.0069
tip+percpu-rswem-2 18.9594 147.3197 12.1375 35.4404 0.0069
tip+percpu-rswem-3 18.8366 126.5831 11.2509 35.9014 0.0069
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 00/13] percpu rwsem -v2
2015-06-22 12:16 [RFC][PATCH 00/13] percpu rwsem -v2 Peter Zijlstra
` (15 preceding siblings ...)
2015-06-22 20:06 ` Linus Torvalds
@ 2015-06-23 16:10 ` Davidlohr Bueso
2015-06-23 16:21 ` Peter Zijlstra
16 siblings, 1 reply; 106+ messages in thread
From: Davidlohr Bueso @ 2015-06-23 16:10 UTC (permalink / raw)
To: Peter Zijlstra
Cc: oleg, paulmck, tj, mingo, linux-kernel, der.herr, riel, viro, torvalds
On Mon, 2015-06-22 at 14:16 +0200, Peter Zijlstra wrote:
> This series converts the cpu hotplug lock into a percpu-rwsem to provide a 3rd
> user.
Curious, why not also mem hotplug? It seems to use the exact same
locking mayhem than cpu.
Thanks,
Davidlohr
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-06-23 10:09 ` Peter Zijlstra
2015-06-23 10:55 ` Peter Zijlstra
@ 2015-06-23 16:20 ` Oleg Nesterov
2015-06-23 17:24 ` Oleg Nesterov
1 sibling, 1 reply; 106+ messages in thread
From: Oleg Nesterov @ 2015-06-23 16:20 UTC (permalink / raw)
To: Peter Zijlstra
Cc: paulmck, tj, mingo, linux-kernel, der.herr, dave, riel, viro, torvalds
On 06/23, Peter Zijlstra wrote:
>
> On Tue, Jun 23, 2015 at 12:21:52AM +0200, Oleg Nesterov wrote:
>
> > Suppose that stop_two_cpus(cpu1 => 0, cpu2 => 1) races with stop_machine().
> >
> > - stop_machine takes the lock on CPU 0, adds the work
> > and drops the lock
> >
> > - cpu_stop_queue_work() queues both works
>
> cpu_stop_queue_work() only ever queues _1_ work.
>
> > - stop_machine takes the lock on CPU 1, etc
> >
> > In this case both CPU 0 and 1 will run multi_cpu_stop() but they will
> > use different multi_stop_data's, so they will wait for each other
> > forever?
>
> So what you're saying is:
>
> queue_stop_cpus_work() stop_two_cpus()
>
> cpu_stop_queue_work(0,..);
> spin_lock(0);
> spin_lock(1);
>
> __cpu_stop_queue_work(0,..);
> __cpu_stop_queue_work(1,..);
>
> spin_unlock(1);
> spin_unlock(0);
> cpu_stop_queue_work(1,..);
Yes, sorry for confusion.
> We can of course slap a percpu-rwsem in, but I wonder if there's
> anything smarter we can do here.
I am wondering too if we can make this multi_cpu_stop() more clever.
Or at least add some deadlock detection...
Until then you can probably just uglify queue_stop_cpus_work() and
avoid the race,
static void queue_stop_cpus_work(const struct cpumask *cpumask,
cpu_stop_fn_t fn, void *arg,
struct cpu_stop_done *done)
{
struct cpu_stopper *stopper;
struct cpu_stop_work *work;
unsigned long flags;
unsigned int cpu;
local_irq_save(flags);
for_each_cpu(cpu, cpumask) {
stopper = &per_cpu(cpu_stopper, cpu);
spin_lock(&stopper->lock);
work = &per_cpu(stop_cpus_work, cpu);
work->fn = fn;
work->arg = arg;
work->done = done;
}
for_each_cpu(cpu, cpumask)
__cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu));
for_each_cpu(cpu, cpumask) {
stopper = &per_cpu(cpu_stopper, cpu);
spin_unlock(&stopper->lock);
}
local_irq_restore(flags);
}
ignoring lockdep problems.
It would be nice to remove stop_cpus_mutex, it actually protects
stop_cpus_work... Then probably stop_two_cpus() can just use
stop_cpus(). We could simply make stop_cpus_mutex per-cpu too,
but this doesn't look nice.
Oleg.
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 00/13] percpu rwsem -v2
2015-06-23 16:10 ` Davidlohr Bueso
@ 2015-06-23 16:21 ` Peter Zijlstra
0 siblings, 0 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-23 16:21 UTC (permalink / raw)
To: Davidlohr Bueso
Cc: oleg, paulmck, tj, mingo, linux-kernel, der.herr, riel, viro, torvalds
On Tue, Jun 23, 2015 at 09:10:03AM -0700, Davidlohr Bueso wrote:
> On Mon, 2015-06-22 at 14:16 +0200, Peter Zijlstra wrote:
> > This series converts the cpu hotplug lock into a percpu-rwsem to provide a 3rd
> > user.
>
> Curious, why not also mem hotplug? It seems to use the exact same
> locking mayhem than cpu.
Because it looks like they 'forgot' to copy the notifiers and therefore
I suspect we could simplify things. We might not need the recursive
nonsense.
But I've not yet actually looked at it much.
I was indeed greatly saddened that these people copied cpu hotplug;
clearly they had not gotten the memo that cpu hotplug is a trainwreck.
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-06-23 13:08 ` Peter Zijlstra
@ 2015-06-23 16:36 ` Oleg Nesterov
2015-06-23 17:30 ` Paul E. McKenney
1 sibling, 0 replies; 106+ messages in thread
From: Oleg Nesterov @ 2015-06-23 16:36 UTC (permalink / raw)
To: Peter Zijlstra
Cc: paulmck, tj, mingo, linux-kernel, der.herr, dave, riel, viro, torvalds
On 06/23, Peter Zijlstra wrote:
>
> void synchronize_sched_expedited(void)
> {
...
> - while (try_stop_cpus(cma ? cm : cpu_online_mask,
> - synchronize_sched_expedited_cpu_stop,
> - NULL) == -EAGAIN) {
> - put_online_cpus();
> - atomic_long_inc(&rsp->expedited_tryfail);
> -
> - /* Check to see if someone else did our work for us. */
> - s = atomic_long_read(&rsp->expedited_done);
> - if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
> - /* ensure test happens before caller kfree */
> - smp_mb__before_atomic(); /* ^^^ */
> - atomic_long_inc(&rsp->expedited_workdone1);
> - free_cpumask_var(cm);
> - return;
> - }
...
> + for_each_online_cpu(cpu) {
> + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
...
> + stop_one_cpu(cpu, synchronize_sched_expedited_cpu_stop, NULL);
I too thought about something like this change ;)
Not sure I read this patch correctly, but it seems that then you can
remove all rsp->expedited_* members/code ?
Oleg.
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 09/13] hotplug: Replace hotplug lock with percpu-rwsem
2015-06-23 7:16 ` Peter Zijlstra
@ 2015-06-23 17:01 ` Oleg Nesterov
2015-06-23 17:53 ` Peter Zijlstra
0 siblings, 1 reply; 106+ messages in thread
From: Oleg Nesterov @ 2015-06-23 17:01 UTC (permalink / raw)
To: Peter Zijlstra
Cc: paulmck, tj, mingo, linux-kernel, der.herr, dave, riel, viro, torvalds
On 06/23, Peter Zijlstra wrote:
>
> On Tue, Jun 23, 2015 at 12:57:39AM +0200, Oleg Nesterov wrote:
> > > +
> > > + lock_map_acquire_read(&cpu_hotplug.rwsem.rw_sem.dep_map);
> > > + _percpu_down_read(&cpu_hotplug.rwsem);
> > > }
> >
> > Confused... Why do we need _percpu_down_read()? Can't get_online_cpus()
> > just use percpu_down_read() ?
> >
> > Yes, percpu_down_read() is not recursive, like the normal down_read().
> > But this does not matter because we rely on ->cpuhp_ref anyway?
>
> While we will not call the actual lock, lockdep will still get confused
> by the inconsistent locking order observed.
>
> Change it and boot, you'll find lockdep output pretty quickly.
Hmm. and I simply can't understand why...
>
> > > --- a/kernel/fork.c
> > > +++ b/kernel/fork.c
> > > @@ -1410,6 +1410,8 @@ static struct task_struct *copy_process(
> > > p->sequential_io_avg = 0;
> > > #endif
> > >
> > > + cpu_hotplug_init_task(p);
> >
> > This is probably unnecessary, copy_process() should not be called under
> > get_online_cpus().
>
> Probably true, in which case we could still use the callback to insert a
> WARN_ON_ONCE(p->cpuhp_ref) :-)
Yes, agreed.
And, perhaps, WARN_ON_ONCE(in_irq) in try_get_online_cpus() makes sense...
percpu_down_read_trylock() from irq is fine, but try_get_online_cpus()
can come right after get/put_online_cpus() updates ->cpuhp_ref.
And I forgot to say,
> void get_online_cpus(void)
> {
> might_sleep();
> - if (cpu_hotplug.active_writer == current)
> +
> + /* read in write recursion */
> + if (cpu_hotplug.writer == current)
> + return;
...
> void put_online_cpus(void)
> {
> - int refcount;
> -
> - if (cpu_hotplug.active_writer == current)
> + if (cpu_hotplug.writer == current)
> return;
We do not need to check cpu_hotplug.writer in get/put_online_cpus().
cpu_hotplug_begin/end can just inc/dec current->cpuhp_ref.
Oleg.
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-06-23 16:20 ` Oleg Nesterov
@ 2015-06-23 17:24 ` Oleg Nesterov
2015-06-25 19:18 ` Peter Zijlstra
0 siblings, 1 reply; 106+ messages in thread
From: Oleg Nesterov @ 2015-06-23 17:24 UTC (permalink / raw)
To: Peter Zijlstra
Cc: paulmck, tj, mingo, linux-kernel, der.herr, dave, riel, viro, torvalds
On 06/23, Oleg Nesterov wrote:
>
>
> It would be nice to remove stop_cpus_mutex, it actually protects
> stop_cpus_work... Then probably stop_two_cpus() can just use
> stop_cpus(). We could simply make stop_cpus_mutex per-cpu too,
> but this doesn't look nice.
IOW. Suppose we add ->work_mutex into struct cpu_stopper. Btw,
I think we should move all per-cpu variables there...
Now,
lock_stop_cpus_works(cpumask)
{
for_each_cpu(cpu, cpumask)
mutex_lock(per_cpu(cpu_stopper_task, cpu).work_mutex);
}
unlock_stop_cpus_works(cpumask)
{
for_each_cpu(cpu, cpumask)
mutex_lock(...);
}
which should be used instead of stop_cpus_mutex. After this change
stop_two_cpus() can just use stop_cpus().
Off-topic. Can't we make __stop_machine() static? The only caller,
_cpu_down() can safely call stop_machine(), get_online_cpus() is
fine under cpu_hotplug_begin().
Oleg.
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-06-23 13:08 ` Peter Zijlstra
2015-06-23 16:36 ` Oleg Nesterov
@ 2015-06-23 17:30 ` Paul E. McKenney
2015-06-23 18:04 ` Peter Zijlstra
1 sibling, 1 reply; 106+ messages in thread
From: Paul E. McKenney @ 2015-06-23 17:30 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
viro, torvalds
On Tue, Jun 23, 2015 at 03:08:26PM +0200, Peter Zijlstra wrote:
> On Tue, Jun 23, 2015 at 01:20:41PM +0200, Peter Zijlstra wrote:
> > Paul, why does this use stop_machine anyway? I seemed to remember you
> > sending resched IPIs around.
It used to, but someone submitted a patch long ago that switched it
to try_stop_cpus(). At that time, RCU didn't unconditionally do the
dyntick-idle thing for CONFIG_NO_HZ=n kernel, so try_stop_cpus() was
quite a bit simpler.
That said, I do use your new-age resched-IPI API in other cases.
> > The rcu_sched_qs() thing would set passed_quiesce, which you can then
> > collect to gauge progress.
> >
> > Shooting IPIs around is bad enough, but running a full blown
> > stop_machine is really blunt and heavy.
>
> Is there anything obviously amiss with the below? It does stop_one_cpu()
> in a loop instead of the multi cpu stop_machine and is therefore much
> friendlier (albeit still heavier than bare resched IPIs) since the CPUs
> do not have to go an sync up.
>
> After all, all we're really interested in is that each CPUs has
> scheduled at least once, we do not care about the cross cpu syncup.
This was on my list. I was thinking of using smp_call_function_single()
combined with polling in order to avoid the double context switch, but
there the approach below is of course simpler. I was intending to fix
up the rest of RCU's relationship with CPU hotplug first, as this would
allow fully covering the incoming and outgoing code paths.
But perhaps a bit too simple. A few comments below...
Thanx, Paul
> ---
> include/linux/stop_machine.h | 7 ----
> kernel/rcu/tree.c | 99 +++++---------------------------------------
> kernel/stop_machine.c | 30 --------------
> 3 files changed, 10 insertions(+), 126 deletions(-)
>
> diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
> index d2abbdb8c6aa..f992da7ee492 100644
> --- a/include/linux/stop_machine.h
> +++ b/include/linux/stop_machine.h
> @@ -32,7 +32,6 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
> void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
> struct cpu_stop_work *work_buf);
> int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
> -int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
>
> #else /* CONFIG_SMP */
>
> @@ -83,12 +82,6 @@ static inline int stop_cpus(const struct cpumask *cpumask,
> return -ENOENT;
> }
>
> -static inline int try_stop_cpus(const struct cpumask *cpumask,
> - cpu_stop_fn_t fn, void *arg)
> -{
> - return stop_cpus(cpumask, fn, arg);
> -}
> -
> #endif /* CONFIG_SMP */
>
> /*
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index add042926a66..4a8cde155dce 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -3257,7 +3257,7 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
> {
> /*
> * There must be a full memory barrier on each affected CPU
> - * between the time that try_stop_cpus() is called and the
> + * between the time that stop_one_cpu() is called and the
> * time that it returns.
> *
> * In the current initial implementation of cpu_stop, the
> @@ -3291,25 +3291,12 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
> * grace period. We are then done, so we use atomic_cmpxchg() to
> * update sync_sched_expedited_done to match our snapshot -- but
> * only if someone else has not already advanced past our snapshot.
> - *
> - * On the other hand, if try_stop_cpus() fails, we check the value
> - * of sync_sched_expedited_done. If it has advanced past our
> - * initial snapshot, then someone else must have forced a grace period
> - * some time after we took our snapshot. In this case, our work is
> - * done for us, and we can simply return. Otherwise, we try again,
> - * but keep our initial snapshot for purposes of checking for someone
> - * doing our work for us.
> - *
> - * If we fail too many times in a row, we fall back to synchronize_sched().
> */
> void synchronize_sched_expedited(void)
> {
> - cpumask_var_t cm;
> - bool cma = false;
> - int cpu;
> - long firstsnap, s, snap;
> - int trycount = 0;
> struct rcu_state *rsp = &rcu_sched_state;
> + long s, snap;
> + int cpu;
>
> /*
> * If we are in danger of counter wrap, just do synchronize_sched().
> @@ -3332,7 +3319,6 @@ void synchronize_sched_expedited(void)
> * full memory barrier.
> */
> snap = atomic_long_inc_return(&rsp->expedited_start);
> - firstsnap = snap;
Hmmm...
> if (!try_get_online_cpus()) {
> /* CPU hotplug operation in flight, fall back to normal GP. */
> wait_rcu_gp(call_rcu_sched);
> @@ -3341,82 +3327,17 @@ void synchronize_sched_expedited(void)
> }
> WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
>
> - /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
> - cma = zalloc_cpumask_var(&cm, GFP_KERNEL);
> - if (cma) {
> - cpumask_copy(cm, cpu_online_mask);
> - cpumask_clear_cpu(raw_smp_processor_id(), cm);
> - for_each_cpu(cpu, cm) {
> - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
> -
> - if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
> - cpumask_clear_cpu(cpu, cm);
> - }
> - if (cpumask_weight(cm) == 0)
> - goto all_cpus_idle;
> - }
Good, you don't need this because you can check for dynticks later.
You will need to check for offline CPUs.
If you had lots of CPUs coming and going, you could argue that tracking
them would help, but synchronize_sched_expedited() should run fast enough
that there isn't time for CPUs to come or go, at least in the common case.
> - /*
> - * Each pass through the following loop attempts to force a
> - * context switch on each CPU.
> - */
> - while (try_stop_cpus(cma ? cm : cpu_online_mask,
> - synchronize_sched_expedited_cpu_stop,
> - NULL) == -EAGAIN) {
> - put_online_cpus();
> - atomic_long_inc(&rsp->expedited_tryfail);
> -
> - /* Check to see if someone else did our work for us. */
> - s = atomic_long_read(&rsp->expedited_done);
> - if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
> - /* ensure test happens before caller kfree */
> - smp_mb__before_atomic(); /* ^^^ */
> - atomic_long_inc(&rsp->expedited_workdone1);
> - free_cpumask_var(cm);
> - return;
Here you lose batching. Yeah, I know that synchronize_sched_expedited()
is -supposed- to be used sparingly, but it is not cool for the kernel
to melt down just because some creative user found a way to heat up a
code path. Need a mutex_trylock() with a counter and checking for
others having already done the needed work.
> - }
> -
> - /* No joy, try again later. Or just synchronize_sched(). */
> - if (trycount++ < 10) {
> - udelay(trycount * num_online_cpus());
> - } else {
> - wait_rcu_gp(call_rcu_sched);
> - atomic_long_inc(&rsp->expedited_normal);
> - free_cpumask_var(cm);
> - return;
> - }
And we still need to be able to drop back to synchronize_sched()
(AKA wait_rcu_gp(call_rcu_sched) in this case) in case we have both a
creative user and a long-running RCU-sched read-side critical section.
> + for_each_online_cpu(cpu) {
> + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
>
> - /* Recheck to see if someone else did our work for us. */
> - s = atomic_long_read(&rsp->expedited_done);
> - if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
> - /* ensure test happens before caller kfree */
> - smp_mb__before_atomic(); /* ^^^ */
> - atomic_long_inc(&rsp->expedited_workdone2);
> - free_cpumask_var(cm);
> - return;
> - }
> + /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
> + if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
> + continue;
Let's see... This does work for idle CPUs and for nohz_full CPUs running
in userspace.
It does not work for the current CPU, so the check needs an additional
check against raw_smp_processor_id(), which is easy enough to add.
There always has been a race window involving CPU hotplug. My recent
CPU_DYING_IDLE change allows things to be exact on the outgoing side,
and I need to make a similar change on the incoming side. There will
continue to be a window where RCU needs to pay attention to the CPU,
but neither IPIs nor scheduling works, and I guess I just do a timed
wait in that case. Rare race anyway, so should be fine.
> - /*
> - * Refetching sync_sched_expedited_started allows later
> - * callers to piggyback on our grace period. We retry
> - * after they started, so our grace period works for them,
> - * and they started after our first try, so their grace
> - * period works for us.
> - */
> - if (!try_get_online_cpus()) {
> - /* CPU hotplug operation in flight, use normal GP. */
> - wait_rcu_gp(call_rcu_sched);
> - atomic_long_inc(&rsp->expedited_normal);
> - free_cpumask_var(cm);
> - return;
> - }
> - snap = atomic_long_read(&rsp->expedited_start);
> - smp_mb(); /* ensure read is before try_stop_cpus(). */
> + stop_one_cpu(cpu, synchronize_sched_expedited_cpu_stop, NULL);
My thought was to use smp_call_function_single(), and to have the function
called recheck dyntick-idle state, avoiding doing a set_tsk_need_resched()
if so. This would result in a single pass through schedule() instead
of stop_one_cpu()'s double context switch. It would likely also require
some rework of rcu_note_context_switch(), which stop_one_cpu() avoids
the need for.
> }
> - atomic_long_inc(&rsp->expedited_stoppedcpus);
>
> -all_cpus_idle:
> - free_cpumask_var(cm);
> + atomic_long_inc(&rsp->expedited_stoppedcpus);
>
> /*
> * Everyone up to our most recent fetch is covered by our grace
> diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
> index fd643d8c4b42..b1329a213503 100644
> --- a/kernel/stop_machine.c
> +++ b/kernel/stop_machine.c
> @@ -371,36 +371,6 @@ int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
> return ret;
> }
>
> -/**
> - * try_stop_cpus - try to stop multiple cpus
> - * @cpumask: cpus to stop
> - * @fn: function to execute
> - * @arg: argument to @fn
> - *
> - * Identical to stop_cpus() except that it fails with -EAGAIN if
> - * someone else is already using the facility.
> - *
> - * CONTEXT:
> - * Might sleep.
> - *
> - * RETURNS:
> - * -EAGAIN if someone else is already stopping cpus, -ENOENT if
> - * @fn(@arg) was not executed at all because all cpus in @cpumask were
> - * offline; otherwise, 0 if all executions of @fn returned 0, any non
> - * zero return value if any returned non zero.
> - */
> -int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
> -{
> - int ret;
> -
> - /* static works are used, process one request at a time */
> - if (!mutex_trylock(&stop_cpus_mutex))
> - return -EAGAIN;
> - ret = __stop_cpus(cpumask, fn, arg);
> - mutex_unlock(&stop_cpus_mutex);
> - return ret;
> -}
> -
> static int cpu_stop_should_run(unsigned int cpu)
> {
> struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
>
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 00/13] percpu rwsem -v2
2015-06-23 14:56 ` Daniel Wagner
@ 2015-06-23 17:50 ` Peter Zijlstra
2015-06-23 19:36 ` Peter Zijlstra
0 siblings, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-23 17:50 UTC (permalink / raw)
To: Daniel Wagner
Cc: oleg, paulmck, tj, mingo, linux-kernel, der.herr, dave, riel,
viro, torvalds, jlayton
On Tue, Jun 23, 2015 at 04:56:39PM +0200, Daniel Wagner wrote:
> flock02
> mean variance sigma max min
> tip-1 11.8994 0.5874 0.7664 13.2022 8.6324
> tip-2 11.7394 0.5252 0.7247 13.2540 9.7513
> tip-3 11.8155 0.5288 0.7272 13.2700 9.9480
> tip+percpu-rswem-1 15.3601 0.8981 0.9477 16.8116 12.6910
> tip+percpu-rswem-2 15.2558 0.8442 0.9188 17.0199 12.9586
> tip+percpu-rswem-3 15.5297 0.6386 0.7991 17.4392 12.7992
I did indeed manage to get flock02 down to a usable level and found:
3.20 : ffffffff811ecbdf: incl %gs:0x7ee1de72(%rip) # aa58 <__preempt_count>
0.27 : ffffffff811ecbe6: mov 0xa98553(%rip),%rax # ffffffff81c85140 <file_rwsem>
10.78 : ffffffff811ecbed: incl %gs:(%rax)
0.19 : ffffffff811ecbf0: mov 0xa9855a(%rip),%edx # ffffffff81c85150 <file_rwsem+0x10>
0.00 : ffffffff811ecbf6: test %edx,%edx
0.00 : ffffffff811ecbf8: jne ffffffff811ecdd1 <flock_lock_file+0x261>
3.47 : ffffffff811ecbfe: decl %gs:0x7ee1de53(%rip) # aa58 <__preempt_count>
0.00 : ffffffff811ecc05: je ffffffff811eccec <flock_lock_file+0x17c>
Which is percpu_down_read(). Now aside from the fact that I run a
PREEMPT=y kernel, it looks like that sem->refcount increment stalls
because of the dependent load.
Manually hoisting the load very slightly improves things:
0.24 : ffffffff811ecbdf: mov 0xa9855a(%rip),%rax # ffffffff81c85140 <file_rwsem>
5.88 : ffffffff811ecbe6: incl %gs:0x7ee1de6b(%rip) # aa58 <__preempt_count>
7.94 : ffffffff811ecbed: incl %gs:(%rax)
0.30 : ffffffff811ecbf0: mov 0xa9855a(%rip),%edx # ffffffff81c85150 <file_rwsem+0x10>
0.00 : ffffffff811ecbf6: test %edx,%edx
0.00 : ffffffff811ecbf8: jne ffffffff811ecdd1 <flock_lock_file+0x261>
3.70 : ffffffff811ecbfe: decl %gs:0x7ee1de53(%rip) # aa58 <__preempt_count>
0.00 : ffffffff811ecc05: je ffffffff811eccec <flock_lock_file+0x17c>
But its not much :/
Using DEFINE_STATIC_PERCPU_RWSEM(file_rwsem) would allow GCC to omit the
sem->refcount load entirely, but its not smart enough to see that it can
(tested 4.9 and 5.1).
---
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -35,6 +35,8 @@ extern void __percpu_up_read(struct perc
static inline void _percpu_down_read(struct percpu_rw_semaphore *sem)
{
+ unsigned int __percpu *refcount = sem->refcount;
+
might_sleep();
preempt_disable();
@@ -47,7 +49,7 @@ static inline void _percpu_down_read(str
* writer will see anything we did within this RCU-sched read-side
* critical section.
*/
- __this_cpu_inc(*sem->refcount);
+ __this_cpu_inc(*refcount);
if (unlikely(!rcu_sync_is_idle(&sem->rss)))
__percpu_down_read(sem); /* Unconditional memory barrier. */
preempt_enable();
@@ -81,6 +83,8 @@ static inline bool percpu_down_read_tryl
static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
{
+ unsigned int __percpu *refcount = sem->refcount;
+
/*
* The barrier() in preempt_disable() prevents the compiler from
* bleeding the critical section out.
@@ -90,7 +94,7 @@ static inline void percpu_up_read(struct
* Same as in percpu_down_read().
*/
if (likely(rcu_sync_is_idle(&sem->rss)))
- __this_cpu_dec(*sem->refcount);
+ __this_cpu_dec(*refcount);
else
__percpu_up_read(sem); /* Unconditional memory barrier. */
preempt_enable();
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 09/13] hotplug: Replace hotplug lock with percpu-rwsem
2015-06-23 17:01 ` Oleg Nesterov
@ 2015-06-23 17:53 ` Peter Zijlstra
2015-06-24 13:50 ` Oleg Nesterov
0 siblings, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-23 17:53 UTC (permalink / raw)
To: Oleg Nesterov
Cc: paulmck, tj, mingo, linux-kernel, der.herr, dave, riel, viro, torvalds
On Tue, Jun 23, 2015 at 07:01:22PM +0200, Oleg Nesterov wrote:
> On 06/23, Peter Zijlstra wrote:
> >
> > On Tue, Jun 23, 2015 at 12:57:39AM +0200, Oleg Nesterov wrote:
> > > > +
> > > > + lock_map_acquire_read(&cpu_hotplug.rwsem.rw_sem.dep_map);
> > > > + _percpu_down_read(&cpu_hotplug.rwsem);
> > > > }
> > >
> > > Confused... Why do we need _percpu_down_read()? Can't get_online_cpus()
> > > just use percpu_down_read() ?
> > >
> > > Yes, percpu_down_read() is not recursive, like the normal down_read().
> > > But this does not matter because we rely on ->cpuhp_ref anyway?
> >
> > While we will not call the actual lock, lockdep will still get confused
> > by the inconsistent locking order observed.
> >
> > Change it and boot, you'll find lockdep output pretty quickly.
>
> Hmm. and I simply can't understand why...
If in one callchain we do:
get_online_cpus();
lock(A);
in another we do:
lock(A);
get_online_cpus();
lockdep will complain about the inverted lock order, however this is not
a problem at all for recursive locks.
I think the example you get on boot is slightly more complicated, but
ends up like the above iirc.
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-06-23 17:30 ` Paul E. McKenney
@ 2015-06-23 18:04 ` Peter Zijlstra
2015-06-23 18:26 ` Paul E. McKenney
0 siblings, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-23 18:04 UTC (permalink / raw)
To: Paul E. McKenney
Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
viro, torvalds
On Tue, Jun 23, 2015 at 10:30:38AM -0700, Paul E. McKenney wrote:
> Good, you don't need this because you can check for dynticks later.
> You will need to check for offline CPUs.
get_online_cpus()
for_each_online_cpus() {
...
}
is what the new code does.
> > - /*
> > - * Each pass through the following loop attempts to force a
> > - * context switch on each CPU.
> > - */
> > - while (try_stop_cpus(cma ? cm : cpu_online_mask,
> > - synchronize_sched_expedited_cpu_stop,
> > - NULL) == -EAGAIN) {
> > - put_online_cpus();
> > - atomic_long_inc(&rsp->expedited_tryfail);
> > -
> > - /* Check to see if someone else did our work for us. */
> > - s = atomic_long_read(&rsp->expedited_done);
> > - if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
> > - /* ensure test happens before caller kfree */
> > - smp_mb__before_atomic(); /* ^^^ */
> > - atomic_long_inc(&rsp->expedited_workdone1);
> > - free_cpumask_var(cm);
> > - return;
>
> Here you lose batching. Yeah, I know that synchronize_sched_expedited()
> is -supposed- to be used sparingly, but it is not cool for the kernel
> to melt down just because some creative user found a way to heat up a
> code path. Need a mutex_trylock() with a counter and checking for
> others having already done the needed work.
I really think you're making that expedited nonsense far too accessible.
But it was exactly that trylock I was trying to get rid of.
> And we still need to be able to drop back to synchronize_sched()
> (AKA wait_rcu_gp(call_rcu_sched) in this case) in case we have both a
> creative user and a long-running RCU-sched read-side critical section.
No, a long-running RCU-sched read-side is a bug and we should fix that,
its called a preemption-latency, we don't like those.
> > + for_each_online_cpu(cpu) {
> > + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
> >
> > - /* Recheck to see if someone else did our work for us. */
> > - s = atomic_long_read(&rsp->expedited_done);
> > - if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
> > - /* ensure test happens before caller kfree */
> > - smp_mb__before_atomic(); /* ^^^ */
> > - atomic_long_inc(&rsp->expedited_workdone2);
> > - free_cpumask_var(cm);
> > - return;
> > - }
> > + /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
> > + if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
> > + continue;
>
> Let's see... This does work for idle CPUs and for nohz_full CPUs running
> in userspace.
>
> It does not work for the current CPU, so the check needs an additional
> check against raw_smp_processor_id(), which is easy enough to add.
Right, realized after I send it out, but it _should_ work for the
current cpu too. Just pointless doing it.
> There always has been a race window involving CPU hotplug.
There is no hotplug race, the entire thing has get_online_cpus() held
across it.
> > + stop_one_cpu(cpu, synchronize_sched_expedited_cpu_stop, NULL);
>
> My thought was to use smp_call_function_single(), and to have the function
> called recheck dyntick-idle state, avoiding doing a set_tsk_need_resched()
> if so.
set_tsk_need_resched() is buggy and should not be used.
> This would result in a single pass through schedule() instead
> of stop_one_cpu()'s double context switch. It would likely also require
> some rework of rcu_note_context_switch(), which stop_one_cpu() avoids
> the need for.
_IF_ you're going to touch rcu_note_context_switch(), you might as well
use a completion, set it for the number of CPUs that need a resched,
spray resched-IPI and have rcu_note_context_switch() do a complete().
But I would really like to avoid adding code to
rcu_note_context_switch(), because we run that on _every_ single context
switch.
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-06-23 18:04 ` Peter Zijlstra
@ 2015-06-23 18:26 ` Paul E. McKenney
2015-06-23 19:05 ` Paul E. McKenney
2015-06-24 7:35 ` Peter Zijlstra
0 siblings, 2 replies; 106+ messages in thread
From: Paul E. McKenney @ 2015-06-23 18:26 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
viro, torvalds
On Tue, Jun 23, 2015 at 08:04:11PM +0200, Peter Zijlstra wrote:
> On Tue, Jun 23, 2015 at 10:30:38AM -0700, Paul E. McKenney wrote:
> > Good, you don't need this because you can check for dynticks later.
> > You will need to check for offline CPUs.
>
> get_online_cpus()
> for_each_online_cpus() {
> ...
> }
>
> is what the new code does.
Ah, I missed that this was not deleted.
> > > - /*
> > > - * Each pass through the following loop attempts to force a
> > > - * context switch on each CPU.
> > > - */
> > > - while (try_stop_cpus(cma ? cm : cpu_online_mask,
> > > - synchronize_sched_expedited_cpu_stop,
> > > - NULL) == -EAGAIN) {
> > > - put_online_cpus();
> > > - atomic_long_inc(&rsp->expedited_tryfail);
> > > -
> > > - /* Check to see if someone else did our work for us. */
> > > - s = atomic_long_read(&rsp->expedited_done);
> > > - if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
> > > - /* ensure test happens before caller kfree */
> > > - smp_mb__before_atomic(); /* ^^^ */
> > > - atomic_long_inc(&rsp->expedited_workdone1);
> > > - free_cpumask_var(cm);
> > > - return;
> >
> > Here you lose batching. Yeah, I know that synchronize_sched_expedited()
> > is -supposed- to be used sparingly, but it is not cool for the kernel
> > to melt down just because some creative user found a way to heat up a
> > code path. Need a mutex_trylock() with a counter and checking for
> > others having already done the needed work.
>
> I really think you're making that expedited nonsense far too accessible.
This has nothing to do with accessibility and everything to do with
robustness. And with me not becoming the triage center for too many
non-RCU bugs.
> But it was exactly that trylock I was trying to get rid of.
OK. Why, exactly?
> > And we still need to be able to drop back to synchronize_sched()
> > (AKA wait_rcu_gp(call_rcu_sched) in this case) in case we have both a
> > creative user and a long-running RCU-sched read-side critical section.
>
> No, a long-running RCU-sched read-side is a bug and we should fix that,
> its called a preemption-latency, we don't like those.
Yes, we should fix them. No, they absolutely must not result in a
meltdown of some unrelated portion of the kernel (like RCU), particularly
if this situation occurs on some system running a production workload
that doesn't happen to care about preemption latency.
> > > + for_each_online_cpu(cpu) {
> > > + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
> > >
> > > - /* Recheck to see if someone else did our work for us. */
> > > - s = atomic_long_read(&rsp->expedited_done);
> > > - if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
> > > - /* ensure test happens before caller kfree */
> > > - smp_mb__before_atomic(); /* ^^^ */
> > > - atomic_long_inc(&rsp->expedited_workdone2);
> > > - free_cpumask_var(cm);
> > > - return;
> > > - }
> > > + /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
> > > + if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
> > > + continue;
> >
> > Let's see... This does work for idle CPUs and for nohz_full CPUs running
> > in userspace.
> >
> > It does not work for the current CPU, so the check needs an additional
> > check against raw_smp_processor_id(), which is easy enough to add.
>
> Right, realized after I send it out, but it _should_ work for the
> current cpu too. Just pointless doing it.
OK, and easily fixed up in any case.
> > There always has been a race window involving CPU hotplug.
>
> There is no hotplug race, the entire thing has get_online_cpus() held
> across it.
Which I would like to get rid of, but not urgent.
> > > + stop_one_cpu(cpu, synchronize_sched_expedited_cpu_stop, NULL);
> >
> > My thought was to use smp_call_function_single(), and to have the function
> > called recheck dyntick-idle state, avoiding doing a set_tsk_need_resched()
> > if so.
>
> set_tsk_need_resched() is buggy and should not be used.
OK, what API is used for this purpose?
> > This would result in a single pass through schedule() instead
> > of stop_one_cpu()'s double context switch. It would likely also require
> > some rework of rcu_note_context_switch(), which stop_one_cpu() avoids
> > the need for.
>
> _IF_ you're going to touch rcu_note_context_switch(), you might as well
> use a completion, set it for the number of CPUs that need a resched,
> spray resched-IPI and have rcu_note_context_switch() do a complete().
>
> But I would really like to avoid adding code to
> rcu_note_context_switch(), because we run that on _every_ single context
> switch.
I believe that I can rework the current code to get the effect without
increased overhead, given that I have no intention of adding the
complete(). Adding the complete -would- add overhead to that fastpath.
Thanx, Paul
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-06-23 18:26 ` Paul E. McKenney
@ 2015-06-23 19:05 ` Paul E. McKenney
2015-06-24 2:23 ` Paul E. McKenney
2015-06-24 7:35 ` Peter Zijlstra
1 sibling, 1 reply; 106+ messages in thread
From: Paul E. McKenney @ 2015-06-23 19:05 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
viro, torvalds
On Tue, Jun 23, 2015 at 11:26:26AM -0700, Paul E. McKenney wrote:
> On Tue, Jun 23, 2015 at 08:04:11PM +0200, Peter Zijlstra wrote:
> > On Tue, Jun 23, 2015 at 10:30:38AM -0700, Paul E. McKenney wrote:
> > > Good, you don't need this because you can check for dynticks later.
> > > You will need to check for offline CPUs.
> >
> > get_online_cpus()
> > for_each_online_cpus() {
> > ...
> > }
> >
> > is what the new code does.
>
> Ah, I missed that this was not deleted.
But get_online_cpus() will re-introduce a deadlock.
Thanx, Paul
> > > > - /*
> > > > - * Each pass through the following loop attempts to force a
> > > > - * context switch on each CPU.
> > > > - */
> > > > - while (try_stop_cpus(cma ? cm : cpu_online_mask,
> > > > - synchronize_sched_expedited_cpu_stop,
> > > > - NULL) == -EAGAIN) {
> > > > - put_online_cpus();
> > > > - atomic_long_inc(&rsp->expedited_tryfail);
> > > > -
> > > > - /* Check to see if someone else did our work for us. */
> > > > - s = atomic_long_read(&rsp->expedited_done);
> > > > - if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
> > > > - /* ensure test happens before caller kfree */
> > > > - smp_mb__before_atomic(); /* ^^^ */
> > > > - atomic_long_inc(&rsp->expedited_workdone1);
> > > > - free_cpumask_var(cm);
> > > > - return;
> > >
> > > Here you lose batching. Yeah, I know that synchronize_sched_expedited()
> > > is -supposed- to be used sparingly, but it is not cool for the kernel
> > > to melt down just because some creative user found a way to heat up a
> > > code path. Need a mutex_trylock() with a counter and checking for
> > > others having already done the needed work.
> >
> > I really think you're making that expedited nonsense far too accessible.
>
> This has nothing to do with accessibility and everything to do with
> robustness. And with me not becoming the triage center for too many
> non-RCU bugs.
>
> > But it was exactly that trylock I was trying to get rid of.
>
> OK. Why, exactly?
>
> > > And we still need to be able to drop back to synchronize_sched()
> > > (AKA wait_rcu_gp(call_rcu_sched) in this case) in case we have both a
> > > creative user and a long-running RCU-sched read-side critical section.
> >
> > No, a long-running RCU-sched read-side is a bug and we should fix that,
> > its called a preemption-latency, we don't like those.
>
> Yes, we should fix them. No, they absolutely must not result in a
> meltdown of some unrelated portion of the kernel (like RCU), particularly
> if this situation occurs on some system running a production workload
> that doesn't happen to care about preemption latency.
>
> > > > + for_each_online_cpu(cpu) {
> > > > + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
> > > >
> > > > - /* Recheck to see if someone else did our work for us. */
> > > > - s = atomic_long_read(&rsp->expedited_done);
> > > > - if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
> > > > - /* ensure test happens before caller kfree */
> > > > - smp_mb__before_atomic(); /* ^^^ */
> > > > - atomic_long_inc(&rsp->expedited_workdone2);
> > > > - free_cpumask_var(cm);
> > > > - return;
> > > > - }
> > > > + /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
> > > > + if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
> > > > + continue;
> > >
> > > Let's see... This does work for idle CPUs and for nohz_full CPUs running
> > > in userspace.
> > >
> > > It does not work for the current CPU, so the check needs an additional
> > > check against raw_smp_processor_id(), which is easy enough to add.
> >
> > Right, realized after I send it out, but it _should_ work for the
> > current cpu too. Just pointless doing it.
>
> OK, and easily fixed up in any case.
>
> > > There always has been a race window involving CPU hotplug.
> >
> > There is no hotplug race, the entire thing has get_online_cpus() held
> > across it.
>
> Which I would like to get rid of, but not urgent.
>
> > > > + stop_one_cpu(cpu, synchronize_sched_expedited_cpu_stop, NULL);
> > >
> > > My thought was to use smp_call_function_single(), and to have the function
> > > called recheck dyntick-idle state, avoiding doing a set_tsk_need_resched()
> > > if so.
> >
> > set_tsk_need_resched() is buggy and should not be used.
>
> OK, what API is used for this purpose?
>
> > > This would result in a single pass through schedule() instead
> > > of stop_one_cpu()'s double context switch. It would likely also require
> > > some rework of rcu_note_context_switch(), which stop_one_cpu() avoids
> > > the need for.
> >
> > _IF_ you're going to touch rcu_note_context_switch(), you might as well
> > use a completion, set it for the number of CPUs that need a resched,
> > spray resched-IPI and have rcu_note_context_switch() do a complete().
> >
> > But I would really like to avoid adding code to
> > rcu_note_context_switch(), because we run that on _every_ single context
> > switch.
>
> I believe that I can rework the current code to get the effect without
> increased overhead, given that I have no intention of adding the
> complete(). Adding the complete -would- add overhead to that fastpath.
>
> Thanx, Paul
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 00/13] percpu rwsem -v2
2015-06-23 17:50 ` Peter Zijlstra
@ 2015-06-23 19:36 ` Peter Zijlstra
2015-06-24 8:46 ` Ingo Molnar
0 siblings, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-23 19:36 UTC (permalink / raw)
To: Daniel Wagner
Cc: oleg, paulmck, tj, mingo, linux-kernel, der.herr, dave, riel,
viro, torvalds, jlayton
On Tue, Jun 23, 2015 at 07:50:12PM +0200, Peter Zijlstra wrote:
> On Tue, Jun 23, 2015 at 04:56:39PM +0200, Daniel Wagner wrote:
> > flock02
> > mean variance sigma max min
> > tip-1 11.8994 0.5874 0.7664 13.2022 8.6324
> > tip-2 11.7394 0.5252 0.7247 13.2540 9.7513
> > tip-3 11.8155 0.5288 0.7272 13.2700 9.9480
> > tip+percpu-rswem-1 15.3601 0.8981 0.9477 16.8116 12.6910
> > tip+percpu-rswem-2 15.2558 0.8442 0.9188 17.0199 12.9586
> > tip+percpu-rswem-3 15.5297 0.6386 0.7991 17.4392 12.7992
>
> I did indeed manage to get flock02 down to a usable level and found:
Aside from the flock_lock_file function moving up, we also get an
increase in _raw_spin_lock.
Before:
5.17% 5.17% flock02 [kernel.vmlinux] [k] _raw_spin_lock
|
---_raw_spin_lock
|
|--99.75%-- flock_lock_file_wait
| sys_flock
| entry_SYSCALL_64_fastpath
| flock
--0.25%-- [...]
After:
7.20% 7.20% flock02 [kernel.vmlinux] [k] _raw_spin_lock
|
---_raw_spin_lock
|
|--52.23%-- flock_lock_file_wait
| sys_flock
| entry_SYSCALL_64_fastpath
| flock
|
|--25.92%-- flock_lock_file
| flock_lock_file_wait
| sys_flock
| entry_SYSCALL_64_fastpath
| flock
|
|--21.42%-- locks_delete_lock_ctx
| flock_lock_file
| flock_lock_file_wait
| sys_flock
| entry_SYSCALL_64_fastpath
| flock
--0.43%-- [...]
And its not at all clear to me why this would be. It looks like
FILE_LOCK_DEFERRED is happening, but I've not yet figured out why that
would be.
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-06-23 19:05 ` Paul E. McKenney
@ 2015-06-24 2:23 ` Paul E. McKenney
2015-06-24 8:32 ` Peter Zijlstra
0 siblings, 1 reply; 106+ messages in thread
From: Paul E. McKenney @ 2015-06-24 2:23 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
viro, torvalds
On Tue, Jun 23, 2015 at 12:05:06PM -0700, Paul E. McKenney wrote:
> On Tue, Jun 23, 2015 at 11:26:26AM -0700, Paul E. McKenney wrote:
> > On Tue, Jun 23, 2015 at 08:04:11PM +0200, Peter Zijlstra wrote:
> > > On Tue, Jun 23, 2015 at 10:30:38AM -0700, Paul E. McKenney wrote:
> > > > Good, you don't need this because you can check for dynticks later.
> > > > You will need to check for offline CPUs.
> > >
> > > get_online_cpus()
> > > for_each_online_cpus() {
> > > ...
> > > }
> > >
> > > is what the new code does.
> >
> > Ah, I missed that this was not deleted.
>
> But get_online_cpus() will re-introduce a deadlock.
And here is an untested patch that applies the gist of your approach,
the series of stop_one_cpu() calls, but without undoing the rest.
I forged your Signed-off-by, please let me know if that doesn't work
for you. There are a number of simplifications that can be made, but
the basic approach gets a good testing first.
And I just noticed that I forgot to get rid of try_stop_cpus().
Well, there will probably be a test failure or two to handle, so
I can add that in the next version. ;-)
Thanx, Paul
------------------------------------------------------------------------
commit 1de96c34b39d840c5fe2689640345ed26f78b8f8
Author: Peter Zijlstra <peterz@infradead.org>
Date: Tue Jun 23 19:03:45 2015 -0700
rcu: Switch synchronize_sched_expedited() to stop_one_cpu()
The synchronize_sched_expedited() currently invokes try_stop_cpus(),
which schedules the stopper kthreads on each online non-idle CPU,
and waits until all those kthreads are running before letting any
of them stop. This is disastrous for real-time workloads, which
get hit with a preemption that is as long as the longest scheduling
latency on any CPU, including any non-realtime housekeeping CPUs.
This commit therefore switches to using stop_one_cpu() on each CPU
in turn. This avoids inflicting the worst-case scheduling latency
on the worst-case CPU onto all other CPUs, and also simplifies the
code a little bit.
Follow-up commits will simplify the counter-snapshotting algorithm
and convert a number of the counters that are now protected by the
new ->expedited_mutex to non-atomic.
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
[ paulmck: Kept stop_one_cpu(), dropped disabling of "guardrails". ]
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 78d0a87ff354..a30971474134 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -103,6 +103,7 @@ struct rcu_state sname##_state = { \
.orphan_nxttail = &sname##_state.orphan_nxtlist, \
.orphan_donetail = &sname##_state.orphan_donelist, \
.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
+ .expedited_mutex = __MUTEX_INITIALIZER(sname##_state.expedited_mutex), \
.name = RCU_STATE_NAME(sname), \
.abbr = sabbr, \
}
@@ -3357,8 +3358,6 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
*/
void synchronize_sched_expedited(void)
{
- cpumask_var_t cm;
- bool cma = false;
int cpu;
long firstsnap, s, snap;
int trycount = 0;
@@ -3394,28 +3393,11 @@ void synchronize_sched_expedited(void)
}
WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
- /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
- cma = zalloc_cpumask_var(&cm, GFP_KERNEL);
- if (cma) {
- cpumask_copy(cm, cpu_online_mask);
- cpumask_clear_cpu(raw_smp_processor_id(), cm);
- for_each_cpu(cpu, cm) {
- struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-
- if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
- cpumask_clear_cpu(cpu, cm);
- }
- if (cpumask_weight(cm) == 0)
- goto all_cpus_idle;
- }
-
/*
* Each pass through the following loop attempts to force a
* context switch on each CPU.
*/
- while (try_stop_cpus(cma ? cm : cpu_online_mask,
- synchronize_sched_expedited_cpu_stop,
- NULL) == -EAGAIN) {
+ while (!mutex_trylock(&rsp->expedited_mutex)) {
put_online_cpus();
atomic_long_inc(&rsp->expedited_tryfail);
@@ -3425,7 +3407,6 @@ void synchronize_sched_expedited(void)
/* ensure test happens before caller kfree */
smp_mb__before_atomic(); /* ^^^ */
atomic_long_inc(&rsp->expedited_workdone1);
- free_cpumask_var(cm);
return;
}
@@ -3435,7 +3416,6 @@ void synchronize_sched_expedited(void)
} else {
wait_rcu_gp(call_rcu_sched);
atomic_long_inc(&rsp->expedited_normal);
- free_cpumask_var(cm);
return;
}
@@ -3445,7 +3425,6 @@ void synchronize_sched_expedited(void)
/* ensure test happens before caller kfree */
smp_mb__before_atomic(); /* ^^^ */
atomic_long_inc(&rsp->expedited_workdone2);
- free_cpumask_var(cm);
return;
}
@@ -3460,16 +3439,23 @@ void synchronize_sched_expedited(void)
/* CPU hotplug operation in flight, use normal GP. */
wait_rcu_gp(call_rcu_sched);
atomic_long_inc(&rsp->expedited_normal);
- free_cpumask_var(cm);
return;
}
snap = atomic_long_read(&rsp->expedited_start);
smp_mb(); /* ensure read is before try_stop_cpus(). */
}
- atomic_long_inc(&rsp->expedited_stoppedcpus);
-all_cpus_idle:
- free_cpumask_var(cm);
+ /* Stop each CPU that is online, non-idle, and not us. */
+ for_each_online_cpu(cpu) {
+ struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+
+ /* Skip our CPU and any idle CPUs. */
+ if (raw_smp_processor_id() == cpu ||
+ !(atomic_add_return(0, &rdtp->dynticks) & 0x1))
+ continue;
+ stop_one_cpu(cpu, synchronize_sched_expedited_cpu_stop, NULL);
+ }
+ atomic_long_inc(&rsp->expedited_stoppedcpus);
/*
* Everyone up to our most recent fetch is covered by our grace
@@ -3488,6 +3474,7 @@ all_cpus_idle:
}
} while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s);
atomic_long_inc(&rsp->expedited_done_exit);
+ mutex_unlock(&rsp->expedited_mutex);
put_online_cpus();
}
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index de22d6d06bf9..b04ffa0dea58 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -478,6 +478,7 @@ struct rcu_state {
/* _rcu_barrier(). */
/* End of fields guarded by barrier_mutex. */
+ struct mutex expedited_mutex; /* Serializes expediting. */
atomic_long_t expedited_start; /* Starting ticket. */
atomic_long_t expedited_done; /* Done ticket. */
atomic_long_t expedited_wrap; /* # near-wrap incidents. */
^ permalink raw reply related [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-06-23 18:26 ` Paul E. McKenney
2015-06-23 19:05 ` Paul E. McKenney
@ 2015-06-24 7:35 ` Peter Zijlstra
2015-06-24 8:42 ` Ingo Molnar
2015-06-24 14:50 ` Paul E. McKenney
1 sibling, 2 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-24 7:35 UTC (permalink / raw)
To: Paul E. McKenney
Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
viro, torvalds
On Tue, Jun 23, 2015 at 11:26:26AM -0700, Paul E. McKenney wrote:
> > I really think you're making that expedited nonsense far too accessible.
>
> This has nothing to do with accessibility and everything to do with
> robustness. And with me not becoming the triage center for too many
> non-RCU bugs.
But by making it so you're rewarding abuse instead of flagging it :-(
> > > And we still need to be able to drop back to synchronize_sched()
> > > (AKA wait_rcu_gp(call_rcu_sched) in this case) in case we have both a
> > > creative user and a long-running RCU-sched read-side critical section.
> >
> > No, a long-running RCU-sched read-side is a bug and we should fix that,
> > its called a preemption-latency, we don't like those.
>
> Yes, we should fix them. No, they absolutely must not result in a
> meltdown of some unrelated portion of the kernel (like RCU), particularly
> if this situation occurs on some system running a production workload
> that doesn't happen to care about preemption latency.
I still don't see a problem here though; the stop_one_cpu() invocation
for the CPU that's suffering its preemption latency will take longer,
but so what?
How does polling and dropping back to sync_rcu() generate better
behaviour than simply waiting for the completion?
> > > > + stop_one_cpu(cpu, synchronize_sched_expedited_cpu_stop, NULL);
> > >
> > > My thought was to use smp_call_function_single(), and to have the function
> > > called recheck dyntick-idle state, avoiding doing a set_tsk_need_resched()
> > > if so.
> >
> > set_tsk_need_resched() is buggy and should not be used.
>
> OK, what API is used for this purpose?
As per exception you (rcu) already have access to resched_cpu(), use
that -- if it doesn't do what you need it to, we'll fix it, you're the
only consumer of it.
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-06-24 2:23 ` Paul E. McKenney
@ 2015-06-24 8:32 ` Peter Zijlstra
2015-06-24 9:31 ` Peter Zijlstra
2015-06-24 15:01 ` Paul E. McKenney
0 siblings, 2 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-24 8:32 UTC (permalink / raw)
To: Paul E. McKenney
Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
viro, torvalds
On Tue, Jun 23, 2015 at 07:23:44PM -0700, Paul E. McKenney wrote:
> And here is an untested patch that applies the gist of your approach,
> the series of stop_one_cpu() calls, but without undoing the rest.
> I forged your Signed-off-by, please let me know if that doesn't work
> for you. There are a number of simplifications that can be made, but
> the basic approach gets a good testing first.
So I really do not get the point of the trylock. It doesn't make sense.
Why would you poll the mutex instead of just wait for it and then
recheck if someone did the work while you were waiting for it?
What's wrong with the below?
---
kernel/rcu/tree.c | 100 +++++++++++++++---------------------------------------
kernel/rcu/tree.h | 1 +
2 files changed, 29 insertions(+), 72 deletions(-)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index add042926a66..b39a5672a7ac 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -103,6 +103,7 @@ struct rcu_state sname##_state = { \
.orphan_nxttail = &sname##_state.orphan_nxtlist, \
.orphan_donetail = &sname##_state.orphan_donelist, \
.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
+ .expedited_mutex = __MUTEX_INITIALIZER(sname##_state.expedited_mutex), \
.name = RCU_STATE_NAME(sname), \
.abbr = sabbr, \
}
@@ -3304,12 +3305,9 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
*/
void synchronize_sched_expedited(void)
{
- cpumask_var_t cm;
- bool cma = false;
- int cpu;
- long firstsnap, s, snap;
- int trycount = 0;
struct rcu_state *rsp = &rcu_sched_state;
+ long s, snap;
+ int cpu;
/*
* If we are in danger of counter wrap, just do synchronize_sched().
@@ -3332,7 +3330,6 @@ void synchronize_sched_expedited(void)
* full memory barrier.
*/
snap = atomic_long_inc_return(&rsp->expedited_start);
- firstsnap = snap;
if (!try_get_online_cpus()) {
/* CPU hotplug operation in flight, fall back to normal GP. */
wait_rcu_gp(call_rcu_sched);
@@ -3341,83 +3338,40 @@ void synchronize_sched_expedited(void)
}
WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
- /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
- cma = zalloc_cpumask_var(&cm, GFP_KERNEL);
- if (cma) {
- cpumask_copy(cm, cpu_online_mask);
- cpumask_clear_cpu(raw_smp_processor_id(), cm);
- for_each_cpu(cpu, cm) {
- struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-
- if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
- cpumask_clear_cpu(cpu, cm);
- }
- if (cpumask_weight(cm) == 0)
- goto all_cpus_idle;
- }
-
/*
* Each pass through the following loop attempts to force a
* context switch on each CPU.
*/
- while (try_stop_cpus(cma ? cm : cpu_online_mask,
- synchronize_sched_expedited_cpu_stop,
- NULL) == -EAGAIN) {
- put_online_cpus();
- atomic_long_inc(&rsp->expedited_tryfail);
+ mutex_lock(&rsp->expedited_mutex);
- /* Check to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone1);
- free_cpumask_var(cm);
- return;
- }
+ /*
+ * Check to see if someone else did our work for us, while we were
+ * waiting for the mutex.
+ */
+ s = atomic_long_read(&rsp->expedited_done);
+ if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
+ /* ensure test happens before caller kfree */
+ smp_mb__before_atomic(); /* ^^^ */
+ atomic_long_inc(&rsp->expedited_workdone1);
+ goto unlock;
+ }
- /* No joy, try again later. Or just synchronize_sched(). */
- if (trycount++ < 10) {
- udelay(trycount * num_online_cpus());
- } else {
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
- free_cpumask_var(cm);
- return;
- }
+ /* Stop each CPU that is online, non-idle, and not us. */
+ for_each_online_cpu(cpu) {
+ struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
- /* Recheck to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone2);
- free_cpumask_var(cm);
- return;
- }
+ /* Skip our CPU, */
+ if (raw_smp_processor_id() == cpu)
+ continue;
- /*
- * Refetching sync_sched_expedited_started allows later
- * callers to piggyback on our grace period. We retry
- * after they started, so our grace period works for them,
- * and they started after our first try, so their grace
- * period works for us.
- */
- if (!try_get_online_cpus()) {
- /* CPU hotplug operation in flight, use normal GP. */
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
- free_cpumask_var(cm);
- return;
- }
- snap = atomic_long_read(&rsp->expedited_start);
- smp_mb(); /* ensure read is before try_stop_cpus(). */
+ /* and any idle CPUs. */
+ if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
+ continue;
+
+ stop_one_cpu(cpu, synchronize_sched_expedited_cpu_stop, NULL);
}
atomic_long_inc(&rsp->expedited_stoppedcpus);
-all_cpus_idle:
- free_cpumask_var(cm);
-
/*
* Everyone up to our most recent fetch is covered by our grace
* period. Update the counter, but only if our work is still
@@ -3435,6 +3389,8 @@ void synchronize_sched_expedited(void)
}
} while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s);
atomic_long_inc(&rsp->expedited_done_exit);
+unlock:
+ mutex_unlock(&rsp->expedited_mutex);
put_online_cpus();
}
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 4adb7ca0bf47..10348c081e8e 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -483,6 +483,7 @@ struct rcu_state {
/* _rcu_barrier(). */
/* End of fields guarded by barrier_mutex. */
+ struct mutex expedited_mutex; /* Serializes expediting. */
atomic_long_t expedited_start; /* Starting ticket. */
atomic_long_t expedited_done; /* Done ticket. */
atomic_long_t expedited_wrap; /* # near-wrap incidents. */
^ permalink raw reply related [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-06-24 7:35 ` Peter Zijlstra
@ 2015-06-24 8:42 ` Ingo Molnar
2015-06-24 13:39 ` Paul E. McKenney
2015-06-24 14:50 ` Paul E. McKenney
1 sibling, 1 reply; 106+ messages in thread
From: Ingo Molnar @ 2015-06-24 8:42 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Paul E. McKenney, Oleg Nesterov, tj, mingo, linux-kernel,
der.herr, dave, riel, viro, torvalds
* Peter Zijlstra <peterz@infradead.org> wrote:
> On Tue, Jun 23, 2015 at 11:26:26AM -0700, Paul E. McKenney wrote:
> > >
> > > I really think you're making that expedited nonsense far too accessible.
> >
> > This has nothing to do with accessibility and everything to do with
> > robustness. And with me not becoming the triage center for too many non-RCU
> > bugs.
>
> But by making it so you're rewarding abuse instead of flagging it :-(
Btw., being a 'triage center' is the bane of APIs that are overly successful,
so we should take that burden with pride! :-)
Lockdep (and the scheduler APIs as well) frequently got into such situations as
well, and we mostly solved it by being more informative with debug splats.
I don't think a kernel API should (ever!) stay artificially silent, just for fear
of flagging too many problems in other code.
Thanks,
Ingo
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 00/13] percpu rwsem -v2
2015-06-23 19:36 ` Peter Zijlstra
@ 2015-06-24 8:46 ` Ingo Molnar
2015-06-24 9:01 ` Peter Zijlstra
2015-06-24 9:18 ` Daniel Wagner
0 siblings, 2 replies; 106+ messages in thread
From: Ingo Molnar @ 2015-06-24 8:46 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Daniel Wagner, oleg, paulmck, tj, mingo, linux-kernel, der.herr,
dave, riel, viro, torvalds, jlayton
* Peter Zijlstra <peterz@infradead.org> wrote:
> On Tue, Jun 23, 2015 at 07:50:12PM +0200, Peter Zijlstra wrote:
> > On Tue, Jun 23, 2015 at 04:56:39PM +0200, Daniel Wagner wrote:
> > > flock02
> > > mean variance sigma max min
> > > tip-1 11.8994 0.5874 0.7664 13.2022 8.6324
> > > tip-2 11.7394 0.5252 0.7247 13.2540 9.7513
> > > tip-3 11.8155 0.5288 0.7272 13.2700 9.9480
> > > tip+percpu-rswem-1 15.3601 0.8981 0.9477 16.8116 12.6910
> > > tip+percpu-rswem-2 15.2558 0.8442 0.9188 17.0199 12.9586
> > > tip+percpu-rswem-3 15.5297 0.6386 0.7991 17.4392 12.7992
> >
> > I did indeed manage to get flock02 down to a usable level and found:
>
> Aside from the flock_lock_file function moving up, we also get an
> increase in _raw_spin_lock.
>
> Before:
>
> 5.17% 5.17% flock02 [kernel.vmlinux] [k] _raw_spin_lock
> |
> ---_raw_spin_lock
> |
> |--99.75%-- flock_lock_file_wait
> | sys_flock
> | entry_SYSCALL_64_fastpath
> | flock
> --0.25%-- [...]
>
>
> After:
>
> 7.20% 7.20% flock02 [kernel.vmlinux] [k] _raw_spin_lock
> |
> ---_raw_spin_lock
> |
> |--52.23%-- flock_lock_file_wait
> | sys_flock
> | entry_SYSCALL_64_fastpath
> | flock
> |
> |--25.92%-- flock_lock_file
> | flock_lock_file_wait
> | sys_flock
> | entry_SYSCALL_64_fastpath
> | flock
> |
> |--21.42%-- locks_delete_lock_ctx
> | flock_lock_file
> | flock_lock_file_wait
> | sys_flock
> | entry_SYSCALL_64_fastpath
> | flock
> --0.43%-- [...]
>
>
> And its not at all clear to me why this would be. It looks like
> FILE_LOCK_DEFERRED is happening, but I've not yet figured out why that
> would be.
So I'd suggest to first compare preemption behavior: does the workload
context-switch heavily, and is it the exact same context switching rate and are
the points of preemption the same as well between the two kernels?
[ Such high variance is often caused by (dynamically) unstable load balancing and
the workload never finding a good equilibrium. Any observable locking overhead
is usually just a second order concern or a symptom. Assuming the workload
context switches heavily. ]
Thanks,
Ingo
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 00/13] percpu rwsem -v2
2015-06-24 8:46 ` Ingo Molnar
@ 2015-06-24 9:01 ` Peter Zijlstra
2015-06-24 9:18 ` Daniel Wagner
1 sibling, 0 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-24 9:01 UTC (permalink / raw)
To: Ingo Molnar
Cc: Daniel Wagner, oleg, paulmck, tj, mingo, linux-kernel, der.herr,
dave, riel, viro, torvalds, jlayton
On Wed, Jun 24, 2015 at 10:46:48AM +0200, Ingo Molnar wrote:
> > > > flock02
> > > > mean variance sigma max min
> > > > tip-1 11.8994 0.5874 0.7664 13.2022 8.6324
> > > > tip-2 11.7394 0.5252 0.7247 13.2540 9.7513
> > > > tip-3 11.8155 0.5288 0.7272 13.2700 9.9480
> > > > tip+percpu-rswem-1 15.3601 0.8981 0.9477 16.8116 12.6910
> > > > tip+percpu-rswem-2 15.2558 0.8442 0.9188 17.0199 12.9586
> > > > tip+percpu-rswem-3 15.5297 0.6386 0.7991 17.4392 12.7992
> [ Such high variance is often caused by (dynamically) unstable load balancing and
> the workload never finding a good equilibrium. Any observable locking overhead
> is usually just a second order concern or a symptom. Assuming the workload
> context switches heavily. ]
flock02 is a relatively stable benchmark -- unlike some of the others
where the variance is orders of magnitude higher than the avg.
But yes, I'll go poke at it more. I just need to hunt down unrelated
fail before continuing with this.
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 00/13] percpu rwsem -v2
2015-06-24 8:46 ` Ingo Molnar
2015-06-24 9:01 ` Peter Zijlstra
@ 2015-06-24 9:18 ` Daniel Wagner
2015-07-01 5:57 ` Daniel Wagner
1 sibling, 1 reply; 106+ messages in thread
From: Daniel Wagner @ 2015-06-24 9:18 UTC (permalink / raw)
To: Ingo Molnar, Peter Zijlstra
Cc: oleg, paulmck, tj, mingo, linux-kernel, der.herr, dave, riel,
viro, torvalds, jlayton
On 06/24/2015 10:46 AM, Ingo Molnar wrote:
> So I'd suggest to first compare preemption behavior: does the workload
> context-switch heavily, and is it the exact same context switching rate and are
> the points of preemption the same as well between the two kernels?
If I read this correctly, the answer is yes.
First the 'stable' flock02 test:
perf stat --repeat 5 --pre 'rm -rf /tmp/a' ~/src/lockperf/flock02 -n 128 -l 64 /tmp/a
0.008793148
0.008784990
0.008587804
0.008693641
0.008776946
Performance counter stats for '/home/wagi/src/lockperf/flock02 -n 128 -l 64 /tmp/a' (5 runs):
76.509634 task-clock (msec) # 3.312 CPUs utilized ( +- 0.67% )
2 context-switches # 0.029 K/sec ( +- 26.50% )
128 cpu-migrations # 0.002 M/sec ( +- 0.31% )
5,295 page-faults # 0.069 M/sec ( +- 0.49% )
89,944,154 cycles # 1.176 GHz ( +- 0.66% )
58,670,259 stalled-cycles-frontend # 65.23% frontend cycles idle ( +- 0.88% )
0 stalled-cycles-backend # 0.00% backend cycles idle
76,991,414 instructions # 0.86 insns per cycle
# 0.76 stalled cycles per insn ( +- 0.19% )
15,239,720 branches # 199.187 M/sec ( +- 0.20% )
103,418 branch-misses # 0.68% of all branches ( +- 6.68% )
0.023102895 seconds time elapsed ( +- 1.09% )
And here posix01 which shows high variance:
perf stat --repeat 5 --pre 'rm -rf /tmp/a' ~/src/lockperf/posix01 -n 128 -l 64 /tmp/a
0.006020402
32.510838421
55.516466069
46.794470223
5.097701438
Performance counter stats for '/home/wagi/src/lockperf/posix01 -n 128 -l 64 /tmp/a' (5 runs):
4177.932106 task-clock (msec) # 14.162 CPUs utilized ( +- 34.59% )
70,646 context-switches # 0.017 M/sec ( +- 31.56% )
28,009 cpu-migrations # 0.007 M/sec ( +- 33.55% )
4,834 page-faults # 0.001 M/sec ( +- 0.98% )
7,291,160,968 cycles # 1.745 GHz ( +- 32.17% )
5,216,204,262 stalled-cycles-frontend # 71.54% frontend cycles idle ( +- 32.13% )
0 stalled-cycles-backend # 0.00% backend cycles idle
1,901,289,780 instructions # 0.26 insns per cycle
# 2.74 stalled cycles per insn ( +- 30.80% )
440,415,914 branches # 105.415 M/sec ( +- 31.06% )
1,347,021 branch-misses # 0.31% of all branches ( +- 29.17% )
0.295016987 seconds time elapsed ( +- 32.01% )
BTW, thanks for the perf stat tip. Really handy!
cheers,
daniel
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-06-24 8:32 ` Peter Zijlstra
@ 2015-06-24 9:31 ` Peter Zijlstra
2015-06-24 13:48 ` Paul E. McKenney
2015-06-24 15:01 ` Paul E. McKenney
1 sibling, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-24 9:31 UTC (permalink / raw)
To: Paul E. McKenney
Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
viro, torvalds
On Wed, Jun 24, 2015 at 10:32:57AM +0200, Peter Zijlstra wrote:
> + s = atomic_long_read(&rsp->expedited_done);
> + if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
> + /* ensure test happens before caller kfree */
> + smp_mb__before_atomic(); /* ^^^ */
FWIW isn't that guaranteed by the control dep?
> + atomic_long_inc(&rsp->expedited_workdone1);
> + goto unlock;
> + }
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-06-24 8:42 ` Ingo Molnar
@ 2015-06-24 13:39 ` Paul E. McKenney
2015-06-24 13:43 ` Ingo Molnar
0 siblings, 1 reply; 106+ messages in thread
From: Paul E. McKenney @ 2015-06-24 13:39 UTC (permalink / raw)
To: Ingo Molnar
Cc: Peter Zijlstra, Oleg Nesterov, tj, mingo, linux-kernel, der.herr,
dave, riel, viro, torvalds
On Wed, Jun 24, 2015 at 10:42:48AM +0200, Ingo Molnar wrote:
>
> * Peter Zijlstra <peterz@infradead.org> wrote:
>
> > On Tue, Jun 23, 2015 at 11:26:26AM -0700, Paul E. McKenney wrote:
> > > >
> > > > I really think you're making that expedited nonsense far too accessible.
> > >
> > > This has nothing to do with accessibility and everything to do with
> > > robustness. And with me not becoming the triage center for too many non-RCU
> > > bugs.
> >
> > But by making it so you're rewarding abuse instead of flagging it :-(
>
> Btw., being a 'triage center' is the bane of APIs that are overly successful,
> so we should take that burden with pride! :-)
I will gladly accept that compliment.
And the burden. But, lazy as I am, I intend to automate it. ;-)
> Lockdep (and the scheduler APIs as well) frequently got into such situations as
> well, and we mostly solved it by being more informative with debug splats.
>
> I don't think a kernel API should (ever!) stay artificially silent, just for fear
> of flagging too many problems in other code.
I agree, as attested by RCU CPU stall warnings, lockdep-RCU, sparse-based
RCU checks, and the object-debug-based checks for double call_rcu().
That said, in all of these cases, including your example of lockdep,
the diagnostic is a debug splat rather than a mutex-contention meltdown.
And it is the mutex-contention meltdown that I will continue making
synchronize_sched_expedited() avoid.
But given the change from bulk try_stop_cpus() to either stop_one_cpu() or
IPIs, it would not be hard to splat if a given CPU didn't come back fast
enough. The latency tracer would of course provide better information,
but synchronize_sched_expedited() could do a coarse-grained job with
less setup required.
My first guess for the timeout would be something like 500 milliseconds.
Thoughts?
Thanx, Paul
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-06-24 13:39 ` Paul E. McKenney
@ 2015-06-24 13:43 ` Ingo Molnar
2015-06-24 14:03 ` Paul E. McKenney
0 siblings, 1 reply; 106+ messages in thread
From: Ingo Molnar @ 2015-06-24 13:43 UTC (permalink / raw)
To: Paul E. McKenney
Cc: Peter Zijlstra, Oleg Nesterov, tj, mingo, linux-kernel, der.herr,
dave, riel, viro, torvalds
* Paul E. McKenney <paulmck@linux.vnet.ibm.com> wrote:
> On Wed, Jun 24, 2015 at 10:42:48AM +0200, Ingo Molnar wrote:
> >
> > * Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > > On Tue, Jun 23, 2015 at 11:26:26AM -0700, Paul E. McKenney wrote:
> > > > >
> > > > > I really think you're making that expedited nonsense far too accessible.
> > > >
> > > > This has nothing to do with accessibility and everything to do with
> > > > robustness. And with me not becoming the triage center for too many non-RCU
> > > > bugs.
> > >
> > > But by making it so you're rewarding abuse instead of flagging it :-(
> >
> > Btw., being a 'triage center' is the bane of APIs that are overly successful,
> > so we should take that burden with pride! :-)
>
> I will gladly accept that compliment.
>
> And the burden. But, lazy as I am, I intend to automate it. ;-)
lol :)
> > Lockdep (and the scheduler APIs as well) frequently got into such situations as
> > well, and we mostly solved it by being more informative with debug splats.
> >
> > I don't think a kernel API should (ever!) stay artificially silent, just for fear
> > of flagging too many problems in other code.
>
> I agree, as attested by RCU CPU stall warnings, lockdep-RCU, sparse-based
> RCU checks, and the object-debug-based checks for double call_rcu().
> That said, in all of these cases, including your example of lockdep,
> the diagnostic is a debug splat rather than a mutex-contention meltdown.
> And it is the mutex-contention meltdown that I will continue making
> synchronize_sched_expedited() avoid.
>
> But given the change from bulk try_stop_cpus() to either stop_one_cpu() or
> IPIs, it would not be hard to splat if a given CPU didn't come back fast
> enough. The latency tracer would of course provide better information,
> but synchronize_sched_expedited() could do a coarse-grained job with
> less setup required.
>
> My first guess for the timeout would be something like 500 milliseconds.
> Thoughts?
So I'd start with 5,000 milliseconds and observe the results first ...
Thanks,
Ingo
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-06-24 9:31 ` Peter Zijlstra
@ 2015-06-24 13:48 ` Paul E. McKenney
0 siblings, 0 replies; 106+ messages in thread
From: Paul E. McKenney @ 2015-06-24 13:48 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
viro, torvalds
On Wed, Jun 24, 2015 at 11:31:02AM +0200, Peter Zijlstra wrote:
> On Wed, Jun 24, 2015 at 10:32:57AM +0200, Peter Zijlstra wrote:
> > + s = atomic_long_read(&rsp->expedited_done);
> > + if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
> > + /* ensure test happens before caller kfree */
> > + smp_mb__before_atomic(); /* ^^^ */
>
> FWIW isn't that guaranteed by the control dep?
For trailing stores, yes, but not for trailing loads. Of course,
trailing loads don't matter in the pure kfree case, but do matter in
other situations. And this isn't anywhere near a fastpath, so I
am not all that worried about the extra memory barrier.
Thanx, Paul
> > + atomic_long_inc(&rsp->expedited_workdone1);
> > + goto unlock;
> > + }
>
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 09/13] hotplug: Replace hotplug lock with percpu-rwsem
2015-06-23 17:53 ` Peter Zijlstra
@ 2015-06-24 13:50 ` Oleg Nesterov
2015-06-24 14:13 ` Peter Zijlstra
2015-06-28 23:56 ` [PATCH 0/3] percpu-rwsem: introduce percpu_rw_semaphore->recursive mode Oleg Nesterov
0 siblings, 2 replies; 106+ messages in thread
From: Oleg Nesterov @ 2015-06-24 13:50 UTC (permalink / raw)
To: Peter Zijlstra
Cc: paulmck, tj, mingo, linux-kernel, der.herr, dave, riel, viro, torvalds
On 06/23, Peter Zijlstra wrote:
>
> On Tue, Jun 23, 2015 at 07:01:22PM +0200, Oleg Nesterov wrote:
> > On 06/23, Peter Zijlstra wrote:
> > >
> > > On Tue, Jun 23, 2015 at 12:57:39AM +0200, Oleg Nesterov wrote:
> > > > > +
> > > > > + lock_map_acquire_read(&cpu_hotplug.rwsem.rw_sem.dep_map);
> > > > > + _percpu_down_read(&cpu_hotplug.rwsem);
> > > > > }
> > > >
> > > > Confused... Why do we need _percpu_down_read()? Can't get_online_cpus()
> > > > just use percpu_down_read() ?
> > > >
> > > > Yes, percpu_down_read() is not recursive, like the normal down_read().
> > > > But this does not matter because we rely on ->cpuhp_ref anyway?
> > >
> > > While we will not call the actual lock, lockdep will still get confused
> > > by the inconsistent locking order observed.
> > >
> > > Change it and boot, you'll find lockdep output pretty quickly.
> >
> > Hmm. and I simply can't understand why...
>
> If in one callchain we do:
>
> get_online_cpus();
> lock(A);
>
> in another we do:
>
> lock(A);
> get_online_cpus();
>
> lockdep will complain about the inverted lock order, however this is not
> a problem at all for recursive locks.
Ah, but in this case lockdep is right. This is deadlockable because
with the new implementation percpu_down_write() blocks the new readers.
So this change just hides the valid warning.
Just suppose that the 3rd CPU does percpu_down_write()->down_write()
right after the 2nd CPU (above) takes lock(A).
I have to admit that I didn't realize that the code above is currently
correct... but it is.
So we need percpu_down_write_dont_block_readers(). I already thought
about this before, I'll try to make the patch tomorrow on top of your
changes.
This means that we do not need task_struct->cpuhp_ref, but we can't
avoid livelock we currently have: cpu_hotplug_begin() can never succeed
if the new readers come fast enough.
Oleg.
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-06-24 13:43 ` Ingo Molnar
@ 2015-06-24 14:03 ` Paul E. McKenney
0 siblings, 0 replies; 106+ messages in thread
From: Paul E. McKenney @ 2015-06-24 14:03 UTC (permalink / raw)
To: Ingo Molnar
Cc: Peter Zijlstra, Oleg Nesterov, tj, mingo, linux-kernel, der.herr,
dave, riel, viro, torvalds
On Wed, Jun 24, 2015 at 03:43:37PM +0200, Ingo Molnar wrote:
>
> * Paul E. McKenney <paulmck@linux.vnet.ibm.com> wrote:
>
> > On Wed, Jun 24, 2015 at 10:42:48AM +0200, Ingo Molnar wrote:
> > >
> > > * Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > > On Tue, Jun 23, 2015 at 11:26:26AM -0700, Paul E. McKenney wrote:
> > > > > >
> > > > > > I really think you're making that expedited nonsense far too accessible.
> > > > >
> > > > > This has nothing to do with accessibility and everything to do with
> > > > > robustness. And with me not becoming the triage center for too many non-RCU
> > > > > bugs.
> > > >
> > > > But by making it so you're rewarding abuse instead of flagging it :-(
> > >
> > > Btw., being a 'triage center' is the bane of APIs that are overly successful,
> > > so we should take that burden with pride! :-)
> >
> > I will gladly accept that compliment.
> >
> > And the burden. But, lazy as I am, I intend to automate it. ;-)
>
> lol :)
>
> > > Lockdep (and the scheduler APIs as well) frequently got into such situations as
> > > well, and we mostly solved it by being more informative with debug splats.
> > >
> > > I don't think a kernel API should (ever!) stay artificially silent, just for fear
> > > of flagging too many problems in other code.
> >
> > I agree, as attested by RCU CPU stall warnings, lockdep-RCU, sparse-based
> > RCU checks, and the object-debug-based checks for double call_rcu().
> > That said, in all of these cases, including your example of lockdep,
> > the diagnostic is a debug splat rather than a mutex-contention meltdown.
> > And it is the mutex-contention meltdown that I will continue making
> > synchronize_sched_expedited() avoid.
> >
> > But given the change from bulk try_stop_cpus() to either stop_one_cpu() or
> > IPIs, it would not be hard to splat if a given CPU didn't come back fast
> > enough. The latency tracer would of course provide better information,
> > but synchronize_sched_expedited() could do a coarse-grained job with
> > less setup required.
> >
> > My first guess for the timeout would be something like 500 milliseconds.
> > Thoughts?
>
> So I'd start with 5,000 milliseconds and observe the results first ...
Sounds good, especially when I recall that the default RCU CPU stall
warning timeout is 21,000 milliseconds... ;-)
Thanx, Paul
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 09/13] hotplug: Replace hotplug lock with percpu-rwsem
2015-06-24 13:50 ` Oleg Nesterov
@ 2015-06-24 14:13 ` Peter Zijlstra
2015-06-24 15:12 ` Oleg Nesterov
2015-06-28 23:56 ` [PATCH 0/3] percpu-rwsem: introduce percpu_rw_semaphore->recursive mode Oleg Nesterov
1 sibling, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-24 14:13 UTC (permalink / raw)
To: Oleg Nesterov
Cc: paulmck, tj, mingo, linux-kernel, der.herr, dave, riel, viro, torvalds
On Wed, Jun 24, 2015 at 03:50:49PM +0200, Oleg Nesterov wrote:
> On 06/23, Peter Zijlstra wrote:
> > If in one callchain we do:
> >
> > get_online_cpus();
> > lock(A);
> >
> > in another we do:
> >
> > lock(A);
> > get_online_cpus();
> >
> > lockdep will complain about the inverted lock order, however this is not
> > a problem at all for recursive locks.
>
> Ah, but in this case lockdep is right. This is deadlockable because
> with the new implementation percpu_down_write() blocks the new readers.
> So this change just hides the valid warning.
>
> Just suppose that the 3rd CPU does percpu_down_write()->down_write()
> right after the 2nd CPU (above) takes lock(A).
>
> I have to admit that I didn't realize that the code above is currently
> correct... but it is.
>
> So we need percpu_down_write_dont_block_readers(). I already thought
> about this before, I'll try to make the patch tomorrow on top of your
> changes.
>
> This means that we do not need task_struct->cpuhp_ref, but we can't
> avoid livelock we currently have: cpu_hotplug_begin() can never succeed
> if the new readers come fast enough.
I'm confused.. why isn't the read-in-read recursion good enough?
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-06-24 7:35 ` Peter Zijlstra
2015-06-24 8:42 ` Ingo Molnar
@ 2015-06-24 14:50 ` Paul E. McKenney
2015-06-24 15:01 ` Peter Zijlstra
1 sibling, 1 reply; 106+ messages in thread
From: Paul E. McKenney @ 2015-06-24 14:50 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
viro, torvalds
On Wed, Jun 24, 2015 at 09:35:03AM +0200, Peter Zijlstra wrote:
> On Tue, Jun 23, 2015 at 11:26:26AM -0700, Paul E. McKenney wrote:
> > > I really think you're making that expedited nonsense far too accessible.
> >
> > This has nothing to do with accessibility and everything to do with
> > robustness. And with me not becoming the triage center for too many
> > non-RCU bugs.
>
> But by making it so you're rewarding abuse instead of flagging it :-(
As discussed in the thread with Ingo, I will do both.
Alternatively, RCU -is- abuse. Anyone who tries to tell you
otherwise simply lacks proper respect for and adoration of traditional
synchronization mechanisms. ;-)
> > > > And we still need to be able to drop back to synchronize_sched()
> > > > (AKA wait_rcu_gp(call_rcu_sched) in this case) in case we have both a
> > > > creative user and a long-running RCU-sched read-side critical section.
> > >
> > > No, a long-running RCU-sched read-side is a bug and we should fix that,
> > > its called a preemption-latency, we don't like those.
> >
> > Yes, we should fix them. No, they absolutely must not result in a
> > meltdown of some unrelated portion of the kernel (like RCU), particularly
> > if this situation occurs on some system running a production workload
> > that doesn't happen to care about preemption latency.
>
> I still don't see a problem here though; the stop_one_cpu() invocation
> for the CPU that's suffering its preemption latency will take longer,
> but so what?
>
> How does polling and dropping back to sync_rcu() generate better
> behaviour than simply waiting for the completion?
Because if there is too much delay, synchronize_rcu() is no slower
than is synchronize_rcu_expedited(), plus synchronize_rcu() is much
more efficient.
That said, it appears that I have not given any particular thought to the
polling code since about 2008 or so, and it could use quite an upgrade...
> > > > > + stop_one_cpu(cpu, synchronize_sched_expedited_cpu_stop, NULL);
> > > >
> > > > My thought was to use smp_call_function_single(), and to have the function
> > > > called recheck dyntick-idle state, avoiding doing a set_tsk_need_resched()
> > > > if so.
> > >
> > > set_tsk_need_resched() is buggy and should not be used.
> >
> > OK, what API is used for this purpose?
>
> As per exception you (rcu) already have access to resched_cpu(), use
> that -- if it doesn't do what you need it to, we'll fix it, you're the
> only consumer of it.
Color me slow and stupid!
And it looks like resched_cpu() does just fine on the local CPU, so it
should be just fine as is. Thank you for the reminder.
Thanx, Paul
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-06-24 8:32 ` Peter Zijlstra
2015-06-24 9:31 ` Peter Zijlstra
@ 2015-06-24 15:01 ` Paul E. McKenney
2015-06-24 15:34 ` Peter Zijlstra
1 sibling, 1 reply; 106+ messages in thread
From: Paul E. McKenney @ 2015-06-24 15:01 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
viro, torvalds
On Wed, Jun 24, 2015 at 10:32:57AM +0200, Peter Zijlstra wrote:
> On Tue, Jun 23, 2015 at 07:23:44PM -0700, Paul E. McKenney wrote:
> > And here is an untested patch that applies the gist of your approach,
> > the series of stop_one_cpu() calls, but without undoing the rest.
> > I forged your Signed-off-by, please let me know if that doesn't work
> > for you. There are a number of simplifications that can be made, but
> > the basic approach gets a good testing first.
>
> So I really do not get the point of the trylock. It doesn't make sense.
>
> Why would you poll the mutex instead of just wait for it and then
> recheck if someone did the work while you were waiting for it?
>
> What's wrong with the below?
Various delays can cause tasks to queue on the mutex out of order.
This can cause a given task not only to have been delayed between
sampling ->expedited_start and the mutex_lock(), but be further delayed
because tasks granted the mutex earlier will wait on grace periods that
the delayed task doesn't need to wait on. These extra waits are simply
not consistent with the "expedited" in synchronize_sched_expedited().
That said, my polling code can most definitely be improved -- as I
mentioned earlier, it is from 2008 or so, back when a lot of things
worked differently. My first thought is to apply something sort of
like force_quiescent_state()'s funnel locking, but with unconditional
mutex_lock() instead of the raw_spin_trylock(). That way, when a given
task is awakened, there is a high probability that a grace period it
can use has already elapsed, allowing it to break out of the loop and go
on its way. This can probably be further improved, but it is a decent
place for me to start.
Thanx, Paul
> ---
> kernel/rcu/tree.c | 100 +++++++++++++++---------------------------------------
> kernel/rcu/tree.h | 1 +
> 2 files changed, 29 insertions(+), 72 deletions(-)
>
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index add042926a66..b39a5672a7ac 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -103,6 +103,7 @@ struct rcu_state sname##_state = { \
> .orphan_nxttail = &sname##_state.orphan_nxtlist, \
> .orphan_donetail = &sname##_state.orphan_donelist, \
> .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
> + .expedited_mutex = __MUTEX_INITIALIZER(sname##_state.expedited_mutex), \
> .name = RCU_STATE_NAME(sname), \
> .abbr = sabbr, \
> }
> @@ -3304,12 +3305,9 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
> */
> void synchronize_sched_expedited(void)
> {
> - cpumask_var_t cm;
> - bool cma = false;
> - int cpu;
> - long firstsnap, s, snap;
> - int trycount = 0;
> struct rcu_state *rsp = &rcu_sched_state;
> + long s, snap;
> + int cpu;
>
> /*
> * If we are in danger of counter wrap, just do synchronize_sched().
> @@ -3332,7 +3330,6 @@ void synchronize_sched_expedited(void)
> * full memory barrier.
> */
> snap = atomic_long_inc_return(&rsp->expedited_start);
> - firstsnap = snap;
> if (!try_get_online_cpus()) {
> /* CPU hotplug operation in flight, fall back to normal GP. */
> wait_rcu_gp(call_rcu_sched);
> @@ -3341,83 +3338,40 @@ void synchronize_sched_expedited(void)
> }
> WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
>
> - /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
> - cma = zalloc_cpumask_var(&cm, GFP_KERNEL);
> - if (cma) {
> - cpumask_copy(cm, cpu_online_mask);
> - cpumask_clear_cpu(raw_smp_processor_id(), cm);
> - for_each_cpu(cpu, cm) {
> - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
> -
> - if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
> - cpumask_clear_cpu(cpu, cm);
> - }
> - if (cpumask_weight(cm) == 0)
> - goto all_cpus_idle;
> - }
> -
> /*
> * Each pass through the following loop attempts to force a
> * context switch on each CPU.
> */
> - while (try_stop_cpus(cma ? cm : cpu_online_mask,
> - synchronize_sched_expedited_cpu_stop,
> - NULL) == -EAGAIN) {
> - put_online_cpus();
> - atomic_long_inc(&rsp->expedited_tryfail);
> + mutex_lock(&rsp->expedited_mutex);
>
> - /* Check to see if someone else did our work for us. */
> - s = atomic_long_read(&rsp->expedited_done);
> - if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
> - /* ensure test happens before caller kfree */
> - smp_mb__before_atomic(); /* ^^^ */
> - atomic_long_inc(&rsp->expedited_workdone1);
> - free_cpumask_var(cm);
> - return;
> - }
> + /*
> + * Check to see if someone else did our work for us, while we were
> + * waiting for the mutex.
> + */
> + s = atomic_long_read(&rsp->expedited_done);
> + if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
> + /* ensure test happens before caller kfree */
> + smp_mb__before_atomic(); /* ^^^ */
> + atomic_long_inc(&rsp->expedited_workdone1);
> + goto unlock;
> + }
>
> - /* No joy, try again later. Or just synchronize_sched(). */
> - if (trycount++ < 10) {
> - udelay(trycount * num_online_cpus());
> - } else {
> - wait_rcu_gp(call_rcu_sched);
> - atomic_long_inc(&rsp->expedited_normal);
> - free_cpumask_var(cm);
> - return;
> - }
> + /* Stop each CPU that is online, non-idle, and not us. */
> + for_each_online_cpu(cpu) {
> + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
>
> - /* Recheck to see if someone else did our work for us. */
> - s = atomic_long_read(&rsp->expedited_done);
> - if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
> - /* ensure test happens before caller kfree */
> - smp_mb__before_atomic(); /* ^^^ */
> - atomic_long_inc(&rsp->expedited_workdone2);
> - free_cpumask_var(cm);
> - return;
> - }
> + /* Skip our CPU, */
> + if (raw_smp_processor_id() == cpu)
> + continue;
>
> - /*
> - * Refetching sync_sched_expedited_started allows later
> - * callers to piggyback on our grace period. We retry
> - * after they started, so our grace period works for them,
> - * and they started after our first try, so their grace
> - * period works for us.
> - */
> - if (!try_get_online_cpus()) {
> - /* CPU hotplug operation in flight, use normal GP. */
> - wait_rcu_gp(call_rcu_sched);
> - atomic_long_inc(&rsp->expedited_normal);
> - free_cpumask_var(cm);
> - return;
> - }
> - snap = atomic_long_read(&rsp->expedited_start);
> - smp_mb(); /* ensure read is before try_stop_cpus(). */
> + /* and any idle CPUs. */
> + if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
> + continue;
> +
> + stop_one_cpu(cpu, synchronize_sched_expedited_cpu_stop, NULL);
> }
> atomic_long_inc(&rsp->expedited_stoppedcpus);
>
> -all_cpus_idle:
> - free_cpumask_var(cm);
> -
> /*
> * Everyone up to our most recent fetch is covered by our grace
> * period. Update the counter, but only if our work is still
> @@ -3435,6 +3389,8 @@ void synchronize_sched_expedited(void)
> }
> } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s);
> atomic_long_inc(&rsp->expedited_done_exit);
> +unlock:
> + mutex_unlock(&rsp->expedited_mutex);
>
> put_online_cpus();
> }
> diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
> index 4adb7ca0bf47..10348c081e8e 100644
> --- a/kernel/rcu/tree.h
> +++ b/kernel/rcu/tree.h
> @@ -483,6 +483,7 @@ struct rcu_state {
> /* _rcu_barrier(). */
> /* End of fields guarded by barrier_mutex. */
>
> + struct mutex expedited_mutex; /* Serializes expediting. */
> atomic_long_t expedited_start; /* Starting ticket. */
> atomic_long_t expedited_done; /* Done ticket. */
> atomic_long_t expedited_wrap; /* # near-wrap incidents. */
>
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-06-24 14:50 ` Paul E. McKenney
@ 2015-06-24 15:01 ` Peter Zijlstra
2015-06-24 15:27 ` Paul E. McKenney
0 siblings, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-24 15:01 UTC (permalink / raw)
To: Paul E. McKenney
Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
viro, torvalds
On Wed, Jun 24, 2015 at 07:50:42AM -0700, Paul E. McKenney wrote:
> On Wed, Jun 24, 2015 at 09:35:03AM +0200, Peter Zijlstra wrote:
> > I still don't see a problem here though; the stop_one_cpu() invocation
> > for the CPU that's suffering its preemption latency will take longer,
> > but so what?
> >
> > How does polling and dropping back to sync_rcu() generate better
> > behaviour than simply waiting for the completion?
>
> Because if there is too much delay, synchronize_rcu() is no slower
> than is synchronize_rcu_expedited(), plus synchronize_rcu() is much
> more efficient.
Still confused.. How is polling and then blocking more efficient than
just blocking in the first place? I'm seeing the polling as a waste of
cpu time.
The thing is, if we're stalled on a stop_one_cpu() call, the sync_rcu()
is equally stalled. The sync_rcu() cannot wait more efficient than we're
already waiting either.
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 09/13] hotplug: Replace hotplug lock with percpu-rwsem
2015-06-24 14:13 ` Peter Zijlstra
@ 2015-06-24 15:12 ` Oleg Nesterov
2015-06-24 16:15 ` Peter Zijlstra
0 siblings, 1 reply; 106+ messages in thread
From: Oleg Nesterov @ 2015-06-24 15:12 UTC (permalink / raw)
To: Peter Zijlstra
Cc: paulmck, tj, mingo, linux-kernel, der.herr, dave, riel, viro, torvalds
On 06/24, Peter Zijlstra wrote:
>
> On Wed, Jun 24, 2015 at 03:50:49PM +0200, Oleg Nesterov wrote:
> > On 06/23, Peter Zijlstra wrote:
> > > If in one callchain we do:
> > >
> > > get_online_cpus();
> > > lock(A);
> > >
> > > in another we do:
> > >
> > > lock(A);
> > > get_online_cpus();
> > >
> > > lockdep will complain about the inverted lock order, however this is not
> > > a problem at all for recursive locks.
> >
> > Ah, but in this case lockdep is right. This is deadlockable because
> > with the new implementation percpu_down_write() blocks the new readers.
> > So this change just hides the valid warning.
> >
> > Just suppose that the 3rd CPU does percpu_down_write()->down_write()
> > right after the 2nd CPU (above) takes lock(A).
> >
> > I have to admit that I didn't realize that the code above is currently
> > correct... but it is.
> >
> > So we need percpu_down_write_dont_block_readers(). I already thought
> > about this before, I'll try to make the patch tomorrow on top of your
> > changes.
> >
> > This means that we do not need task_struct->cpuhp_ref, but we can't
> > avoid livelock we currently have: cpu_hotplug_begin() can never succeed
> > if the new readers come fast enough.
>
> I'm confused.. why isn't the read-in-read recursion good enough?
Because the code above can actually deadlock if 2 CPU's do this at
the same time?
task_struct->cpuhp_ref only makes read-in-read work, but
percpu_down_write() blocks the new readers.
Suppose that ->cpuhp_ref == 0 on CPU's 0 and 1, suppose that CPU 2
does percpu_down_write() and "sem->state = readers_block" is already
visible to CPU 1 when it calls get_online_cpus().
CPU_0 CPU_1 CPU_2
get_online_cpus(); lock(A);
// waits for CPU_1
lock(A)
// waits for CPU_0
percpu_down_write();
// waits for CPU_2
get_online_cpus();
Oleg.
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-06-24 15:01 ` Peter Zijlstra
@ 2015-06-24 15:27 ` Paul E. McKenney
2015-06-24 15:40 ` Peter Zijlstra
0 siblings, 1 reply; 106+ messages in thread
From: Paul E. McKenney @ 2015-06-24 15:27 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
viro, torvalds
On Wed, Jun 24, 2015 at 05:01:51PM +0200, Peter Zijlstra wrote:
> On Wed, Jun 24, 2015 at 07:50:42AM -0700, Paul E. McKenney wrote:
> > On Wed, Jun 24, 2015 at 09:35:03AM +0200, Peter Zijlstra wrote:
>
> > > I still don't see a problem here though; the stop_one_cpu() invocation
> > > for the CPU that's suffering its preemption latency will take longer,
> > > but so what?
> > >
> > > How does polling and dropping back to sync_rcu() generate better
> > > behaviour than simply waiting for the completion?
> >
> > Because if there is too much delay, synchronize_rcu() is no slower
> > than is synchronize_rcu_expedited(), plus synchronize_rcu() is much
> > more efficient.
>
> Still confused.. How is polling and then blocking more efficient than
> just blocking in the first place? I'm seeing the polling as a waste of
> cpu time.
As I said, the current code is quite old and will get a facelift.
> The thing is, if we're stalled on a stop_one_cpu() call, the sync_rcu()
> is equally stalled. The sync_rcu() cannot wait more efficient than we're
> already waiting either.
Ah, but synchronize_rcu() doesn't force waiting on more than one extra
grace period. With strictly queued mutex, you can end up waiting on
several.
Thanx, Paul
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-06-24 15:01 ` Paul E. McKenney
@ 2015-06-24 15:34 ` Peter Zijlstra
0 siblings, 0 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-24 15:34 UTC (permalink / raw)
To: Paul E. McKenney
Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
viro, torvalds
On Wed, Jun 24, 2015 at 08:01:29AM -0700, Paul E. McKenney wrote:
> On Wed, Jun 24, 2015 at 10:32:57AM +0200, Peter Zijlstra wrote:
> > On Tue, Jun 23, 2015 at 07:23:44PM -0700, Paul E. McKenney wrote:
> > > And here is an untested patch that applies the gist of your approach,
> > > the series of stop_one_cpu() calls, but without undoing the rest.
> > > I forged your Signed-off-by, please let me know if that doesn't work
> > > for you. There are a number of simplifications that can be made, but
> > > the basic approach gets a good testing first.
> >
> > So I really do not get the point of the trylock. It doesn't make sense.
> >
> > Why would you poll the mutex instead of just wait for it and then
> > recheck if someone did the work while you were waiting for it?
> >
> > What's wrong with the below?
>
> Various delays can cause tasks to queue on the mutex out of order.
If the mutex owner sleeps, mutexes are FIFO, otherwise things can get
iffy indeed.
> This can cause a given task not only to have been delayed between
> sampling ->expedited_start and the mutex_lock(), but be further delayed
> because tasks granted the mutex earlier will wait on grace periods that
> the delayed task doesn't need to wait on. These extra waits are simply
> not consistent with the "expedited" in synchronize_sched_expedited().
Feh, I really do not know if its worth optimizing the concurrent
expedited case, but we could just make it an open-coded mutex that's
strictly FIFO. A waitqueue on the done variable might be sufficient.
That's still tons better than polling.
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-06-24 15:27 ` Paul E. McKenney
@ 2015-06-24 15:40 ` Peter Zijlstra
2015-06-24 16:09 ` Paul E. McKenney
0 siblings, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-24 15:40 UTC (permalink / raw)
To: Paul E. McKenney
Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
viro, torvalds
On Wed, Jun 24, 2015 at 08:27:19AM -0700, Paul E. McKenney wrote:
> > The thing is, if we're stalled on a stop_one_cpu() call, the sync_rcu()
> > is equally stalled. The sync_rcu() cannot wait more efficient than we're
> > already waiting either.
>
> Ah, but synchronize_rcu() doesn't force waiting on more than one extra
> grace period. With strictly queued mutex, you can end up waiting on
> several.
But you could fix that by replacing/augmenting the expedited ticket with
gpnum/copmleted as used in get_state_synchronize_rcu()/cond_synchronize_rcu().
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-06-24 15:40 ` Peter Zijlstra
@ 2015-06-24 16:09 ` Paul E. McKenney
2015-06-24 16:42 ` Peter Zijlstra
0 siblings, 1 reply; 106+ messages in thread
From: Paul E. McKenney @ 2015-06-24 16:09 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
viro, torvalds
On Wed, Jun 24, 2015 at 05:40:10PM +0200, Peter Zijlstra wrote:
> On Wed, Jun 24, 2015 at 08:27:19AM -0700, Paul E. McKenney wrote:
> > > The thing is, if we're stalled on a stop_one_cpu() call, the sync_rcu()
> > > is equally stalled. The sync_rcu() cannot wait more efficient than we're
> > > already waiting either.
> >
> > Ah, but synchronize_rcu() doesn't force waiting on more than one extra
> > grace period. With strictly queued mutex, you can end up waiting on
> > several.
>
> But you could fix that by replacing/augmenting the expedited ticket with
> gpnum/copmleted as used in get_state_synchronize_rcu()/cond_synchronize_rcu().
Yes, good point, that would be a way of speeding the existing polling
loop up in the case where the polling loop took longer than a normal
grace period. Might also be a way to speed up the new "polling" regime,
but I am still beating up the counters. ;-)
But if the mutex serializes everything unconditionally, then you have
already potentially waited for several grace periods worth of time
before you get a chance to check the ticket, so the check doesn't help.
Or am I missing something subtle here?
It looks like I do need to use smp_call_function_single() and your
resched_cpu() because calling stop_one_cpu() sequentially is about
twice as slow as try_stop_cpus() in rcutorture runs of up to 16 CPUs.
But either way, your point about not stopping all the CPUs does hold.
Thanx, Paul
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 09/13] hotplug: Replace hotplug lock with percpu-rwsem
2015-06-24 15:12 ` Oleg Nesterov
@ 2015-06-24 16:15 ` Peter Zijlstra
0 siblings, 0 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-24 16:15 UTC (permalink / raw)
To: Oleg Nesterov
Cc: paulmck, tj, mingo, linux-kernel, der.herr, dave, riel, viro, torvalds
On Wed, Jun 24, 2015 at 05:12:12PM +0200, Oleg Nesterov wrote:
> On 06/24, Peter Zijlstra wrote:
> > I'm confused.. why isn't the read-in-read recursion good enough?
>
> Because the code above can actually deadlock if 2 CPU's do this at
> the same time?
Hmm yes.. this makes the hotplug locking worse than I feared it was, but
alas.
FYI, the actual splat.
---
[ 7.399737] ======================================================
[ 7.406640] [ INFO: possible circular locking dependency detected ]
[ 7.413643] 4.1.0-02756-ge3d06bd-dirty #185 Not tainted
[ 7.419481] -------------------------------------------------------
[ 7.426483] kworker/0:1/215 is trying to acquire lock:
[ 7.432221] (&cpu_hotplug.rwsem){++++++}, at: [<ffffffff810ebd63>] apply_workqueue_attrs+0x183/0x4b0
[ 7.442564]
[ 7.442564] but task is already holding lock:
[ 7.449079] (&item->mutex){+.+.+.}, at: [<ffffffff815c4dc3>] drm_global_item_ref+0x33/0xe0
[ 7.458455]
[ 7.458455] which lock already depends on the new lock.
[ 7.458455]
[ 7.467591]
[ 7.467591] the existing dependency chain (in reverse order) is:
[ 7.475949]
-> #3 (&item->mutex){+.+.+.}:
[ 7.480662] [<ffffffff811232b1>] lock_acquire+0xd1/0x290
[ 7.487280] [<ffffffff818ea777>] mutex_lock_nested+0x47/0x3c0
[ 7.494390] [<ffffffff815c4dc3>] drm_global_item_ref+0x33/0xe0
[ 7.501596] [<ffffffff815dcd90>] mgag200_mm_init+0x50/0x1c0
[ 7.508514] [<ffffffff815d757f>] mgag200_driver_load+0x30f/0x500
[ 7.515916] [<ffffffff815b1491>] drm_dev_register+0xb1/0x100
[ 7.522922] [<ffffffff815b428d>] drm_get_pci_dev+0x8d/0x1e0
[ 7.529840] [<ffffffff815dbd3f>] mga_pci_probe+0x9f/0xc0
[ 7.536463] [<ffffffff814bde92>] local_pci_probe+0x42/0xa0
[ 7.543283] [<ffffffff810e54e8>] work_for_cpu_fn+0x18/0x30
[ 7.550106] [<ffffffff810e9d57>] process_one_work+0x1e7/0x7e0
[ 7.557214] [<ffffffff810ea518>] worker_thread+0x1c8/0x460
[ 7.564029] [<ffffffff810f05b6>] kthread+0xf6/0x110
[ 7.570166] [<ffffffff818eefdf>] ret_from_fork+0x3f/0x70
[ 7.576792]
-> #2 (drm_global_mutex){+.+.+.}:
[ 7.581891] [<ffffffff811232b1>] lock_acquire+0xd1/0x290
[ 7.588514] [<ffffffff818ea777>] mutex_lock_nested+0x47/0x3c0
[ 7.595622] [<ffffffff815b1406>] drm_dev_register+0x26/0x100
[ 7.602632] [<ffffffff815b428d>] drm_get_pci_dev+0x8d/0x1e0
[ 7.609547] [<ffffffff815dbd3f>] mga_pci_probe+0x9f/0xc0
[ 7.616170] [<ffffffff814bde92>] local_pci_probe+0x42/0xa0
[ 7.622987] [<ffffffff810e54e8>] work_for_cpu_fn+0x18/0x30
[ 7.629806] [<ffffffff810e9d57>] process_one_work+0x1e7/0x7e0
[ 7.636913] [<ffffffff810ea518>] worker_thread+0x1c8/0x460
[ 7.643727] [<ffffffff810f05b6>] kthread+0xf6/0x110
[ 7.649866] [<ffffffff818eefdf>] ret_from_fork+0x3f/0x70
[ 7.656490]
-> #1 ((&wfc.work)){+.+.+.}:
[ 7.661104] [<ffffffff811232b1>] lock_acquire+0xd1/0x290
[ 7.667727] [<ffffffff810e737d>] flush_work+0x3d/0x260
[ 7.674155] [<ffffffff810e9822>] work_on_cpu+0x82/0x90
[ 7.680584] [<ffffffff814bf2a2>] pci_device_probe+0x112/0x120
[ 7.687692] [<ffffffff815e685f>] driver_probe_device+0x17f/0x2e0
[ 7.695094] [<ffffffff815e6a94>] __driver_attach+0x94/0xa0
[ 7.701910] [<ffffffff815e4786>] bus_for_each_dev+0x66/0xa0
[ 7.708824] [<ffffffff815e626e>] driver_attach+0x1e/0x20
[ 7.715447] [<ffffffff815e5ed8>] bus_add_driver+0x168/0x210
[ 7.722361] [<ffffffff815e7880>] driver_register+0x60/0xe0
[ 7.729180] [<ffffffff814bd754>] __pci_register_driver+0x64/0x70
[ 7.736580] [<ffffffff81f9a10d>] pcie_portdrv_init+0x66/0x79
[ 7.743593] [<ffffffff810002c8>] do_one_initcall+0x88/0x1c0
[ 7.750508] [<ffffffff81f5f169>] kernel_init_freeable+0x1f5/0x282
[ 7.758005] [<ffffffff818da36e>] kernel_init+0xe/0xe0
[ 7.764338] [<ffffffff818eefdf>] ret_from_fork+0x3f/0x70
[ 7.770961]
-> #0 (&cpu_hotplug.rwsem){++++++}:
[ 7.776255] [<ffffffff81122817>] __lock_acquire+0x2207/0x2240
[ 7.783363] [<ffffffff811232b1>] lock_acquire+0xd1/0x290
[ 7.789986] [<ffffffff810cb6e2>] get_online_cpus+0x62/0xb0
[ 7.796805] [<ffffffff810ebd63>] apply_workqueue_attrs+0x183/0x4b0
[ 7.804398] [<ffffffff810ed7bc>] __alloc_workqueue_key+0x2ec/0x560
[ 7.811992] [<ffffffff815cbefa>] ttm_mem_global_init+0x5a/0x310
[ 7.819295] [<ffffffff815dcbb2>] mgag200_ttm_mem_global_init+0x12/0x20
[ 7.827277] [<ffffffff815c4df5>] drm_global_item_ref+0x65/0xe0
[ 7.834481] [<ffffffff815dcd90>] mgag200_mm_init+0x50/0x1c0
[ 7.841395] [<ffffffff815d757f>] mgag200_driver_load+0x30f/0x500
[ 7.848793] [<ffffffff815b1491>] drm_dev_register+0xb1/0x100
[ 7.855804] [<ffffffff815b428d>] drm_get_pci_dev+0x8d/0x1e0
[ 7.862715] [<ffffffff815dbd3f>] mga_pci_probe+0x9f/0xc0
[ 7.869338] [<ffffffff814bde92>] local_pci_probe+0x42/0xa0
[ 7.876159] [<ffffffff810e54e8>] work_for_cpu_fn+0x18/0x30
[ 7.882979] [<ffffffff810e9d57>] process_one_work+0x1e7/0x7e0
[ 7.890087] [<ffffffff810ea518>] worker_thread+0x1c8/0x460
[ 7.896907] [<ffffffff810f05b6>] kthread+0xf6/0x110
[ 7.903043] [<ffffffff818eefdf>] ret_from_fork+0x3f/0x70
[ 7.909673]
[ 7.909673] other info that might help us debug this:
[ 7.909673]
[ 7.918616] Chain exists of:
&cpu_hotplug.rwsem --> drm_global_mutex --> &item->mutex
[ 7.927907] Possible unsafe locking scenario:
[ 7.927907]
[ 7.934521] CPU0 CPU1
[ 7.939580] ---- ----
[ 7.944639] lock(&item->mutex);
[ 7.948359] lock(drm_global_mutex);
[ 7.955292] lock(&item->mutex);
[ 7.961855] lock(&cpu_hotplug.rwsem);
[ 7.966158]
[ 7.966158] *** DEADLOCK ***
[ 7.966158]
[ 7.972771] 4 locks held by kworker/0:1/215:
[ 7.977539] #0: ("events"){.+.+.+}, at: [<ffffffff810e9cc6>] process_one_work+0x156/0x7e0
[ 7.986929] #1: ((&wfc.work)){+.+.+.}, at: [<ffffffff810e9cc6>] process_one_work+0x156/0x7e0
[ 7.996600] #2: (drm_global_mutex){+.+.+.}, at: [<ffffffff815b1406>] drm_dev_register+0x26/0x100
[ 8.006690] #3: (&item->mutex){+.+.+.}, at: [<ffffffff815c4dc3>] drm_global_item_ref+0x33/0xe0
[ 8.016559]
[ 8.016559] stack backtrace:
[ 8.021427] CPU: 0 PID: 215 Comm: kworker/0:1 Not tainted 4.1.0-02756-ge3d06bd-dirty #185
[ 8.030565] Hardware name: Intel Corporation S2600GZ/S2600GZ, BIOS SE5C600.86B.02.02.0002.122320131210 12/23/2013
[ 8.042034] Workqueue: events work_for_cpu_fn
[ 8.046909] ffffffff82857e30 ffff88042b3437c8 ffffffff818e5189 0000000000000011
[ 8.055216] ffffffff8282aa40 ffff88042b343818 ffffffff8111ee76 0000000000000004
[ 8.063522] ffff88042b343888 ffff88042b33f040 0000000000000004 ffff88042b33f040
[ 8.071827] Call Trace:
[ 8.074559] [<ffffffff818e5189>] dump_stack+0x4c/0x6e
[ 8.080300] [<ffffffff8111ee76>] print_circular_bug+0x1c6/0x220
[ 8.087011] [<ffffffff81122817>] __lock_acquire+0x2207/0x2240
[ 8.093528] [<ffffffff811232b1>] lock_acquire+0xd1/0x290
[ 8.099559] [<ffffffff810ebd63>] ? apply_workqueue_attrs+0x183/0x4b0
[ 8.106755] [<ffffffff810cb6e2>] get_online_cpus+0x62/0xb0
[ 8.112981] [<ffffffff810ebd63>] ? apply_workqueue_attrs+0x183/0x4b0
[ 8.120176] [<ffffffff810ead27>] ? alloc_workqueue_attrs+0x27/0x80
[ 8.127178] [<ffffffff810ebd63>] apply_workqueue_attrs+0x183/0x4b0
[ 8.134182] [<ffffffff8111cc21>] ? debug_mutex_init+0x31/0x40
[ 8.140690] [<ffffffff810ed7bc>] __alloc_workqueue_key+0x2ec/0x560
[ 8.147691] [<ffffffff815cbefa>] ttm_mem_global_init+0x5a/0x310
[ 8.154405] [<ffffffff8122b050>] ? __kmalloc+0x5e0/0x630
[ 8.160435] [<ffffffff815c4de2>] ? drm_global_item_ref+0x52/0xe0
[ 8.167243] [<ffffffff815dcbb2>] mgag200_ttm_mem_global_init+0x12/0x20
[ 8.174631] [<ffffffff815c4df5>] drm_global_item_ref+0x65/0xe0
[ 8.181245] [<ffffffff815dcd90>] mgag200_mm_init+0x50/0x1c0
[ 8.187570] [<ffffffff815d757f>] mgag200_driver_load+0x30f/0x500
[ 8.194383] [<ffffffff815b1491>] drm_dev_register+0xb1/0x100
[ 8.200802] [<ffffffff815b428d>] drm_get_pci_dev+0x8d/0x1e0
[ 8.207125] [<ffffffff818ebf9e>] ? mutex_unlock+0xe/0x10
[ 8.213156] [<ffffffff815dbd3f>] mga_pci_probe+0x9f/0xc0
[ 8.219187] [<ffffffff814bde92>] local_pci_probe+0x42/0xa0
[ 8.225412] [<ffffffff8111db81>] ? __lock_is_held+0x51/0x80
[ 8.231736] [<ffffffff810e54e8>] work_for_cpu_fn+0x18/0x30
[ 8.237962] [<ffffffff810e9d57>] process_one_work+0x1e7/0x7e0
[ 8.244477] [<ffffffff810e9cc6>] ? process_one_work+0x156/0x7e0
[ 8.251187] [<ffffffff810ea518>] worker_thread+0x1c8/0x460
[ 8.257410] [<ffffffff810ea350>] ? process_one_work+0x7e0/0x7e0
[ 8.264120] [<ffffffff810ea350>] ? process_one_work+0x7e0/0x7e0
[ 8.270829] [<ffffffff810f05b6>] kthread+0xf6/0x110
[ 8.276375] [<ffffffff818ee230>] ? _raw_spin_unlock_irq+0x30/0x60
[ 8.283282] [<ffffffff810f04c0>] ? kthread_create_on_node+0x220/0x220
[ 8.290566] [<ffffffff818eefdf>] ret_from_fork+0x3f/0x70
[ 8.296597] [<ffffffff810f04c0>] ? kthread_create_on_node+0x220/0x220
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-06-24 16:09 ` Paul E. McKenney
@ 2015-06-24 16:42 ` Peter Zijlstra
2015-06-24 17:10 ` Paul E. McKenney
0 siblings, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-24 16:42 UTC (permalink / raw)
To: Paul E. McKenney
Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
viro, torvalds
On Wed, Jun 24, 2015 at 09:09:04AM -0700, Paul E. McKenney wrote:
> Yes, good point, that would be a way of speeding the existing polling
> loop up in the case where the polling loop took longer than a normal
> grace period. Might also be a way to speed up the new "polling" regime,
> but I am still beating up the counters. ;-)
>
> But if the mutex serializes everything unconditionally, then you have
> already potentially waited for several grace periods worth of time
> before you get a chance to check the ticket, so the check doesn't help.
> Or am I missing something subtle here?
Observe gpnum before you acquire the mutex, once you get it, check it
against completed, if you've waited long enough, bail.
The thing is, once you start bailing on this condition your 'queue'
drains very fast and this is around the same time sync_rcu() would've
released the waiters too.
Furthermore, until this point we can have 'slow' progress by kicking the
CPUs.
That said, the all cpus concurrent sync_rcu_expedited scenario is
absolutely horrid, its everyone spray everyone else.
> It looks like I do need to use smp_call_function_single() and your
> resched_cpu() because calling stop_one_cpu() sequentially is about
> twice as slow as try_stop_cpus() in rcutorture runs of up to 16 CPUs.
> But either way, your point about not stopping all the CPUs does hold.
Bah, I was afraid of that, the problem is that we wait for the
individual stop_work to complete before sending another.
The below is getting a little out of hand, but should avoid the problem
and might be easier than getting the IPI think going, but who knows.
---
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -103,6 +103,7 @@ struct rcu_state sname##_state = { \
.orphan_nxttail = &sname##_state.orphan_nxtlist, \
.orphan_donetail = &sname##_state.orphan_donelist, \
.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
+ .expedited_mutex = __MUTEX_INITIALIZER(sname##_state.expedited_mutex), \
.name = RCU_STATE_NAME(sname), \
.abbr = sabbr, \
}
@@ -3253,23 +3254,28 @@ void cond_synchronize_rcu(unsigned long
}
EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
+struct exp_stop_state {
+ wait_queue_head_t *wq;
+ atomic_t count;
+};
+
static int synchronize_sched_expedited_cpu_stop(void *data)
{
+ struct exp_stop_state *ess = data;
+
/*
* There must be a full memory barrier on each affected CPU
* between the time that try_stop_cpus() is called and the
* time that it returns.
- *
- * In the current initial implementation of cpu_stop, the
- * above condition is already met when the control reaches
- * this point and the following smp_mb() is not strictly
- * necessary. Do smp_mb() anyway for documentation and
- * robustness against future implementation changes.
*/
- smp_mb(); /* See above comment block. */
+ if (atomic_dec_and_test(&ess->count))
+ wake_up(ess->wq);
+
return 0;
}
+static DEFINE_PER_CPU(struct cpu_stop_work, exp_stop_work);
+
/**
* synchronize_sched_expedited - Brute-force RCU-sched grace period
*
@@ -3304,12 +3310,11 @@ static int synchronize_sched_expedited_c
*/
void synchronize_sched_expedited(void)
{
- cpumask_var_t cm;
- bool cma = false;
- int cpu;
- long firstsnap, s, snap;
- int trycount = 0;
+ DECLARE_WAIT_QUEUE_HEAD_ONSTACK(stop_wait);
+ struct exp_stop_state ess = { .wq = &stop_wait, };
struct rcu_state *rsp = &rcu_sched_state;
+ long s, snap;
+ int cpu;
/*
* If we are in danger of counter wrap, just do synchronize_sched().
@@ -3332,7 +3337,6 @@ void synchronize_sched_expedited(void)
* full memory barrier.
*/
snap = atomic_long_inc_return(&rsp->expedited_start);
- firstsnap = snap;
if (!try_get_online_cpus()) {
/* CPU hotplug operation in flight, fall back to normal GP. */
wait_rcu_gp(call_rcu_sched);
@@ -3341,82 +3345,44 @@ void synchronize_sched_expedited(void)
}
WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
- /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
- cma = zalloc_cpumask_var(&cm, GFP_KERNEL);
- if (cma) {
- cpumask_copy(cm, cpu_online_mask);
- cpumask_clear_cpu(raw_smp_processor_id(), cm);
- for_each_cpu(cpu, cm) {
- struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-
- if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
- cpumask_clear_cpu(cpu, cm);
- }
- if (cpumask_weight(cm) == 0)
- goto all_cpus_idle;
- }
-
/*
* Each pass through the following loop attempts to force a
* context switch on each CPU.
*/
- while (try_stop_cpus(cma ? cm : cpu_online_mask,
- synchronize_sched_expedited_cpu_stop,
- NULL) == -EAGAIN) {
- put_online_cpus();
- atomic_long_inc(&rsp->expedited_tryfail);
+ mutex_lock(&rsp->expedited_mutex);
- /* Check to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone1);
- free_cpumask_var(cm);
- return;
- }
+ /*
+ * Check to see if someone else did our work for us, while we were
+ * waiting for the mutex.
+ */
+ s = atomic_long_read(&rsp->expedited_done);
+ if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
+ /* ensure test happens before caller kfree */
+ smp_mb__before_atomic(); /* ^^^ */
+ atomic_long_inc(&rsp->expedited_workdone1);
+ goto unlock;
+ }
- /* No joy, try again later. Or just synchronize_sched(). */
- if (trycount++ < 10) {
- udelay(trycount * num_online_cpus());
- } else {
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
- free_cpumask_var(cm);
- return;
- }
+ /* Stop each CPU that is online, non-idle, and not us. */
+ for_each_online_cpu(cpu) {
+ struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
- /* Recheck to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone2);
- free_cpumask_var(cm);
- return;
- }
+ /* Skip our CPU, */
+ if (raw_smp_processor_id() == cpu)
+ continue;
- /*
- * Refetching sync_sched_expedited_started allows later
- * callers to piggyback on our grace period. We retry
- * after they started, so our grace period works for them,
- * and they started after our first try, so their grace
- * period works for us.
- */
- if (!try_get_online_cpus()) {
- /* CPU hotplug operation in flight, use normal GP. */
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
- free_cpumask_var(cm);
- return;
- }
- snap = atomic_long_read(&rsp->expedited_start);
- smp_mb(); /* ensure read is before try_stop_cpus(). */
+ /* and any idle CPUs. */
+ if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
+ continue;
+
+ atomic_inc(&ess.count);
+ stop_one_cpu_nowait(cpu, synchronize_sched_expedited_cpu_stop, ,
+ &per_cpu(exp_stop_work, cpu));
}
- atomic_long_inc(&rsp->expedited_stoppedcpus);
-all_cpus_idle:
- free_cpumask_var(cm);
+ wait_event(ess.wq, !atomic_read(&ess.count));
+
+ atomic_long_inc(&rsp->expedited_stoppedcpus);
/*
* Everyone up to our most recent fetch is covered by our grace
@@ -3435,6 +3401,8 @@ void synchronize_sched_expedited(void)
}
} while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s);
atomic_long_inc(&rsp->expedited_done_exit);
+unlock:
+ mutex_unlock(&rsp->expedited_mutex);
put_online_cpus();
}
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -483,6 +483,7 @@ struct rcu_state {
/* _rcu_barrier(). */
/* End of fields guarded by barrier_mutex. */
+ struct mutex expedited_mutex; /* Serializes expediting. */
atomic_long_t expedited_start; /* Starting ticket. */
atomic_long_t expedited_done; /* Done ticket. */
atomic_long_t expedited_wrap; /* # near-wrap incidents. */
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-06-24 16:42 ` Peter Zijlstra
@ 2015-06-24 17:10 ` Paul E. McKenney
2015-06-24 17:20 ` Paul E. McKenney
` (2 more replies)
0 siblings, 3 replies; 106+ messages in thread
From: Paul E. McKenney @ 2015-06-24 17:10 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
viro, torvalds
On Wed, Jun 24, 2015 at 06:42:00PM +0200, Peter Zijlstra wrote:
> On Wed, Jun 24, 2015 at 09:09:04AM -0700, Paul E. McKenney wrote:
> > Yes, good point, that would be a way of speeding the existing polling
> > loop up in the case where the polling loop took longer than a normal
> > grace period. Might also be a way to speed up the new "polling" regime,
> > but I am still beating up the counters. ;-)
> >
> > But if the mutex serializes everything unconditionally, then you have
> > already potentially waited for several grace periods worth of time
> > before you get a chance to check the ticket, so the check doesn't help.
> > Or am I missing something subtle here?
>
> Observe gpnum before you acquire the mutex, once you get it, check it
> against completed, if you've waited long enough, bail.
>
> The thing is, once you start bailing on this condition your 'queue'
> drains very fast and this is around the same time sync_rcu() would've
> released the waiters too.
In my experience, this sort of thing simply melts down on large systems.
I am reworking this with multiple locks so as to keep the large-system
contention down to a dull roar.
> Furthermore, until this point we can have 'slow' progress by kicking the
> CPUs.
>
> That said, the all cpus concurrent sync_rcu_expedited scenario is
> absolutely horrid, its everyone spray everyone else.
Agreed, but we really need a system in this state to remain responsive
enough to allow reasonable debugging to proceed rather than just silently
hanging. Ergo, I will be providing multiple locks to keep contention
within the realm of reason. It really isn't complex enough to be worth
arguing about. Maybe 20 lines of straightforward code. (Yeah, yeah,
Murphy says otherwise, but he will have to prove it.)
> > It looks like I do need to use smp_call_function_single() and your
> > resched_cpu() because calling stop_one_cpu() sequentially is about
> > twice as slow as try_stop_cpus() in rcutorture runs of up to 16 CPUs.
> > But either way, your point about not stopping all the CPUs does hold.
>
> Bah, I was afraid of that, the problem is that we wait for the
> individual stop_work to complete before sending another.
>
> The below is getting a little out of hand, but should avoid the problem
> and might be easier than getting the IPI think going, but who knows.
OK, I will give this a try. Of course, the counter needs to be
initialized to 1 rather than zero, and it needs to be atomically
decremented after all stop_one_cpu_nowait() invocations, otherwise you
can get an early wakeup due to the usual race conditions.
Thanx, Paul
> ---
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -103,6 +103,7 @@ struct rcu_state sname##_state = { \
> .orphan_nxttail = &sname##_state.orphan_nxtlist, \
> .orphan_donetail = &sname##_state.orphan_donelist, \
> .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
> + .expedited_mutex = __MUTEX_INITIALIZER(sname##_state.expedited_mutex), \
> .name = RCU_STATE_NAME(sname), \
> .abbr = sabbr, \
> }
> @@ -3253,23 +3254,28 @@ void cond_synchronize_rcu(unsigned long
> }
> EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
>
> +struct exp_stop_state {
> + wait_queue_head_t *wq;
> + atomic_t count;
> +};
> +
> static int synchronize_sched_expedited_cpu_stop(void *data)
> {
> + struct exp_stop_state *ess = data;
> +
> /*
> * There must be a full memory barrier on each affected CPU
> * between the time that try_stop_cpus() is called and the
> * time that it returns.
> - *
> - * In the current initial implementation of cpu_stop, the
> - * above condition is already met when the control reaches
> - * this point and the following smp_mb() is not strictly
> - * necessary. Do smp_mb() anyway for documentation and
> - * robustness against future implementation changes.
> */
> - smp_mb(); /* See above comment block. */
> + if (atomic_dec_and_test(&ess->count))
> + wake_up(ess->wq);
> +
> return 0;
> }
>
> +static DEFINE_PER_CPU(struct cpu_stop_work, exp_stop_work);
> +
> /**
> * synchronize_sched_expedited - Brute-force RCU-sched grace period
> *
> @@ -3304,12 +3310,11 @@ static int synchronize_sched_expedited_c
> */
> void synchronize_sched_expedited(void)
> {
> - cpumask_var_t cm;
> - bool cma = false;
> - int cpu;
> - long firstsnap, s, snap;
> - int trycount = 0;
> + DECLARE_WAIT_QUEUE_HEAD_ONSTACK(stop_wait);
> + struct exp_stop_state ess = { .wq = &stop_wait, };
> struct rcu_state *rsp = &rcu_sched_state;
> + long s, snap;
> + int cpu;
>
> /*
> * If we are in danger of counter wrap, just do synchronize_sched().
> @@ -3332,7 +3337,6 @@ void synchronize_sched_expedited(void)
> * full memory barrier.
> */
> snap = atomic_long_inc_return(&rsp->expedited_start);
> - firstsnap = snap;
> if (!try_get_online_cpus()) {
> /* CPU hotplug operation in flight, fall back to normal GP. */
> wait_rcu_gp(call_rcu_sched);
> @@ -3341,82 +3345,44 @@ void synchronize_sched_expedited(void)
> }
> WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
>
> - /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
> - cma = zalloc_cpumask_var(&cm, GFP_KERNEL);
> - if (cma) {
> - cpumask_copy(cm, cpu_online_mask);
> - cpumask_clear_cpu(raw_smp_processor_id(), cm);
> - for_each_cpu(cpu, cm) {
> - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
> -
> - if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
> - cpumask_clear_cpu(cpu, cm);
> - }
> - if (cpumask_weight(cm) == 0)
> - goto all_cpus_idle;
> - }
> -
> /*
> * Each pass through the following loop attempts to force a
> * context switch on each CPU.
> */
> - while (try_stop_cpus(cma ? cm : cpu_online_mask,
> - synchronize_sched_expedited_cpu_stop,
> - NULL) == -EAGAIN) {
> - put_online_cpus();
> - atomic_long_inc(&rsp->expedited_tryfail);
> + mutex_lock(&rsp->expedited_mutex);
>
> - /* Check to see if someone else did our work for us. */
> - s = atomic_long_read(&rsp->expedited_done);
> - if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
> - /* ensure test happens before caller kfree */
> - smp_mb__before_atomic(); /* ^^^ */
> - atomic_long_inc(&rsp->expedited_workdone1);
> - free_cpumask_var(cm);
> - return;
> - }
> + /*
> + * Check to see if someone else did our work for us, while we were
> + * waiting for the mutex.
> + */
> + s = atomic_long_read(&rsp->expedited_done);
> + if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
> + /* ensure test happens before caller kfree */
> + smp_mb__before_atomic(); /* ^^^ */
> + atomic_long_inc(&rsp->expedited_workdone1);
> + goto unlock;
> + }
>
> - /* No joy, try again later. Or just synchronize_sched(). */
> - if (trycount++ < 10) {
> - udelay(trycount * num_online_cpus());
> - } else {
> - wait_rcu_gp(call_rcu_sched);
> - atomic_long_inc(&rsp->expedited_normal);
> - free_cpumask_var(cm);
> - return;
> - }
> + /* Stop each CPU that is online, non-idle, and not us. */
> + for_each_online_cpu(cpu) {
> + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
>
> - /* Recheck to see if someone else did our work for us. */
> - s = atomic_long_read(&rsp->expedited_done);
> - if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
> - /* ensure test happens before caller kfree */
> - smp_mb__before_atomic(); /* ^^^ */
> - atomic_long_inc(&rsp->expedited_workdone2);
> - free_cpumask_var(cm);
> - return;
> - }
> + /* Skip our CPU, */
> + if (raw_smp_processor_id() == cpu)
> + continue;
>
> - /*
> - * Refetching sync_sched_expedited_started allows later
> - * callers to piggyback on our grace period. We retry
> - * after they started, so our grace period works for them,
> - * and they started after our first try, so their grace
> - * period works for us.
> - */
> - if (!try_get_online_cpus()) {
> - /* CPU hotplug operation in flight, use normal GP. */
> - wait_rcu_gp(call_rcu_sched);
> - atomic_long_inc(&rsp->expedited_normal);
> - free_cpumask_var(cm);
> - return;
> - }
> - snap = atomic_long_read(&rsp->expedited_start);
> - smp_mb(); /* ensure read is before try_stop_cpus(). */
> + /* and any idle CPUs. */
> + if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
> + continue;
> +
> + atomic_inc(&ess.count);
> + stop_one_cpu_nowait(cpu, synchronize_sched_expedited_cpu_stop, ,
> + &per_cpu(exp_stop_work, cpu));
> }
> - atomic_long_inc(&rsp->expedited_stoppedcpus);
>
> -all_cpus_idle:
> - free_cpumask_var(cm);
> + wait_event(ess.wq, !atomic_read(&ess.count));
> +
> + atomic_long_inc(&rsp->expedited_stoppedcpus);
>
> /*
> * Everyone up to our most recent fetch is covered by our grace
> @@ -3435,6 +3401,8 @@ void synchronize_sched_expedited(void)
> }
> } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s);
> atomic_long_inc(&rsp->expedited_done_exit);
> +unlock:
> + mutex_unlock(&rsp->expedited_mutex);
>
> put_online_cpus();
> }
> --- a/kernel/rcu/tree.h
> +++ b/kernel/rcu/tree.h
> @@ -483,6 +483,7 @@ struct rcu_state {
> /* _rcu_barrier(). */
> /* End of fields guarded by barrier_mutex. */
>
> + struct mutex expedited_mutex; /* Serializes expediting. */
> atomic_long_t expedited_start; /* Starting ticket. */
> atomic_long_t expedited_done; /* Done ticket. */
> atomic_long_t expedited_wrap; /* # near-wrap incidents. */
>
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-06-24 17:10 ` Paul E. McKenney
@ 2015-06-24 17:20 ` Paul E. McKenney
2015-06-24 17:29 ` Peter Zijlstra
2015-06-24 17:28 ` Peter Zijlstra
2015-06-24 17:58 ` Peter Zijlstra
2 siblings, 1 reply; 106+ messages in thread
From: Paul E. McKenney @ 2015-06-24 17:20 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
viro, torvalds
On Wed, Jun 24, 2015 at 10:10:04AM -0700, Paul E. McKenney wrote:
> On Wed, Jun 24, 2015 at 06:42:00PM +0200, Peter Zijlstra wrote:
> > On Wed, Jun 24, 2015 at 09:09:04AM -0700, Paul E. McKenney wrote:
[ . . . ]
> > > It looks like I do need to use smp_call_function_single() and your
> > > resched_cpu() because calling stop_one_cpu() sequentially is about
> > > twice as slow as try_stop_cpus() in rcutorture runs of up to 16 CPUs.
> > > But either way, your point about not stopping all the CPUs does hold.
> >
> > Bah, I was afraid of that, the problem is that we wait for the
> > individual stop_work to complete before sending another.
> >
> > The below is getting a little out of hand, but should avoid the problem
> > and might be easier than getting the IPI think going, but who knows.
>
> OK, I will give this a try. Of course, the counter needs to be
> initialized to 1 rather than zero, and it needs to be atomically
> decremented after all stop_one_cpu_nowait() invocations, otherwise you
> can get an early wakeup due to the usual race conditions.
Except that I promised Ingo I would check for CPUs failing to schedule
quickly enough, which means that I must track them individually rather
than via a single counter...
You did have me going for a bit, though! ;-)
Thanx, Paul
> > ---
> > --- a/kernel/rcu/tree.c
> > +++ b/kernel/rcu/tree.c
> > @@ -103,6 +103,7 @@ struct rcu_state sname##_state = { \
> > .orphan_nxttail = &sname##_state.orphan_nxtlist, \
> > .orphan_donetail = &sname##_state.orphan_donelist, \
> > .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
> > + .expedited_mutex = __MUTEX_INITIALIZER(sname##_state.expedited_mutex), \
> > .name = RCU_STATE_NAME(sname), \
> > .abbr = sabbr, \
> > }
> > @@ -3253,23 +3254,28 @@ void cond_synchronize_rcu(unsigned long
> > }
> > EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
> >
> > +struct exp_stop_state {
> > + wait_queue_head_t *wq;
> > + atomic_t count;
> > +};
> > +
> > static int synchronize_sched_expedited_cpu_stop(void *data)
> > {
> > + struct exp_stop_state *ess = data;
> > +
> > /*
> > * There must be a full memory barrier on each affected CPU
> > * between the time that try_stop_cpus() is called and the
> > * time that it returns.
> > - *
> > - * In the current initial implementation of cpu_stop, the
> > - * above condition is already met when the control reaches
> > - * this point and the following smp_mb() is not strictly
> > - * necessary. Do smp_mb() anyway for documentation and
> > - * robustness against future implementation changes.
> > */
> > - smp_mb(); /* See above comment block. */
> > + if (atomic_dec_and_test(&ess->count))
> > + wake_up(ess->wq);
> > +
> > return 0;
> > }
> >
> > +static DEFINE_PER_CPU(struct cpu_stop_work, exp_stop_work);
> > +
> > /**
> > * synchronize_sched_expedited - Brute-force RCU-sched grace period
> > *
> > @@ -3304,12 +3310,11 @@ static int synchronize_sched_expedited_c
> > */
> > void synchronize_sched_expedited(void)
> > {
> > - cpumask_var_t cm;
> > - bool cma = false;
> > - int cpu;
> > - long firstsnap, s, snap;
> > - int trycount = 0;
> > + DECLARE_WAIT_QUEUE_HEAD_ONSTACK(stop_wait);
> > + struct exp_stop_state ess = { .wq = &stop_wait, };
> > struct rcu_state *rsp = &rcu_sched_state;
> > + long s, snap;
> > + int cpu;
> >
> > /*
> > * If we are in danger of counter wrap, just do synchronize_sched().
> > @@ -3332,7 +3337,6 @@ void synchronize_sched_expedited(void)
> > * full memory barrier.
> > */
> > snap = atomic_long_inc_return(&rsp->expedited_start);
> > - firstsnap = snap;
> > if (!try_get_online_cpus()) {
> > /* CPU hotplug operation in flight, fall back to normal GP. */
> > wait_rcu_gp(call_rcu_sched);
> > @@ -3341,82 +3345,44 @@ void synchronize_sched_expedited(void)
> > }
> > WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
> >
> > - /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
> > - cma = zalloc_cpumask_var(&cm, GFP_KERNEL);
> > - if (cma) {
> > - cpumask_copy(cm, cpu_online_mask);
> > - cpumask_clear_cpu(raw_smp_processor_id(), cm);
> > - for_each_cpu(cpu, cm) {
> > - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
> > -
> > - if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
> > - cpumask_clear_cpu(cpu, cm);
> > - }
> > - if (cpumask_weight(cm) == 0)
> > - goto all_cpus_idle;
> > - }
> > -
> > /*
> > * Each pass through the following loop attempts to force a
> > * context switch on each CPU.
> > */
> > - while (try_stop_cpus(cma ? cm : cpu_online_mask,
> > - synchronize_sched_expedited_cpu_stop,
> > - NULL) == -EAGAIN) {
> > - put_online_cpus();
> > - atomic_long_inc(&rsp->expedited_tryfail);
> > + mutex_lock(&rsp->expedited_mutex);
> >
> > - /* Check to see if someone else did our work for us. */
> > - s = atomic_long_read(&rsp->expedited_done);
> > - if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
> > - /* ensure test happens before caller kfree */
> > - smp_mb__before_atomic(); /* ^^^ */
> > - atomic_long_inc(&rsp->expedited_workdone1);
> > - free_cpumask_var(cm);
> > - return;
> > - }
> > + /*
> > + * Check to see if someone else did our work for us, while we were
> > + * waiting for the mutex.
> > + */
> > + s = atomic_long_read(&rsp->expedited_done);
> > + if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
> > + /* ensure test happens before caller kfree */
> > + smp_mb__before_atomic(); /* ^^^ */
> > + atomic_long_inc(&rsp->expedited_workdone1);
> > + goto unlock;
> > + }
> >
> > - /* No joy, try again later. Or just synchronize_sched(). */
> > - if (trycount++ < 10) {
> > - udelay(trycount * num_online_cpus());
> > - } else {
> > - wait_rcu_gp(call_rcu_sched);
> > - atomic_long_inc(&rsp->expedited_normal);
> > - free_cpumask_var(cm);
> > - return;
> > - }
> > + /* Stop each CPU that is online, non-idle, and not us. */
> > + for_each_online_cpu(cpu) {
> > + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
> >
> > - /* Recheck to see if someone else did our work for us. */
> > - s = atomic_long_read(&rsp->expedited_done);
> > - if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
> > - /* ensure test happens before caller kfree */
> > - smp_mb__before_atomic(); /* ^^^ */
> > - atomic_long_inc(&rsp->expedited_workdone2);
> > - free_cpumask_var(cm);
> > - return;
> > - }
> > + /* Skip our CPU, */
> > + if (raw_smp_processor_id() == cpu)
> > + continue;
> >
> > - /*
> > - * Refetching sync_sched_expedited_started allows later
> > - * callers to piggyback on our grace period. We retry
> > - * after they started, so our grace period works for them,
> > - * and they started after our first try, so their grace
> > - * period works for us.
> > - */
> > - if (!try_get_online_cpus()) {
> > - /* CPU hotplug operation in flight, use normal GP. */
> > - wait_rcu_gp(call_rcu_sched);
> > - atomic_long_inc(&rsp->expedited_normal);
> > - free_cpumask_var(cm);
> > - return;
> > - }
> > - snap = atomic_long_read(&rsp->expedited_start);
> > - smp_mb(); /* ensure read is before try_stop_cpus(). */
> > + /* and any idle CPUs. */
> > + if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
> > + continue;
> > +
> > + atomic_inc(&ess.count);
> > + stop_one_cpu_nowait(cpu, synchronize_sched_expedited_cpu_stop, ,
> > + &per_cpu(exp_stop_work, cpu));
> > }
> > - atomic_long_inc(&rsp->expedited_stoppedcpus);
> >
> > -all_cpus_idle:
> > - free_cpumask_var(cm);
> > + wait_event(ess.wq, !atomic_read(&ess.count));
> > +
> > + atomic_long_inc(&rsp->expedited_stoppedcpus);
> >
> > /*
> > * Everyone up to our most recent fetch is covered by our grace
> > @@ -3435,6 +3401,8 @@ void synchronize_sched_expedited(void)
> > }
> > } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s);
> > atomic_long_inc(&rsp->expedited_done_exit);
> > +unlock:
> > + mutex_unlock(&rsp->expedited_mutex);
> >
> > put_online_cpus();
> > }
> > --- a/kernel/rcu/tree.h
> > +++ b/kernel/rcu/tree.h
> > @@ -483,6 +483,7 @@ struct rcu_state {
> > /* _rcu_barrier(). */
> > /* End of fields guarded by barrier_mutex. */
> >
> > + struct mutex expedited_mutex; /* Serializes expediting. */
> > atomic_long_t expedited_start; /* Starting ticket. */
> > atomic_long_t expedited_done; /* Done ticket. */
> > atomic_long_t expedited_wrap; /* # near-wrap incidents. */
> >
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-06-24 17:10 ` Paul E. McKenney
2015-06-24 17:20 ` Paul E. McKenney
@ 2015-06-24 17:28 ` Peter Zijlstra
2015-06-24 17:32 ` Peter Zijlstra
2015-06-24 18:14 ` Peter Zijlstra
2015-06-24 17:58 ` Peter Zijlstra
2 siblings, 2 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-24 17:28 UTC (permalink / raw)
To: Paul E. McKenney
Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
viro, torvalds
On Wed, Jun 24, 2015 at 10:10:17AM -0700, Paul E. McKenney wrote:
> OK, I will give this a try. Of course, the counter needs to be
> initialized to 1 rather than zero, and it needs to be atomically
> decremented after all stop_one_cpu_nowait() invocations, otherwise you
> can get an early wakeup due to the usual race conditions.
Clever that.
How about something like this, it replaced mutex and start/done ticket
thing with an MCS style lockless FIFO queue.
I further uses the gpnum/completed thing to short circuit things if
we've waited long enough.
---
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3253,23 +3253,28 @@ void cond_synchronize_rcu(unsigned long
}
EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
+struct exp_stop_state {
+ wait_queue_head_t *wq;
+ atomic_t count;
+};
+
static int synchronize_sched_expedited_cpu_stop(void *data)
{
+ struct exp_stop_state *ess = data;
+
/*
* There must be a full memory barrier on each affected CPU
* between the time that try_stop_cpus() is called and the
* time that it returns.
- *
- * In the current initial implementation of cpu_stop, the
- * above condition is already met when the control reaches
- * this point and the following smp_mb() is not strictly
- * necessary. Do smp_mb() anyway for documentation and
- * robustness against future implementation changes.
*/
- smp_mb(); /* See above comment block. */
+ if (atomic_dec_and_test(&ess->count))
+ wake_up(ess->wq);
+
return 0;
}
+static DEFINE_PER_CPU(struct cpu_stop_work, exp_stop_work);
+
/**
* synchronize_sched_expedited - Brute-force RCU-sched grace period
*
@@ -3304,138 +3309,84 @@ static int synchronize_sched_expedited_c
*/
void synchronize_sched_expedited(void)
{
- cpumask_var_t cm;
- bool cma = false;
- int cpu;
- long firstsnap, s, snap;
- int trycount = 0;
+ DECLARE_WAIT_QUEUE_HEAD_ONSTACK(stop_wait);
+ struct exp_stop_state ess = {
+ .wq = &stop_wait,
+ .count = ATOMIC_INIT(1),
+ };
struct rcu_state *rsp = &rcu_sched_state;
+ struct expedited_queue_task {
+ struct expedited_queue_task *next;
+ struct task_struct *task;
+ int done;
+ } *prev, *next, entry = {
+ .task = current,
+ };
+ long gpnum;
+ int cpu;
- /*
- * If we are in danger of counter wrap, just do synchronize_sched().
- * By allowing sync_sched_expedited_started to advance no more than
- * ULONG_MAX/8 ahead of sync_sched_expedited_done, we are ensuring
- * that more than 3.5 billion CPUs would be required to force a
- * counter wrap on a 32-bit system. Quite a few more CPUs would of
- * course be required on a 64-bit system.
- */
- if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start),
- (ulong)atomic_long_read(&rsp->expedited_done) +
- ULONG_MAX / 8)) {
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_wrap);
- return;
- }
-
- /*
- * Take a ticket. Note that atomic_inc_return() implies a
- * full memory barrier.
- */
- snap = atomic_long_inc_return(&rsp->expedited_start);
- firstsnap = snap;
if (!try_get_online_cpus()) {
/* CPU hotplug operation in flight, fall back to normal GP. */
wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
return;
}
WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
- /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
- cma = zalloc_cpumask_var(&cm, GFP_KERNEL);
- if (cma) {
- cpumask_copy(cm, cpu_online_mask);
- cpumask_clear_cpu(raw_smp_processor_id(), cm);
- for_each_cpu(cpu, cm) {
- struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+ smp_mb();
+ gpnum = smp_load_acquire(&rsp->gpnum);
- if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
- cpumask_clear_cpu(cpu, cm);
+ /* MCS style queue 'lock' */
+ prev = xchg(&rsp->expedited_queue, &entry);
+ if (prev) {
+ WRITE_ONCE(prev->next, &entry);
+ for (;;) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (smp_load_acquire(&entry.done))
+ break;
+ schedule();
}
- if (cpumask_weight(cm) == 0)
- goto all_cpus_idle;
+ __set_current_state(TASK_RUNNING);
}
/*
- * Each pass through the following loop attempts to force a
- * context switch on each CPU.
+ * Check to see if someone else did our work for us, while we were
+ * waiting on the queue.
*/
- while (try_stop_cpus(cma ? cm : cpu_online_mask,
- synchronize_sched_expedited_cpu_stop,
- NULL) == -EAGAIN) {
- put_online_cpus();
- atomic_long_inc(&rsp->expedited_tryfail);
-
- /* Check to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone1);
- free_cpumask_var(cm);
- return;
- }
-
- /* No joy, try again later. Or just synchronize_sched(). */
- if (trycount++ < 10) {
- udelay(trycount * num_online_cpus());
- } else {
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
- free_cpumask_var(cm);
- return;
- }
-
- /* Recheck to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone2);
- free_cpumask_var(cm);
- return;
- }
+ if (ULONG_CMP_LT(gpnum, smp_load_acquire(&rsp->completed)))
+ goto unlock;
- /*
- * Refetching sync_sched_expedited_started allows later
- * callers to piggyback on our grace period. We retry
- * after they started, so our grace period works for them,
- * and they started after our first try, so their grace
- * period works for us.
- */
- if (!try_get_online_cpus()) {
- /* CPU hotplug operation in flight, use normal GP. */
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
- free_cpumask_var(cm);
- return;
- }
- snap = atomic_long_read(&rsp->expedited_start);
- smp_mb(); /* ensure read is before try_stop_cpus(). */
+ /* Stop each CPU that is online, non-idle, and not us. */
+ for_each_online_cpu(cpu) {
+ struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+
+ /* Skip our CPU, */
+ if (raw_smp_processor_id() == cpu)
+ continue;
+
+ /* and any idle CPUs. */
+ if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
+ continue;
+
+ atomic_inc(&ess.count);
+ stop_one_cpu_nowait(cpu, synchronize_sched_expedited_cpu_stop,
+ &ess, &per_cpu(exp_stop_work, cpu));
}
- atomic_long_inc(&rsp->expedited_stoppedcpus);
+ atomic_dec(&ess.count);
-all_cpus_idle:
- free_cpumask_var(cm);
+ wait_event(stop_wait, !atomic_read(&ess.count));
- /*
- * Everyone up to our most recent fetch is covered by our grace
- * period. Update the counter, but only if our work is still
- * relevant -- which it won't be if someone who started later
- * than we did already did their update.
- */
- do {
- atomic_long_inc(&rsp->expedited_done_tries);
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_done_lost);
- break;
- }
- } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s);
- atomic_long_inc(&rsp->expedited_done_exit);
+unlock:
+ /* MCS style queue 'unlock' */
+ next = READ_ONCE(entry.next);
+ if (!next) {
+ if (cmpxchg(&rsp->expedited_queue, &entry, NULL) == &entry)
+ goto done;
+ while (!(next = READ_ONCE(entry.next)))
+ cpu_relax();
+ }
+ smp_store_release(&next->done, 1);
+done:
put_online_cpus();
}
EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -483,17 +483,7 @@ struct rcu_state {
/* _rcu_barrier(). */
/* End of fields guarded by barrier_mutex. */
- atomic_long_t expedited_start; /* Starting ticket. */
- atomic_long_t expedited_done; /* Done ticket. */
- atomic_long_t expedited_wrap; /* # near-wrap incidents. */
- atomic_long_t expedited_tryfail; /* # acquisition failures. */
- atomic_long_t expedited_workdone1; /* # done by others #1. */
- atomic_long_t expedited_workdone2; /* # done by others #2. */
- atomic_long_t expedited_normal; /* # fallbacks to normal. */
- atomic_long_t expedited_stoppedcpus; /* # successful stop_cpus. */
- atomic_long_t expedited_done_tries; /* # tries to update _done. */
- atomic_long_t expedited_done_lost; /* # times beaten to _done. */
- atomic_long_t expedited_done_exit; /* # times exited _done loop. */
+ void *expedited_queue;
unsigned long jiffies_force_qs; /* Time at which to invoke */
/* force_quiescent_state(). */
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-06-24 17:20 ` Paul E. McKenney
@ 2015-06-24 17:29 ` Peter Zijlstra
0 siblings, 0 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-24 17:29 UTC (permalink / raw)
To: Paul E. McKenney
Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
viro, torvalds
On Wed, Jun 24, 2015 at 10:20:18AM -0700, Paul E. McKenney wrote:
> Except that I promised Ingo I would check for CPUs failing to schedule
> quickly enough, which means that I must track them individually rather
> than via a single counter...
You can track individual CPUs timestamps by extending the per-cpu
storage we use for the exp_stop_work.
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-06-24 17:28 ` Peter Zijlstra
@ 2015-06-24 17:32 ` Peter Zijlstra
2015-06-24 18:14 ` Peter Zijlstra
1 sibling, 0 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-24 17:32 UTC (permalink / raw)
To: Paul E. McKenney
Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
viro, torvalds
On Wed, Jun 24, 2015 at 07:28:18PM +0200, Peter Zijlstra wrote:
> +unlock:
> + /* MCS style queue 'unlock' */
> + next = READ_ONCE(entry.next);
> + if (!next) {
> + if (cmpxchg(&rsp->expedited_queue, &entry, NULL) == &entry)
> + goto done;
> + while (!(next = READ_ONCE(entry.next)))
> + cpu_relax();
> + }
> + smp_store_release(&next->done, 1);
Do you suppose:
wake_up_process(next->task);
would help? :-)
>
> +done:
> put_online_cpus();
> }
> EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-06-24 17:10 ` Paul E. McKenney
2015-06-24 17:20 ` Paul E. McKenney
2015-06-24 17:28 ` Peter Zijlstra
@ 2015-06-24 17:58 ` Peter Zijlstra
2015-06-25 3:23 ` Paul E. McKenney
2 siblings, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-24 17:58 UTC (permalink / raw)
To: Paul E. McKenney
Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
viro, torvalds
On Wed, Jun 24, 2015 at 10:10:17AM -0700, Paul E. McKenney wrote:
> > The thing is, once you start bailing on this condition your 'queue'
> > drains very fast and this is around the same time sync_rcu() would've
> > released the waiters too.
>
> In my experience, this sort of thing simply melts down on large systems.
> I am reworking this with multiple locks so as to keep the large-system
> contention down to a dull roar.
So with the MCS queue we're got less global trashing than you had with
the start/done tickets. Only the queue head on enqueue.
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-06-24 17:28 ` Peter Zijlstra
2015-06-24 17:32 ` Peter Zijlstra
@ 2015-06-24 18:14 ` Peter Zijlstra
1 sibling, 0 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-24 18:14 UTC (permalink / raw)
To: Paul E. McKenney
Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
viro, torvalds
On Wed, Jun 24, 2015 at 07:28:18PM +0200, Peter Zijlstra wrote:
> How about something like this, it replaced mutex and start/done ticket
> thing with an MCS style lockless FIFO queue.
>
> I further uses the gpnum/completed thing to short circuit things if
> we've waited long enough.
Prettier version
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3253,23 +3253,41 @@ void cond_synchronize_rcu(unsigned long
}
EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
+struct expedited_task_state {
+ struct expedited_task_state *next;
+ struct task_struct *task;
+ atomic_t count;
+ int done;
+};
+
static int synchronize_sched_expedited_cpu_stop(void *data)
{
+ struct expedited_task_state *ets = data;
+
/*
* There must be a full memory barrier on each affected CPU
* between the time that try_stop_cpus() is called and the
* time that it returns.
- *
- * In the current initial implementation of cpu_stop, the
- * above condition is already met when the control reaches
- * this point and the following smp_mb() is not strictly
- * necessary. Do smp_mb() anyway for documentation and
- * robustness against future implementation changes.
*/
- smp_mb(); /* See above comment block. */
+ if (atomic_dec_and_test(&ets->count))
+ wake_up_process(ets->task);
+
return 0;
}
+static DEFINE_PER_CPU(struct cpu_stop_work, exp_stop_work);
+
+#define current_wait(cond) \
+do { \
+ for (;;) { \
+ set_current_state(TASK_UNINTERRUPTIBLE); \
+ if (cond) \
+ break; \
+ schedule(); \
+ } \
+ __set_current_state(TASK_RUNNING); \
+} while (0)
+
/**
* synchronize_sched_expedited - Brute-force RCU-sched grace period
*
@@ -3304,138 +3322,71 @@ static int synchronize_sched_expedited_c
*/
void synchronize_sched_expedited(void)
{
- cpumask_var_t cm;
- bool cma = false;
- int cpu;
- long firstsnap, s, snap;
- int trycount = 0;
struct rcu_state *rsp = &rcu_sched_state;
+ struct expedited_task_state *prev, *next, entry = {
+ .task = current,
+ .count = ATOMIC_INIT(1), /* avoid spurious wakeups */
+ };
+ long gpnum;
+ int cpu;
- /*
- * If we are in danger of counter wrap, just do synchronize_sched().
- * By allowing sync_sched_expedited_started to advance no more than
- * ULONG_MAX/8 ahead of sync_sched_expedited_done, we are ensuring
- * that more than 3.5 billion CPUs would be required to force a
- * counter wrap on a 32-bit system. Quite a few more CPUs would of
- * course be required on a 64-bit system.
- */
- if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start),
- (ulong)atomic_long_read(&rsp->expedited_done) +
- ULONG_MAX / 8)) {
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_wrap);
- return;
- }
-
- /*
- * Take a ticket. Note that atomic_inc_return() implies a
- * full memory barrier.
- */
- snap = atomic_long_inc_return(&rsp->expedited_start);
- firstsnap = snap;
if (!try_get_online_cpus()) {
/* CPU hotplug operation in flight, fall back to normal GP. */
wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
return;
}
WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
- /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
- cma = zalloc_cpumask_var(&cm, GFP_KERNEL);
- if (cma) {
- cpumask_copy(cm, cpu_online_mask);
- cpumask_clear_cpu(raw_smp_processor_id(), cm);
- for_each_cpu(cpu, cm) {
- struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-
- if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
- cpumask_clear_cpu(cpu, cm);
- }
- if (cpumask_weight(cm) == 0)
- goto all_cpus_idle;
+ smp_mb();
+ gpnum = smp_load_acquire(&rsp->gpnum);
+
+ /* MCS style queue 'lock' */
+ prev = xchg(&rsp->expedited_queue, &entry);
+ if (prev) {
+ WRITE_ONCE(prev->next, &entry);
+ current_wait(smp_load_acquire(&entry.done));
}
/*
- * Each pass through the following loop attempts to force a
- * context switch on each CPU.
+ * Check to see if someone else did our work for us, while we were
+ * waiting on the queue.
*/
- while (try_stop_cpus(cma ? cm : cpu_online_mask,
- synchronize_sched_expedited_cpu_stop,
- NULL) == -EAGAIN) {
- put_online_cpus();
- atomic_long_inc(&rsp->expedited_tryfail);
-
- /* Check to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone1);
- free_cpumask_var(cm);
- return;
- }
-
- /* No joy, try again later. Or just synchronize_sched(). */
- if (trycount++ < 10) {
- udelay(trycount * num_online_cpus());
- } else {
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
- free_cpumask_var(cm);
- return;
- }
-
- /* Recheck to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone2);
- free_cpumask_var(cm);
- return;
- }
-
- /*
- * Refetching sync_sched_expedited_started allows later
- * callers to piggyback on our grace period. We retry
- * after they started, so our grace period works for them,
- * and they started after our first try, so their grace
- * period works for us.
- */
- if (!try_get_online_cpus()) {
- /* CPU hotplug operation in flight, use normal GP. */
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
- free_cpumask_var(cm);
- return;
- }
- snap = atomic_long_read(&rsp->expedited_start);
- smp_mb(); /* ensure read is before try_stop_cpus(). */
+ if (ULONG_CMP_LT(gpnum, smp_load_acquire(&rsp->completed)))
+ goto unlock;
+
+ /* Stop each CPU that is online, non-idle, and not us. */
+ for_each_online_cpu(cpu) {
+ struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+
+ /* Skip our CPU, */
+ if (raw_smp_processor_id() == cpu)
+ continue;
+
+ /* and any idle CPUs. */
+ if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
+ continue;
+
+ atomic_inc(&entry.count);
+ stop_one_cpu_nowait(cpu, synchronize_sched_expedited_cpu_stop,
+ &entry, &per_cpu(exp_stop_work, cpu));
}
- atomic_long_inc(&rsp->expedited_stoppedcpus);
-all_cpus_idle:
- free_cpumask_var(cm);
+ atomic_dec(&entry.count); /* let the wakeups in */
+ current_wait(!atomic_read(&entry.count));
- /*
- * Everyone up to our most recent fetch is covered by our grace
- * period. Update the counter, but only if our work is still
- * relevant -- which it won't be if someone who started later
- * than we did already did their update.
- */
- do {
- atomic_long_inc(&rsp->expedited_done_tries);
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_done_lost);
- break;
- }
- } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s);
- atomic_long_inc(&rsp->expedited_done_exit);
+unlock:
+ /* MCS style queue 'unlock' */
+ next = READ_ONCE(entry.next);
+ if (!next) {
+ if (cmpxchg(&rsp->expedited_queue, &entry, NULL) == &entry)
+ goto done;
+ while (!(next = READ_ONCE(entry.next)))
+ cpu_relax();
+ }
+ smp_store_release(&next->done, 1);
+ wake_up_process(next->task);
+done:
put_online_cpus();
}
EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -483,17 +483,7 @@ struct rcu_state {
/* _rcu_barrier(). */
/* End of fields guarded by barrier_mutex. */
- atomic_long_t expedited_start; /* Starting ticket. */
- atomic_long_t expedited_done; /* Done ticket. */
- atomic_long_t expedited_wrap; /* # near-wrap incidents. */
- atomic_long_t expedited_tryfail; /* # acquisition failures. */
- atomic_long_t expedited_workdone1; /* # done by others #1. */
- atomic_long_t expedited_workdone2; /* # done by others #2. */
- atomic_long_t expedited_normal; /* # fallbacks to normal. */
- atomic_long_t expedited_stoppedcpus; /* # successful stop_cpus. */
- atomic_long_t expedited_done_tries; /* # tries to update _done. */
- atomic_long_t expedited_done_lost; /* # times beaten to _done. */
- atomic_long_t expedited_done_exit; /* # times exited _done loop. */
+ void *expedited_queue;
unsigned long jiffies_force_qs; /* Time at which to invoke */
/* force_quiescent_state(). */
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-06-24 17:58 ` Peter Zijlstra
@ 2015-06-25 3:23 ` Paul E. McKenney
2015-06-25 11:07 ` Peter Zijlstra
0 siblings, 1 reply; 106+ messages in thread
From: Paul E. McKenney @ 2015-06-25 3:23 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
viro, torvalds
On Wed, Jun 24, 2015 at 07:58:30PM +0200, Peter Zijlstra wrote:
> On Wed, Jun 24, 2015 at 10:10:17AM -0700, Paul E. McKenney wrote:
> > > The thing is, once you start bailing on this condition your 'queue'
> > > drains very fast and this is around the same time sync_rcu() would've
> > > released the waiters too.
> >
> > In my experience, this sort of thing simply melts down on large systems.
> > I am reworking this with multiple locks so as to keep the large-system
> > contention down to a dull roar.
>
> So with the MCS queue we're got less global trashing than you had with
> the start/done tickets. Only the queue head on enqueue.
Here is what I had in mind, where you don't have any global trashing
except when the ->expedited_sequence gets updated. Passes mild rcutorture
testing.
Still needs asynchronous CPU stoppage and stall warnings and trace
documentation updates. Plus fixes for whatever bugs show up.
Thanx, Paul
------------------------------------------------------------------------
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 78d0a87ff354..887370b7e52a 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -70,6 +70,7 @@ MODULE_ALIAS("rcutree");
static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
+static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS];
/*
* In order to export the rcu_state name to the tracing tools, it
@@ -3323,6 +3324,22 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
return 0;
}
+/* Common code for synchronize_sched_expedited() work-done checking. */
+static bool sync_sched_exp_wd(struct rcu_state *rsp, struct rcu_node *rnp,
+ atomic_long_t *stat, unsigned long s)
+{
+ if (ULONG_CMP_GE(READ_ONCE(rsp->expedited_sequence), s)) {
+ if (rnp)
+ mutex_unlock(&rnp->exp_funnel_mutex);
+ /* Ensure test happens before caller kfree(). */
+ smp_mb__before_atomic(); /* ^^^ */
+ atomic_long_inc(stat);
+ put_online_cpus();
+ return true;
+ }
+ return false;
+}
+
/**
* synchronize_sched_expedited - Brute-force RCU-sched grace period
*
@@ -3334,58 +3351,24 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
* restructure your code to batch your updates, and then use a single
* synchronize_sched() instead.
*
- * This implementation can be thought of as an application of ticket
- * locking to RCU, with sync_sched_expedited_started and
- * sync_sched_expedited_done taking on the roles of the halves
- * of the ticket-lock word. Each task atomically increments
- * sync_sched_expedited_started upon entry, snapshotting the old value,
- * then attempts to stop all the CPUs. If this succeeds, then each
- * CPU will have executed a context switch, resulting in an RCU-sched
- * grace period. We are then done, so we use atomic_cmpxchg() to
- * update sync_sched_expedited_done to match our snapshot -- but
- * only if someone else has not already advanced past our snapshot.
- *
- * On the other hand, if try_stop_cpus() fails, we check the value
- * of sync_sched_expedited_done. If it has advanced past our
- * initial snapshot, then someone else must have forced a grace period
- * some time after we took our snapshot. In this case, our work is
- * done for us, and we can simply return. Otherwise, we try again,
- * but keep our initial snapshot for purposes of checking for someone
- * doing our work for us.
- *
- * If we fail too many times in a row, we fall back to synchronize_sched().
+ * This implementation can be thought of as an application of sequence
+ * locking to expedited grace periods, but using the sequence counter to
+ * determine when someone else has already done the work instead of for
+ * retrying readers.
*/
void synchronize_sched_expedited(void)
{
- cpumask_var_t cm;
- bool cma = false;
int cpu;
- long firstsnap, s, snap;
- int trycount = 0;
+ long s;
struct rcu_state *rsp = &rcu_sched_state;
+ struct rcu_node *rnp0;
+ struct rcu_node *rnp1 = NULL;
- /*
- * If we are in danger of counter wrap, just do synchronize_sched().
- * By allowing sync_sched_expedited_started to advance no more than
- * ULONG_MAX/8 ahead of sync_sched_expedited_done, we are ensuring
- * that more than 3.5 billion CPUs would be required to force a
- * counter wrap on a 32-bit system. Quite a few more CPUs would of
- * course be required on a 64-bit system.
- */
- if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start),
- (ulong)atomic_long_read(&rsp->expedited_done) +
- ULONG_MAX / 8)) {
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_wrap);
- return;
- }
+ /* Take a snapshot of the sequence number. */
+ smp_mb(); /* Caller's modifications seen first by other CPUs. */
+ s = (READ_ONCE(rsp->expedited_sequence) + 3) & ~0x1;
+ smp_mb(); /* Above access must not bleed into critical section. */
- /*
- * Take a ticket. Note that atomic_inc_return() implies a
- * full memory barrier.
- */
- snap = atomic_long_inc_return(&rsp->expedited_start);
- firstsnap = snap;
if (!try_get_online_cpus()) {
/* CPU hotplug operation in flight, fall back to normal GP. */
wait_rcu_gp(call_rcu_sched);
@@ -3394,100 +3377,47 @@ void synchronize_sched_expedited(void)
}
WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
- /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
- cma = zalloc_cpumask_var(&cm, GFP_KERNEL);
- if (cma) {
- cpumask_copy(cm, cpu_online_mask);
- cpumask_clear_cpu(raw_smp_processor_id(), cm);
- for_each_cpu(cpu, cm) {
- struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-
- if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
- cpumask_clear_cpu(cpu, cm);
- }
- if (cpumask_weight(cm) == 0)
- goto all_cpus_idle;
- }
-
/*
- * Each pass through the following loop attempts to force a
- * context switch on each CPU.
+ * Each pass through the following loop works its way
+ * up the rcu_node tree, returning if others have done the
+ * work or otherwise falls through holding the root rnp's
+ * ->exp_funnel_mutex. The mapping from CPU to rcu_node structure
+ * can be inexact, as it is just promoting locality and is not
+ * strictly needed for correctness.
*/
- while (try_stop_cpus(cma ? cm : cpu_online_mask,
- synchronize_sched_expedited_cpu_stop,
- NULL) == -EAGAIN) {
- put_online_cpus();
- atomic_long_inc(&rsp->expedited_tryfail);
-
- /* Check to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone1);
- free_cpumask_var(cm);
+ rnp0 = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode;
+ for (; rnp0 != NULL; rnp0 = rnp0->parent) {
+ if (sync_sched_exp_wd(rsp, rnp1, &rsp->expedited_workdone1, s))
return;
- }
+ mutex_lock(&rnp0->exp_funnel_mutex);
+ if (rnp1)
+ mutex_unlock(&rnp1->exp_funnel_mutex);
+ rnp1 = rnp0;
+ }
+ rnp0 = rnp1; /* rcu_get_root(rsp), AKA root rcu_node structure. */
+ if (sync_sched_exp_wd(rsp, rnp0, &rsp->expedited_workdone2, s))
+ return;
- /* No joy, try again later. Or just synchronize_sched(). */
- if (trycount++ < 10) {
- udelay(trycount * num_online_cpus());
- } else {
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
- free_cpumask_var(cm);
- return;
- }
+ WRITE_ONCE(rsp->expedited_sequence, rsp->expedited_sequence + 1);
+ smp_mb(); /* Ensure expedited GP seen after counter increment. */
+ WARN_ON_ONCE(!(rsp->expedited_sequence & 0x1));
- /* Recheck to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone2);
- free_cpumask_var(cm);
- return;
- }
+ /* Stop each CPU that is online, non-idle, and not us. */
+ for_each_online_cpu(cpu) {
+ struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
- /*
- * Refetching sync_sched_expedited_started allows later
- * callers to piggyback on our grace period. We retry
- * after they started, so our grace period works for them,
- * and they started after our first try, so their grace
- * period works for us.
- */
- if (!try_get_online_cpus()) {
- /* CPU hotplug operation in flight, use normal GP. */
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
- free_cpumask_var(cm);
- return;
- }
- snap = atomic_long_read(&rsp->expedited_start);
- smp_mb(); /* ensure read is before try_stop_cpus(). */
+ /* Skip our CPU and any idle CPUs. */
+ if (raw_smp_processor_id() == cpu ||
+ !(atomic_add_return(0, &rdtp->dynticks) & 0x1))
+ continue;
+ stop_one_cpu(cpu, synchronize_sched_expedited_cpu_stop, NULL);
}
- atomic_long_inc(&rsp->expedited_stoppedcpus);
-
-all_cpus_idle:
- free_cpumask_var(cm);
- /*
- * Everyone up to our most recent fetch is covered by our grace
- * period. Update the counter, but only if our work is still
- * relevant -- which it won't be if someone who started later
- * than we did already did their update.
- */
- do {
- atomic_long_inc(&rsp->expedited_done_tries);
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_done_lost);
- break;
- }
- } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s);
- atomic_long_inc(&rsp->expedited_done_exit);
+ smp_mb(); /* Ensure expedited GP seen before counter increment. */
+ WRITE_ONCE(rsp->expedited_sequence, rsp->expedited_sequence + 1);
+ WARN_ON_ONCE(rsp->expedited_sequence & 0x1);
+ mutex_unlock(&rnp0->exp_funnel_mutex);
+ smp_mb(); /* ensure subsequent action seen after grace period. */
put_online_cpus();
}
@@ -4043,6 +3973,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
{
static const char * const buf[] = RCU_NODE_NAME_INIT;
static const char * const fqs[] = RCU_FQS_NAME_INIT;
+ static const char * const exp[] = RCU_EXP_NAME_INIT;
static u8 fl_mask = 0x1;
int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */
@@ -4101,6 +4032,9 @@ static void __init rcu_init_one(struct rcu_state *rsp,
rnp->level = i;
INIT_LIST_HEAD(&rnp->blkd_tasks);
rcu_init_one_nocb(rnp);
+ mutex_init(&rnp->exp_funnel_mutex);
+ lockdep_set_class_and_name(&rnp->exp_funnel_mutex,
+ &rcu_exp_class[i], exp[i]);
}
}
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index de22d6d06bf9..f0f4dd96dd73 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -68,6 +68,7 @@
# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 }
# define RCU_NODE_NAME_INIT { "rcu_node_0" }
# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" }
+# define RCU_EXP_NAME_INIT { "rcu_node_exp_0" }
#elif NR_CPUS <= RCU_FANOUT_2
# define RCU_NUM_LVLS 2
# define NUM_RCU_LVL_0 1
@@ -76,6 +77,7 @@
# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 }
# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" }
# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" }
+# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1" }
#elif NR_CPUS <= RCU_FANOUT_3
# define RCU_NUM_LVLS 3
# define NUM_RCU_LVL_0 1
@@ -85,6 +87,7 @@
# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 }
# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" }
# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" }
+# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" }
#elif NR_CPUS <= RCU_FANOUT_4
# define RCU_NUM_LVLS 4
# define NUM_RCU_LVL_0 1
@@ -95,6 +98,7 @@
# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 }
# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" }
# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" }
+# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" }
#else
# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
@@ -237,6 +241,8 @@ struct rcu_node {
int need_future_gp[2];
/* Counts of upcoming no-CB GP requests. */
raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp;
+
+ struct mutex exp_funnel_mutex ____cacheline_internodealigned_in_smp;
} ____cacheline_internodealigned_in_smp;
/*
@@ -478,17 +484,11 @@ struct rcu_state {
/* _rcu_barrier(). */
/* End of fields guarded by barrier_mutex. */
- atomic_long_t expedited_start; /* Starting ticket. */
- atomic_long_t expedited_done; /* Done ticket. */
- atomic_long_t expedited_wrap; /* # near-wrap incidents. */
+ unsigned long expedited_sequence; /* Take a ticket. */
atomic_long_t expedited_tryfail; /* # acquisition failures. */
atomic_long_t expedited_workdone1; /* # done by others #1. */
atomic_long_t expedited_workdone2; /* # done by others #2. */
atomic_long_t expedited_normal; /* # fallbacks to normal. */
- atomic_long_t expedited_stoppedcpus; /* # successful stop_cpus. */
- atomic_long_t expedited_done_tries; /* # tries to update _done. */
- atomic_long_t expedited_done_lost; /* # times beaten to _done. */
- atomic_long_t expedited_done_exit; /* # times exited _done loop. */
unsigned long jiffies_force_qs; /* Time at which to invoke */
/* force_quiescent_state(). */
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 3ea7ffc7d5c4..d2aab8dcd58e 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -185,18 +185,13 @@ static int show_rcuexp(struct seq_file *m, void *v)
{
struct rcu_state *rsp = (struct rcu_state *)m->private;
- seq_printf(m, "s=%lu d=%lu w=%lu tf=%lu wd1=%lu wd2=%lu n=%lu sc=%lu dt=%lu dl=%lu dx=%lu\n",
- atomic_long_read(&rsp->expedited_start),
- atomic_long_read(&rsp->expedited_done),
- atomic_long_read(&rsp->expedited_wrap),
+ seq_printf(m, "t=%lu tf=%lu wd1=%lu wd2=%lu n=%lu sc=%lu\n",
+ rsp->expedited_sequence,
atomic_long_read(&rsp->expedited_tryfail),
atomic_long_read(&rsp->expedited_workdone1),
atomic_long_read(&rsp->expedited_workdone2),
atomic_long_read(&rsp->expedited_normal),
- atomic_long_read(&rsp->expedited_stoppedcpus),
- atomic_long_read(&rsp->expedited_done_tries),
- atomic_long_read(&rsp->expedited_done_lost),
- atomic_long_read(&rsp->expedited_done_exit));
+ rsp->expedited_sequence / 2);
return 0;
}
^ permalink raw reply related [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-06-25 3:23 ` Paul E. McKenney
@ 2015-06-25 11:07 ` Peter Zijlstra
2015-06-25 13:47 ` Paul E. McKenney
0 siblings, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-25 11:07 UTC (permalink / raw)
To: Paul E. McKenney
Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
viro, torvalds
On Wed, Jun 24, 2015 at 08:23:17PM -0700, Paul E. McKenney wrote:
> Here is what I had in mind, where you don't have any global trashing
> except when the ->expedited_sequence gets updated. Passes mild rcutorture
> testing.
> /*
> + * Each pass through the following loop works its way
> + * up the rcu_node tree, returning if others have done the
> + * work or otherwise falls through holding the root rnp's
> + * ->exp_funnel_mutex. The mapping from CPU to rcu_node structure
> + * can be inexact, as it is just promoting locality and is not
> + * strictly needed for correctness.
> */
> + rnp0 = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode;
> + for (; rnp0 != NULL; rnp0 = rnp0->parent) {
> + if (sync_sched_exp_wd(rsp, rnp1, &rsp->expedited_workdone1, s))
> return;
> + mutex_lock(&rnp0->exp_funnel_mutex);
> + if (rnp1)
> + mutex_unlock(&rnp1->exp_funnel_mutex);
> + rnp1 = rnp0;
> + }
> + rnp0 = rnp1; /* rcu_get_root(rsp), AKA root rcu_node structure. */
> + if (sync_sched_exp_wd(rsp, rnp0, &rsp->expedited_workdone2, s))
> + return;
I'm still somewhat confused by the whole strict order sequence vs this
non ordered 'polling' of global state.
This funnel thing basically waits random times depending on the
contention of these mutexes and tries again. Ultimately serializing on
the root funnel thing.
So on the one hand you have to strictly order these expedited caller,
but then you don't want to actually process them in order. If 'by magic'
you manage to process the 3rd in queue, you can drop the 2nd because it
will have waited long enough. OTOH the 2nd will have waited too long.
You also do not take the actual RCU state machine into account -- this
is a parallel state.
Can't we integrate the force quiescent state machinery with the
expedited machinery -- that is instead of building a parallel state, use
the expedited thing to push the regular machine forward?
We can use the stop_machine calls to force the local RCU state forward,
after all, we _know_ we just made a context switch into the stopper
thread. All we need to do is disable interrupts to hold off the tick
(which normally drives the state machine) and just unconditionally
advance our state.
If we use the regular GP machinery, you also don't have to strongly
order the callers, just stick them on whatever GP was active when they
came in and let them roll, this allows much better (and more natural)
concurrent processing.
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-06-25 11:07 ` Peter Zijlstra
@ 2015-06-25 13:47 ` Paul E. McKenney
2015-06-25 14:20 ` Peter Zijlstra
0 siblings, 1 reply; 106+ messages in thread
From: Paul E. McKenney @ 2015-06-25 13:47 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
viro, torvalds
On Thu, Jun 25, 2015 at 01:07:34PM +0200, Peter Zijlstra wrote:
> On Wed, Jun 24, 2015 at 08:23:17PM -0700, Paul E. McKenney wrote:
> > Here is what I had in mind, where you don't have any global trashing
> > except when the ->expedited_sequence gets updated. Passes mild rcutorture
> > testing.
>
> > /*
> > + * Each pass through the following loop works its way
> > + * up the rcu_node tree, returning if others have done the
> > + * work or otherwise falls through holding the root rnp's
> > + * ->exp_funnel_mutex. The mapping from CPU to rcu_node structure
> > + * can be inexact, as it is just promoting locality and is not
> > + * strictly needed for correctness.
> > */
> > + rnp0 = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode;
> > + for (; rnp0 != NULL; rnp0 = rnp0->parent) {
> > + if (sync_sched_exp_wd(rsp, rnp1, &rsp->expedited_workdone1, s))
> > return;
> > + mutex_lock(&rnp0->exp_funnel_mutex);
> > + if (rnp1)
> > + mutex_unlock(&rnp1->exp_funnel_mutex);
> > + rnp1 = rnp0;
> > + }
> > + rnp0 = rnp1; /* rcu_get_root(rsp), AKA root rcu_node structure. */
> > + if (sync_sched_exp_wd(rsp, rnp0, &rsp->expedited_workdone2, s))
> > + return;
>
> I'm still somewhat confused by the whole strict order sequence vs this
> non ordered 'polling' of global state.
>
> This funnel thing basically waits random times depending on the
> contention of these mutexes and tries again. Ultimately serializing on
> the root funnel thing.
Not random at all!
The whole funnel is controlled by the root ->exp_funnel_mutex holder,
who is going to hold the lock for a single expedited grace period, then
release it. This means that any time a task acquires a lock, there is
very likely to have been a recent state change. Hence the checks after
each lock acquisition.
So in the heavy-use case, what tends to happen is that there are one
or two expedited grace periods, and then the entire queue of waiters
acquiring ->exp_funnel_mutex simply evaporates -- they can make use of
the expedited grace period whose completion resulted in their acquisition
completing and thus them being awakened. No fuss, no muss, no unnecessary
contention or cache thrashing.
> So on the one hand you have to strictly order these expedited caller,
> but then you don't want to actually process them in order. If 'by magic'
> you manage to process the 3rd in queue, you can drop the 2nd because it
> will have waited long enough. OTOH the 2nd will have waited too long.
Let's take the example of a 4096-CPU system with default configuration of
CONFIG_RCU_FANOUT=64 and CONFIG_RCU_FANOUT_LEAF=16. There will then be
256 leaf rcu_node structures, each of which is subordinate to one of four
internal rcu_node structures, each of which is subordinate to the root
rcu_node structure. There can then be up to 260 tasks waiting on non-root
rcu_node ->exp_funnel_mutex, with an additional task holding the root
rcu_node ->exp_funnel_mutex and carrying out an expedited grace period.
Once that grace period completes, one of the tasks holding an internal
->exp_funnel_mutex acquires the root ->exp_funnel_mutex. If it can use
the just-completed grace period, it releases its ->exp_funnel_mutex,
and the cycle repeats, until the queue drains. If not, then it will
carry out another grace period, perhaps making some of the queue wait
unnecessarily -- but that can happen in the strictly queued case as well,
due to delays between snapshotting the counter and getting on the queue.
The key advantage of the funnel approach is that many tasks can be
concurrently discovering that the grace period they need has already
happened.
Of course, if there are more than 260 tasks queued, the excess tasks will
queue on the leaf ->exp_funnel_mutex mutexes. But they will eventually
start draining 256 at a time, in parallel.
And nothing comes for free. In an idle system, the single task wanting
an expedited grace period must work its way up the rcu_node tree. In
the 4096-CPU case with default configuration, it must acquire three
uncontended mutexes. But this is way down in the noise compared to
the 4095 cache misses required to determine that all the rest of the
CPUs are idle. So the funnel approach is a good tradeoff.
> You also do not take the actual RCU state machine into account -- this
> is a parallel state.
>
> Can't we integrate the force quiescent state machinery with the
> expedited machinery -- that is instead of building a parallel state, use
> the expedited thing to push the regular machine forward?
>
> We can use the stop_machine calls to force the local RCU state forward,
> after all, we _know_ we just made a context switch into the stopper
> thread. All we need to do is disable interrupts to hold off the tick
> (which normally drives the state machine) and just unconditionally
> advance our state.
>
> If we use the regular GP machinery, you also don't have to strongly
> order the callers, just stick them on whatever GP was active when they
> came in and let them roll, this allows much better (and more natural)
> concurrent processing.
That gets quite complex, actually. Lots of races with the normal grace
periods doing one thing or another.
However, it should be quite easy to go the other way and make the normal
grace-period processing take advantage of expedited grace periods that
happened to occur at the right time. I will look into this, thank you
for the nudge!
Thanx, Paul
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-06-25 13:47 ` Paul E. McKenney
@ 2015-06-25 14:20 ` Peter Zijlstra
2015-06-25 14:51 ` Paul E. McKenney
0 siblings, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-25 14:20 UTC (permalink / raw)
To: Paul E. McKenney
Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
viro, torvalds
On Thu, Jun 25, 2015 at 06:47:55AM -0700, Paul E. McKenney wrote:
> On Thu, Jun 25, 2015 at 01:07:34PM +0200, Peter Zijlstra wrote:
> > I'm still somewhat confused by the whole strict order sequence vs this
> > non ordered 'polling' of global state.
> >
> > This funnel thing basically waits random times depending on the
> > contention of these mutexes and tries again. Ultimately serializing on
> > the root funnel thing.
>
> Not random at all!
No, they are random per, definition it depends on the amount of
contention and since that's random, the rest it too.
> The whole funnel is controlled by the root ->exp_funnel_mutex holder,
> who is going to hold the lock for a single expedited grace period, then
> release it. This means that any time a task acquires a lock, there is
> very likely to have been a recent state change. Hence the checks after
> each lock acquisition.
>
> So in the heavy-use case, what tends to happen is that there are one
> or two expedited grace periods, and then the entire queue of waiters
> acquiring ->exp_funnel_mutex simply evaporates -- they can make use of
> the expedited grace period whose completion resulted in their acquisition
> completing and thus them being awakened. No fuss, no muss, no unnecessary
> contention or cache thrashing.
Plenty of cache trashing, since your 'tree' is not at all cache aligned
or even remotely coherent with the actual machine topology -- I'll keep
reminding you :-)
But I must admit that the workings of the sequence thing elided me this
morning. Yes that's much better than the strict ticket order of before.
> > You also do not take the actual RCU state machine into account -- this
> > is a parallel state.
> >
> > Can't we integrate the force quiescent state machinery with the
> > expedited machinery -- that is instead of building a parallel state, use
> > the expedited thing to push the regular machine forward?
> >
> > We can use the stop_machine calls to force the local RCU state forward,
> > after all, we _know_ we just made a context switch into the stopper
> > thread. All we need to do is disable interrupts to hold off the tick
> > (which normally drives the state machine) and just unconditionally
> > advance our state.
> >
> > If we use the regular GP machinery, you also don't have to strongly
> > order the callers, just stick them on whatever GP was active when they
> > came in and let them roll, this allows much better (and more natural)
> > concurrent processing.
>
> That gets quite complex, actually. Lots of races with the normal grace
> periods doing one thing or another.
How so? I'm probably missing several years of RCU trickery and detail
again, but since we can advance from the tick, we should be able to
advance from the stop work with IRQs disabled with equal ease.
And since the stop work and the tick are fully serialized, there cannot
be any races there.
And the stop work against other CPUs is the exact same races you already
had with tick vs tick.
So please humour me and explain how all this is far more complicated ;-)
> However, it should be quite easy to go the other way and make the normal
> grace-period processing take advantage of expedited grace periods that
> happened to occur at the right time. I will look into this, thank you
> for the nudge!
That should already be happening, right? Since we force context
switches, the tick driven RCU state machine will observe those and make
progress -- assuming it was trying to make progress at all of course.
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-06-25 14:20 ` Peter Zijlstra
@ 2015-06-25 14:51 ` Paul E. McKenney
2015-06-26 12:32 ` Peter Zijlstra
0 siblings, 1 reply; 106+ messages in thread
From: Paul E. McKenney @ 2015-06-25 14:51 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
viro, torvalds
On Thu, Jun 25, 2015 at 04:20:11PM +0200, Peter Zijlstra wrote:
> On Thu, Jun 25, 2015 at 06:47:55AM -0700, Paul E. McKenney wrote:
> > On Thu, Jun 25, 2015 at 01:07:34PM +0200, Peter Zijlstra wrote:
> > > I'm still somewhat confused by the whole strict order sequence vs this
> > > non ordered 'polling' of global state.
> > >
> > > This funnel thing basically waits random times depending on the
> > > contention of these mutexes and tries again. Ultimately serializing on
> > > the root funnel thing.
> >
> > Not random at all!
>
> No, they are random per, definition it depends on the amount of
> contention and since that's random, the rest it too.
Not sure how to parse this one. ;-)
> > The whole funnel is controlled by the root ->exp_funnel_mutex holder,
> > who is going to hold the lock for a single expedited grace period, then
> > release it. This means that any time a task acquires a lock, there is
> > very likely to have been a recent state change. Hence the checks after
> > each lock acquisition.
> >
> > So in the heavy-use case, what tends to happen is that there are one
> > or two expedited grace periods, and then the entire queue of waiters
> > acquiring ->exp_funnel_mutex simply evaporates -- they can make use of
> > the expedited grace period whose completion resulted in their acquisition
> > completing and thus them being awakened. No fuss, no muss, no unnecessary
> > contention or cache thrashing.
>
> Plenty of cache trashing, since your 'tree' is not at all cache aligned
> or even remotely coherent with the actual machine topology -- I'll keep
> reminding you :-)
And, as I keep reminding you, if you actually show me system-level data
demonstrating that this is a real problem, I might consider taking some
action. And also reminding you that in the meantime, you can experiment
by setting the fanout sizes to match a given system and see if it makes
any visible difference. (Yes, I do understand the odd numbering of
hyperthreads, but you can still run a reasonable experiment.)
> But I must admit that the workings of the sequence thing elided me this
> morning. Yes that's much better than the strict ticket order of before.
OK, good!
> > > You also do not take the actual RCU state machine into account -- this
> > > is a parallel state.
> > >
> > > Can't we integrate the force quiescent state machinery with the
> > > expedited machinery -- that is instead of building a parallel state, use
> > > the expedited thing to push the regular machine forward?
> > >
> > > We can use the stop_machine calls to force the local RCU state forward,
> > > after all, we _know_ we just made a context switch into the stopper
> > > thread. All we need to do is disable interrupts to hold off the tick
> > > (which normally drives the state machine) and just unconditionally
> > > advance our state.
> > >
> > > If we use the regular GP machinery, you also don't have to strongly
> > > order the callers, just stick them on whatever GP was active when they
> > > came in and let them roll, this allows much better (and more natural)
> > > concurrent processing.
> >
> > That gets quite complex, actually. Lots of races with the normal grace
> > periods doing one thing or another.
>
> How so? I'm probably missing several years of RCU trickery and detail
> again, but since we can advance from the tick, we should be able to
> advance from the stop work with IRQs disabled with equal ease.
>
> And since the stop work and the tick are fully serialized, there cannot
> be any races there.
>
> And the stop work against other CPUs is the exact same races you already
> had with tick vs tick.
>
> So please humour me and explain how all this is far more complicated ;-)
Yeah, I do need to get RCU design/implementation documentation put together.
In the meantime, RCU's normal grace-period machinery is designed to be
quite loosely coupled. The idea is that almost all actions occur locally,
reducing contention and cache thrashing. But an expedited grace period
needs tight coupling in order to be able to complete quickly. Making
something that switches between loose and tight coupling in short order
is not at all simple.
> > However, it should be quite easy to go the other way and make the normal
> > grace-period processing take advantage of expedited grace periods that
> > happened to occur at the right time. I will look into this, thank you
> > for the nudge!
>
> That should already be happening, right? Since we force context
> switches, the tick driven RCU state machine will observe those and make
> progress -- assuming it was trying to make progress at all of course.
It is to an extent, but I believe that I can do better. On the other hand,
it is quite possible that this is a 6AM delusion on my part. ;-)
If it is not a delusion, the eventual solution will likely be a much more
satisfying answer to your "why not merge into the normal RCU grace period
machinery" question. But I need to complete reworking the expedited
machinery first!
Thanx, Paul
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 05/13] percpu-rwsem: Optimize readers and reduce global impact
2015-06-23 7:28 ` Nicholas Mc Guire
@ 2015-06-25 19:08 ` Peter Zijlstra
2015-06-25 19:17 ` Tejun Heo
0 siblings, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-25 19:08 UTC (permalink / raw)
To: Nicholas Mc Guire
Cc: oleg, paulmck, tj, mingo, linux-kernel, dave, riel, viro, torvalds
On Tue, Jun 23, 2015 at 09:28:11AM +0200, Nicholas Mc Guire wrote:
>
> A bit off-topic probably
> but maybe this should not be in kernel/locking/percpu-rwsem.c but in a
> generic percpu location as this construct is present in the core a few times
> atleast in:
> kernel/irq/irqdesc.c:kstat_irqs
> kernel/fork.c:nr_processes
That has an odd unsigned long vs int fail, but yes.
> mm/memcontrol.c:mem_cgroup_read_events
> mm/memcontrol.c:mem_cgroup_read_stat
Those seem to be hotplug challenged. I'm thinking dropping that
nocpu_base.count[] crap and just iterating all possible CPUs would've
been much easier.
> > +#define per_cpu_sum(var) \
> > +({ \
> > + typeof(var) __sum = 0; \
> > + int cpu; \
> > + for_each_possible_cpu(cpu) \
> > + __sum += per_cpu(var, cpu); \
> > + __sum; \
> > +})
> > +
>
> so maybe put it into include/linux/percpu.h ?
Yes I can do that.
We can try and use it more after that, there seems to be loads of places
that could use this fs/namespace.c fs/inode.c etc..
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 05/13] percpu-rwsem: Optimize readers and reduce global impact
2015-06-25 19:08 ` Peter Zijlstra
@ 2015-06-25 19:17 ` Tejun Heo
2015-06-29 9:32 ` Peter Zijlstra
0 siblings, 1 reply; 106+ messages in thread
From: Tejun Heo @ 2015-06-25 19:17 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Nicholas Mc Guire, oleg, paulmck, mingo, linux-kernel, dave,
riel, viro, torvalds
Hello,
On Thu, Jun 25, 2015 at 09:08:00PM +0200, Peter Zijlstra wrote:
> > mm/memcontrol.c:mem_cgroup_read_events
> > mm/memcontrol.c:mem_cgroup_read_stat
>
> Those seem to be hotplug challenged. I'm thinking dropping that
> nocpu_base.count[] crap and just iterating all possible CPUs would've
> been much easier.
A patch doing that is already queued for this merge window. IIRC,
it's included as part of cgroup writeback updates.
> > > +#define per_cpu_sum(var) \
> > > +({ \
> > > + typeof(var) __sum = 0; \
> > > + int cpu; \
> > > + for_each_possible_cpu(cpu) \
> > > + __sum += per_cpu(var, cpu); \
> > > + __sum; \
> > > +})
> > > +
> >
> > so maybe put it into include/linux/percpu.h ?
percpu-defs.h would be the better place for it.
> Yes I can do that.
>
> We can try and use it more after that, there seems to be loads of places
> that could use this fs/namespace.c fs/inode.c etc..
Hmmm... the only worry I have about this is people using it on u64 on
32bit machines. CPU local ops can do split updates on lower and upper
halves and the remotely-read value will be surprising. We have the
same issues w/ regular per_cpu accesses to but the summing function /
macro is better at giving the false sense of security. Prolly
limiting it upto ulong size is a good idea?
Thanks.
--
tejun
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-06-23 17:24 ` Oleg Nesterov
@ 2015-06-25 19:18 ` Peter Zijlstra
0 siblings, 0 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-25 19:18 UTC (permalink / raw)
To: Oleg Nesterov
Cc: paulmck, tj, mingo, linux-kernel, der.herr, dave, riel, viro, torvalds
On Tue, Jun 23, 2015 at 07:24:16PM +0200, Oleg Nesterov wrote:
> IOW. Suppose we add ->work_mutex into struct cpu_stopper. Btw,
> I think we should move all per-cpu variables there...
>
> Now,
>
> lock_stop_cpus_works(cpumask)
> {
> for_each_cpu(cpu, cpumask)
> mutex_lock(per_cpu(cpu_stopper_task, cpu).work_mutex);
> }
>
> unlock_stop_cpus_works(cpumask)
> {
> for_each_cpu(cpu, cpumask)
> mutex_lock(...);
> }
>
> which should be used instead of stop_cpus_mutex. After this change
> stop_two_cpus() can just use stop_cpus().
Right, lockdep annotating that will be 'interesting' though. And
stop_two_cpus() then has the problem of allocating a cpumask. Simpler to
let it keep 'abuse' the queueing spinlock in there.
> Off-topic. Can't we make __stop_machine() static? The only caller,
> _cpu_down() can safely call stop_machine(), get_online_cpus() is
> fine under cpu_hotplug_begin().
Can do I think.
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-06-25 14:51 ` Paul E. McKenney
@ 2015-06-26 12:32 ` Peter Zijlstra
2015-06-26 16:14 ` Paul E. McKenney
0 siblings, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-26 12:32 UTC (permalink / raw)
To: Paul E. McKenney
Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
viro, torvalds
On Thu, Jun 25, 2015 at 07:51:46AM -0700, Paul E. McKenney wrote:
> > So please humour me and explain how all this is far more complicated ;-)
>
> Yeah, I do need to get RCU design/implementation documentation put together.
>
> In the meantime, RCU's normal grace-period machinery is designed to be
> quite loosely coupled. The idea is that almost all actions occur locally,
> reducing contention and cache thrashing. But an expedited grace period
> needs tight coupling in order to be able to complete quickly. Making
> something that switches between loose and tight coupling in short order
> is not at all simple.
But expedited just means faster, we never promised that
sync_rcu_expedited is the absolute fastest primitive ever.
So I really should go read the RCU code I suppose, but I don't get
what's wrong with starting a forced quiescent state, then doing the
stop_work spray, where each work will run the regular RCU tick thing to
push it forwards.
>From my feeble memories, what I remember is that the last cpu to
complete a GP on a leaf node will push the completion up to the next
level, until at last we've reached the root of your tree and we can
complete the GP globally.
To me it just makes more sense to have a single RCU state machine. With
expedited we'll push it as fast as we can, but no faster.
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-06-26 12:32 ` Peter Zijlstra
@ 2015-06-26 16:14 ` Paul E. McKenney
2015-06-29 7:56 ` Peter Zijlstra
0 siblings, 1 reply; 106+ messages in thread
From: Paul E. McKenney @ 2015-06-26 16:14 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
viro, torvalds
On Fri, Jun 26, 2015 at 02:32:07PM +0200, Peter Zijlstra wrote:
> On Thu, Jun 25, 2015 at 07:51:46AM -0700, Paul E. McKenney wrote:
> > > So please humour me and explain how all this is far more complicated ;-)
> >
> > Yeah, I do need to get RCU design/implementation documentation put together.
> >
> > In the meantime, RCU's normal grace-period machinery is designed to be
> > quite loosely coupled. The idea is that almost all actions occur locally,
> > reducing contention and cache thrashing. But an expedited grace period
> > needs tight coupling in order to be able to complete quickly. Making
> > something that switches between loose and tight coupling in short order
> > is not at all simple.
>
> But expedited just means faster, we never promised that
> sync_rcu_expedited is the absolute fastest primitive ever.
Which is good, because given that it is doing something to each and
every CPU, it most assuredly won't in any way resemble the absolute
fastest primitive ever. ;-)
> So I really should go read the RCU code I suppose, but I don't get
> what's wrong with starting a forced quiescent state, then doing the
> stop_work spray, where each work will run the regular RCU tick thing to
> push it forwards.
>
> >From my feeble memories, what I remember is that the last cpu to
> complete a GP on a leaf node will push the completion up to the next
> level, until at last we've reached the root of your tree and we can
> complete the GP globally.
That is true, the task that notices the last required quiescent state
will push up the tree and notice that the grace period has ended.
If that task is not the grace-period kthread, it will then awaken
the grace-period kthread.
> To me it just makes more sense to have a single RCU state machine. With
> expedited we'll push it as fast as we can, but no faster.
Suppose that someone invokes synchronize_sched_expedited(), but there
is no normal grace period in flight. Then each CPU will note its own
quiescent state, but when it later might have tried to push it up the
tree, it will see that there is no grace period in effect, and will
therefore not bother.
OK, we could have synchronize_sched_expedited() tell the grace-period
kthread to start a grace period if one was not already in progress.
But that still isn't good enough, because the grace-period kthread will
take some time to initialize the new grace period, and if we hammer all
the CPUs before the initialization is complete, the resulting quiescent
states cannot be counted against the new grace period. (The reason for
this is that there is some delay between the actual quiescent state
and the time that it is reported, so we have to be very careful not
to incorrectly report a quiescent state from an earlier grace period
against the current grace period.)
OK, the grace-period kthread could tell synchronize_sched_expedited()
when it has finished initializing the grace period, though this is
starting to get a bit on the Rube Goldberg side. But this -still- is
not good enough, because even though the grace-period kthread has fully
initialized the new grace period, the individual CPUs are unaware of it.
And they will therefore continue to ignore any quiescent state that they
encounter, because they cannot prove that it actually happened after
the start of the current grace period.
OK, we could have some sort of indication when all CPUs become aware
of the new grace period by having them atomically manipulate a global
counter. Presumably we have some flag indicating when this is and is
not needed so that we avoid the killer memory contention in the common
case where it is not needed. But this -still- isn't good enough, because
idle CPUs never will become aware of the new grace period -- by design,
as they are supposed to be able to sleep through an arbitrary number of
grace periods.
OK, so we could have some sort of indication when all non-idle CPUs
become aware of the new grace period. But there could be races where
an idle CPU suddenly becomes non-idle just after it was reported that
the all non-idle CPUs were aware of the grace period. This would result
in a hang, because this this newly non-idle CPU might not have noticed
the new grace period at the time that synchronize_sched_expedited()
hammers it, which would mean that this newly non-idle CPU would refuse
to report the resulting quiescent state.
OK, so the grace-period kthread could track and report the set of CPUs
that had ever been idle since synchronize_sched_expedited() contacted it.
But holy overhead Batman!!!
And that is just one of the possible interactions with the grace-period
kthread. It might be in the middle of setting up a new grace period.
It might be in the middle of cleaning up after the last grace period.
It might be waiting for a grace period to complete, and the last quiescent
state was just reported, but hasn't propagated all the way up yet. All
of these would need to be handled correctly, and a number of them would
be as messy as the above scenario. Some might be even more messy.
I feel like there is a much easier way, but cannot yet articulate it.
I came across a couple of complications and a blind alley with it thus
far, but it still looks promising. I expect to be able to generate
actual code for it within a few days, but right now it is just weird
abstract shapes in my head. (Sorry, if I knew how to describe them,
I could just write the code! When I do write the code, it will probably
seem obvious and trivial, that being the usual outcome...)
Thanx, Paul
^ permalink raw reply [flat|nested] 106+ messages in thread
* [PATCH 0/3] percpu-rwsem: introduce percpu_rw_semaphore->recursive mode
2015-06-24 13:50 ` Oleg Nesterov
2015-06-24 14:13 ` Peter Zijlstra
@ 2015-06-28 23:56 ` Oleg Nesterov
2015-06-28 23:56 ` [PATCH 1/3] rcusync: introduce rcu_sync_struct->exclusive mode Oleg Nesterov
` (2 more replies)
1 sibling, 3 replies; 106+ messages in thread
From: Oleg Nesterov @ 2015-06-28 23:56 UTC (permalink / raw)
To: Peter Zijlstra
Cc: paulmck, tj, mingo, linux-kernel, der.herr, dave, riel, viro, torvalds
On 06/24, Oleg Nesterov wrote:
>
> So we need percpu_down_write_dont_block_readers(). I already thought
> about this before, I'll try to make the patch tomorrow on top of your
> changes.
Never say tomorrow...
> This means that we do not need task_struct->cpuhp_ref, but we can't
> avoid livelock we currently have: cpu_hotplug_begin() can never succeed
> if the new readers come fast enough.
Like with any other "recursive" lock.
Peter, I know you don't like the 1st patch. And yes, we could add another
mutex into percpu_rw_semaphore instead. But I think it would be better
to rely on rcu_sync_enter(). As for completion, we can remove it later.
Nevermind, the actual change is 3/3 and it looks simple.
Oleg.
^ permalink raw reply [flat|nested] 106+ messages in thread
* [PATCH 1/3] rcusync: introduce rcu_sync_struct->exclusive mode
2015-06-28 23:56 ` [PATCH 0/3] percpu-rwsem: introduce percpu_rw_semaphore->recursive mode Oleg Nesterov
@ 2015-06-28 23:56 ` Oleg Nesterov
2015-06-28 23:56 ` [PATCH 2/3] percpu-rwsem: don't use percpu_rw_semaphore->rw_sem to exclude writers Oleg Nesterov
2015-06-28 23:56 ` [PATCH 3/3] percpu-rwsem: introduce percpu_rw_semaphore->recursive mode Oleg Nesterov
2 siblings, 0 replies; 106+ messages in thread
From: Oleg Nesterov @ 2015-06-28 23:56 UTC (permalink / raw)
To: Peter Zijlstra
Cc: paulmck, tj, mingo, der.herr, dave, riel, viro, torvalds, linux-kernel
Add rcu_sync_struct->exclusive boolean set by rcu_sync_init(), it
obviously controls the exclusiveness of rcu_sync_enter(). This is
what percpu_down_write() actually wants.
We turn ->gp_wait into "struct completion gp_comp", it is used as
a resource counter in "exclusive" mode. Otherwise we only use its
completion->wait member for wait_event/wake_up_all. We never mix
the completion/wait_queue_head_t operations.
TODO: we can cleanup this logic and avoid "struct completion", but
this needs a bit more changes.
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
include/linux/percpu-rwsem.h | 2 +-
include/linux/rcusync.h | 29 ++++++++++++++++-------------
kernel/locking/percpu-rwsem.c | 2 +-
kernel/rcu/sync.c | 25 ++++++++++++++++++++-----
4 files changed, 38 insertions(+), 20 deletions(-)
diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index e12ce86..9202e73 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -21,7 +21,7 @@ static DEFINE_PER_CPU(unsigned int, __percpu_rwsem_refcount_##name); \
static struct percpu_rw_semaphore name = { \
.refcount = &__percpu_rwsem_refcount_##name, \
.state = 0, \
- .rss = __RCU_SYNC_INITIALIZER(name.rss, RCU_SCHED_SYNC), \
+ .rss = __RCU_SYNC_INITIALIZER(name.rss, RCU_SCHED_SYNC, 1), \
.writer = __WAIT_QUEUE_HEAD_INITIALIZER(name.writer), \
.rw_sem = __RWSEM_INITIALIZER(name.rw_sem), \
}
diff --git a/include/linux/rcusync.h b/include/linux/rcusync.h
index 0135838..aaea86a 100644
--- a/include/linux/rcusync.h
+++ b/include/linux/rcusync.h
@@ -1,7 +1,7 @@
#ifndef _LINUX_RCUSYNC_H_
#define _LINUX_RCUSYNC_H_
-#include <linux/wait.h>
+#include <linux/completion.h>
#include <linux/rcupdate.h>
enum rcu_sync_type { RCU_SYNC, RCU_SCHED_SYNC, RCU_BH_SYNC };
@@ -9,11 +9,12 @@ enum rcu_sync_type { RCU_SYNC, RCU_SCHED_SYNC, RCU_BH_SYNC };
struct rcu_sync_struct {
int gp_state;
int gp_count;
- wait_queue_head_t gp_wait;
+ struct completion gp_comp;
int cb_state;
struct rcu_head cb_head;
+ bool exclusive;
enum rcu_sync_type gp_type;
};
@@ -28,30 +29,32 @@ static inline bool rcu_sync_is_idle(struct rcu_sync_struct *rss)
#endif
}
-extern void rcu_sync_init(struct rcu_sync_struct *, enum rcu_sync_type);
+extern void rcu_sync_init(struct rcu_sync_struct *,
+ enum rcu_sync_type, bool excl);
extern void rcu_sync_enter(struct rcu_sync_struct *);
extern void rcu_sync_exit(struct rcu_sync_struct *);
extern void rcu_sync_dtor(struct rcu_sync_struct *);
-#define __RCU_SYNC_INITIALIZER(name, type) { \
+#define __RCU_SYNC_INITIALIZER(name, type, excl) { \
.gp_state = 0, \
.gp_count = 0, \
- .gp_wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.gp_wait), \
+ .gp_comp = COMPLETION_INITIALIZER(name.gp_comp), \
.cb_state = 0, \
+ .exclusive = excl, \
.gp_type = type, \
}
-#define __DEFINE_RCU_SYNC(name, type) \
- struct rcu_sync_struct name = __RCU_SYNC_INITIALIZER(name, type)
+#define __DEFINE_RCU_SYNC(name, type, excl) \
+ struct rcu_sync_struct name = __RCU_SYNC_INITIALIZER(name, type, excl)
-#define DEFINE_RCU_SYNC(name) \
- __DEFINE_RCU_SYNC(name, RCU_SYNC)
+#define DEFINE_RCU_SYNC(name, excl) \
+ __DEFINE_RCU_SYNC(name, RCU_SYNC, excl)
-#define DEFINE_RCU_SCHED_SYNC(name) \
- __DEFINE_RCU_SYNC(name, RCU_SCHED_SYNC)
+#define DEFINE_RCU_SCHED_SYNC(name, excl) \
+ __DEFINE_RCU_SYNC(name, RCU_SCHED_SYNC, excl)
-#define DEFINE_RCU_BH_SYNC(name) \
- __DEFINE_RCU_SYNC(name, RCU_BH_SYNC)
+#define DEFINE_RCU_BH_SYNC(name, excl) \
+ __DEFINE_RCU_SYNC(name, RCU_BH_SYNC, excl)
#endif /* _LINUX_RCUSYNC_H_ */
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 915646c..014d2f4 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -18,7 +18,7 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
return -ENOMEM;
sem->state = readers_slow;
- rcu_sync_init(&sem->rss, RCU_SCHED_SYNC);
+ rcu_sync_init(&sem->rss, RCU_SCHED_SYNC, true);
init_waitqueue_head(&sem->writer);
__init_rwsem(&sem->rw_sem, name, rwsem_key);
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index 8835ad1..03ddc61 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -38,7 +38,8 @@ static const struct {
enum { GP_IDLE = 0, GP_PENDING, GP_PASSED };
enum { CB_IDLE = 0, CB_PENDING, CB_REPLAY };
-#define rss_lock gp_wait.lock
+#define rss_lock gp_comp.wait.lock
+#define gp_wait gp_comp.wait
#ifdef CONFIG_PROVE_RCU
bool __rcu_sync_is_idle(struct rcu_sync_struct *rss)
@@ -49,10 +50,12 @@ bool __rcu_sync_is_idle(struct rcu_sync_struct *rss)
EXPORT_SYMBOL_GPL(__rcu_sync_is_idle);
#endif
-void rcu_sync_init(struct rcu_sync_struct *rss, enum rcu_sync_type type)
+void rcu_sync_init(struct rcu_sync_struct *rss,
+ enum rcu_sync_type type, bool excl)
{
memset(rss, 0, sizeof(*rss));
- init_waitqueue_head(&rss->gp_wait);
+ init_completion(&rss->gp_comp);
+ rss->exclusive = excl;
rss->gp_type = type;
}
@@ -72,9 +75,13 @@ void rcu_sync_enter(struct rcu_sync_struct *rss)
if (need_sync) {
gp_ops[rss->gp_type].sync();
rss->gp_state = GP_PASSED;
- wake_up_all(&rss->gp_wait);
+ if (!rss->exclusive)
+ wake_up_all(&rss->gp_wait);
} else if (need_wait) {
- wait_event(rss->gp_wait, rss->gp_state == GP_PASSED);
+ if (!rss->exclusive)
+ wait_event(rss->gp_wait, rss->gp_state == GP_PASSED);
+ else
+ wait_for_completion(&rss->gp_comp);
} else {
/*
* Possible when there's a pending CB from a rcu_sync_exit().
@@ -119,6 +126,12 @@ static void rcu_sync_func(struct rcu_head *rcu)
spin_unlock_irqrestore(&rss->rss_lock, flags);
}
+static inline void __complete_locked(struct completion *x)
+{
+ x->done++;
+ __wake_up_locked(&x->wait, TASK_NORMAL, 1);
+}
+
void rcu_sync_exit(struct rcu_sync_struct *rss)
{
spin_lock_irq(&rss->rss_lock);
@@ -129,6 +142,8 @@ void rcu_sync_exit(struct rcu_sync_struct *rss)
} else if (rss->cb_state == CB_PENDING) {
rss->cb_state = CB_REPLAY;
}
+ } else if (rss->exclusive) {
+ __complete_locked(&rss->gp_comp);
}
spin_unlock_irq(&rss->rss_lock);
}
--
1.5.5.1
^ permalink raw reply related [flat|nested] 106+ messages in thread
* [PATCH 2/3] percpu-rwsem: don't use percpu_rw_semaphore->rw_sem to exclude writers
2015-06-28 23:56 ` [PATCH 0/3] percpu-rwsem: introduce percpu_rw_semaphore->recursive mode Oleg Nesterov
2015-06-28 23:56 ` [PATCH 1/3] rcusync: introduce rcu_sync_struct->exclusive mode Oleg Nesterov
@ 2015-06-28 23:56 ` Oleg Nesterov
2015-06-28 23:56 ` [PATCH 3/3] percpu-rwsem: introduce percpu_rw_semaphore->recursive mode Oleg Nesterov
2 siblings, 0 replies; 106+ messages in thread
From: Oleg Nesterov @ 2015-06-28 23:56 UTC (permalink / raw)
To: Peter Zijlstra
Cc: paulmck, tj, mingo, der.herr, dave, riel, viro, torvalds, linux-kernel
percpu_down_write() does down_write() to exclude both the readers and
other writers. We can rely on rcu_sync_enter() in exclusive mode and
take ->rw_sem right before wait_event().
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
kernel/locking/percpu-rwsem.c | 3 +--
1 files changed, 1 insertions(+), 2 deletions(-)
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 014d2f4..609c13b 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -139,8 +139,6 @@ static bool readers_active_check(struct percpu_rw_semaphore *sem)
void percpu_down_write(struct percpu_rw_semaphore *sem)
{
- down_write(&sem->rw_sem);
-
/* Notify readers to take the slow path. */
rcu_sync_enter(&sem->rss);
@@ -158,6 +156,7 @@ void percpu_down_write(struct percpu_rw_semaphore *sem)
* therefore will wait for them.
*/
+ down_write(&sem->rw_sem);
/* Wait for all now active readers to complete. */
wait_event(sem->writer, readers_active_check(sem));
}
--
1.5.5.1
^ permalink raw reply related [flat|nested] 106+ messages in thread
* [PATCH 3/3] percpu-rwsem: introduce percpu_rw_semaphore->recursive mode
2015-06-28 23:56 ` [PATCH 0/3] percpu-rwsem: introduce percpu_rw_semaphore->recursive mode Oleg Nesterov
2015-06-28 23:56 ` [PATCH 1/3] rcusync: introduce rcu_sync_struct->exclusive mode Oleg Nesterov
2015-06-28 23:56 ` [PATCH 2/3] percpu-rwsem: don't use percpu_rw_semaphore->rw_sem to exclude writers Oleg Nesterov
@ 2015-06-28 23:56 ` Oleg Nesterov
2 siblings, 0 replies; 106+ messages in thread
From: Oleg Nesterov @ 2015-06-28 23:56 UTC (permalink / raw)
To: Peter Zijlstra
Cc: paulmck, tj, mingo, der.herr, dave, riel, viro, torvalds, linux-kernel
Add percpu_rw_semaphore->recursive boolean. If it is true then the
recursive percpu_down_read() is safe, percpu_down_write() doesn't
exclude the new readers, like cpu_hotplug_begin().
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
include/linux/percpu-rwsem.h | 15 ++++++++++-----
kernel/events/uprobes.c | 2 +-
kernel/locking/percpu-rwsem.c | 15 +++++++++++----
3 files changed, 22 insertions(+), 10 deletions(-)
diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index 9202e73..9441abd 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -13,16 +13,18 @@ struct percpu_rw_semaphore {
int state;
struct rcu_sync_struct rss;
wait_queue_head_t writer;
+ bool recursive;
struct rw_semaphore rw_sem;
};
-#define DEFINE_STATIC_PERCPU_RWSEM(name) \
+#define DEFINE_STATIC_PERCPU_RWSEM(name, rec) \
static DEFINE_PER_CPU(unsigned int, __percpu_rwsem_refcount_##name); \
static struct percpu_rw_semaphore name = { \
.refcount = &__percpu_rwsem_refcount_##name, \
.state = 0, \
.rss = __RCU_SYNC_INITIALIZER(name.rss, RCU_SCHED_SYNC, 1), \
.writer = __WAIT_QUEUE_HEAD_INITIALIZER(name.writer), \
+ .recursive = rec, \
.rw_sem = __RWSEM_INITIALIZER(name.rw_sem), \
}
@@ -37,7 +39,10 @@ static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
{
might_sleep();
- rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 0, _RET_IP_);
+ if (sem->recursive)
+ rwlock_acquire_read(&sem->rw_sem.dep_map, 0, 0, _RET_IP_);
+ else
+ rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 0, _RET_IP_);
preempt_disable();
/*
@@ -97,14 +102,14 @@ static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
extern void percpu_down_write(struct percpu_rw_semaphore *);
extern void percpu_up_write(struct percpu_rw_semaphore *);
-extern int __percpu_init_rwsem(struct percpu_rw_semaphore *,
+extern int __percpu_init_rwsem(struct percpu_rw_semaphore *, bool,
const char *, struct lock_class_key *);
extern void percpu_free_rwsem(struct percpu_rw_semaphore *);
-#define percpu_init_rwsem(sem) \
+#define percpu_init_rwsem(sem, recursive) \
({ \
static struct lock_class_key rwsem_key; \
- __percpu_init_rwsem(sem, #sem, &rwsem_key); \
+ __percpu_init_rwsem(sem, recursive, #sem, &rwsem_key); \
})
#endif
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index cb346f2..a4813a1 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1985,7 +1985,7 @@ static int __init init_uprobes(void)
for (i = 0; i < UPROBES_HASH_SZ; i++)
mutex_init(&uprobes_mmap_mutex[i]);
- if (percpu_init_rwsem(&dup_mmap_sem))
+ if (percpu_init_rwsem(&dup_mmap_sem, false))
return -ENOMEM;
return register_die_notifier(&uprobe_exception_nb);
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 609c13b..3db7c45 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -10,7 +10,7 @@
enum { readers_slow, readers_block };
-int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
+int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, bool recursive,
const char *name, struct lock_class_key *rwsem_key)
{
sem->refcount = alloc_percpu(unsigned int);
@@ -20,6 +20,7 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
sem->state = readers_slow;
rcu_sync_init(&sem->rss, RCU_SCHED_SYNC, true);
init_waitqueue_head(&sem->writer);
+ sem->recursive = recursive;
__init_rwsem(&sem->rw_sem, name, rwsem_key);
return 0;
@@ -124,9 +125,15 @@ void __percpu_up_read(struct percpu_rw_semaphore *sem)
*/
static bool readers_active_check(struct percpu_rw_semaphore *sem)
{
- if (per_cpu_sum(*sem->refcount) != 0)
+ if (sem->recursive && !down_write_trylock(&sem->rw_sem))
return false;
+ if (per_cpu_sum(*sem->refcount) != 0) {
+ if (sem->recursive)
+ up_write(&sem->rw_sem);
+ return false;
+ }
+
/*
* If we observed the decrement; ensure we see the entire critical
* section.
@@ -155,8 +162,8 @@ void percpu_down_write(struct percpu_rw_semaphore *sem)
* then we are guaranteed to see their sem->refcount increment, and
* therefore will wait for them.
*/
-
- down_write(&sem->rw_sem);
+ if (!sem->recursive)
+ down_write(&sem->rw_sem);
/* Wait for all now active readers to complete. */
wait_event(sem->writer, readers_active_check(sem));
}
--
1.5.5.1
^ permalink raw reply related [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-06-26 16:14 ` Paul E. McKenney
@ 2015-06-29 7:56 ` Peter Zijlstra
2015-06-30 21:32 ` Paul E. McKenney
0 siblings, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-29 7:56 UTC (permalink / raw)
To: Paul E. McKenney
Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
viro, torvalds
On Fri, Jun 26, 2015 at 09:14:28AM -0700, Paul E. McKenney wrote:
> > To me it just makes more sense to have a single RCU state machine. With
> > expedited we'll push it as fast as we can, but no faster.
>
> Suppose that someone invokes synchronize_sched_expedited(), but there
> is no normal grace period in flight. Then each CPU will note its own
> quiescent state, but when it later might have tried to push it up the
> tree, it will see that there is no grace period in effect, and will
> therefore not bother.
Right, I did mention the force grace period machinery to make sure we
start one before poking :-)
> OK, we could have synchronize_sched_expedited() tell the grace-period
> kthread to start a grace period if one was not already in progress.
I had indeed forgotten that got farmed out to the kthread; on which, my
poor desktop seems to have spend ~140 minutes of its (most recent)
existence poking RCU things.
7 root 20 0 0 0 0 S 0.0 0.0 56:34.66 rcu_sched
8 root 20 0 0 0 0 S 0.0 0.0 20:58.19 rcuos/0
9 root 20 0 0 0 0 S 0.0 0.0 18:50.75 rcuos/1
10 root 20 0 0 0 0 S 0.0 0.0 18:30.62 rcuos/2
11 root 20 0 0 0 0 S 0.0 0.0 17:33.24 rcuos/3
12 root 20 0 0 0 0 S 0.0 0.0 2:43.54 rcuos/4
13 root 20 0 0 0 0 S 0.0 0.0 3:00.31 rcuos/5
14 root 20 0 0 0 0 S 0.0 0.0 3:09.27 rcuos/6
15 root 20 0 0 0 0 S 0.0 0.0 2:52.98 rcuos/7
Which is almost as much time as my konsole:
2853 peterz 20 0 586240 103664 41848 S 1.0 0.3 147:39.50 konsole
Which seems somewhat excessive. But who knows.
> OK, the grace-period kthread could tell synchronize_sched_expedited()
> when it has finished initializing the grace period, though this is
> starting to get a bit on the Rube Goldberg side. But this -still- is
> not good enough, because even though the grace-period kthread has fully
> initialized the new grace period, the individual CPUs are unaware of it.
Right, so over the weekend -- I had postponed reading this rather long
email for I was knackered -- I had figured that because we trickle the
GP completion up, you probably equally trickle the GP start down of
sorts and there might be 'interesting' things there.
> And they will therefore continue to ignore any quiescent state that they
> encounter, because they cannot prove that it actually happened after
> the start of the current grace period.
Right, badness :-)
Although here I'll once again go ahead and say something ignorant; how
come that's a problem? Surely if we know the kthread thing has finished
starting a GP, any one CPU issuing a full memory barrier (as would be
implied by switching to the stop worker) must then indeed observe that
global state? due to that transitivity thing.
That is, I'm having a wee bit of bother for seeing how you'd need
manipulation of global variables as you elude to below.
> But this -still- isn't good enough, because
> idle CPUs never will become aware of the new grace period -- by design,
> as they are supposed to be able to sleep through an arbitrary number of
> grace periods.
Yes, I'm sure. Waking up seems like a serializing experience though; but
I suppose that's not good enough if we wake up right before we force
start the GP.
> I feel like there is a much easier way, but cannot yet articulate it.
> I came across a couple of complications and a blind alley with it thus
> far, but it still looks promising. I expect to be able to generate
> actual code for it within a few days, but right now it is just weird
> abstract shapes in my head. (Sorry, if I knew how to describe them,
> I could just write the code! When I do write the code, it will probably
> seem obvious and trivial, that being the usual outcome...)
Hehe, glad to have been of help :-)
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 05/13] percpu-rwsem: Optimize readers and reduce global impact
2015-06-25 19:17 ` Tejun Heo
@ 2015-06-29 9:32 ` Peter Zijlstra
2015-06-29 15:12 ` Tejun Heo
0 siblings, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-29 9:32 UTC (permalink / raw)
To: Tejun Heo
Cc: Nicholas Mc Guire, oleg, paulmck, mingo, linux-kernel, dave,
riel, viro, torvalds
On Thu, Jun 25, 2015 at 03:17:01PM -0400, Tejun Heo wrote:
> Hmmm... the only worry I have about this is people using it on u64 on
> 32bit machines. CPU local ops can do split updates on lower and upper
> halves and the remotely-read value will be surprising. We have the
> same issues w/ regular per_cpu accesses to but the summing function /
> macro is better at giving the false sense of security. Prolly
> limiting it upto ulong size is a good idea?
Agreed, luckily we already have the infrastructure for this, something
like so?
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -287,6 +287,16 @@ do { \
preempt_enable(); \
} while (0)
+#define per_cpu_sum(var) \
+({ \
+ typeof(var) __sum = 0; \
+ int cpu; \
+ compiletime_assert_atomic_type(__sum); \
+ for_each_possible_cpu(cpu) \
+ __sum += per_cpu(var, cpu); \
+ __sum; \
+})
+
/*
* Branching function to split up a function into a set of functions that
* are called for different scalar sizes of the objects handled.
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 05/13] percpu-rwsem: Optimize readers and reduce global impact
2015-06-29 9:32 ` Peter Zijlstra
@ 2015-06-29 15:12 ` Tejun Heo
2015-06-29 15:14 ` Peter Zijlstra
0 siblings, 1 reply; 106+ messages in thread
From: Tejun Heo @ 2015-06-29 15:12 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Nicholas Mc Guire, oleg, paulmck, mingo, linux-kernel, dave,
riel, viro, torvalds
Hello, Peter.
On Mon, Jun 29, 2015 at 11:32:19AM +0200, Peter Zijlstra wrote:
> Agreed, luckily we already have the infrastructure for this, something
> like so?
>
> --- a/include/linux/percpu-defs.h
> +++ b/include/linux/percpu-defs.h
> @@ -287,6 +287,16 @@ do { \
> preempt_enable(); \
> } while (0)
>
> +#define per_cpu_sum(var) \
> +({ \
> + typeof(var) __sum = 0; \
> + int cpu; \
Why not __cpu?
> + compiletime_assert_atomic_type(__sum); \
> + for_each_possible_cpu(cpu) \
> + __sum += per_cpu(var, cpu); \
> + __sum; \
> +})
But other than that, looks good to me.
Thanks.
--
tejun
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 05/13] percpu-rwsem: Optimize readers and reduce global impact
2015-06-29 15:12 ` Tejun Heo
@ 2015-06-29 15:14 ` Peter Zijlstra
0 siblings, 0 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-29 15:14 UTC (permalink / raw)
To: Tejun Heo
Cc: Nicholas Mc Guire, oleg, paulmck, mingo, linux-kernel, dave,
riel, viro, torvalds
On Mon, Jun 29, 2015 at 11:12:20AM -0400, Tejun Heo wrote:
> Hello, Peter.
>
> On Mon, Jun 29, 2015 at 11:32:19AM +0200, Peter Zijlstra wrote:
> > Agreed, luckily we already have the infrastructure for this, something
> > like so?
> >
> > --- a/include/linux/percpu-defs.h
> > +++ b/include/linux/percpu-defs.h
> > @@ -287,6 +287,16 @@ do { \
> > preempt_enable(); \
> > } while (0)
> >
> > +#define per_cpu_sum(var) \
> > +({ \
> > + typeof(var) __sum = 0; \
> > + int cpu; \
>
> Why not __cpu?
I've no idea, __cpu is indeed more consistent, consider it changed.
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-06-29 7:56 ` Peter Zijlstra
@ 2015-06-30 21:32 ` Paul E. McKenney
2015-07-01 11:56 ` Peter Zijlstra
0 siblings, 1 reply; 106+ messages in thread
From: Paul E. McKenney @ 2015-06-30 21:32 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
viro, torvalds
On Mon, Jun 29, 2015 at 09:56:46AM +0200, Peter Zijlstra wrote:
> On Fri, Jun 26, 2015 at 09:14:28AM -0700, Paul E. McKenney wrote:
> > > To me it just makes more sense to have a single RCU state machine. With
> > > expedited we'll push it as fast as we can, but no faster.
> >
> > Suppose that someone invokes synchronize_sched_expedited(), but there
> > is no normal grace period in flight. Then each CPU will note its own
> > quiescent state, but when it later might have tried to push it up the
> > tree, it will see that there is no grace period in effect, and will
> > therefore not bother.
>
> Right, I did mention the force grace period machinery to make sure we
> start one before poking :-)
Fair enough...
> > OK, we could have synchronize_sched_expedited() tell the grace-period
> > kthread to start a grace period if one was not already in progress.
>
> I had indeed forgotten that got farmed out to the kthread; on which, my
> poor desktop seems to have spend ~140 minutes of its (most recent)
> existence poking RCU things.
>
> 7 root 20 0 0 0 0 S 0.0 0.0 56:34.66 rcu_sched
> 8 root 20 0 0 0 0 S 0.0 0.0 20:58.19 rcuos/0
> 9 root 20 0 0 0 0 S 0.0 0.0 18:50.75 rcuos/1
> 10 root 20 0 0 0 0 S 0.0 0.0 18:30.62 rcuos/2
> 11 root 20 0 0 0 0 S 0.0 0.0 17:33.24 rcuos/3
> 12 root 20 0 0 0 0 S 0.0 0.0 2:43.54 rcuos/4
> 13 root 20 0 0 0 0 S 0.0 0.0 3:00.31 rcuos/5
> 14 root 20 0 0 0 0 S 0.0 0.0 3:09.27 rcuos/6
> 15 root 20 0 0 0 0 S 0.0 0.0 2:52.98 rcuos/7
>
> Which is almost as much time as my konsole:
>
> 2853 peterz 20 0 586240 103664 41848 S 1.0 0.3 147:39.50 konsole
>
> Which seems somewhat excessive. But who knows.
No idea. How long has that system been up? What has it been doing?
The rcu_sched overhead is expected behavior if the system has run between
ten and one hundred million grace periods, give or take an order of
magnitude depending on the number of idle CPUs and so on.
The overhead for the RCU offload kthreads is what it is. A kfree() takes
as much time as a kfree does, and they are all nicely counted up for you.
> > OK, the grace-period kthread could tell synchronize_sched_expedited()
> > when it has finished initializing the grace period, though this is
> > starting to get a bit on the Rube Goldberg side. But this -still- is
> > not good enough, because even though the grace-period kthread has fully
> > initialized the new grace period, the individual CPUs are unaware of it.
>
> Right, so over the weekend -- I had postponed reading this rather long
> email for I was knackered -- I had figured that because we trickle the
> GP completion up, you probably equally trickle the GP start down of
> sorts and there might be 'interesting' things there.
The GP completion trickles both up and down, though the down part shouldn't
matter in this case.
> > And they will therefore continue to ignore any quiescent state that they
> > encounter, because they cannot prove that it actually happened after
> > the start of the current grace period.
>
> Right, badness :-)
>
> Although here I'll once again go ahead and say something ignorant; how
> come that's a problem? Surely if we know the kthread thing has finished
> starting a GP, any one CPU issuing a full memory barrier (as would be
> implied by switching to the stop worker) must then indeed observe that
> global state? due to that transitivity thing.
>
> That is, I'm having a wee bit of bother for seeing how you'd need
> manipulation of global variables as you elude to below.
Well, I thought that you wanted to leverage the combining tree to
determine when the grace period had completed. If a given CPU isn't
pushing its quiescent states up the combining tree, then the combining
tree can't do much for you.
> > But this -still- isn't good enough, because
> > idle CPUs never will become aware of the new grace period -- by design,
> > as they are supposed to be able to sleep through an arbitrary number of
> > grace periods.
>
> Yes, I'm sure. Waking up seems like a serializing experience though; but
> I suppose that's not good enough if we wake up right before we force
> start the GP.
That would indeed be one of the problems that could occur. ;-)
> > I feel like there is a much easier way, but cannot yet articulate it.
> > I came across a couple of complications and a blind alley with it thus
> > far, but it still looks promising. I expect to be able to generate
> > actual code for it within a few days, but right now it is just weird
> > abstract shapes in my head. (Sorry, if I knew how to describe them,
> > I could just write the code! When I do write the code, it will probably
> > seem obvious and trivial, that being the usual outcome...)
>
> Hehe, glad to have been of help :-)
Well, I do have something that seems reasonably straightforward. Sending
the patches along separately. Not sure that it is worth its weight.
The idea is that we keep the expedited grace periods working as they do
now, independently of the normal grace period. The normal grace period
takes a sequence number just after initialization, and checks to see
if an expedited grace period happened in the meantime at the beginning
of each quiescent-state forcing episode. This saves the last one or
two quiescent-state forcing scans if the case where an expedited grace
period really did happen.
It is possible for the expedited grace period to help things along by
waking up the grace-period kthread, but of course doing this too much
further increases the time consumed by your rcu_sched kthread. It is
possible to compromise by only doing the wakeup every so many grace
periods or only once per a given period of time, which is the approach
the last patch in the series takes.
I will be sending the series shortly, followed by a series for the
other portions of the expedited grace-period upgrade.
Thanx, Paul
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 00/13] percpu rwsem -v2
2015-06-24 9:18 ` Daniel Wagner
@ 2015-07-01 5:57 ` Daniel Wagner
2015-07-01 21:54 ` Linus Torvalds
0 siblings, 1 reply; 106+ messages in thread
From: Daniel Wagner @ 2015-07-01 5:57 UTC (permalink / raw)
To: Ingo Molnar, Peter Zijlstra
Cc: oleg, paulmck, tj, mingo, linux-kernel, der.herr, dave, riel,
viro, torvalds, jlayton
Hi,
I did a sweep over the parameters for posix01. The parameters are number
of processes and number of locks taken per process. In contrast to the
other test, it looks like there is no set which ends a nice stable
result (read low variance). I have tried several things including
pinning down all processes to CPUs to avoid migration. The results
improved slightly but there was still a high variance.
Anyway I have collected some data and I like to share it. Maybe it is
still useful. All numbers here are without the above mentioned pinning.
There are some runs missing (don't know the reason yet) and I didn't let
it run till the end. So add some salt to these numbers.
The test script and raw data can be found here:
http://monom.org/posix01/
The tables reads:
nproc: number of process started
columns: number of locks taken per process
Hardware
4x E5-4610, for this test all process are scheduled on one socket
First the numbers for tip 4.1.0-02756-ge3d06bd.
nproc 8
100 200 300 400 500
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean 0.075449 0.210547 0.340658 0.464083 0.590400
std 0.015550 0.024989 0.032080 0.043803 0.055003
min 0.021643 0.067456 0.211779 0.279643 0.327628
25% 0.065337 0.195664 0.318114 0.430040 0.546488
50% 0.075345 0.209411 0.338512 0.461397 0.591433
75% 0.084725 0.226517 0.364190 0.494638 0.626532
max 0.127050 0.281836 0.454558 0.607559 0.762149
nproc 16
100 200 300 400 500
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean 1.023660 2.463384 3.891954 5.312716 6.752857
std 0.105065 0.124916 0.136476 0.172906 0.207449
min 0.351199 1.527379 3.106403 4.157478 5.519601
25% 0.961098 2.397597 3.807098 5.201875 6.633034
50% 1.031460 2.467317 3.895824 5.321227 6.757502
75% 1.093412 2.539284 3.985122 5.432336 6.889859
max 1.278603 2.785901 4.369434 5.798982 7.324263
nproc 24
100 200 300 400 500
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean 3.460166 7.942193 11.898540 11.150066 11.060036
std 0.191564 0.232989 0.612868 0.680323 0.465967
min 2.748545 6.575510 9.977165 9.209685 8.937682
25% 3.325521 7.806847 11.440580 10.774070 10.912302
50% 3.493138 7.951859 11.852556 11.163595 11.074910
75% 3.596927 8.088036 12.443429 11.365197 11.243125
max 3.974884 8.589840 13.079780 16.341043 14.244954
nproc 32
100 200 300 400 500
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean 6.797286 13.943421 14.373278 15.857103 20.047039
std 0.366013 0.417859 0.625967 0.377463 0.302939
min 3.323312 12.266006 12.492706 14.451931 17.496059
25% 6.649401 13.719397 14.186790 15.738348 19.958001
50% 6.868362 13.862458 14.312992 15.870438 20.083564
75% 6.995801 14.027167 14.429383 15.984881 20.215722
max 7.369007 15.631300 21.587450 19.364991 20.755793
nproc 40
100 200 300 400 500
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean 11.156514 16.936808 18.930412 25.605206 32.334239
std 0.613158 0.614545 0.485336 0.344226 0.398747
min 5.609261 13.147398 16.930261 23.448985 28.992899
25% 10.999876 16.740775 18.788180 25.481274 32.188020
50% 11.251502 16.883100 18.946506 25.648879 32.369347
75% 11.439205 17.032133 19.105678 25.806715 32.565019
max 12.155905 24.116348 26.152117 26.502637 33.263763
nproc 48
100 200 300 400 500
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean 16.523705 18.214558 27.877811 37.703763 47.655792
std 0.974732 1.118383 0.357481 0.435081 0.472945
min 7.909358 16.279568 25.989797 35.308061 45.279940
25% 16.385582 17.960832 27.729399 37.555420 47.458123
50% 16.692900 18.137635 27.920459 37.767064 47.679325
75% 16.927355 18.311502 28.092018 37.950782 47.926311
max 17.720374 35.810409 28.721941 38.746273 49.333097
nproc 56
100 200 300 400 500
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean 11.567668 25.100333 38.603884 52.135564 65.716669
std 0.320771 0.369833 0.554834 0.534120 0.612844
min 10.123811 22.598875 35.668780 49.182148 62.504962
25% 11.394438 24.925338 38.389200 51.885988 65.441492
50% 11.593920 25.135043 38.641839 52.206010 65.771692
75% 11.789101 25.328558 38.895343 52.451819 66.068270
max 12.319346 25.948404 46.458428 53.605888 67.270679
nproc 64
100 200 300 400 500
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean 15.421295 33.254418 51.073912 68.936111 86.919074
std 0.398493 0.411222 0.551629 0.690891 0.694183
min 13.269859 30.900978 48.174802 65.549282 83.099271
25% 15.203732 33.037478 50.821702 68.619365 86.579749
50% 15.467885 33.279869 51.130972 69.001664 86.953804
75% 15.694466 33.514712 51.380860 69.361632 87.341084
max 16.347321 34.475095 52.507292 70.884752 88.807083
nproc 72
100 200 300 400 500
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean 19.762286 42.488827 65.167763 87.903430 110.666679
std 0.483660 0.480269 0.689872 0.828354 0.892759
min 15.506067 39.937453 61.196633 84.227403 107.014850
25% 19.519194 42.261548 64.834133 87.515837 110.225142
50% 19.809986 42.541263 65.265768 87.974049 110.747980
75% 20.083315 42.792858 65.603762 88.392599 111.223192
max 20.913434 43.830009 66.791452 90.184550 113.062344
nproc 80
100 200 300 400 500
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean 24.782285 52.853068 80.902314 109.112294 137.441640
std 0.523731 0.639160 0.799033 0.952619 1.091478
min 20.126615 47.813274 77.357915 104.033857 131.978443
25% 24.498501 52.547855 80.509926 108.606293 136.877050
50% 24.835766 52.918841 80.950773 109.197236 137.498470
75% 25.137887 53.244013 81.376380 109.723791 138.101133
max 26.161997 54.372957 83.266046 111.709888 140.419400
nproc 88
100 200 300 400 500
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean 30.196867 64.467080 98.710365 133.024282 167.330900
std 0.749476 0.691460 0.863908 1.033780 1.240237
min 16.647491 60.034797 94.053510 128.281171 161.778166
25% 29.896764 64.121607 98.290368 132.484092 166.711172
50% 30.271808 64.514222 98.742714 133.089852 167.429483
75% 30.627200 64.903154 99.262584 133.706735 168.086624
max 31.806051 66.343856 101.077264 136.143873 170.449596
nproc 96
100 200 300 400 500
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean 36.304100 77.194851 117.958001 158.820159 199.868940
std 0.712442 0.718565 1.009163 1.220813 1.462219
min 31.128111 73.850226 112.075970 152.910227 192.977453
25% 35.928427 76.811233 117.466922 158.151278 199.058411
50% 36.378220 77.209148 117.998878 158.879704 199.861157
75% 36.761744 77.636286 118.615380 159.583272 200.701769
max 38.069263 79.445286 120.878239 162.826438 206.826424
nproc 104
100 200 300 400 500
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean 42.731401 90.887253 138.815476 186.824953 235.055458
std 1.045572 0.742232 0.999065 1.298818 1.554890
min 23.734733 87.384048 133.462821 180.971966 227.475939
25% 42.353032 90.441055 138.213962 186.109237 234.169575
50% 42.861112 90.900274 138.836083 186.835884 235.084204
75% 43.236527 91.382487 139.460129 187.694247 236.011148
max 44.600281 93.394394 141.959512 190.171221 239.491909
nproc 112
100 200 300 400
count 460.000000 460.000000 460.000000 460.000000
mean 49.782729 105.468739 161.416099 217.385757
std 0.904312 1.011980 1.222772 1.475225
min 45.334285 100.711113 156.087707 210.639527
25% 49.394518 104.971028 160.743875 216.590612
50% 49.906665 105.604756 161.528712 217.437408
75% 50.363428 106.088852 162.187166 218.286111
max 51.800116 108.372299 164.614385 221.788613
And now the same tests for tip+percpu_rwsem:
nproc 8
100 200 300 400 500
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean 0.285784 0.639623 0.935062 1.165287 1.457565
std 0.040458 0.089317 0.112704 0.094596 0.110337
min 0.118961 0.253775 0.351943 0.869095 1.026194
25% 0.263250 0.600806 0.858630 1.100281 1.376566
50% 0.287019 0.649395 0.930437 1.167166 1.461235
75% 0.312601 0.692013 1.013786 1.228887 1.533511
max 0.407264 0.860837 1.298671 1.460842 1.927867
nproc 16
100 200 300 400 500
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean 2.338683 5.219408 8.117279 11.050641 14.035433
std 0.146102 0.270400 0.392875 0.510692 0.576044
min 1.836110 4.179970 6.491748 8.998336 11.442838
25% 2.239374 5.042915 7.860587 10.728740 13.667630
50% 2.335801 5.217732 8.125243 11.052183 14.010561
75% 2.443152 5.404223 8.396037 11.404375 14.417740
max 2.798029 5.927344 9.172875 12.203548 15.444552
nproc 24
100 200 300 400 500
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean 6.399927 13.673487 20.729554 27.316864 34.125202
std 0.558388 1.157996 1.647191 2.066864 2.487975
min 4.961608 10.767524 17.145018 22.441426 28.566438
25% 5.987118 12.849801 19.555979 25.943463 32.399122
50% 6.388215 13.583983 20.533054 27.122120 33.959403
75% 6.915310 14.786835 22.252796 29.187176 36.308254
max 7.405319 15.823960 23.858206 31.754922 38.997955
nproc 32
100 200 300 400 500
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean 11.973832 24.885823 36.705614 48.036525 57.418669
std 1.270516 2.604583 3.963139 5.283237 6.441122
min 9.395066 19.958662 27.768684 38.247046 46.265231
25% 10.955417 22.708953 33.510437 43.613011 51.901209
50% 11.801515 24.556642 35.805816 47.315635 55.933447
75% 13.294692 27.520679 40.689642 53.139912 63.860584
max 14.217272 29.968337 44.409489 58.246754 71.045867
nproc 40
100 200 300 400 500
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean 19.307414 39.204462 55.768040 70.808627 83.830246
std 2.189803 3.982241 5.467692 6.737372 8.124025
min 14.450258 30.606836 44.342114 55.520218 64.704178
25% 17.418113 35.968251 51.341042 65.352697 77.744806
50% 19.067713 39.023460 55.548934 70.282785 83.374667
75% 21.479466 42.666118 60.379906 76.604241 91.158904
max 23.687483 47.019928 67.143361 85.084045 100.957011
nproc 48
100 200 300 400 500
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean 28.386773 55.462523 77.886706 92.579064 104.319703
std 3.231688 6.142373 8.633285 10.950222 12.510504
min 21.703659 42.486864 56.904221 66.605689 76.529646
25% 25.635256 50.575642 71.306694 82.931995 94.222776
50% 28.136694 55.235674 77.298409 91.993559 104.909015
75% 31.484979 60.645302 85.693462 102.195018 114.141212
max 35.713537 68.342796 96.065304 115.926497 130.916876
nproc 56
100 200 300 400 500
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean 39.037206 74.470404 97.900979 111.320283 135.943281
std 4.594741 8.940246 11.715321 13.823450 16.032080
min 29.532559 55.193557 65.590273 79.580482 98.565733
25% 35.212004 66.990273 88.066459 100.643871 122.864654
50% 38.796902 73.928176 96.771490 110.669216 136.199617
75% 43.154846 82.041731 108.937264 120.727216 147.769269
max 49.215714 92.181542 125.188702 141.113117 170.961264
nproc 64
100 200 300 400 500
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean 51.099012 93.028015 114.649700 145.944300 178.043572
std 6.310777 12.719401 14.675830 18.019135 21.084448
min 36.770938 54.620852 80.837116 98.765936 126.207980
25% 45.955694 84.078285 103.452854 132.127548 160.746493
50% 50.275929 93.031565 114.333533 144.951788 177.105994
75% 56.955477 104.656181 128.418118 163.865640 197.275452
max 63.369715 120.360706 146.542148 182.482159 218.814651
nproc 72
100 200 300 400 500
count 506.000000 506.000000 506.000000 506.000000 506.000000
mean 64.905270 108.760098 138.811285 179.277895 222.584001
std 8.784532 16.293281 18.160401 21.203767 25.904456
min 43.035451 64.762288 96.401934 127.995159 162.341026
25% 58.658290 98.438247 126.035692 162.944645 202.228444
50% 64.756854 109.608197 139.190635 181.413255 223.359111
75% 72.488483 123.608470 152.745541 195.549278 245.454358
max 83.424516 139.214509 172.538610 218.677815 270.799895
nproc 80
100 200 300 400 500
count 61.000000 61.000000 61.000000 61.000000 61.000000
mean 76.727789 124.438489 174.095378 225.855798 272.416390
std 9.757928 18.034325 20.216132 24.868596 29.384832
min 55.988043 83.842137 130.842940 173.596051 208.508169
25% 69.218268 116.679810 162.149179 207.015727 252.194955
50% 75.392969 125.378519 173.117425 225.071270 276.188038
75% 83.748328 136.689138 192.392097 245.019530 296.407232
max 97.004966 165.172805 206.391629 266.751069 318.089290
nproc 88
100
count 157.000000
mean 90.337638
std 15.239911
min 53.393662
25% 79.648088
50% 91.075065
75% 103.530939
max 120.680507
And an attempt at visualization:
http://monom.org/posix01/sweep-4.1.0-02756-ge3d06bd.png
http://monom.org/posix01/sweep-4.1.0-02769-g6ce2591.png
Let me know if these numbers help or not. I start to get better in
running those tests tough they take quite some time to finish. So if
they are useless I sleep well without doing this :)
cheers,
daniel
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-06-30 21:32 ` Paul E. McKenney
@ 2015-07-01 11:56 ` Peter Zijlstra
2015-07-01 15:56 ` Paul E. McKenney
0 siblings, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-07-01 11:56 UTC (permalink / raw)
To: Paul E. McKenney
Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
viro, torvalds
On Tue, Jun 30, 2015 at 02:32:58PM -0700, Paul E. McKenney wrote:
> > I had indeed forgotten that got farmed out to the kthread; on which, my
> > poor desktop seems to have spend ~140 minutes of its (most recent)
> > existence poking RCU things.
> >
> > 7 root 20 0 0 0 0 S 0.0 0.0 56:34.66 rcu_sched
> > 8 root 20 0 0 0 0 S 0.0 0.0 20:58.19 rcuos/0
> > 9 root 20 0 0 0 0 S 0.0 0.0 18:50.75 rcuos/1
> > 10 root 20 0 0 0 0 S 0.0 0.0 18:30.62 rcuos/2
> > 11 root 20 0 0 0 0 S 0.0 0.0 17:33.24 rcuos/3
> > 12 root 20 0 0 0 0 S 0.0 0.0 2:43.54 rcuos/4
> > 13 root 20 0 0 0 0 S 0.0 0.0 3:00.31 rcuos/5
> > 14 root 20 0 0 0 0 S 0.0 0.0 3:09.27 rcuos/6
> > 15 root 20 0 0 0 0 S 0.0 0.0 2:52.98 rcuos/7
> >
> > Which is almost as much time as my konsole:
> >
> > 2853 peterz 20 0 586240 103664 41848 S 1.0 0.3 147:39.50 konsole
> >
> > Which seems somewhat excessive. But who knows.
>
> No idea. How long has that system been up? What has it been doing?
Some 40 odd days it seems. Its my desktop, I read email (in mutt in
Konsole), I type patches (in vim in Konsole), I compile kernels (in
Konsole) etc..
Now konsole is threaded and each new window/tab is just another thread
in the same process so runtime should accumulate. However I just found
that for some obscure reason there's two konsole processes around, and
the other is the one that I'm using most, it also has significantly more
runtime.
3264 ? Sl 452:43 \_ /usr/bin/konsole
Must be some of that brain damaged desktop shite that confused things --
I see the one is stared with some -session argument. Some day I'll
discover how to destroy all that nonsense and make things behave as they
should.
> The rcu_sched overhead is expected behavior if the system has run between
> ten and one hundred million grace periods, give or take an order of
> magnitude depending on the number of idle CPUs and so on.
>
> The overhead for the RCU offload kthreads is what it is. A kfree() takes
> as much time as a kfree does, and they are all nicely counted up for you.
Yah, if only we could account it back to whomever caused it :/
> > Although here I'll once again go ahead and say something ignorant; how
> > come that's a problem? Surely if we know the kthread thing has finished
> > starting a GP, any one CPU issuing a full memory barrier (as would be
> > implied by switching to the stop worker) must then indeed observe that
> > global state? due to that transitivity thing.
> >
> > That is, I'm having a wee bit of bother for seeing how you'd need
> > manipulation of global variables as you elude to below.
>
> Well, I thought that you wanted to leverage the combining tree to
> determine when the grace period had completed. If a given CPU isn't
> pushing its quiescent states up the combining tree, then the combining
> tree can't do much for you.
Right that is what I wanted, and sure the combining thing needs to
happen with atomics, but that's not new, it already does that.
What I was talking about was the interaction between the force
quiescence state and the poking detectoring that a QS had indeed be
started.
> Well, I do have something that seems reasonably straightforward. Sending
> the patches along separately. Not sure that it is worth its weight.
>
> The idea is that we keep the expedited grace periods working as they do
> now, independently of the normal grace period. The normal grace period
> takes a sequence number just after initialization, and checks to see
> if an expedited grace period happened in the meantime at the beginning
> of each quiescent-state forcing episode. This saves the last one or
> two quiescent-state forcing scans if the case where an expedited grace
> period really did happen.
>
> It is possible for the expedited grace period to help things along by
> waking up the grace-period kthread, but of course doing this too much
> further increases the time consumed by your rcu_sched kthread.
Ah so that is the purpose of that patch. Still, I'm having trouble
seeing how you can do this too much, you would only be waking it if
there was a GP pending completion, right? At which point waking it is
the right thing.
If you wake it unconditionally, even if there's nothing to do, then yes
that'd be a waste of cycles.
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-07-01 11:56 ` Peter Zijlstra
@ 2015-07-01 15:56 ` Paul E. McKenney
2015-07-01 16:16 ` Peter Zijlstra
0 siblings, 1 reply; 106+ messages in thread
From: Paul E. McKenney @ 2015-07-01 15:56 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
viro, torvalds
On Wed, Jul 01, 2015 at 01:56:42PM +0200, Peter Zijlstra wrote:
> On Tue, Jun 30, 2015 at 02:32:58PM -0700, Paul E. McKenney wrote:
>
> > > I had indeed forgotten that got farmed out to the kthread; on which, my
> > > poor desktop seems to have spend ~140 minutes of its (most recent)
> > > existence poking RCU things.
> > >
> > > 7 root 20 0 0 0 0 S 0.0 0.0 56:34.66 rcu_sched
> > > 8 root 20 0 0 0 0 S 0.0 0.0 20:58.19 rcuos/0
> > > 9 root 20 0 0 0 0 S 0.0 0.0 18:50.75 rcuos/1
> > > 10 root 20 0 0 0 0 S 0.0 0.0 18:30.62 rcuos/2
> > > 11 root 20 0 0 0 0 S 0.0 0.0 17:33.24 rcuos/3
> > > 12 root 20 0 0 0 0 S 0.0 0.0 2:43.54 rcuos/4
> > > 13 root 20 0 0 0 0 S 0.0 0.0 3:00.31 rcuos/5
> > > 14 root 20 0 0 0 0 S 0.0 0.0 3:09.27 rcuos/6
> > > 15 root 20 0 0 0 0 S 0.0 0.0 2:52.98 rcuos/7
> > >
> > > Which is almost as much time as my konsole:
> > >
> > > 2853 peterz 20 0 586240 103664 41848 S 1.0 0.3 147:39.50 konsole
> > >
> > > Which seems somewhat excessive. But who knows.
> >
> > No idea. How long has that system been up? What has it been doing?
>
> Some 40 odd days it seems. Its my desktop, I read email (in mutt in
> Konsole), I type patches (in vim in Konsole), I compile kernels (in
> Konsole) etc..
>
> Now konsole is threaded and each new window/tab is just another thread
> in the same process so runtime should accumulate. However I just found
> that for some obscure reason there's two konsole processes around, and
> the other is the one that I'm using most, it also has significantly more
> runtime.
>
> 3264 ? Sl 452:43 \_ /usr/bin/konsole
>
> Must be some of that brain damaged desktop shite that confused things --
> I see the one is stared with some -session argument. Some day I'll
> discover how to destroy all that nonsense and make things behave as they
> should.
Well, you appear to be using about 6% of a CPU, or 0.7% of the entire
8-CPU system for the RCU GP kthread. That is more than I would like to
see consumed.
Odd that you have four of eight of the rcuos CPUs with higher consumption
than the others. I would expect three of eight. Are you by chance running
an eight-core system with hyperthreading disabled in hardware, via boot
parameter, or via explicit offline? The real question I have is "is
nr_cpu_ids equal to 16 rather than to 8?"
A significant fraction of rcu_sched's CPU overhead is likely due to that
extra wakeup for the fourth leader rcuos kthread.
Also, do you have nohz_full set? Just wondering why callback offloading
is enabled. (If you want it enabled, fine, but from what I can see your
workload isn't being helped by it and it does have higher overhead.)
Even if you don't want offloading and do disable it, it would be good to
reduce the penalty. Is there something I can do to reduce the overhead
of waking several kthreads? Right now, I just do a series of wake_up()
calls, one for each leader rcuos kthread.
Oh, are you running v3.10 or some such? If so, there are some more
recent RCU changes that can help with this. They are called out here:
http://www.rdrop.com/users/paulmck/scalability/paper/BareMetal.2015.01.15b.pdf
> > The rcu_sched overhead is expected behavior if the system has run between
> > ten and one hundred million grace periods, give or take an order of
> > magnitude depending on the number of idle CPUs and so on.
> >
> > The overhead for the RCU offload kthreads is what it is. A kfree() takes
> > as much time as a kfree does, and they are all nicely counted up for you.
>
> Yah, if only we could account it back to whomever caused it :/
It could be done, but would require increasing the size of rcu_head.
And would require costly fine-grained timing of callback execution.
Not something for production systems, I would guess.
> > > Although here I'll once again go ahead and say something ignorant; how
> > > come that's a problem? Surely if we know the kthread thing has finished
> > > starting a GP, any one CPU issuing a full memory barrier (as would be
> > > implied by switching to the stop worker) must then indeed observe that
> > > global state? due to that transitivity thing.
> > >
> > > That is, I'm having a wee bit of bother for seeing how you'd need
> > > manipulation of global variables as you elude to below.
> >
> > Well, I thought that you wanted to leverage the combining tree to
> > determine when the grace period had completed. If a given CPU isn't
> > pushing its quiescent states up the combining tree, then the combining
> > tree can't do much for you.
>
> Right that is what I wanted, and sure the combining thing needs to
> happen with atomics, but that's not new, it already does that.
>
> What I was talking about was the interaction between the force
> quiescence state and the poking detectoring that a QS had indeed be
> started.
It gets worse.
Suppose that a grace period is already in progess. You cannot leverage
its use of the combining tree because some of the CPUs might have already
indicated a quiescent state, which means that the current grace period
won't necessarily wait for all of the CPUs that the concurrent expedited
grace period needs to wait on. So you need to kick the current grace
period, wait for it to complete, wait for the next one to start (with
all the fun and exciting issues called out earlier), do the expedited
grace period, then wait for completion.
> > Well, I do have something that seems reasonably straightforward. Sending
> > the patches along separately. Not sure that it is worth its weight.
> >
> > The idea is that we keep the expedited grace periods working as they do
> > now, independently of the normal grace period. The normal grace period
> > takes a sequence number just after initialization, and checks to see
> > if an expedited grace period happened in the meantime at the beginning
> > of each quiescent-state forcing episode. This saves the last one or
> > two quiescent-state forcing scans if the case where an expedited grace
> > period really did happen.
> >
> > It is possible for the expedited grace period to help things along by
> > waking up the grace-period kthread, but of course doing this too much
> > further increases the time consumed by your rcu_sched kthread.
>
> Ah so that is the purpose of that patch. Still, I'm having trouble
> seeing how you can do this too much, you would only be waking it if
> there was a GP pending completion, right? At which point waking it is
> the right thing.
>
> If you wake it unconditionally, even if there's nothing to do, then yes
> that'd be a waste of cycles.
Heh! You are already complaining about rcu_sched consuming 0.7%
of your system, and rightfully so. Increasing this overhead still
further therefore cannot be considered a good thing unless there is some
overwhelming benefit. And I am not seeing that benefit. Perhaps due
to a failure of imagination, but until someone enlightens me, I have to
throttle the wakeups -- or, perhaps better, omit the wakeups entirely.
Actually, I am not convinced that I should push any of the patches that
leverage expedited grace periods to help out normal grace periods.
Thanx, Paul
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-07-01 15:56 ` Paul E. McKenney
@ 2015-07-01 16:16 ` Peter Zijlstra
2015-07-01 18:45 ` Paul E. McKenney
0 siblings, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-07-01 16:16 UTC (permalink / raw)
To: Paul E. McKenney
Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
viro, torvalds
On Wed, Jul 01, 2015 at 08:56:55AM -0700, Paul E. McKenney wrote:
> On Wed, Jul 01, 2015 at 01:56:42PM +0200, Peter Zijlstra wrote:
> Odd that you have four of eight of the rcuos CPUs with higher consumption
> than the others. I would expect three of eight. Are you by chance running
> an eight-core system with hyperthreading disabled in hardware, via boot
> parameter, or via explicit offline? The real question I have is "is
> nr_cpu_ids equal to 16 rather than to 8?"
It should not, but I'd have to instrument to be sure. Its a regular
4 core + ht part.
model name : Intel(R) Core(TM) i7-2600K CPU @ 3.40GHz
> Also, do you have nohz_full set?
Nope..
> Just wondering why callback offloading
> is enabled. (If you want it enabled, fine, but from what I can see your
> workload isn't being helped by it and it does have higher overhead.)
I think this is a distro .config; every time I strip the desktop kernel
I end up needing a driver I hadn't built. Clearly I've not really paid
attention to the RCU options.
> Even if you don't want offloading and do disable it, it would be good to
> reduce the penalty. Is there something I can do to reduce the overhead
> of waking several kthreads? Right now, I just do a series of wake_up()
> calls, one for each leader rcuos kthread.
>
> Oh, are you running v3.10 or some such? If so, there are some more
> recent RCU changes that can help with this. They are called out here:
Not that old, but not something recent either. I'll upgrade and see if
it goes away. I really detest rebooting the desktop, but it needs to
happen every so often.
> > Yah, if only we could account it back to whomever caused it :/
>
> It could be done, but would require increasing the size of rcu_head.
> And would require costly fine-grained timing of callback execution.
> Not something for production systems, I would guess.
Nope :/ I know.
> > What I was talking about was the interaction between the force
> > quiescence state and the poking detectoring that a QS had indeed be
> > started.
>
> It gets worse.
>
> Suppose that a grace period is already in progess. You cannot leverage
> its use of the combining tree because some of the CPUs might have already
> indicated a quiescent state, which means that the current grace period
> won't necessarily wait for all of the CPUs that the concurrent expedited
> grace period needs to wait on. So you need to kick the current grace
> period, wait for it to complete, wait for the next one to start (with
> all the fun and exciting issues called out earlier), do the expedited
> grace period, then wait for completion.
Ah yes. You do do find the fun cases :-)
> > If you wake it unconditionally, even if there's nothing to do, then yes
> > that'd be a waste of cycles.
>
> Heh! You are already complaining about rcu_sched consuming 0.7%
> of your system, and rightfully so. Increasing this overhead still
> further therefore cannot be considered a good thing unless there is some
> overwhelming benefit. And I am not seeing that benefit. Perhaps due
> to a failure of imagination, but until someone enlightens me, I have to
> throttle the wakeups -- or, perhaps better, omit the wakeups entirely.
>
> Actually, I am not convinced that I should push any of the patches that
> leverage expedited grace periods to help out normal grace periods.
It would seem a shame not to.. I've not yet had time to form a coherent
reply to that thread though.
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
2015-07-01 16:16 ` Peter Zijlstra
@ 2015-07-01 18:45 ` Paul E. McKenney
0 siblings, 0 replies; 106+ messages in thread
From: Paul E. McKenney @ 2015-07-01 18:45 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
viro, torvalds
On Wed, Jul 01, 2015 at 06:16:40PM +0200, Peter Zijlstra wrote:
> On Wed, Jul 01, 2015 at 08:56:55AM -0700, Paul E. McKenney wrote:
> > On Wed, Jul 01, 2015 at 01:56:42PM +0200, Peter Zijlstra wrote:
> > Odd that you have four of eight of the rcuos CPUs with higher consumption
> > than the others. I would expect three of eight. Are you by chance running
> > an eight-core system with hyperthreading disabled in hardware, via boot
> > parameter, or via explicit offline? The real question I have is "is
> > nr_cpu_ids equal to 16 rather than to 8?"
>
> It should not, but I'd have to instrument to be sure. Its a regular
> 4 core + ht part.
>
> model name : Intel(R) Core(TM) i7-2600K CPU @ 3.40GHz
Well, if nr_cpu_ids is equal to 8, I likely need to recheck my math.
> > Also, do you have nohz_full set?
>
> Nope..
>
> > Just wondering why callback offloading
> > is enabled. (If you want it enabled, fine, but from what I can see your
> > workload isn't being helped by it and it does have higher overhead.)
>
> I think this is a distro .config; every time I strip the desktop kernel
> I end up needing a driver I hadn't built. Clearly I've not really paid
> attention to the RCU options.
OK, early versions of RHEL definitely would do what you have by default,
and I would need to check with Rik to find out what stuff got backported
when.
> > Even if you don't want offloading and do disable it, it would be good to
> > reduce the penalty. Is there something I can do to reduce the overhead
> > of waking several kthreads? Right now, I just do a series of wake_up()
> > calls, one for each leader rcuos kthread.
> >
> > Oh, are you running v3.10 or some such? If so, there are some more
> > recent RCU changes that can help with this. They are called out here:
>
> Not that old, but not something recent either. I'll upgrade and see if
> it goes away. I really detest rebooting the desktop, but it needs to
> happen every so often.
Feel free to send me the .config, the exact version, and any boot
parameters you have. That would allow me to tell you whether or not
moving ahead would do you any good.
> > > Yah, if only we could account it back to whomever caused it :/
> >
> > It could be done, but would require increasing the size of rcu_head.
> > And would require costly fine-grained timing of callback execution.
> > Not something for production systems, I would guess.
>
> Nope :/ I know.
>
> > > What I was talking about was the interaction between the force
> > > quiescence state and the poking detectoring that a QS had indeed be
> > > started.
> >
> > It gets worse.
> >
> > Suppose that a grace period is already in progess. You cannot leverage
> > its use of the combining tree because some of the CPUs might have already
> > indicated a quiescent state, which means that the current grace period
> > won't necessarily wait for all of the CPUs that the concurrent expedited
> > grace period needs to wait on. So you need to kick the current grace
> > period, wait for it to complete, wait for the next one to start (with
> > all the fun and exciting issues called out earlier), do the expedited
> > grace period, then wait for completion.
>
> Ah yes. You do do find the fun cases :-)
Given that I am RCU maintainer, I had better be able to. A large
quantity of them rushed into my head when you suggested this, hence my
initial reaction. That said, Oleg is probably better at finding fun
cases than am I.
> > > If you wake it unconditionally, even if there's nothing to do, then yes
> > > that'd be a waste of cycles.
> >
> > Heh! You are already complaining about rcu_sched consuming 0.7%
> > of your system, and rightfully so. Increasing this overhead still
> > further therefore cannot be considered a good thing unless there is some
> > overwhelming benefit. And I am not seeing that benefit. Perhaps due
> > to a failure of imagination, but until someone enlightens me, I have to
> > throttle the wakeups -- or, perhaps better, omit the wakeups entirely.
> >
> > Actually, I am not convinced that I should push any of the patches that
> > leverage expedited grace periods to help out normal grace periods.
>
> It would seem a shame not to.. I've not yet had time to form a coherent
> reply to that thread though.
Well, it does increase complexity and coupling, and I don't see that
it provides big-animal benefits to justify this. Again, might be just
insufficient imagination on my part, but...
Thanx, Paul
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 00/13] percpu rwsem -v2
2015-07-01 5:57 ` Daniel Wagner
@ 2015-07-01 21:54 ` Linus Torvalds
2015-07-02 9:41 ` Peter Zijlstra
0 siblings, 1 reply; 106+ messages in thread
From: Linus Torvalds @ 2015-07-01 21:54 UTC (permalink / raw)
To: Daniel Wagner
Cc: Ingo Molnar, Peter Zijlstra, Oleg Nesterov, Paul McKenney,
Tejun Heo, Ingo Molnar, Linux Kernel Mailing List, der.herr,
Davidlohr Bueso, Rik van Riel, Al Viro, Jeff Layton
On Tue, Jun 30, 2015 at 10:57 PM, Daniel Wagner <wagi@monom.org> wrote:
>
> And an attempt at visualization:
>
> http://monom.org/posix01/sweep-4.1.0-02756-ge3d06bd.png
> http://monom.org/posix01/sweep-4.1.0-02769-g6ce2591.png
Ugh. The old numbers look (mostly) fairly tight, and then the new ones
are all over the map, and usually much worse.
We've seen this behavior before when switching from a non-sleeping
lock to a sleeping one. The sleeping locks have absolutely horrible
behavior when they get contended, and spend tons of CPU time on the
sleep/wakeup management, based on almost random timing noise. And it
can get orders of magnitude worse if there are any nested locks that
basically trigger trains of that kind of behavior.
In general, sleeping locks are just horribly horribly bad for things
that do small simple operations. Which is what fs/locks.c does.
I'm not convinced it's fixable. Maybe the new rwsem just isn't a good idea.
Linus
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 00/13] percpu rwsem -v2
2015-07-01 21:54 ` Linus Torvalds
@ 2015-07-02 9:41 ` Peter Zijlstra
2015-07-20 5:53 ` Daniel Wagner
0 siblings, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-07-02 9:41 UTC (permalink / raw)
To: Linus Torvalds
Cc: Daniel Wagner, Ingo Molnar, Oleg Nesterov, Paul McKenney,
Tejun Heo, Ingo Molnar, Linux Kernel Mailing List, der.herr,
Davidlohr Bueso, Rik van Riel, Al Viro, Jeff Layton
On Wed, Jul 01, 2015 at 02:54:59PM -0700, Linus Torvalds wrote:
> On Tue, Jun 30, 2015 at 10:57 PM, Daniel Wagner <wagi@monom.org> wrote:
> >
> > And an attempt at visualization:
> >
> > http://monom.org/posix01/sweep-4.1.0-02756-ge3d06bd.png
> > http://monom.org/posix01/sweep-4.1.0-02769-g6ce2591.png
>
> Ugh. The old numbers look (mostly) fairly tight, and then the new ones
> are all over the map, and usually much worse.
>
> We've seen this behavior before when switching from a non-sleeping
> lock to a sleeping one. The sleeping locks have absolutely horrible
> behavior when they get contended, and spend tons of CPU time on the
> sleep/wakeup management,
Right, I'm just not seeing how any of that would happen here :/ The read
side would only ever block on reading /proc/$something and I'm fairly
sure that benchmark doesn't actually touch that file.
In any case, I will look into this, I've just not had time yet..
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 00/13] percpu rwsem -v2
2015-07-02 9:41 ` Peter Zijlstra
@ 2015-07-20 5:53 ` Daniel Wagner
2015-07-20 18:44 ` Linus Torvalds
0 siblings, 1 reply; 106+ messages in thread
From: Daniel Wagner @ 2015-07-20 5:53 UTC (permalink / raw)
To: Peter Zijlstra, Linus Torvalds
Cc: Ingo Molnar, Oleg Nesterov, Paul McKenney, Tejun Heo,
Ingo Molnar, Linux Kernel Mailing List, der.herr,
Davidlohr Bueso, Rik van Riel, Al Viro, Jeff Layton
On 07/02/2015 11:41 AM, Peter Zijlstra wrote:
> On Wed, Jul 01, 2015 at 02:54:59PM -0700, Linus Torvalds wrote:
>> On Tue, Jun 30, 2015 at 10:57 PM, Daniel Wagner <wagi@monom.org> wrote:
>>>
>>> And an attempt at visualization:
>>>
>>> http://monom.org/posix01/sweep-4.1.0-02756-ge3d06bd.png
>>> http://monom.org/posix01/sweep-4.1.0-02769-g6ce2591.png
>>
>> Ugh. The old numbers look (mostly) fairly tight, and then the new ones
>> are all over the map, and usually much worse.
>>
>> We've seen this behavior before when switching from a non-sleeping
>> lock to a sleeping one. The sleeping locks have absolutely horrible
>> behavior when they get contended, and spend tons of CPU time on the
>> sleep/wakeup management,
>
> Right, I'm just not seeing how any of that would happen here :/ The read
> side would only ever block on reading /proc/$something and I'm fairly
> sure that benchmark doesn't actually touch that file.
>
> In any case, I will look into this, I've just not had time yet..
I did some more testing and found out that the slow path of percpu_down_read()
is never taken (as expected). The only change left is the exchange from a
percpu arch_spinlock_t spinlocks to percpu spinlock_t spinlocks.
Turning them back into arch_spinlock_t gives almost the same numbers as
with spinlock_t.
Then Peter suggested to change the code to
preempt_disable();
spin_unlock();
preempt_enable_no_resched();
to verify if arch_spin_lock() is buggy and does not disable preemption
and we see a lock holder preemption on non virt setups.
Here all the numbers and plots:
- base line
http://monom.org/posix01-4/tip-4.1.0-02756-ge3d06bd.png
http://monom.org/posix01-4/tip-4.1.0-02756-ge3d06bd.txt
- arch_spinlock_t
http://monom.org/posix01-4/arch_spintlock_t-4.1.0-02769-g6ce2591-dirty.png
http://monom.org/posix01-4/arch_spintlock_t-4.1.0-02769-g6ce2591-dirty.txt
http://monom.org/posix01-4/arch_spintlock_t-4.1.0-02769-g6ce2591-dirty.patch
- no resched
http://monom.org/posix01-4/no_resched-4.1.0-02770-g4d518cf.png
http://monom.org/posix01-4/no_resched-4.1.0-02770-g4d518cf.txt
http://monom.org/posix01-4/no_resched-4.1.0-02770-g4d518cf.patch
cheers,
daniel
^ permalink raw reply [flat|nested] 106+ messages in thread
* Re: [RFC][PATCH 00/13] percpu rwsem -v2
2015-07-20 5:53 ` Daniel Wagner
@ 2015-07-20 18:44 ` Linus Torvalds
0 siblings, 0 replies; 106+ messages in thread
From: Linus Torvalds @ 2015-07-20 18:44 UTC (permalink / raw)
To: Daniel Wagner
Cc: Peter Zijlstra, Ingo Molnar, Oleg Nesterov, Paul McKenney,
Tejun Heo, Ingo Molnar, Linux Kernel Mailing List, der.herr,
Davidlohr Bueso, Rik van Riel, Al Viro, Jeff Layton
On Sun, Jul 19, 2015 at 10:53 PM, Daniel Wagner <wagi@monom.org> wrote:
>
> Turning them back into arch_spinlock_t gives almost the same numbers as
> with spinlock_t.
>
> Then Peter suggested to change the code to
>
> preempt_disable();
> spin_unlock();
> preempt_enable_no_resched();
>
> to verify if arch_spin_lock() is buggy and does not disable preemption
> and we see a lock holder preemption on non virt setups.
Hmm. "arch_spin_lock()" isn't _supposed_ to disable preemption. The
caller should do that (possibly by disabling interrupts). See
include/linux/spinlock_api_smp.h for details.
But yes, that's a *very* subtle difference between "arch_spin_lock()"
and "spin_lock()". The former doesn't do lockdep or other debugging
and it doesn't disable preemption. So they are not interchangeable.
The current lglocks uses arch_spin_lock exactly because it does not
*want* lockdep tracking (it does its own) and because it does its own
preemption handling.
So saying "verify if arch_spin_lock() is buggy and does not disable
preemption" is complete BS. If arch_spin_lock() were to disable
preemption, _that_ would be a bug.
Linus
^ permalink raw reply [flat|nested] 106+ messages in thread
end of thread, other threads:[~2015-07-20 18:44 UTC | newest]
Thread overview: 106+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-06-22 12:16 [RFC][PATCH 00/13] percpu rwsem -v2 Peter Zijlstra
2015-06-22 12:16 ` [RFC][PATCH 01/13] rcu: Create rcu_sync infrastructure Peter Zijlstra
2015-06-22 12:16 ` [RFC][PATCH 02/13] rcusync: Introduce struct rcu_sync_ops Peter Zijlstra
2015-06-22 12:16 ` [RFC][PATCH 03/13] rcusync: Add the CONFIG_PROVE_RCU checks Peter Zijlstra
2015-06-22 12:16 ` [RFC][PATCH 04/13] rcusync: Introduce rcu_sync_dtor() Peter Zijlstra
2015-06-22 12:16 ` [RFC][PATCH 05/13] percpu-rwsem: Optimize readers and reduce global impact Peter Zijlstra
2015-06-22 23:02 ` Oleg Nesterov
2015-06-23 7:28 ` Nicholas Mc Guire
2015-06-25 19:08 ` Peter Zijlstra
2015-06-25 19:17 ` Tejun Heo
2015-06-29 9:32 ` Peter Zijlstra
2015-06-29 15:12 ` Tejun Heo
2015-06-29 15:14 ` Peter Zijlstra
2015-06-22 12:16 ` [RFC][PATCH 06/13] percpu-rwsem: Provide percpu_down_read_trylock() Peter Zijlstra
2015-06-22 23:08 ` Oleg Nesterov
2015-06-22 12:16 ` [RFC][PATCH 07/13] sched: Reorder task_struct Peter Zijlstra
2015-06-22 12:16 ` [RFC][PATCH 08/13] percpu-rwsem: DEFINE_STATIC_PERCPU_RWSEM Peter Zijlstra
2015-06-22 12:16 ` [RFC][PATCH 09/13] hotplug: Replace hotplug lock with percpu-rwsem Peter Zijlstra
2015-06-22 22:57 ` Oleg Nesterov
2015-06-23 7:16 ` Peter Zijlstra
2015-06-23 17:01 ` Oleg Nesterov
2015-06-23 17:53 ` Peter Zijlstra
2015-06-24 13:50 ` Oleg Nesterov
2015-06-24 14:13 ` Peter Zijlstra
2015-06-24 15:12 ` Oleg Nesterov
2015-06-24 16:15 ` Peter Zijlstra
2015-06-28 23:56 ` [PATCH 0/3] percpu-rwsem: introduce percpu_rw_semaphore->recursive mode Oleg Nesterov
2015-06-28 23:56 ` [PATCH 1/3] rcusync: introduce rcu_sync_struct->exclusive mode Oleg Nesterov
2015-06-28 23:56 ` [PATCH 2/3] percpu-rwsem: don't use percpu_rw_semaphore->rw_sem to exclude writers Oleg Nesterov
2015-06-28 23:56 ` [PATCH 3/3] percpu-rwsem: introduce percpu_rw_semaphore->recursive mode Oleg Nesterov
2015-06-22 12:16 ` [RFC][PATCH 10/13] fs/locks: Replace lg_global with a percpu-rwsem Peter Zijlstra
2015-06-22 12:16 ` [RFC][PATCH 11/13] fs/locks: Replace lg_local with a per-cpu spinlock Peter Zijlstra
2015-06-23 0:19 ` Oleg Nesterov
2015-06-22 12:16 ` [RFC][PATCH 12/13] stop_machine: Remove lglock Peter Zijlstra
2015-06-22 22:21 ` Oleg Nesterov
2015-06-23 10:09 ` Peter Zijlstra
2015-06-23 10:55 ` Peter Zijlstra
2015-06-23 11:20 ` Peter Zijlstra
2015-06-23 13:08 ` Peter Zijlstra
2015-06-23 16:36 ` Oleg Nesterov
2015-06-23 17:30 ` Paul E. McKenney
2015-06-23 18:04 ` Peter Zijlstra
2015-06-23 18:26 ` Paul E. McKenney
2015-06-23 19:05 ` Paul E. McKenney
2015-06-24 2:23 ` Paul E. McKenney
2015-06-24 8:32 ` Peter Zijlstra
2015-06-24 9:31 ` Peter Zijlstra
2015-06-24 13:48 ` Paul E. McKenney
2015-06-24 15:01 ` Paul E. McKenney
2015-06-24 15:34 ` Peter Zijlstra
2015-06-24 7:35 ` Peter Zijlstra
2015-06-24 8:42 ` Ingo Molnar
2015-06-24 13:39 ` Paul E. McKenney
2015-06-24 13:43 ` Ingo Molnar
2015-06-24 14:03 ` Paul E. McKenney
2015-06-24 14:50 ` Paul E. McKenney
2015-06-24 15:01 ` Peter Zijlstra
2015-06-24 15:27 ` Paul E. McKenney
2015-06-24 15:40 ` Peter Zijlstra
2015-06-24 16:09 ` Paul E. McKenney
2015-06-24 16:42 ` Peter Zijlstra
2015-06-24 17:10 ` Paul E. McKenney
2015-06-24 17:20 ` Paul E. McKenney
2015-06-24 17:29 ` Peter Zijlstra
2015-06-24 17:28 ` Peter Zijlstra
2015-06-24 17:32 ` Peter Zijlstra
2015-06-24 18:14 ` Peter Zijlstra
2015-06-24 17:58 ` Peter Zijlstra
2015-06-25 3:23 ` Paul E. McKenney
2015-06-25 11:07 ` Peter Zijlstra
2015-06-25 13:47 ` Paul E. McKenney
2015-06-25 14:20 ` Peter Zijlstra
2015-06-25 14:51 ` Paul E. McKenney
2015-06-26 12:32 ` Peter Zijlstra
2015-06-26 16:14 ` Paul E. McKenney
2015-06-29 7:56 ` Peter Zijlstra
2015-06-30 21:32 ` Paul E. McKenney
2015-07-01 11:56 ` Peter Zijlstra
2015-07-01 15:56 ` Paul E. McKenney
2015-07-01 16:16 ` Peter Zijlstra
2015-07-01 18:45 ` Paul E. McKenney
2015-06-23 14:39 ` Paul E. McKenney
2015-06-23 16:20 ` Oleg Nesterov
2015-06-23 17:24 ` Oleg Nesterov
2015-06-25 19:18 ` Peter Zijlstra
2015-06-22 12:16 ` [RFC][PATCH 13/13] locking: " Peter Zijlstra
2015-06-22 12:36 ` [RFC][PATCH 00/13] percpu rwsem -v2 Peter Zijlstra
2015-06-22 18:11 ` Daniel Wagner
2015-06-22 19:05 ` Peter Zijlstra
2015-06-23 9:35 ` Daniel Wagner
2015-06-23 10:00 ` Ingo Molnar
2015-06-23 14:34 ` Peter Zijlstra
2015-06-23 14:56 ` Daniel Wagner
2015-06-23 17:50 ` Peter Zijlstra
2015-06-23 19:36 ` Peter Zijlstra
2015-06-24 8:46 ` Ingo Molnar
2015-06-24 9:01 ` Peter Zijlstra
2015-06-24 9:18 ` Daniel Wagner
2015-07-01 5:57 ` Daniel Wagner
2015-07-01 21:54 ` Linus Torvalds
2015-07-02 9:41 ` Peter Zijlstra
2015-07-20 5:53 ` Daniel Wagner
2015-07-20 18:44 ` Linus Torvalds
2015-06-22 20:06 ` Linus Torvalds
2015-06-23 16:10 ` Davidlohr Bueso
2015-06-23 16:21 ` Peter Zijlstra
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).