All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC][PATCH 00/13] percpu rwsem -v2
@ 2015-06-22 12:16 Peter Zijlstra
  2015-06-22 12:16 ` [RFC][PATCH 01/13] rcu: Create rcu_sync infrastructure Peter Zijlstra
                   ` (16 more replies)
  0 siblings, 17 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-22 12:16 UTC (permalink / raw)
  To: oleg, paulmck
  Cc: tj, mingo, linux-kernel, der.herr, peterz, dave, riel, viro, torvalds

This is a derived work of the cpu hotplug lock rework I did in 2013 which never
really went anywhere because Linus didn't like it.

This applies those same optimizations to the percpu-rwsem. Seeing how we did
all the work it seemed a waste to not use it at all. Linus still didn't like it
because there was only a single user, there are two now:

 - uprobes
 - cgroups

This series converts the cpu hotplug lock into a percpu-rwsem to provide a 3rd
user.

Also, since Linus thinks lglocks is a failed locking primitive (which I whole
heartedly agree with, its preempt-disable latencies are an abomination), it
also converts the global part of fs/locks's usage of lglock over to a
percpu-rwsem and uses a per-cpu spinlock for the local part. This both provides
another (4th) percpu-rwsem users and removes an lglock user.

It further removes the stop_machine lglock usage, and with it kills lglocks.

Changes since -v1:

 - Added the missing smp_load_acquire()/smp_store_release() as spotted by Oleg
 - Added percpu_down_read_trylock()
 - Convert cpu hotplug lock
 - Convert fs/locks
 - Removes lglock from stop_machine
 - Removes lglock

---
 Documentation/locking/lglock.txt | 166 -------------------------
 fs/Kconfig                       |   1 +
 fs/file_table.c                  |   1 -
 fs/locks.c                       |  65 +++++++---
 include/linux/cpu.h              |   6 +
 include/linux/lglock.h           |  81 -------------
 include/linux/percpu-rwsem.h     |  96 +++++++++++++--
 include/linux/sched.h            |   9 +-
 init/main.c                      |   1 +
 kernel/cpu.c                     | 130 ++++++--------------
 kernel/fork.c                    |   2 +
 kernel/locking/Makefile          |   1 -
 kernel/locking/lglock.c          | 111 -----------------
 kernel/locking/percpu-rwsem.c    | 255 +++++++++++++++++++++------------------
 kernel/rcu/Makefile              |   2 +-
 kernel/stop_machine.c            |  52 ++++----
 lib/Kconfig                      |  10 ++
 17 files changed, 371 insertions(+), 618 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 106+ messages in thread

* [RFC][PATCH 01/13] rcu: Create rcu_sync infrastructure
  2015-06-22 12:16 [RFC][PATCH 00/13] percpu rwsem -v2 Peter Zijlstra
@ 2015-06-22 12:16 ` Peter Zijlstra
  2015-06-22 12:16 ` [RFC][PATCH 02/13] rcusync: Introduce struct rcu_sync_ops Peter Zijlstra
                   ` (15 subsequent siblings)
  16 siblings, 0 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-22 12:16 UTC (permalink / raw)
  To: oleg, paulmck
  Cc: tj, mingo, linux-kernel, der.herr, peterz, dave, riel, viro, torvalds

[-- Attachment #1: oleg_nesterov-2_rcu-create_rcu_sync_infrastructure.patch --]
[-- Type: text/plain, Size: 5875 bytes --]

It is functionally equivalent to

        struct rcu_sync_struct {
                atomic_t counter;
        };

        static inline bool rcu_sync_is_idle(struct rcu_sync_struct *rss)
        {
                return atomic_read(&rss->counter) == 0;
        }

        static inline void rcu_sync_enter(struct rcu_sync_struct *rss)
        {
                atomic_inc(&rss->counter);
                synchronize_sched();
        }

        static inline void rcu_sync_exit(struct rcu_sync_struct *rss)
        {
                synchronize_sched();
                atomic_dec(&rss->counter);
        }

except: it records the state and synchronize_sched() is only called by
rcu_sync_enter() and only if necessary.

Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/rcusync.h |   64 ++++++++++++++++++++++++++++
 kernel/rcu/Makefile     |    2 
 kernel/rcu/sync.c       |  108 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 173 insertions(+), 1 deletion(-)

--- /dev/null
+++ b/include/linux/rcusync.h
@@ -0,0 +1,64 @@
+#ifndef _LINUX_RCUSYNC_H_
+#define _LINUX_RCUSYNC_H_
+
+#include <linux/wait.h>
+#include <linux/rcupdate.h>
+
+struct rcu_sync_struct {
+	int			gp_state;
+	int			gp_count;
+	wait_queue_head_t	gp_wait;
+
+	int			cb_state;
+	struct rcu_head		cb_head;
+
+	void (*sync)(void);
+	void (*call)(struct rcu_head *, void (*)(struct rcu_head *));
+};
+
+#define ___RCU_SYNC_INIT(name)						\
+	.gp_state = 0,							\
+	.gp_count = 0,							\
+	.gp_wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.gp_wait),		\
+	.cb_state = 0
+
+#define __RCU_SCHED_SYNC_INIT(name) {					\
+	___RCU_SYNC_INIT(name),						\
+	.sync = synchronize_sched,					\
+	.call = call_rcu_sched,						\
+}
+
+#define __RCU_BH_SYNC_INIT(name) {					\
+	___RCU_SYNC_INIT(name),						\
+	.sync = synchronize_rcu_bh,					\
+	.call = call_rcu_bh,						\
+}
+
+#define __RCU_SYNC_INIT(name) {						\
+	___RCU_SYNC_INIT(name),						\
+	.sync = synchronize_rcu,					\
+	.call = call_rcu,						\
+}
+
+#define DEFINE_RCU_SCHED_SYNC(name)					\
+	struct rcu_sync_struct name = __RCU_SCHED_SYNC_INIT(name)
+
+#define DEFINE_RCU_BH_SYNC(name)					\
+	struct rcu_sync_struct name = __RCU_BH_SYNC_INIT(name)
+
+#define DEFINE_RCU_SYNC(name)						\
+	struct rcu_sync_struct name = __RCU_SYNC_INIT(name)
+
+static inline bool rcu_sync_is_idle(struct rcu_sync_struct *rss)
+{
+	return !rss->gp_state; /* GP_IDLE */
+}
+
+enum rcu_sync_type { RCU_SYNC, RCU_SCHED_SYNC, RCU_BH_SYNC };
+
+extern void rcu_sync_init(struct rcu_sync_struct *, enum rcu_sync_type);
+extern void rcu_sync_enter(struct rcu_sync_struct *);
+extern void rcu_sync_exit(struct rcu_sync_struct *);
+
+#endif /* _LINUX_RCUSYNC_H_ */
+
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -1,4 +1,4 @@
-obj-y += update.o
+obj-y += update.o sync.o
 obj-$(CONFIG_SRCU) += srcu.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_TREE_RCU) += tree.o
--- /dev/null
+++ b/kernel/rcu/sync.c
@@ -0,0 +1,108 @@
+
+#include <linux/rcusync.h>
+#include <linux/sched.h>
+
+enum { GP_IDLE = 0, GP_PENDING, GP_PASSED };
+enum { CB_IDLE = 0, CB_PENDING, CB_REPLAY };
+
+#define	rss_lock	gp_wait.lock
+
+void rcu_sync_init(struct rcu_sync_struct *rss, enum rcu_sync_type type)
+{
+	memset(rss, 0, sizeof(*rss));
+	init_waitqueue_head(&rss->gp_wait);
+
+	switch (type) {
+	case RCU_SYNC:
+		rss->sync = synchronize_rcu;
+		rss->call = call_rcu;
+		break;
+
+	case RCU_SCHED_SYNC:
+		rss->sync = synchronize_sched;
+		rss->call = call_rcu_sched;
+		break;
+
+	case RCU_BH_SYNC:
+		rss->sync = synchronize_rcu_bh;
+		rss->call = call_rcu_bh;
+		break;
+	}
+}
+
+void rcu_sync_enter(struct rcu_sync_struct *rss)
+{
+	bool need_wait, need_sync;
+
+	spin_lock_irq(&rss->rss_lock);
+	need_wait = rss->gp_count++;
+	need_sync = rss->gp_state == GP_IDLE;
+	if (need_sync)
+		rss->gp_state = GP_PENDING;
+	spin_unlock_irq(&rss->rss_lock);
+
+	BUG_ON(need_wait && need_sync);
+
+	if (need_sync) {
+		rss->sync();
+		rss->gp_state = GP_PASSED;
+		wake_up_all(&rss->gp_wait);
+	} else if (need_wait) {
+		wait_event(rss->gp_wait, rss->gp_state == GP_PASSED);
+	} else {
+		/*
+		 * Possible when there's a pending CB from a rcu_sync_exit().
+		 * Nobody has yet been allowed the 'fast' path and thus we can
+		 * avoid doing any sync(). The callback will get 'dropped'.
+		 */
+		BUG_ON(rss->gp_state != GP_PASSED);
+	}
+}
+
+static void rcu_sync_func(struct rcu_head *rcu)
+{
+	struct rcu_sync_struct *rss =
+		container_of(rcu, struct rcu_sync_struct, cb_head);
+	unsigned long flags;
+
+
+	BUG_ON(rss->gp_state != GP_PASSED);
+	BUG_ON(rss->cb_state == CB_IDLE);
+
+	spin_lock_irqsave(&rss->rss_lock, flags);
+	if (rss->gp_count) {
+		/*
+		 * A new rcu_sync_begin() has happened; drop the callback.
+		 */
+		rss->cb_state = CB_IDLE;
+	} else if (rss->cb_state == CB_REPLAY) {
+		/*
+		 * A new rcu_sync_exit() has happened; requeue the callback
+		 * to catch a later GP.
+		 */
+		rss->cb_state = CB_PENDING;
+		rss->call(&rss->cb_head, rcu_sync_func);
+	} else {
+		/*
+		 * We're at least a GP after rcu_sync_exit(); eveybody will now
+		 * have observed the write side critical section. Let 'em rip!.
+		 */
+		rss->cb_state = CB_IDLE;
+		rss->gp_state = GP_IDLE;
+	}
+	spin_unlock_irqrestore(&rss->rss_lock, flags);
+}
+
+void rcu_sync_exit(struct rcu_sync_struct *rss)
+{
+	spin_lock_irq(&rss->rss_lock);
+	if (!--rss->gp_count) {
+		if (rss->cb_state == CB_IDLE) {
+			rss->cb_state = CB_PENDING;
+			rss->call(&rss->cb_head, rcu_sync_func);
+		} else if (rss->cb_state == CB_PENDING) {
+			rss->cb_state = CB_REPLAY;
+		}
+	}
+	spin_unlock_irq(&rss->rss_lock);
+}


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 106+ messages in thread

* [RFC][PATCH 02/13] rcusync: Introduce struct rcu_sync_ops
  2015-06-22 12:16 [RFC][PATCH 00/13] percpu rwsem -v2 Peter Zijlstra
  2015-06-22 12:16 ` [RFC][PATCH 01/13] rcu: Create rcu_sync infrastructure Peter Zijlstra
@ 2015-06-22 12:16 ` Peter Zijlstra
  2015-06-22 12:16 ` [RFC][PATCH 03/13] rcusync: Add the CONFIG_PROVE_RCU checks Peter Zijlstra
                   ` (14 subsequent siblings)
  16 siblings, 0 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-22 12:16 UTC (permalink / raw)
  To: oleg, paulmck
  Cc: tj, mingo, linux-kernel, der.herr, peterz, dave, riel, viro, torvalds

[-- Attachment #1: oleg_nesterov-4_rcusync-introduce_struct_rcu_sync_ops.patch --]
[-- Type: text/plain, Size: 5102 bytes --]

Add the new struct rcu_sync_ops which holds sync/call methods, and
turn the function pointers in rcu_sync_struct into an array of struct
rcu_sync_ops.

This simplifies the "init" helpers, and this way it is simpler to add
the new methods we need, especially ifdef'ed.

Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/rcusync.h |   60 ++++++++++++++++++------------------------------
 kernel/rcu/sync.c       |   43 +++++++++++++++++-----------------
 2 files changed, 45 insertions(+), 58 deletions(-)

--- a/include/linux/rcusync.h
+++ b/include/linux/rcusync.h
@@ -4,6 +4,8 @@
 #include <linux/wait.h>
 #include <linux/rcupdate.h>
 
+enum rcu_sync_type { RCU_SYNC, RCU_SCHED_SYNC, RCU_BH_SYNC };
+
 struct rcu_sync_struct {
 	int			gp_state;
 	int			gp_count;
@@ -12,53 +14,37 @@ struct rcu_sync_struct {
 	int			cb_state;
 	struct rcu_head		cb_head;
 
-	void (*sync)(void);
-	void (*call)(struct rcu_head *, void (*)(struct rcu_head *));
+	enum rcu_sync_type	gp_type;
 };
 
-#define ___RCU_SYNC_INIT(name)						\
-	.gp_state = 0,							\
-	.gp_count = 0,							\
-	.gp_wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.gp_wait),		\
-	.cb_state = 0
-
-#define __RCU_SCHED_SYNC_INIT(name) {					\
-	___RCU_SYNC_INIT(name),						\
-	.sync = synchronize_sched,					\
-	.call = call_rcu_sched,						\
-}
-
-#define __RCU_BH_SYNC_INIT(name) {					\
-	___RCU_SYNC_INIT(name),						\
-	.sync = synchronize_rcu_bh,					\
-	.call = call_rcu_bh,						\
-}
-
-#define __RCU_SYNC_INIT(name) {						\
-	___RCU_SYNC_INIT(name),						\
-	.sync = synchronize_rcu,					\
-	.call = call_rcu,						\
-}
-
-#define DEFINE_RCU_SCHED_SYNC(name)					\
-	struct rcu_sync_struct name = __RCU_SCHED_SYNC_INIT(name)
-
-#define DEFINE_RCU_BH_SYNC(name)					\
-	struct rcu_sync_struct name = __RCU_BH_SYNC_INIT(name)
-
-#define DEFINE_RCU_SYNC(name)						\
-	struct rcu_sync_struct name = __RCU_SYNC_INIT(name)
-
 static inline bool rcu_sync_is_idle(struct rcu_sync_struct *rss)
 {
 	return !rss->gp_state; /* GP_IDLE */
 }
 
-enum rcu_sync_type { RCU_SYNC, RCU_SCHED_SYNC, RCU_BH_SYNC };
-
 extern void rcu_sync_init(struct rcu_sync_struct *, enum rcu_sync_type);
 extern void rcu_sync_enter(struct rcu_sync_struct *);
 extern void rcu_sync_exit(struct rcu_sync_struct *);
 
+#define __RCU_SYNC_INITIALIZER(name, type) {				\
+		.gp_state = 0,						\
+		.gp_count = 0,						\
+		.gp_wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.gp_wait),	\
+		.cb_state = 0,						\
+		.gp_type = type,					\
+	}
+
+#define	__DEFINE_RCU_SYNC(name, type)	\
+	struct rcu_sync_struct name = __RCU_SYNC_INITIALIZER(name, type)
+
+#define DEFINE_RCU_SYNC(name)		\
+	__DEFINE_RCU_SYNC(name, RCU_SYNC)
+
+#define DEFINE_RCU_SCHED_SYNC(name)	\
+	__DEFINE_RCU_SYNC(name, RCU_SCHED_SYNC)
+
+#define DEFINE_RCU_BH_SYNC(name)	\
+	__DEFINE_RCU_SYNC(name, RCU_BH_SYNC)
+
 #endif /* _LINUX_RCUSYNC_H_ */
 
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -1,7 +1,24 @@
-
 #include <linux/rcusync.h>
 #include <linux/sched.h>
 
+static const struct {
+	void (*sync)(void);
+	void (*call)(struct rcu_head *, void (*)(struct rcu_head *));
+} gp_ops[] = {
+	[RCU_SYNC] = {
+		.sync = synchronize_rcu,
+		.call = call_rcu,
+	},
+	[RCU_SCHED_SYNC] = {
+		.sync = synchronize_sched,
+		.call = call_rcu_sched,
+	},
+	[RCU_BH_SYNC] = {
+		.sync = synchronize_rcu_bh,
+		.call = call_rcu_bh,
+	},
+};
+
 enum { GP_IDLE = 0, GP_PENDING, GP_PASSED };
 enum { CB_IDLE = 0, CB_PENDING, CB_REPLAY };
 
@@ -11,23 +28,7 @@ void rcu_sync_init(struct rcu_sync_struc
 {
 	memset(rss, 0, sizeof(*rss));
 	init_waitqueue_head(&rss->gp_wait);
-
-	switch (type) {
-	case RCU_SYNC:
-		rss->sync = synchronize_rcu;
-		rss->call = call_rcu;
-		break;
-
-	case RCU_SCHED_SYNC:
-		rss->sync = synchronize_sched;
-		rss->call = call_rcu_sched;
-		break;
-
-	case RCU_BH_SYNC:
-		rss->sync = synchronize_rcu_bh;
-		rss->call = call_rcu_bh;
-		break;
-	}
+	rss->gp_type = type;
 }
 
 void rcu_sync_enter(struct rcu_sync_struct *rss)
@@ -44,7 +45,7 @@ void rcu_sync_enter(struct rcu_sync_stru
 	BUG_ON(need_wait && need_sync);
 
 	if (need_sync) {
-		rss->sync();
+		gp_ops[rss->gp_type].sync();
 		rss->gp_state = GP_PASSED;
 		wake_up_all(&rss->gp_wait);
 	} else if (need_wait) {
@@ -81,7 +82,7 @@ static void rcu_sync_func(struct rcu_hea
 		 * to catch a later GP.
 		 */
 		rss->cb_state = CB_PENDING;
-		rss->call(&rss->cb_head, rcu_sync_func);
+		gp_ops[rss->gp_type].call(&rss->cb_head, rcu_sync_func);
 	} else {
 		/*
 		 * We're at least a GP after rcu_sync_exit(); eveybody will now
@@ -99,7 +100,7 @@ void rcu_sync_exit(struct rcu_sync_struc
 	if (!--rss->gp_count) {
 		if (rss->cb_state == CB_IDLE) {
 			rss->cb_state = CB_PENDING;
-			rss->call(&rss->cb_head, rcu_sync_func);
+			gp_ops[rss->gp_type].call(&rss->cb_head, rcu_sync_func);
 		} else if (rss->cb_state == CB_PENDING) {
 			rss->cb_state = CB_REPLAY;
 		}


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 106+ messages in thread

* [RFC][PATCH 03/13] rcusync: Add the CONFIG_PROVE_RCU checks
  2015-06-22 12:16 [RFC][PATCH 00/13] percpu rwsem -v2 Peter Zijlstra
  2015-06-22 12:16 ` [RFC][PATCH 01/13] rcu: Create rcu_sync infrastructure Peter Zijlstra
  2015-06-22 12:16 ` [RFC][PATCH 02/13] rcusync: Introduce struct rcu_sync_ops Peter Zijlstra
@ 2015-06-22 12:16 ` Peter Zijlstra
  2015-06-22 12:16 ` [RFC][PATCH 04/13] rcusync: Introduce rcu_sync_dtor() Peter Zijlstra
                   ` (13 subsequent siblings)
  16 siblings, 0 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-22 12:16 UTC (permalink / raw)
  To: oleg, paulmck
  Cc: tj, mingo, linux-kernel, der.herr, peterz, dave, riel, viro, torvalds

[-- Attachment #1: oleg_nesterov-5_rcusync-add_the_config_prove_rcu_checks.patch --]
[-- Type: text/plain, Size: 2438 bytes --]

It would be nice to validate that the caller of rcu_sync_is_idle()
holds the corresponding type of RCU read-side lock. Add the new
rcu_sync_ops->held() method and change rcu_sync_is_idle() to
WARN() if it returns false.

This obviously penalizes the readers (fast-path), but only if
CONFIG_PROVE_RCU.

Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Suggested-by: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/rcusync.h |    6 ++++++
 kernel/rcu/sync.c       |   21 +++++++++++++++++++++
 2 files changed, 27 insertions(+)

--- a/include/linux/rcusync.h
+++ b/include/linux/rcusync.h
@@ -17,9 +17,15 @@ struct rcu_sync_struct {
 	enum rcu_sync_type	gp_type;
 };
 
+extern bool __rcu_sync_is_idle(struct rcu_sync_struct *);
+
 static inline bool rcu_sync_is_idle(struct rcu_sync_struct *rss)
 {
+#ifdef CONFIG_PROVE_RCU
+	return __rcu_sync_is_idle(rss);
+#else
 	return !rss->gp_state; /* GP_IDLE */
+#endif
 }
 
 extern void rcu_sync_init(struct rcu_sync_struct *, enum rcu_sync_type);
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -1,21 +1,33 @@
 #include <linux/rcusync.h>
 #include <linux/sched.h>
 
+#ifdef CONFIG_PROVE_RCU
+#define __INIT_HELD(func)	.held = func,
+#else
+#define __INIT_HELD(func)
+#endif
+
 static const struct {
 	void (*sync)(void);
 	void (*call)(struct rcu_head *, void (*)(struct rcu_head *));
+#ifdef CONFIG_PROVE_RCU
+	int  (*held)(void);
+#endif
 } gp_ops[] = {
 	[RCU_SYNC] = {
 		.sync = synchronize_rcu,
 		.call = call_rcu,
+		__INIT_HELD(rcu_read_lock_held)
 	},
 	[RCU_SCHED_SYNC] = {
 		.sync = synchronize_sched,
 		.call = call_rcu_sched,
+		__INIT_HELD(rcu_read_lock_sched_held)
 	},
 	[RCU_BH_SYNC] = {
 		.sync = synchronize_rcu_bh,
 		.call = call_rcu_bh,
+		__INIT_HELD(rcu_read_lock_bh_held)
 	},
 };
 
@@ -24,6 +36,15 @@ enum { CB_IDLE = 0, CB_PENDING, CB_REPLA
 
 #define	rss_lock	gp_wait.lock
 
+#ifdef CONFIG_PROVE_RCU
+bool __rcu_sync_is_idle(struct rcu_sync_struct *rss)
+{
+	WARN_ON(!gp_ops[rss->gp_type].held());
+	return rss->gp_state == GP_IDLE;
+}
+EXPORT_SYMBOL_GPL(__rcu_sync_is_idle);
+#endif
+
 void rcu_sync_init(struct rcu_sync_struct *rss, enum rcu_sync_type type)
 {
 	memset(rss, 0, sizeof(*rss));


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 106+ messages in thread

* [RFC][PATCH 04/13] rcusync: Introduce rcu_sync_dtor()
  2015-06-22 12:16 [RFC][PATCH 00/13] percpu rwsem -v2 Peter Zijlstra
                   ` (2 preceding siblings ...)
  2015-06-22 12:16 ` [RFC][PATCH 03/13] rcusync: Add the CONFIG_PROVE_RCU checks Peter Zijlstra
@ 2015-06-22 12:16 ` Peter Zijlstra
  2015-06-22 12:16 ` [RFC][PATCH 05/13] percpu-rwsem: Optimize readers and reduce global impact Peter Zijlstra
                   ` (12 subsequent siblings)
  16 siblings, 0 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-22 12:16 UTC (permalink / raw)
  To: oleg, paulmck
  Cc: tj, mingo, linux-kernel, der.herr, peterz, dave, riel, viro, torvalds

[-- Attachment #1: oleg_nesterov-6_rcusync-introduce_rcu_sync_dtor.patch --]
[-- Type: text/plain, Size: 2278 bytes --]

Add the new rcu_sync_ops->wait() method and the new helper,
rcu_sync_dtor().

It is needed if you are going to, say, kfree(rcu_sync_object).
It simply calls ops->wait() to "flush" the potentially pending
rcu callback.

Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/rcusync.h |    1 +
 kernel/rcu/sync.c       |   22 ++++++++++++++++++++++
 2 files changed, 23 insertions(+)

--- a/include/linux/rcusync.h
+++ b/include/linux/rcusync.h
@@ -31,6 +31,7 @@ static inline bool rcu_sync_is_idle(stru
 extern void rcu_sync_init(struct rcu_sync_struct *, enum rcu_sync_type);
 extern void rcu_sync_enter(struct rcu_sync_struct *);
 extern void rcu_sync_exit(struct rcu_sync_struct *);
+extern void rcu_sync_dtor(struct rcu_sync_struct *);
 
 #define __RCU_SYNC_INITIALIZER(name, type) {				\
 		.gp_state = 0,						\
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -10,6 +10,7 @@
 static const struct {
 	void (*sync)(void);
 	void (*call)(struct rcu_head *, void (*)(struct rcu_head *));
+	void (*wait)(void);
 #ifdef CONFIG_PROVE_RCU
 	int  (*held)(void);
 #endif
@@ -17,16 +18,19 @@ static const struct {
 	[RCU_SYNC] = {
 		.sync = synchronize_rcu,
 		.call = call_rcu,
+		.wait = rcu_barrier,
 		__INIT_HELD(rcu_read_lock_held)
 	},
 	[RCU_SCHED_SYNC] = {
 		.sync = synchronize_sched,
 		.call = call_rcu_sched,
+		.wait = rcu_barrier_sched,
 		__INIT_HELD(rcu_read_lock_sched_held)
 	},
 	[RCU_BH_SYNC] = {
 		.sync = synchronize_rcu_bh,
 		.call = call_rcu_bh,
+		.wait = rcu_barrier_bh,
 		__INIT_HELD(rcu_read_lock_bh_held)
 	},
 };
@@ -128,3 +132,21 @@ void rcu_sync_exit(struct rcu_sync_struc
 	}
 	spin_unlock_irq(&rss->rss_lock);
 }
+
+void rcu_sync_dtor(struct rcu_sync_struct *rss)
+{
+	int cb_state;
+
+	BUG_ON(rss->gp_count);
+
+	spin_lock_irq(&rss->rss_lock);
+	if (rss->cb_state == CB_REPLAY)
+		rss->cb_state = CB_PENDING;
+	cb_state = rss->cb_state;
+	spin_unlock_irq(&rss->rss_lock);
+
+	if (cb_state != CB_IDLE) {
+		gp_ops[rss->gp_type].wait();
+		BUG_ON(rss->cb_state != CB_IDLE);
+	}
+}


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 106+ messages in thread

* [RFC][PATCH 05/13] percpu-rwsem: Optimize readers and reduce global impact
  2015-06-22 12:16 [RFC][PATCH 00/13] percpu rwsem -v2 Peter Zijlstra
                   ` (3 preceding siblings ...)
  2015-06-22 12:16 ` [RFC][PATCH 04/13] rcusync: Introduce rcu_sync_dtor() Peter Zijlstra
@ 2015-06-22 12:16 ` Peter Zijlstra
  2015-06-22 23:02   ` Oleg Nesterov
  2015-06-23  7:28   ` Nicholas Mc Guire
  2015-06-22 12:16 ` [RFC][PATCH 06/13] percpu-rwsem: Provide percpu_down_read_trylock() Peter Zijlstra
                   ` (11 subsequent siblings)
  16 siblings, 2 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-22 12:16 UTC (permalink / raw)
  To: oleg, paulmck
  Cc: tj, mingo, linux-kernel, der.herr, peterz, dave, riel, viro, torvalds

[-- Attachment #1: peterz-zijlstra-opt-percpu-rwsem.patch --]
[-- Type: text/plain, Size: 13552 bytes --]

Currently the percpu-rwsem has two issues:

 - it switches to (global) atomic ops while a writer is waiting;
   which could be quite a while and slows down releasing the readers.

 - it employs synchronize_sched_expedited() _twice_ which is evil and
   should die -- it shoots IPIs around the machine.

This patch cures the first problem by ordering the reader-state vs
reader-count (see the comments in __percpu_down_read() and
percpu_down_write()). This changes a global atomic op into a full
memory barrier, which doesn't have the global cacheline contention.

It cures the second problem by employing the rcu-sync primitives by
Oleg which reduces to no sync_sched() calls in the 'normal' case of
no write contention -- global locks had better be rare, and has a
maximum of one sync_sched() call in case of contention.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/percpu-rwsem.h  |   62 +++++++++-
 kernel/locking/percpu-rwsem.c |  243 ++++++++++++++++++++++--------------------
 2 files changed, 182 insertions(+), 123 deletions(-)

--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -5,18 +5,64 @@
 #include <linux/rwsem.h>
 #include <linux/percpu.h>
 #include <linux/wait.h>
+#include <linux/rcusync.h>
 #include <linux/lockdep.h>
 
 struct percpu_rw_semaphore {
-	unsigned int __percpu	*fast_read_ctr;
-	atomic_t		write_ctr;
+	unsigned int __percpu	*refcount;
+	int			state;
+	struct rcu_sync_struct	rss;
+	wait_queue_head_t	writer;
 	struct rw_semaphore	rw_sem;
-	atomic_t		slow_read_ctr;
-	wait_queue_head_t	write_waitq;
 };
 
-extern void percpu_down_read(struct percpu_rw_semaphore *);
-extern void percpu_up_read(struct percpu_rw_semaphore *);
+extern void __percpu_down_read(struct percpu_rw_semaphore *);
+extern void __percpu_up_read(struct percpu_rw_semaphore *);
+
+static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
+{
+	might_sleep();
+
+	rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 0, _RET_IP_);
+
+	preempt_disable();
+	/*
+	 * We are in an RCU-sched read-side critical section, so the writer
+	 * cannot both change sem->state from readers_fast and start
+	 * checking counters while we are here. So if we see !sem->state,
+	 * we know that the writer won't be checking until we past the
+	 * preempt_enable() and that once the synchronize_sched() is done, the
+	 * writer will see anything we did within this RCU-sched read-side
+	 * critical section.
+	 */
+	__this_cpu_inc(*sem->refcount);
+	if (unlikely(!rcu_sync_is_idle(&sem->rss)))
+		__percpu_down_read(sem); /* Unconditional memory barrier. */
+	preempt_enable();
+	/*
+	 * The barrier() from preempt_enable() prevents the compiler from
+	 * bleeding the critical section out.
+	 */
+}
+
+static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
+{
+	/*
+	 * The barrier() in preempt_disable() prevents the compiler from
+	 * bleeding the critical section out.
+	 */
+	preempt_disable();
+	/*
+	 * Same as in percpu_down_read().
+	 */
+	if (likely(rcu_sync_is_idle(&sem->rss)))
+		__this_cpu_dec(*sem->refcount);
+	else
+		__percpu_up_read(sem); /* Unconditional memory barrier. */
+	preempt_enable();
+
+	rwsem_release(&sem->rw_sem.dep_map, 1, _RET_IP_);
+}
 
 extern void percpu_down_write(struct percpu_rw_semaphore *);
 extern void percpu_up_write(struct percpu_rw_semaphore *);
@@ -25,10 +71,10 @@ extern int __percpu_init_rwsem(struct pe
 				const char *, struct lock_class_key *);
 extern void percpu_free_rwsem(struct percpu_rw_semaphore *);
 
-#define percpu_init_rwsem(brw)	\
+#define percpu_init_rwsem(sem)					\
 ({								\
 	static struct lock_class_key rwsem_key;			\
-	__percpu_init_rwsem(brw, #brw, &rwsem_key);		\
+	__percpu_init_rwsem(sem, #sem, &rwsem_key);		\
 })
 
 #endif
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -8,158 +8,171 @@
 #include <linux/sched.h>
 #include <linux/errno.h>
 
-int __percpu_init_rwsem(struct percpu_rw_semaphore *brw,
+enum { readers_slow, readers_block };
+
+int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
 			const char *name, struct lock_class_key *rwsem_key)
 {
-	brw->fast_read_ctr = alloc_percpu(int);
-	if (unlikely(!brw->fast_read_ctr))
+	sem->refcount = alloc_percpu(unsigned int);
+	if (unlikely(!sem->refcount))
 		return -ENOMEM;
 
-	/* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
-	__init_rwsem(&brw->rw_sem, name, rwsem_key);
-	atomic_set(&brw->write_ctr, 0);
-	atomic_set(&brw->slow_read_ctr, 0);
-	init_waitqueue_head(&brw->write_waitq);
+	sem->state = readers_slow;
+	rcu_sync_init(&sem->rss, RCU_SCHED_SYNC);
+	init_waitqueue_head(&sem->writer);
+	__init_rwsem(&sem->rw_sem, name, rwsem_key);
+
 	return 0;
 }
 
-void percpu_free_rwsem(struct percpu_rw_semaphore *brw)
+void percpu_free_rwsem(struct percpu_rw_semaphore *sem)
 {
-	free_percpu(brw->fast_read_ctr);
-	brw->fast_read_ctr = NULL; /* catch use after free bugs */
+	rcu_sync_dtor(&sem->rss);
+	free_percpu(sem->refcount);
+	sem->refcount = NULL; /* catch use after free bugs */
 }
 
-/*
- * This is the fast-path for down_read/up_read, it only needs to ensure
- * there is no pending writer (atomic_read(write_ctr) == 0) and inc/dec the
- * fast per-cpu counter. The writer uses synchronize_sched_expedited() to
- * serialize with the preempt-disabled section below.
- *
- * The nontrivial part is that we should guarantee acquire/release semantics
- * in case when
- *
- *	R_W: down_write() comes after up_read(), the writer should see all
- *	     changes done by the reader
- * or
- *	W_R: down_read() comes after up_write(), the reader should see all
- *	     changes done by the writer
- *
- * If this helper fails the callers rely on the normal rw_semaphore and
- * atomic_dec_and_test(), so in this case we have the necessary barriers.
- *
- * But if it succeeds we do not have any barriers, atomic_read(write_ctr) or
- * __this_cpu_add() below can be reordered with any LOAD/STORE done by the
- * reader inside the critical section. See the comments in down_write and
- * up_write below.
- */
-static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val)
+void __percpu_down_read(struct percpu_rw_semaphore *sem)
 {
-	bool success = false;
+	/*
+	 * Due to having preemption disabled the decrement happens on
+	 * the same CPU as the increment, avoiding the
+	 * increment-on-one-CPU-and-decrement-on-another problem.
+	 *
+	 * And yes, if the reader misses the writer's assignment of
+	 * readers_block to sem->state, then the writer is
+	 * guaranteed to see the reader's increment.  Conversely, any
+	 * readers that increment their sem->refcount after the
+	 * writer looks are guaranteed to see the readers_block value,
+	 * which in turn means that they are guaranteed to immediately
+	 * decrement their sem->refcount, so that it doesn't matter
+	 * that the writer missed them.
+	 */
+
+	smp_mb(); /* A matches D */
+
+	/*
+	 * If !readers_block the critical section starts here, matched by the
+	 * release in percpu_up_write().
+	 */
+	if (likely(smp_load_acquire(&sem->state) != readers_block))
+		return;
+
+	/*
+	 * Per the above comment; we still have preemption disabled and
+	 * will thus decrement on the same CPU as we incremented.
+	 */
+	__percpu_up_read(sem);
+
+	/*
+	 * We either call schedule() in the wait, or we'll fall through
+	 * and reschedule on the preempt_enable() in percpu_down_read().
+	 */
+	preempt_enable_no_resched();
+
+	/*
+	 * Avoid lockdep for the down/up_read() we already have them.
+	 */
+	__down_read(&sem->rw_sem);
+	__this_cpu_inc(*sem->refcount);
+	__up_read(&sem->rw_sem);
 
 	preempt_disable();
-	if (likely(!atomic_read(&brw->write_ctr))) {
-		__this_cpu_add(*brw->fast_read_ctr, val);
-		success = true;
-	}
-	preempt_enable();
+}
+
+void __percpu_up_read(struct percpu_rw_semaphore *sem)
+{
+	smp_mb(); /* B matches C */
+	/*
+	 * In other words, if they see our decrement (presumably to aggregate
+	 * zero, as that is the only time it matters) they will also see our
+	 * critical section.
+	 */
+	this_cpu_dec(*sem->refcount);
 
-	return success;
+	/* Prod writer to recheck readers_active */
+	wake_up(&sem->writer);
 }
 
+
+#define per_cpu_sum(var)						\
+({									\
+	typeof(var) __sum = 0;						\
+	int cpu;							\
+	for_each_possible_cpu(cpu)					\
+		__sum += per_cpu(var, cpu);				\
+	__sum;								\
+})
+
 /*
- * Like the normal down_read() this is not recursive, the writer can
- * come after the first percpu_down_read() and create the deadlock.
- *
- * Note: returns with lock_is_held(brw->rw_sem) == T for lockdep,
- * percpu_up_read() does rwsem_release(). This pairs with the usage
- * of ->rw_sem in percpu_down/up_write().
+ * Return true if the modular sum of the sem->refcount per-CPU variable is
+ * zero.  If this sum is zero, then it is stable due to the fact that if any
+ * newly arriving readers increment a given counter, they will immediately
+ * decrement that same counter.
  */
-void percpu_down_read(struct percpu_rw_semaphore *brw)
+static bool readers_active_check(struct percpu_rw_semaphore *sem)
 {
-	might_sleep();
-	if (likely(update_fast_ctr(brw, +1))) {
-		rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_);
-		return;
-	}
+	if (per_cpu_sum(*sem->refcount) != 0)
+		return false;
+
+	/*
+	 * If we observed the decrement; ensure we see the entire critical
+	 * section.
+	 */
+
+	smp_mb(); /* C matches B */
 
-	down_read(&brw->rw_sem);
-	atomic_inc(&brw->slow_read_ctr);
-	/* avoid up_read()->rwsem_release() */
-	__up_read(&brw->rw_sem);
+	return true;
 }
 
-void percpu_up_read(struct percpu_rw_semaphore *brw)
+void percpu_down_write(struct percpu_rw_semaphore *sem)
 {
-	rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_);
+	down_write(&sem->rw_sem);
 
-	if (likely(update_fast_ctr(brw, -1)))
-		return;
+	/* Notify readers to take the slow path. */
+	rcu_sync_enter(&sem->rss);
 
-	/* false-positive is possible but harmless */
-	if (atomic_dec_and_test(&brw->slow_read_ctr))
-		wake_up_all(&brw->write_waitq);
-}
+	/*
+	 * Notify new readers to block; up until now, and thus throughout the
+	 * longish rcu_sync_enter() above, new readers could still come in.
+	 */
+	sem->state = readers_block;
 
-static int clear_fast_ctr(struct percpu_rw_semaphore *brw)
-{
-	unsigned int sum = 0;
-	int cpu;
+	smp_mb(); /* D matches A */
 
-	for_each_possible_cpu(cpu) {
-		sum += per_cpu(*brw->fast_read_ctr, cpu);
-		per_cpu(*brw->fast_read_ctr, cpu) = 0;
-	}
+	/*
+	 * If they don't see our writer of readers_block to sem->state,
+	 * then we are guaranteed to see their sem->refcount increment, and
+	 * therefore will wait for them.
+	 */
 
-	return sum;
+	/* Wait for all now active readers to complete. */
+	wait_event(sem->writer, readers_active_check(sem));
 }
 
-/*
- * A writer increments ->write_ctr to force the readers to switch to the
- * slow mode, note the atomic_read() check in update_fast_ctr().
- *
- * After that the readers can only inc/dec the slow ->slow_read_ctr counter,
- * ->fast_read_ctr is stable. Once the writer moves its sum into the slow
- * counter it represents the number of active readers.
- *
- * Finally the writer takes ->rw_sem for writing and blocks the new readers,
- * then waits until the slow counter becomes zero.
- */
-void percpu_down_write(struct percpu_rw_semaphore *brw)
+void percpu_up_write(struct percpu_rw_semaphore *sem)
 {
-	/* tell update_fast_ctr() there is a pending writer */
-	atomic_inc(&brw->write_ctr);
 	/*
-	 * 1. Ensures that write_ctr != 0 is visible to any down_read/up_read
-	 *    so that update_fast_ctr() can't succeed.
+	 * Signal the writer is done, no fast path yet.
 	 *
-	 * 2. Ensures we see the result of every previous this_cpu_add() in
-	 *    update_fast_ctr().
+	 * One reason that we cannot just immediately flip to readers_fast is
+	 * that new readers might fail to see the results of this writer's
+	 * critical section.
 	 *
-	 * 3. Ensures that if any reader has exited its critical section via
-	 *    fast-path, it executes a full memory barrier before we return.
-	 *    See R_W case in the comment above update_fast_ctr().
+	 * Therefore we force it through the slow path which guarantees an
+	 * acquire and thereby guarantees the critical section's consistency.
 	 */
-	synchronize_sched_expedited();
+	smp_store_release(&sem->state, readers_slow);
 
-	/* exclude other writers, and block the new readers completely */
-	down_write(&brw->rw_sem);
-
-	/* nobody can use fast_read_ctr, move its sum into slow_read_ctr */
-	atomic_add(clear_fast_ctr(brw), &brw->slow_read_ctr);
-
-	/* wait for all readers to complete their percpu_up_read() */
-	wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr));
-}
+	/*
+	 * Release the write lock, this will allow readers back in the game.
+	 */
+	up_write(&sem->rw_sem);
 
-void percpu_up_write(struct percpu_rw_semaphore *brw)
-{
-	/* release the lock, but the readers can't use the fast-path */
-	up_write(&brw->rw_sem);
 	/*
-	 * Insert the barrier before the next fast-path in down_read,
-	 * see W_R case in the comment above update_fast_ctr().
+	 * Once this completes (at least one RCU grace period hence) the reader
+	 * fast path will be available again. Safe to use outside the exclusive
+	 * write lock because its counting.
 	 */
-	synchronize_sched_expedited();
-	/* the last writer unblocks update_fast_ctr() */
-	atomic_dec(&brw->write_ctr);
+	rcu_sync_exit(&sem->rss);
 }


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 106+ messages in thread

* [RFC][PATCH 06/13] percpu-rwsem: Provide percpu_down_read_trylock()
  2015-06-22 12:16 [RFC][PATCH 00/13] percpu rwsem -v2 Peter Zijlstra
                   ` (4 preceding siblings ...)
  2015-06-22 12:16 ` [RFC][PATCH 05/13] percpu-rwsem: Optimize readers and reduce global impact Peter Zijlstra
@ 2015-06-22 12:16 ` Peter Zijlstra
  2015-06-22 23:08   ` Oleg Nesterov
  2015-06-22 12:16 ` [RFC][PATCH 07/13] sched: Reorder task_struct Peter Zijlstra
                   ` (10 subsequent siblings)
  16 siblings, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-22 12:16 UTC (permalink / raw)
  To: oleg, paulmck
  Cc: tj, mingo, linux-kernel, der.herr, peterz, dave, riel, viro, torvalds

[-- Attachment #1: peterz-percpu-rwsem-trydown.patch --]
[-- Type: text/plain, Size: 1711 bytes --]


Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/percpu-rwsem.h  |   17 +++++++++++++++++
 kernel/locking/percpu-rwsem.c |   12 ++++++++++++
 2 files changed, 29 insertions(+)

--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -17,6 +17,7 @@ struct percpu_rw_semaphore {
 };
 
 extern void __percpu_down_read(struct percpu_rw_semaphore *);
+extern bool __percpu_down_read_trylock(struct percpu_rw_semaphore *);
 extern void __percpu_up_read(struct percpu_rw_semaphore *);
 
 static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
@@ -45,6 +46,22 @@ static inline void percpu_down_read(stru
 	 */
 }
 
+static inline bool percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
+{
+	bool ret = true;
+
+	preempt_disable();
+	__this_cpu_inc(*sem->refcount);
+	if (unlikely(!rcu_sync_is_idle(&sem->rss)))
+		ret = __percpu_down_read_trylock(sem);
+	preempt_enable();
+
+	if (ret)
+		rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 1, _RET_IP_);
+
+	return ret;
+}
+
 static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
 {
 	/*
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -80,6 +80,18 @@ void __percpu_down_read(struct percpu_rw
 	preempt_disable();
 }
 
+bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
+{
+	smp_mb(); /* A matches D */
+
+	if (likely(smp_load_acquire(&sem->state) != readers_block))
+		return true;
+
+	__percpu_up_read(sem);
+
+	return false;
+}
+
 void __percpu_up_read(struct percpu_rw_semaphore *sem)
 {
 	smp_mb(); /* B matches C */


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 106+ messages in thread

* [RFC][PATCH 07/13] sched: Reorder task_struct
  2015-06-22 12:16 [RFC][PATCH 00/13] percpu rwsem -v2 Peter Zijlstra
                   ` (5 preceding siblings ...)
  2015-06-22 12:16 ` [RFC][PATCH 06/13] percpu-rwsem: Provide percpu_down_read_trylock() Peter Zijlstra
@ 2015-06-22 12:16 ` Peter Zijlstra
  2015-06-22 12:16 ` [RFC][PATCH 08/13] percpu-rwsem: DEFINE_STATIC_PERCPU_RWSEM Peter Zijlstra
                   ` (9 subsequent siblings)
  16 siblings, 0 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-22 12:16 UTC (permalink / raw)
  To: oleg, paulmck
  Cc: tj, mingo, linux-kernel, der.herr, peterz, dave, riel, viro, torvalds

[-- Attachment #1: peterz-sched-reorder.patch --]
[-- Type: text/plain, Size: 850 bytes --]

Fill some 4 byte holes by slightly re-ordering some variables.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/sched.h |    5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1363,17 +1363,16 @@ struct task_struct {
 	atomic_t usage;
 	unsigned int flags;	/* per process flags, defined below */
 	unsigned int ptrace;
+	int on_rq;
 
 #ifdef CONFIG_SMP
 	struct llist_node wake_entry;
 	int on_cpu;
+	int wake_cpu;
 	struct task_struct *last_wakee;
 	unsigned long wakee_flips;
 	unsigned long wakee_flip_decay_ts;
-
-	int wake_cpu;
 #endif
-	int on_rq;
 
 	int prio, static_prio, normal_prio;
 	unsigned int rt_priority;


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 106+ messages in thread

* [RFC][PATCH 08/13] percpu-rwsem: DEFINE_STATIC_PERCPU_RWSEM
  2015-06-22 12:16 [RFC][PATCH 00/13] percpu rwsem -v2 Peter Zijlstra
                   ` (6 preceding siblings ...)
  2015-06-22 12:16 ` [RFC][PATCH 07/13] sched: Reorder task_struct Peter Zijlstra
@ 2015-06-22 12:16 ` Peter Zijlstra
  2015-06-22 12:16 ` [RFC][PATCH 09/13] hotplug: Replace hotplug lock with percpu-rwsem Peter Zijlstra
                   ` (8 subsequent siblings)
  16 siblings, 0 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-22 12:16 UTC (permalink / raw)
  To: oleg, paulmck
  Cc: tj, mingo, linux-kernel, der.herr, peterz, dave, riel, viro, torvalds

[-- Attachment #1: peterz-percpu-rwsem-static-init.patch --]
[-- Type: text/plain, Size: 1160 bytes --]

Provide a static init

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/percpu-rwsem.h |   13 +++++++++++++
 1 file changed, 13 insertions(+)

--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -16,6 +16,19 @@ struct percpu_rw_semaphore {
 	struct rw_semaphore	rw_sem;
 };
 
+#define DEFINE_STATIC_PERCPU_RWSEM(name)				\
+static DEFINE_PER_CPU(unsigned int, __percpu_rwsem_refcount_##name);	\
+static struct percpu_rw_semaphore name = {				\
+	.refcount = &__percpu_rwsem_refcount_##name,			\
+	.state = 0,							\
+	.rss = __RCU_SYNC_INITIALIZER(name.rss, RCU_SCHED_SYNC),	\
+	.writer = __WAIT_QUEUE_HEAD_INITIALIZER(name.writer),		\
+	.rw_sem = __RWSEM_INITIALIZER(name.rw_sem),			\
+}
+
+#define lockdep_assert_held_percpu_rwsem(sem)				\
+	lockdep_assert_held(&(sem)->rw_sem)
+
 extern void __percpu_down_read(struct percpu_rw_semaphore *);
 extern bool __percpu_down_read_trylock(struct percpu_rw_semaphore *);
 extern void __percpu_up_read(struct percpu_rw_semaphore *);


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 106+ messages in thread

* [RFC][PATCH 09/13] hotplug: Replace hotplug lock with percpu-rwsem
  2015-06-22 12:16 [RFC][PATCH 00/13] percpu rwsem -v2 Peter Zijlstra
                   ` (7 preceding siblings ...)
  2015-06-22 12:16 ` [RFC][PATCH 08/13] percpu-rwsem: DEFINE_STATIC_PERCPU_RWSEM Peter Zijlstra
@ 2015-06-22 12:16 ` Peter Zijlstra
  2015-06-22 22:57   ` Oleg Nesterov
  2015-06-22 12:16 ` [RFC][PATCH 10/13] fs/locks: Replace lg_global with a percpu-rwsem Peter Zijlstra
                   ` (7 subsequent siblings)
  16 siblings, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-22 12:16 UTC (permalink / raw)
  To: oleg, paulmck
  Cc: tj, mingo, linux-kernel, der.herr, peterz, dave, riel, viro, torvalds

[-- Attachment #1: peterz-hotplug-rwsem.patch --]
[-- Type: text/plain, Size: 8424 bytes --]

The cpu hotplug lock is a rwsem with read-in-write and read-in-read
recursion. Implement it as such.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/cpu.h          |    6 +
 include/linux/percpu-rwsem.h |   10 ++-
 include/linux/sched.h        |    4 +
 init/main.c                  |    1 
 kernel/cpu.c                 |  133 +++++++++++++------------------------------
 kernel/fork.c                |    2 
 lib/Kconfig                  |    5 +
 7 files changed, 66 insertions(+), 95 deletions(-)

--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -224,6 +224,9 @@ extern struct bus_type cpu_subsys;
 #ifdef CONFIG_HOTPLUG_CPU
 /* Stop CPUs going up and down. */
 
+extern void cpu_hotplug_init(void);
+extern void cpu_hotplug_init_task(struct task_struct *p);
+
 extern void cpu_hotplug_begin(void);
 extern void cpu_hotplug_done(void);
 extern void get_online_cpus(void);
@@ -242,6 +245,9 @@ int cpu_down(unsigned int cpu);
 
 #else		/* CONFIG_HOTPLUG_CPU */
 
+static inline void cpu_hotplug_init(void) {}
+static inline void cpu_hotplug_init_task(struct task_struct *p) {}
+
 static inline void cpu_hotplug_begin(void) {}
 static inline void cpu_hotplug_done(void) {}
 #define get_online_cpus()	do { } while (0)
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -20,12 +20,10 @@ extern void __percpu_down_read(struct pe
 extern bool __percpu_down_read_trylock(struct percpu_rw_semaphore *);
 extern void __percpu_up_read(struct percpu_rw_semaphore *);
 
-static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
+static inline void _percpu_down_read(struct percpu_rw_semaphore *sem)
 {
 	might_sleep();
 
-	rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 0, _RET_IP_);
-
 	preempt_disable();
 	/*
 	 * We are in an RCU-sched read-side critical section, so the writer
@@ -46,6 +44,12 @@ static inline void percpu_down_read(stru
 	 */
 }
 
+static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
+{
+	rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 0, _RET_IP_);
+	_percpu_down_read(sem);
+}
+
 static inline bool percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
 {
 	bool ret = true;
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1389,6 +1389,10 @@ struct task_struct {
 	unsigned int btrace_seq;
 #endif
 
+#ifdef CONFIG_HOTPLUG_CPU
+	int cpuhp_ref;
+#endif
+
 	unsigned int policy;
 	int nr_cpus_allowed;
 	cpumask_t cpus_allowed;
--- a/init/main.c
+++ b/init/main.c
@@ -588,6 +588,7 @@ asmlinkage __visible void __init start_k
 	sched_clock_postinit();
 	perf_event_init();
 	profile_init();
+	cpu_hotplug_init();
 	call_function_init();
 	WARN(!irqs_disabled(), "Interrupts were enabled early\n");
 	early_boot_irqs_disabled = false;
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -22,6 +22,7 @@
 #include <linux/lockdep.h>
 #include <linux/tick.h>
 #include <trace/events/power.h>
+#include <linux/percpu-rwsem.h>
 
 #include "smpboot.h"
 
@@ -50,7 +51,8 @@ EXPORT_SYMBOL(cpu_notifier_register_done
 
 static RAW_NOTIFIER_HEAD(cpu_chain);
 
-/* If set, cpu_up and cpu_down will return -EBUSY and do nothing.
+/*
+ * If set, cpu_up and cpu_down will return -EBUSY and do nothing.
  * Should always be manipulated under cpu_add_remove_lock
  */
 static int cpu_hotplug_disabled;
@@ -58,126 +60,72 @@ static int cpu_hotplug_disabled;
 #ifdef CONFIG_HOTPLUG_CPU
 
 static struct {
-	struct task_struct *active_writer;
-	/* wait queue to wake up the active_writer */
-	wait_queue_head_t wq;
-	/* verifies that no writer will get active while readers are active */
-	struct mutex lock;
-	/*
-	 * Also blocks the new readers during
-	 * an ongoing cpu hotplug operation.
-	 */
-	atomic_t refcount;
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	struct lockdep_map dep_map;
-#endif
-} cpu_hotplug = {
-	.active_writer = NULL,
-	.wq = __WAIT_QUEUE_HEAD_INITIALIZER(cpu_hotplug.wq),
-	.lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	.dep_map = {.name = "cpu_hotplug.lock" },
-#endif
-};
-
-/* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */
-#define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map)
-#define cpuhp_lock_acquire_tryread() \
-				  lock_map_acquire_tryread(&cpu_hotplug.dep_map)
-#define cpuhp_lock_acquire()      lock_map_acquire(&cpu_hotplug.dep_map)
-#define cpuhp_lock_release()      lock_map_release(&cpu_hotplug.dep_map)
+	struct percpu_rw_semaphore	rwsem;
+	struct task_struct		*writer;
+} cpu_hotplug = { .writer = &init_task, };
+
+void cpu_hotplug_init(void)
+{
+	percpu_init_rwsem(&cpu_hotplug.rwsem);
+	cpu_hotplug.writer = NULL;
+}
 
+void cpu_hotplug_init_task(struct task_struct *p)
+{
+	p->cpuhp_ref = 0;
+}
 
 void get_online_cpus(void)
 {
 	might_sleep();
-	if (cpu_hotplug.active_writer == current)
+
+	/* read in write recursion */
+	if (cpu_hotplug.writer == current)
+		return;
+
+	/* read in read recursion */
+	if (current->cpuhp_ref++)
 		return;
-	cpuhp_lock_acquire_read();
-	mutex_lock(&cpu_hotplug.lock);
-	atomic_inc(&cpu_hotplug.refcount);
-	mutex_unlock(&cpu_hotplug.lock);
+
+	lock_map_acquire_read(&cpu_hotplug.rwsem.rw_sem.dep_map);
+	_percpu_down_read(&cpu_hotplug.rwsem);
 }
 EXPORT_SYMBOL_GPL(get_online_cpus);
 
 bool try_get_online_cpus(void)
 {
-	if (cpu_hotplug.active_writer == current)
+	if (cpu_hotplug.writer == current)
 		return true;
-	if (!mutex_trylock(&cpu_hotplug.lock))
-		return false;
-	cpuhp_lock_acquire_tryread();
-	atomic_inc(&cpu_hotplug.refcount);
-	mutex_unlock(&cpu_hotplug.lock);
-	return true;
+
+	if (current->cpuhp_ref++)
+		return true;
+
+	return percpu_down_read_trylock(&cpu_hotplug.rwsem);
 }
 EXPORT_SYMBOL_GPL(try_get_online_cpus);
 
 void put_online_cpus(void)
 {
-	int refcount;
-
-	if (cpu_hotplug.active_writer == current)
+	if (cpu_hotplug.writer == current)
 		return;
 
-	refcount = atomic_dec_return(&cpu_hotplug.refcount);
-	if (WARN_ON(refcount < 0)) /* try to fix things up */
-		atomic_inc(&cpu_hotplug.refcount);
-
-	if (refcount <= 0 && waitqueue_active(&cpu_hotplug.wq))
-		wake_up(&cpu_hotplug.wq);
-
-	cpuhp_lock_release();
+	if (--current->cpuhp_ref)
+		return;
 
+	percpu_up_read(&cpu_hotplug.rwsem);
 }
 EXPORT_SYMBOL_GPL(put_online_cpus);
 
-/*
- * This ensures that the hotplug operation can begin only when the
- * refcount goes to zero.
- *
- * Note that during a cpu-hotplug operation, the new readers, if any,
- * will be blocked by the cpu_hotplug.lock
- *
- * Since cpu_hotplug_begin() is always called after invoking
- * cpu_maps_update_begin(), we can be sure that only one writer is active.
- *
- * Note that theoretically, there is a possibility of a livelock:
- * - Refcount goes to zero, last reader wakes up the sleeping
- *   writer.
- * - Last reader unlocks the cpu_hotplug.lock.
- * - A new reader arrives at this moment, bumps up the refcount.
- * - The writer acquires the cpu_hotplug.lock finds the refcount
- *   non zero and goes to sleep again.
- *
- * However, this is very difficult to achieve in practice since
- * get_online_cpus() not an api which is called all that often.
- *
- */
 void cpu_hotplug_begin(void)
 {
-	DEFINE_WAIT(wait);
-
-	cpu_hotplug.active_writer = current;
-	cpuhp_lock_acquire();
-
-	for (;;) {
-		mutex_lock(&cpu_hotplug.lock);
-		prepare_to_wait(&cpu_hotplug.wq, &wait, TASK_UNINTERRUPTIBLE);
-		if (likely(!atomic_read(&cpu_hotplug.refcount)))
-				break;
-		mutex_unlock(&cpu_hotplug.lock);
-		schedule();
-	}
-	finish_wait(&cpu_hotplug.wq, &wait);
+	percpu_down_write(&cpu_hotplug.rwsem);
+	cpu_hotplug.writer = current;
 }
 
 void cpu_hotplug_done(void)
 {
-	cpu_hotplug.active_writer = NULL;
-	mutex_unlock(&cpu_hotplug.lock);
-	cpuhp_lock_release();
+	cpu_hotplug.writer = NULL;
+	percpu_up_write(&cpu_hotplug.rwsem);
 }
 
 /*
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1410,6 +1410,8 @@ static struct task_struct *copy_process(
 	p->sequential_io_avg	= 0;
 #endif
 
+	cpu_hotplug_init_task(p);
+
 	/* Perform scheduler related setup. Assign this task to a CPU. */
 	retval = sched_fork(clone_flags, p);
 	if (retval)
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -56,6 +56,11 @@ config STMP_DEVICE
 config PERCPU_RWSEM
 	bool
 
+config PERCPU_RWSEM_HOTPLUG
+	def_bool y
+	depends on HOTPLUG_CPU
+	select PERCPU_RWSEM
+
 config ARCH_USE_CMPXCHG_LOCKREF
 	bool
 


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 106+ messages in thread

* [RFC][PATCH 10/13] fs/locks: Replace lg_global with a percpu-rwsem
  2015-06-22 12:16 [RFC][PATCH 00/13] percpu rwsem -v2 Peter Zijlstra
                   ` (8 preceding siblings ...)
  2015-06-22 12:16 ` [RFC][PATCH 09/13] hotplug: Replace hotplug lock with percpu-rwsem Peter Zijlstra
@ 2015-06-22 12:16 ` Peter Zijlstra
  2015-06-22 12:16 ` [RFC][PATCH 11/13] fs/locks: Replace lg_local with a per-cpu spinlock Peter Zijlstra
                   ` (6 subsequent siblings)
  16 siblings, 0 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-22 12:16 UTC (permalink / raw)
  To: oleg, paulmck
  Cc: tj, mingo, linux-kernel, der.herr, peterz, dave, riel, viro, torvalds

[-- Attachment #1: peterz-fslocks-rwsem.patch --]
[-- Type: text/plain, Size: 5096 bytes --]

Replace the global part of the lglock with a percpu-rwsem.

Since fcl_lock is a spinlock and itself nests under i_lock, which too
is a spinlock we cannot acquire sleeping locks at
locks_{insert,remove}_global_locks().

We can however wrap all fcl_lock acquisitions with percpu_down_read
such that all invocations of locks_{insert,remove}_global_locks() have
that read lock held.

This allows us to replace the lg_global part of the lglock with the
write side of the rwsem.

In the absense of writers, percpu_{down,up}_read() are free of atomic
instructions. This further avoids the very long preempt-disable
regions caused by lglock on larger machines.

Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 fs/locks.c |   18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

--- a/fs/locks.c
+++ b/fs/locks.c
@@ -165,6 +165,7 @@ int lease_break_time = 45;
  */
 DEFINE_STATIC_LGLOCK(file_lock_lglock);
 static DEFINE_PER_CPU(struct hlist_head, file_lock_list);
+static struct percpu_rw_semaphore file_rwsem;
 
 /*
  * The blocked_hash is used to find POSIX lock loops for deadlock detection.
@@ -556,6 +557,8 @@ static int posix_same_owner(struct file_
 /* Must be called with the flc_lock held! */
 static void locks_insert_global_locks(struct file_lock *fl)
 {
+	lockdep_assert_held_percpu_rwsem(&file_rwsem);
+
 	lg_local_lock(&file_lock_lglock);
 	fl->fl_link_cpu = smp_processor_id();
 	hlist_add_head(&fl->fl_link, this_cpu_ptr(&file_lock_list));
@@ -565,6 +568,8 @@ static void locks_insert_global_locks(st
 /* Must be called with the flc_lock held! */
 static void locks_delete_global_locks(struct file_lock *fl)
 {
+	lockdep_assert_held_percpu_rwsem(&file_rwsem);
+
 	/*
 	 * Avoid taking lock if already unhashed. This is safe since this check
 	 * is done while holding the flc_lock, and new insertions into the list
@@ -885,6 +890,7 @@ static int flock_lock_file(struct file *
 			return -ENOMEM;
 	}
 
+	percpu_down_read(&file_rwsem);
 	spin_lock(&ctx->flc_lock);
 	if (request->fl_flags & FL_ACCESS)
 		goto find_conflict;
@@ -925,6 +931,7 @@ static int flock_lock_file(struct file *
 
 out:
 	spin_unlock(&ctx->flc_lock);
+	percpu_up_read(&file_rwsem);
 	if (new_fl)
 		locks_free_lock(new_fl);
 	locks_dispose_list(&dispose);
@@ -960,6 +967,7 @@ static int __posix_lock_file(struct inod
 		new_fl2 = locks_alloc_lock();
 	}
 
+	percpu_down_read(&file_rwsem);
 	spin_lock(&ctx->flc_lock);
 	/*
 	 * New lock request. Walk all POSIX locks and look for conflicts. If
@@ -1131,6 +1139,7 @@ static int __posix_lock_file(struct inod
 	}
  out:
 	spin_unlock(&ctx->flc_lock);
+	percpu_up_read(&file_rwsem);
 	/*
 	 * Free any unused locks.
 	 */
@@ -1407,6 +1416,7 @@ int __break_lease(struct inode *inode, u
 		return error;
 	}
 
+	percpu_down_read(&file_rwsem);
 	spin_lock(&ctx->flc_lock);
 
 	time_out_leases(inode, &dispose);
@@ -1477,6 +1487,7 @@ int __break_lease(struct inode *inode, u
 	}
 out:
 	spin_unlock(&ctx->flc_lock);
+	percpu_up_read(&file_rwsem);
 	locks_dispose_list(&dispose);
 	locks_free_lock(new_fl);
 	return error;
@@ -1630,6 +1641,7 @@ generic_add_lease(struct file *filp, lon
 		return -EINVAL;
 	}
 
+	percpu_down_read(&file_rwsem);
 	spin_lock(&ctx->flc_lock);
 	time_out_leases(inode, &dispose);
 	error = check_conflicting_open(dentry, arg, lease->fl_flags);
@@ -1700,6 +1712,7 @@ generic_add_lease(struct file *filp, lon
 		lease->fl_lmops->lm_setup(lease, priv);
 out:
 	spin_unlock(&ctx->flc_lock);
+	percpu_up_read(&file_rwsem);
 	locks_dispose_list(&dispose);
 	if (is_deleg)
 		mutex_unlock(&inode->i_mutex);
@@ -1722,6 +1735,7 @@ static int generic_delete_lease(struct f
 		return error;
 	}
 
+	percpu_down_read(&file_rwsem);
 	spin_lock(&ctx->flc_lock);
 	list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
 		if (fl->fl_file == filp &&
@@ -1734,6 +1748,7 @@ static int generic_delete_lease(struct f
 	if (victim)
 		error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose);
 	spin_unlock(&ctx->flc_lock);
+	percpu_up_read(&file_rwsem);
 	locks_dispose_list(&dispose);
 	return error;
 }
@@ -2634,6 +2649,7 @@ static void *locks_start(struct seq_file
 	struct locks_iterator *iter = f->private;
 
 	iter->li_pos = *pos + 1;
+	percpu_down_write(&file_rwsem);
 	lg_global_lock(&file_lock_lglock);
 	spin_lock(&blocked_lock_lock);
 	return seq_hlist_start_percpu(&file_lock_list, &iter->li_cpu, *pos);
@@ -2652,6 +2668,7 @@ static void locks_stop(struct seq_file *
 {
 	spin_unlock(&blocked_lock_lock);
 	lg_global_unlock(&file_lock_lglock);
+	percpu_up_write(&file_rwsem);
 }
 
 static const struct seq_operations locks_seq_operations = {
@@ -2693,6 +2710,7 @@ static int __init filelock_init(void)
 			sizeof(struct file_lock), 0, SLAB_PANIC, NULL);
 
 	lg_lock_init(&file_lock_lglock, "file_lock_lglock");
+	percpu_init_rwsem(&file_rwsem);
 
 	for_each_possible_cpu(i)
 		INIT_HLIST_HEAD(per_cpu_ptr(&file_lock_list, i));


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 106+ messages in thread

* [RFC][PATCH 11/13] fs/locks: Replace lg_local with a per-cpu spinlock
  2015-06-22 12:16 [RFC][PATCH 00/13] percpu rwsem -v2 Peter Zijlstra
                   ` (9 preceding siblings ...)
  2015-06-22 12:16 ` [RFC][PATCH 10/13] fs/locks: Replace lg_global with a percpu-rwsem Peter Zijlstra
@ 2015-06-22 12:16 ` Peter Zijlstra
  2015-06-23  0:19   ` Oleg Nesterov
  2015-06-22 12:16 ` [RFC][PATCH 12/13] stop_machine: Remove lglock Peter Zijlstra
                   ` (5 subsequent siblings)
  16 siblings, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-22 12:16 UTC (permalink / raw)
  To: oleg, paulmck
  Cc: tj, mingo, linux-kernel, der.herr, peterz, dave, riel, viro, torvalds

[-- Attachment #1: peterz-fslocks-remove-lg.patch --]
[-- Type: text/plain, Size: 4580 bytes --]

As Oleg suggested, replace file_lock_list with a structure containing
the hlist head and a spinlock.

This completely removes the lglock from fs/locks.

Cc: Al Viro <viro@ZenIV.linux.org.uk>
Suggested-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 fs/Kconfig |    1 +
 fs/locks.c |   47 +++++++++++++++++++++++++++++------------------
 2 files changed, 30 insertions(+), 18 deletions(-)

--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -65,6 +65,7 @@ config EXPORTFS
 config FILE_LOCKING
 	bool "Enable POSIX file locking API" if EXPERT
 	default y
+	select PERCPU_RWSEM
 	help
 	  This option enables standard file locking support, required
           for filesystems like NFS and for the flock() system
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -128,7 +128,6 @@
 #include <linux/pid_namespace.h>
 #include <linux/hashtable.h>
 #include <linux/percpu.h>
-#include <linux/lglock.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/filelock.h>
@@ -159,12 +158,17 @@ int lease_break_time = 45;
 
 /*
  * The global file_lock_list is only used for displaying /proc/locks, so we
- * keep a list on each CPU, with each list protected by its own spinlock via
- * the file_lock_lglock. Note that alterations to the list also require that
- * the relevant flc_lock is held.
+ * keep a list on each CPU, with each list protected by its own spinlock.
+ * Global serialization is done using file_rwsem.
+ *
+ * Note that alterations to the list also require that the relevant flc_lock is
+ * held.
  */
-DEFINE_STATIC_LGLOCK(file_lock_lglock);
-static DEFINE_PER_CPU(struct hlist_head, file_lock_list);
+struct file_lock_list_struct {
+	spinlock_t		lock;
+	struct hlist_head	hlist;
+};
+static DEFINE_PER_CPU(struct file_lock_list_struct, file_lock_list);
 static struct percpu_rw_semaphore file_rwsem;
 
 /*
@@ -557,17 +561,21 @@ static int posix_same_owner(struct file_
 /* Must be called with the flc_lock held! */
 static void locks_insert_global_locks(struct file_lock *fl)
 {
+	struct file_lock_list_struct *fll = this_cpu_ptr(&file_lock_list);
+
 	lockdep_assert_held_percpu_rwsem(&file_rwsem);
 
-	lg_local_lock(&file_lock_lglock);
+	spin_lock(&fll->lock);
 	fl->fl_link_cpu = smp_processor_id();
-	hlist_add_head(&fl->fl_link, this_cpu_ptr(&file_lock_list));
-	lg_local_unlock(&file_lock_lglock);
+	hlist_add_head(&fl->fl_link, &fll->hlist);
+	spin_unlock(&fll->lock);
 }
 
 /* Must be called with the flc_lock held! */
 static void locks_delete_global_locks(struct file_lock *fl)
 {
+	struct file_lock_list_struct *fll;
+
 	lockdep_assert_held_percpu_rwsem(&file_rwsem);
 
 	/*
@@ -577,9 +585,11 @@ static void locks_delete_global_locks(st
 	 */
 	if (hlist_unhashed(&fl->fl_link))
 		return;
-	lg_local_lock_cpu(&file_lock_lglock, fl->fl_link_cpu);
+
+	fll = per_cpu_ptr(&file_lock_list, fl->fl_link_cpu);
+	spin_lock(&fll->lock);
 	hlist_del_init(&fl->fl_link);
-	lg_local_unlock_cpu(&file_lock_lglock, fl->fl_link_cpu);
+	spin_unlock(&fll->lock);
 }
 
 static unsigned long
@@ -2650,9 +2660,8 @@ static void *locks_start(struct seq_file
 
 	iter->li_pos = *pos + 1;
 	percpu_down_write(&file_rwsem);
-	lg_global_lock(&file_lock_lglock);
 	spin_lock(&blocked_lock_lock);
-	return seq_hlist_start_percpu(&file_lock_list, &iter->li_cpu, *pos);
+	return seq_hlist_start_percpu(&file_lock_list.hlist, &iter->li_cpu, *pos);
 }
 
 static void *locks_next(struct seq_file *f, void *v, loff_t *pos)
@@ -2660,14 +2669,13 @@ static void *locks_next(struct seq_file
 	struct locks_iterator *iter = f->private;
 
 	++iter->li_pos;
-	return seq_hlist_next_percpu(v, &file_lock_list, &iter->li_cpu, pos);
+	return seq_hlist_next_percpu(v, &file_lock_list.hlist, &iter->li_cpu, pos);
 }
 
 static void locks_stop(struct seq_file *f, void *v)
 	__releases(&blocked_lock_lock)
 {
 	spin_unlock(&blocked_lock_lock);
-	lg_global_unlock(&file_lock_lglock);
 	percpu_up_write(&file_rwsem);
 }
 
@@ -2709,11 +2717,14 @@ static int __init filelock_init(void)
 	filelock_cache = kmem_cache_create("file_lock_cache",
 			sizeof(struct file_lock), 0, SLAB_PANIC, NULL);
 
-	lg_lock_init(&file_lock_lglock, "file_lock_lglock");
 	percpu_init_rwsem(&file_rwsem);
 
-	for_each_possible_cpu(i)
-		INIT_HLIST_HEAD(per_cpu_ptr(&file_lock_list, i));
+	for_each_possible_cpu(i) {
+		struct file_lock_list_struct *fll = per_cpu_ptr(&file_lock_list, i);
+
+		spin_lock_init(&fll->lock);
+		INIT_HLIST_HEAD(&fll->hlist);
+	}
 
 	return 0;
 }


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 106+ messages in thread

* [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-06-22 12:16 [RFC][PATCH 00/13] percpu rwsem -v2 Peter Zijlstra
                   ` (10 preceding siblings ...)
  2015-06-22 12:16 ` [RFC][PATCH 11/13] fs/locks: Replace lg_local with a per-cpu spinlock Peter Zijlstra
@ 2015-06-22 12:16 ` Peter Zijlstra
  2015-06-22 22:21   ` Oleg Nesterov
  2015-06-22 12:16 ` [RFC][PATCH 13/13] locking: " Peter Zijlstra
                   ` (4 subsequent siblings)
  16 siblings, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-22 12:16 UTC (permalink / raw)
  To: oleg, paulmck
  Cc: tj, mingo, linux-kernel, der.herr, peterz, dave, riel, viro, torvalds

[-- Attachment #1: peterz-stop_machine-lg-removal.patch --]
[-- Type: text/plain, Size: 4198 bytes --]

We can replace both the global and local part of the lglock by better
usage of cpu_stopper::lock.

By having stop_two_cpus() acquire two cpu_stopper::locks we gain full
order against the global stop_machine which takes each of these locks
in order.

Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/stop_machine.c |   52 ++++++++++++++++++++++++++++----------------------
 lib/Kconfig           |    5 ++++
 2 files changed, 35 insertions(+), 22 deletions(-)

--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -20,7 +20,6 @@
 #include <linux/kallsyms.h>
 #include <linux/smpboot.h>
 #include <linux/atomic.h>
-#include <linux/lglock.h>
 
 /*
  * Structure to determine completion condition and record errors.  May
@@ -44,14 +43,6 @@ static DEFINE_PER_CPU(struct cpu_stopper
 static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task);
 static bool stop_machine_initialized = false;
 
-/*
- * Avoids a race between stop_two_cpus and global stop_cpus, where
- * the stoppers could get queued up in reverse order, leading to
- * system deadlock. Using an lglock means stop_two_cpus remains
- * relatively cheap.
- */
-DEFINE_STATIC_LGLOCK(stop_cpus_lock);
-
 static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
 {
 	memset(done, 0, sizeof(*done));
@@ -71,21 +62,26 @@ static void cpu_stop_signal_done(struct
 }
 
 /* queue @work to @stopper.  if offline, @work is completed immediately */
-static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
+static void __cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
 {
 	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
 	struct task_struct *p = per_cpu(cpu_stopper_task, cpu);
 
-	unsigned long flags;
-
-	spin_lock_irqsave(&stopper->lock, flags);
-
 	if (stopper->enabled) {
 		list_add_tail(&work->list, &stopper->works);
 		wake_up_process(p);
-	} else
+	} else {
 		cpu_stop_signal_done(work->done, false);
+	}
+}
 
+static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
+{
+	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
+	unsigned long flags;
+
+	spin_lock_irqsave(&stopper->lock, flags);
+	__cpu_stop_queue_work(cpu, work);
 	spin_unlock_irqrestore(&stopper->lock, flags);
 }
 
@@ -224,9 +220,14 @@ static int multi_cpu_stop(void *data)
  */
 int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg)
 {
-	struct cpu_stop_done done;
+	struct cpu_stopper *stopper1, *stopper2;
 	struct cpu_stop_work work1, work2;
 	struct multi_stop_data msdata;
+	struct cpu_stop_done done;
+	unsigned long flags;
+
+	if (cpu2 < cpu1)
+		swap(cpu1, cpu2);
 
 	preempt_disable();
 	msdata = (struct multi_stop_data){
@@ -258,10 +259,17 @@ int stop_two_cpus(unsigned int cpu1, uns
 		return -ENOENT;
 	}
 
-	lg_double_lock(&stop_cpus_lock, cpu1, cpu2);
-	cpu_stop_queue_work(cpu1, &work1);
-	cpu_stop_queue_work(cpu2, &work2);
-	lg_double_unlock(&stop_cpus_lock, cpu1, cpu2);
+	stopper1 = per_cpu_ptr(&cpu_stopper, cpu1);
+	stopper2 = per_cpu_ptr(&cpu_stopper, cpu2);
+
+	spin_lock_irqsave(&stopper1->lock, flags);
+	spin_lock(&stopper2->lock);
+
+	__cpu_stop_queue_work(cpu1, &work1);
+	__cpu_stop_queue_work(cpu2, &work2);
+
+	spin_unlock(&stopper2->lock);
+	spin_unlock_irqrestore(&stopper1->lock, flags);
 
 	preempt_enable();
 
@@ -315,10 +323,10 @@ static void queue_stop_cpus_work(const s
 	 * preempted by a stopper which might wait for other stoppers
 	 * to enter @fn which can lead to deadlock.
 	 */
-	lg_global_lock(&stop_cpus_lock);
+	preempt_disable();
 	for_each_cpu(cpu, cpumask)
 		cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu));
-	lg_global_unlock(&stop_cpus_lock);
+	preempt_enable();
 }
 
 static int __stop_cpus(const struct cpumask *cpumask,
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -61,6 +61,11 @@ config PERCPU_RWSEM_HOTPLUG
 	depends on HOTPLUG_CPU
 	select PERCPU_RWSEM
 
+config PERCPU_RWSEM_SMP
+	def_bool y
+	depends on SMP
+	select PERCPU_RWSEM
+
 config ARCH_USE_CMPXCHG_LOCKREF
 	bool
 


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 106+ messages in thread

* [RFC][PATCH 13/13] locking: Remove lglock
  2015-06-22 12:16 [RFC][PATCH 00/13] percpu rwsem -v2 Peter Zijlstra
                   ` (11 preceding siblings ...)
  2015-06-22 12:16 ` [RFC][PATCH 12/13] stop_machine: Remove lglock Peter Zijlstra
@ 2015-06-22 12:16 ` Peter Zijlstra
  2015-06-22 12:36 ` [RFC][PATCH 00/13] percpu rwsem -v2 Peter Zijlstra
                   ` (3 subsequent siblings)
  16 siblings, 0 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-22 12:16 UTC (permalink / raw)
  To: oleg, paulmck
  Cc: tj, mingo, linux-kernel, der.herr, peterz, dave, riel, viro, torvalds

[-- Attachment #1: peterz-remove_lglock.patch --]
[-- Type: text/plain, Size: 13502 bytes --]

Since there are no users left of this primitive, make it go away.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 Documentation/locking/lglock.txt |  166 ---------------------------------------
 fs/file_table.c                  |    1 
 include/linux/lglock.h           |   81 -------------------
 kernel/locking/Makefile          |    1 
 kernel/locking/lglock.c          |  111 --------------------------
 5 files changed, 360 deletions(-)

--- a/Documentation/locking/lglock.txt
+++ /dev/null
@@ -1,166 +0,0 @@
-lglock - local/global locks for mostly local access patterns
-------------------------------------------------------------
-
-Origin: Nick Piggin's VFS scalability series introduced during
-	2.6.35++ [1] [2]
-Location: kernel/locking/lglock.c
-	include/linux/lglock.h
-Users: currently only the VFS and stop_machine related code
-
-Design Goal:
-------------
-
-Improve scalability of globally used large data sets that are
-distributed over all CPUs as per_cpu elements.
-
-To manage global data structures that are partitioned over all CPUs
-as per_cpu elements but can be mostly handled by CPU local actions
-lglock will be used where the majority of accesses are cpu local
-reading and occasional cpu local writing with very infrequent
-global write access.
-
-
-* deal with things locally whenever possible
-	- very fast access to the local per_cpu data
-	- reasonably fast access to specific per_cpu data on a different
-	  CPU
-* while making global action possible when needed
-	- by expensive access to all CPUs locks - effectively
-	  resulting in a globally visible critical section.
-
-Design:
--------
-
-Basically it is an array of per_cpu spinlocks with the
-lg_local_lock/unlock accessing the local CPUs lock object and the
-lg_local_lock_cpu/unlock_cpu accessing a remote CPUs lock object
-the lg_local_lock has to disable preemption as migration protection so
-that the reference to the local CPUs lock does not go out of scope.
-Due to the lg_local_lock/unlock only touching cpu-local resources it
-is fast. Taking the local lock on a different CPU will be more
-expensive but still relatively cheap.
-
-One can relax the migration constraints by acquiring the current
-CPUs lock with lg_local_lock_cpu, remember the cpu, and release that
-lock at the end of the critical section even if migrated. This should
-give most of the performance benefits without inhibiting migration
-though needs careful considerations for nesting of lglocks and
-consideration of deadlocks with lg_global_lock.
-
-The lg_global_lock/unlock locks all underlying spinlocks of all
-possible CPUs (including those off-line). The preemption disable/enable
-are needed in the non-RT kernels to prevent deadlocks like:
-
-                     on cpu 1
-
-              task A          task B
-         lg_global_lock
-           got cpu 0 lock
-                 <<<< preempt <<<<
-                         lg_local_lock_cpu for cpu 0
-                           spin on cpu 0 lock
-
-On -RT this deadlock scenario is resolved by the arch_spin_locks in the
-lglocks being replaced by rt_mutexes which resolve the above deadlock
-by boosting the lock-holder.
-
-
-Implementation:
----------------
-
-The initial lglock implementation from Nick Piggin used some complex
-macros to generate the lglock/brlock in lglock.h - they were later
-turned into a set of functions by Andi Kleen [7]. The change to functions
-was motivated by the presence of multiple lock users and also by them
-being easier to maintain than the generating macros. This change to
-functions is also the basis to eliminated the restriction of not
-being initializeable in kernel modules (the remaining problem is that
-locks are not explicitly initialized - see lockdep-design.txt)
-
-Declaration and initialization:
--------------------------------
-
-  #include <linux/lglock.h>
-
-  DEFINE_LGLOCK(name)
-  or:
-  DEFINE_STATIC_LGLOCK(name);
-
-  lg_lock_init(&name, "lockdep_name_string");
-
-  on UP this is mapped to DEFINE_SPINLOCK(name) in both cases, note
-  also that as of 3.18-rc6 all declaration in use are of the _STATIC_
-  variant (and it seems that the non-static was never in use).
-  lg_lock_init is initializing the lockdep map only.
-
-Usage:
-------
-
-From the locking semantics it is a spinlock. It could be called a
-locality aware spinlock. lg_local_* behaves like a per_cpu
-spinlock and lg_global_* like a global spinlock.
-No surprises in the API.
-
-  lg_local_lock(*lglock);
-     access to protected per_cpu object on this CPU
-  lg_local_unlock(*lglock);
-
-  lg_local_lock_cpu(*lglock, cpu);
-     access to protected per_cpu object on other CPU cpu
-  lg_local_unlock_cpu(*lglock, cpu);
-
-  lg_global_lock(*lglock);
-     access all protected per_cpu objects on all CPUs
-  lg_global_unlock(*lglock);
-
-  There are no _trylock variants of the lglocks.
-
-Note that the lg_global_lock/unlock has to iterate over all possible
-CPUs rather than the actually present CPUs or a CPU could go off-line
-with a held lock [4] and that makes it very expensive. A discussion on
-these issues can be found at [5]
-
-Constraints:
-------------
-
-  * currently the declaration of lglocks in kernel modules is not
-    possible, though this should be doable with little change.
-  * lglocks are not recursive.
-  * suitable for code that can do most operations on the CPU local
-    data and will very rarely need the global lock
-  * lg_global_lock/unlock is *very* expensive and does not scale
-  * on UP systems all lg_* primitives are simply spinlocks
-  * in PREEMPT_RT the spinlock becomes an rt-mutex and can sleep but
-    does not change the tasks state while sleeping [6].
-  * in PREEMPT_RT the preempt_disable/enable in lg_local_lock/unlock
-    is downgraded to a migrate_disable/enable, the other
-    preempt_disable/enable are downgraded to barriers [6].
-    The deadlock noted for non-RT above is resolved due to rt_mutexes
-    boosting the lock-holder in this case which arch_spin_locks do
-    not do.
-
-lglocks were designed for very specific problems in the VFS and probably
-only are the right answer in these corner cases. Any new user that looks
-at lglocks probably wants to look at the seqlock and RCU alternatives as
-her first choice. There are also efforts to resolve the RCU issues that
-currently prevent using RCU in place of view remaining lglocks.
-
-Note on brlock history:
------------------------
-
-The 'Big Reader' read-write spinlocks were originally introduced by
-Ingo Molnar in 2000 (2.4/2.5 kernel series) and removed in 2003. They
-later were introduced by the VFS scalability patch set in 2.6 series
-again as the "big reader lock" brlock [2] variant of lglock which has
-been replaced by seqlock primitives or by RCU based primitives in the
-3.13 kernel series as was suggested in [3] in 2003. The brlock was
-entirely removed in the 3.13 kernel series.
-
-Link: 1 http://lkml.org/lkml/2010/8/2/81
-Link: 2 http://lwn.net/Articles/401738/
-Link: 3 http://lkml.org/lkml/2003/3/9/205
-Link: 4 https://lkml.org/lkml/2011/8/24/185
-Link: 5 http://lkml.org/lkml/2011/12/18/189
-Link: 6 https://www.kernel.org/pub/linux/kernel/projects/rt/
-        patch series - lglocks-rt.patch.patch
-Link: 7 http://lkml.org/lkml/2012/3/5/26
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -20,7 +20,6 @@
 #include <linux/cdev.h>
 #include <linux/fsnotify.h>
 #include <linux/sysctl.h>
-#include <linux/lglock.h>
 #include <linux/percpu_counter.h>
 #include <linux/percpu.h>
 #include <linux/hardirq.h>
--- a/include/linux/lglock.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Specialised local-global spinlock. Can only be declared as global variables
- * to avoid overhead and keep things simple (and we don't want to start using
- * these inside dynamically allocated structures).
- *
- * "local/global locks" (lglocks) can be used to:
- *
- * - Provide fast exclusive access to per-CPU data, with exclusive access to
- *   another CPU's data allowed but possibly subject to contention, and to
- *   provide very slow exclusive access to all per-CPU data.
- * - Or to provide very fast and scalable read serialisation, and to provide
- *   very slow exclusive serialisation of data (not necessarily per-CPU data).
- *
- * Brlocks are also implemented as a short-hand notation for the latter use
- * case.
- *
- * Copyright 2009, 2010, Nick Piggin, Novell Inc.
- */
-#ifndef __LINUX_LGLOCK_H
-#define __LINUX_LGLOCK_H
-
-#include <linux/spinlock.h>
-#include <linux/lockdep.h>
-#include <linux/percpu.h>
-#include <linux/cpu.h>
-#include <linux/notifier.h>
-
-#ifdef CONFIG_SMP
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-#define LOCKDEP_INIT_MAP lockdep_init_map
-#else
-#define LOCKDEP_INIT_MAP(a, b, c, d)
-#endif
-
-struct lglock {
-	arch_spinlock_t __percpu *lock;
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	struct lock_class_key lock_key;
-	struct lockdep_map    lock_dep_map;
-#endif
-};
-
-#define DEFINE_LGLOCK(name)						\
-	static DEFINE_PER_CPU(arch_spinlock_t, name ## _lock)		\
-	= __ARCH_SPIN_LOCK_UNLOCKED;					\
-	struct lglock name = { .lock = &name ## _lock }
-
-#define DEFINE_STATIC_LGLOCK(name)					\
-	static DEFINE_PER_CPU(arch_spinlock_t, name ## _lock)		\
-	= __ARCH_SPIN_LOCK_UNLOCKED;					\
-	static struct lglock name = { .lock = &name ## _lock }
-
-void lg_lock_init(struct lglock *lg, char *name);
-
-void lg_local_lock(struct lglock *lg);
-void lg_local_unlock(struct lglock *lg);
-void lg_local_lock_cpu(struct lglock *lg, int cpu);
-void lg_local_unlock_cpu(struct lglock *lg, int cpu);
-
-void lg_double_lock(struct lglock *lg, int cpu1, int cpu2);
-void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2);
-
-void lg_global_lock(struct lglock *lg);
-void lg_global_unlock(struct lglock *lg);
-
-#else
-/* When !CONFIG_SMP, map lglock to spinlock */
-#define lglock spinlock
-#define DEFINE_LGLOCK(name) DEFINE_SPINLOCK(name)
-#define DEFINE_STATIC_LGLOCK(name) static DEFINE_SPINLOCK(name)
-#define lg_lock_init(lg, name) spin_lock_init(lg)
-#define lg_local_lock spin_lock
-#define lg_local_unlock spin_unlock
-#define lg_local_lock_cpu(lg, cpu) spin_lock(lg)
-#define lg_local_unlock_cpu(lg, cpu) spin_unlock(lg)
-#define lg_global_lock spin_lock
-#define lg_global_unlock spin_unlock
-#endif
-
-#endif
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -15,7 +15,6 @@ obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
 endif
 obj-$(CONFIG_SMP) += spinlock.o
 obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o
-obj-$(CONFIG_SMP) += lglock.o
 obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
 obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o
 obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
--- a/kernel/locking/lglock.c
+++ /dev/null
@@ -1,111 +0,0 @@
-/* See include/linux/lglock.h for description */
-#include <linux/module.h>
-#include <linux/lglock.h>
-#include <linux/cpu.h>
-#include <linux/string.h>
-
-/*
- * Note there is no uninit, so lglocks cannot be defined in
- * modules (but it's fine to use them from there)
- * Could be added though, just undo lg_lock_init
- */
-
-void lg_lock_init(struct lglock *lg, char *name)
-{
-	LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0);
-}
-EXPORT_SYMBOL(lg_lock_init);
-
-void lg_local_lock(struct lglock *lg)
-{
-	arch_spinlock_t *lock;
-
-	preempt_disable();
-	lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
-	lock = this_cpu_ptr(lg->lock);
-	arch_spin_lock(lock);
-}
-EXPORT_SYMBOL(lg_local_lock);
-
-void lg_local_unlock(struct lglock *lg)
-{
-	arch_spinlock_t *lock;
-
-	lock_release(&lg->lock_dep_map, 1, _RET_IP_);
-	lock = this_cpu_ptr(lg->lock);
-	arch_spin_unlock(lock);
-	preempt_enable();
-}
-EXPORT_SYMBOL(lg_local_unlock);
-
-void lg_local_lock_cpu(struct lglock *lg, int cpu)
-{
-	arch_spinlock_t *lock;
-
-	preempt_disable();
-	lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
-	lock = per_cpu_ptr(lg->lock, cpu);
-	arch_spin_lock(lock);
-}
-EXPORT_SYMBOL(lg_local_lock_cpu);
-
-void lg_local_unlock_cpu(struct lglock *lg, int cpu)
-{
-	arch_spinlock_t *lock;
-
-	lock_release(&lg->lock_dep_map, 1, _RET_IP_);
-	lock = per_cpu_ptr(lg->lock, cpu);
-	arch_spin_unlock(lock);
-	preempt_enable();
-}
-EXPORT_SYMBOL(lg_local_unlock_cpu);
-
-void lg_double_lock(struct lglock *lg, int cpu1, int cpu2)
-{
-	BUG_ON(cpu1 == cpu2);
-
-	/* lock in cpu order, just like lg_global_lock */
-	if (cpu2 < cpu1)
-		swap(cpu1, cpu2);
-
-	preempt_disable();
-	lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
-	arch_spin_lock(per_cpu_ptr(lg->lock, cpu1));
-	arch_spin_lock(per_cpu_ptr(lg->lock, cpu2));
-}
-
-void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2)
-{
-	lock_release(&lg->lock_dep_map, 1, _RET_IP_);
-	arch_spin_unlock(per_cpu_ptr(lg->lock, cpu1));
-	arch_spin_unlock(per_cpu_ptr(lg->lock, cpu2));
-	preempt_enable();
-}
-
-void lg_global_lock(struct lglock *lg)
-{
-	int i;
-
-	preempt_disable();
-	lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
-	for_each_possible_cpu(i) {
-		arch_spinlock_t *lock;
-		lock = per_cpu_ptr(lg->lock, i);
-		arch_spin_lock(lock);
-	}
-}
-EXPORT_SYMBOL(lg_global_lock);
-
-void lg_global_unlock(struct lglock *lg)
-{
-	int i;
-
-	lock_release(&lg->lock_dep_map, 1, _RET_IP_);
-	for_each_possible_cpu(i) {
-		arch_spinlock_t *lock;
-		lock = per_cpu_ptr(lg->lock, i);
-		arch_spin_unlock(lock);
-	}
-	preempt_enable();
-}
-EXPORT_SYMBOL(lg_global_unlock);


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 00/13] percpu rwsem -v2
  2015-06-22 12:16 [RFC][PATCH 00/13] percpu rwsem -v2 Peter Zijlstra
                   ` (12 preceding siblings ...)
  2015-06-22 12:16 ` [RFC][PATCH 13/13] locking: " Peter Zijlstra
@ 2015-06-22 12:36 ` Peter Zijlstra
  2015-06-22 18:11 ` Daniel Wagner
                   ` (2 subsequent siblings)
  16 siblings, 0 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-22 12:36 UTC (permalink / raw)
  To: oleg, paulmck
  Cc: tj, mingo, linux-kernel, der.herr, dave, riel, viro, torvalds


I forgot to Re-instate "From: Oleg Nesterov" On the first 4 patches.

Sorry about that. I'll take more care with a next posting.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 00/13] percpu rwsem -v2
  2015-06-22 12:16 [RFC][PATCH 00/13] percpu rwsem -v2 Peter Zijlstra
                   ` (13 preceding siblings ...)
  2015-06-22 12:36 ` [RFC][PATCH 00/13] percpu rwsem -v2 Peter Zijlstra
@ 2015-06-22 18:11 ` Daniel Wagner
  2015-06-22 19:05   ` Peter Zijlstra
  2015-06-22 20:06 ` Linus Torvalds
  2015-06-23 16:10 ` Davidlohr Bueso
  16 siblings, 1 reply; 106+ messages in thread
From: Daniel Wagner @ 2015-06-22 18:11 UTC (permalink / raw)
  To: Peter Zijlstra, oleg, paulmck
  Cc: tj, mingo, linux-kernel, der.herr, dave, riel, viro, torvalds

On 06/22/2015 02:16 PM, Peter Zijlstra wrote:
> Also, since Linus thinks lglocks is a failed locking primitive (which I whole
> heartedly agree with, its preempt-disable latencies are an abomination), it
> also converts the global part of fs/locks's usage of lglock over to a
> percpu-rwsem and uses a per-cpu spinlock for the local part. This both provides
> another (4th) percpu-rwsem users and removes an lglock user.

I did a quick lockperf run with these patches on a 4 socket E5-4610 machine.
These microbenches execercise the fs' locks a bit.

I suspect I got the wrong tree. The patches did not apply cleanly. The resulting
kernel boots fine and doesn't explode... so far...

The results aren't looking too bad. Though building a kernel with 'make -j200'
was extreme slow. I'll look into it tomorrow.

https://git.samba.org/jlayton/linux.git/?p=jlayton/lockperf.git;a=summary

flock01
                             mean   variance      sigma        max        min
                    4.1.0    11.7075   816.3341    28.5716   125.6552     0.0021
             percpu-rwsem    11.4614   760.1345    27.5705   132.5030     0.0026


flock02
                             mean   variance      sigma        max        min
                    4.1.0     7.0197     1.1812     1.0868    10.6188     5.1706
             percpu-rwsem     9.3194     1.3443     1.1594    11.5902     6.6138


lease01
                             mean   variance      sigma        max        min
                    4.1.0    41.8361    23.8462     4.8833    51.3493    28.5859
             percpu-rwsem    40.2738    20.8323     4.5642    49.6037    28.0704


lease02
                             mean   variance      sigma        max        min
                    4.1.0    71.2159    12.7763     3.5744    77.8432    58.0390
             percpu-rwsem    71.4312    14.7688     3.8430    76.5036    57.8615


posix01
                             mean   variance      sigma        max        min
                    4.1.0   121.9020 27882.5260   166.9806   603.5509     0.0063
             percpu-rwsem   185.3981 38474.3836   196.1489   580.6532     0.0073


posix02
                             mean   variance      sigma        max        min
                    4.1.0    12.7461     3.1802     1.7833    15.5411     8.1018
             percpu-rwsem    16.2341     4.3038     2.0746    19.3271    11.1751


posix03
                             mean   variance      sigma        max        min
                    4.1.0     0.9121     0.0000     0.0000     0.9121     0.9121
             percpu-rwsem     0.9379     0.0000     0.0000     0.9379     0.9379


posix04
                             mean   variance      sigma        max        min
                    4.1.0     0.0703     0.0044     0.0664     0.6764     0.0437
             percpu-rwsem     0.0675     0.0007     0.0267     0.3236     0.0491


cheers,
daniel

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 00/13] percpu rwsem -v2
  2015-06-22 18:11 ` Daniel Wagner
@ 2015-06-22 19:05   ` Peter Zijlstra
  2015-06-23  9:35     ` Daniel Wagner
  0 siblings, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-22 19:05 UTC (permalink / raw)
  To: Daniel Wagner
  Cc: oleg, paulmck, tj, mingo, linux-kernel, der.herr, dave, riel,
	viro, torvalds

On Mon, Jun 22, 2015 at 08:11:14PM +0200, Daniel Wagner wrote:
> On 06/22/2015 02:16 PM, Peter Zijlstra wrote:
> > Also, since Linus thinks lglocks is a failed locking primitive (which I whole
> > heartedly agree with, its preempt-disable latencies are an abomination), it
> > also converts the global part of fs/locks's usage of lglock over to a
> > percpu-rwsem and uses a per-cpu spinlock for the local part. This both provides
> > another (4th) percpu-rwsem users and removes an lglock user.
> 
> I did a quick lockperf run with these patches on a 4 socket E5-4610 machine.
> These microbenches execercise the fs' locks a bit.
> 
> I suspect I got the wrong tree. The patches did not apply cleanly. The resulting
> kernel boots fine and doesn't explode... so far...

Its against tip/master, although I expect the locking/core bits that
were sent to Linus earlier today to be the biggest missing piece.

All I really did was build a kernel with lockdep enabled and boot +
build a kernel to see it didn't go belly up.

> The results aren't looking too bad. Though building a kernel with 'make -j200'
> was extreme slow. I'll look into it tomorrow.
> 
> https://git.samba.org/jlayton/linux.git/?p=jlayton/lockperf.git;a=summary

Sweet, I wasn't aware these existed. I'll go have a play.

> posix01
>                              mean   variance      sigma        max        min
>                     4.1.0   121.9020 27882.5260   166.9806   603.5509     0.0063
>              percpu-rwsem   185.3981 38474.3836   196.1489   580.6532     0.0073
> 
> 
> posix02
>                              mean   variance      sigma        max        min
>                     4.1.0    12.7461     3.1802     1.7833    15.5411     8.1018
>              percpu-rwsem    16.2341     4.3038     2.0746    19.3271    11.1751
> 

These two seem to hurt, lemme go look at what they do.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 00/13] percpu rwsem -v2
  2015-06-22 12:16 [RFC][PATCH 00/13] percpu rwsem -v2 Peter Zijlstra
                   ` (14 preceding siblings ...)
  2015-06-22 18:11 ` Daniel Wagner
@ 2015-06-22 20:06 ` Linus Torvalds
  2015-06-23 16:10 ` Davidlohr Bueso
  16 siblings, 0 replies; 106+ messages in thread
From: Linus Torvalds @ 2015-06-22 20:06 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Oleg Nesterov, Paul McKenney, Tejun Heo, Ingo Molnar,
	Linux Kernel Mailing List, der.herr, Davidlohr Bueso,
	Rik van Riel, Al Viro

On Mon, Jun 22, 2015 at 5:16 AM, Peter Zijlstra <peterz@infradead.org> wrote:
>
> It further removes the stop_machine lglock usage, and with it kills lglocks.

Ok. With all the conversions, and removal of lglock, my dislike of
this goes away.

I'm somewhat worried about Daniel's report about "building a kernel
with 'make -j200' was extreme slow", but that may be due to something
else (does the machine have enough memory for "make -j200"? The kernel
compile parallelizes so well, and gcc uses so much memory, that you
need a *lot* of memory to use things like "-j200").

But assuming that gets sorted out, and somebody looks at the few file
locking performance issues, I have no objections to this series any
more.

               Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-06-22 12:16 ` [RFC][PATCH 12/13] stop_machine: Remove lglock Peter Zijlstra
@ 2015-06-22 22:21   ` Oleg Nesterov
  2015-06-23 10:09     ` Peter Zijlstra
  0 siblings, 1 reply; 106+ messages in thread
From: Oleg Nesterov @ 2015-06-22 22:21 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: paulmck, tj, mingo, linux-kernel, der.herr, dave, riel, viro, torvalds

On 06/22, Peter Zijlstra wrote:
>
> By having stop_two_cpus() acquire two cpu_stopper::locks we gain full
> order against the global stop_machine which takes each of these locks
> in order.

Yes, but stop_machine() locks/unlocs cpu_stopper->lock sequentially, it
never holds more than 1 ->lock, so

> +static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
> +{
> +	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&stopper->lock, flags);
> +	__cpu_stop_queue_work(cpu, work);
>  	spin_unlock_irqrestore(&stopper->lock, flags);
>  }

...

>  int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg)
>  {
> -	struct cpu_stop_done done;
> +	struct cpu_stopper *stopper1, *stopper2;
>  	struct cpu_stop_work work1, work2;
>  	struct multi_stop_data msdata;
> +	struct cpu_stop_done done;
> +	unsigned long flags;
> +
> +	if (cpu2 < cpu1)
> +		swap(cpu1, cpu2);

...

> +	stopper1 = per_cpu_ptr(&cpu_stopper, cpu1);
> +	stopper2 = per_cpu_ptr(&cpu_stopper, cpu2);
> +
> +	spin_lock_irqsave(&stopper1->lock, flags);
> +	spin_lock(&stopper2->lock);
> +
> +	__cpu_stop_queue_work(cpu1, &work1);
> +	__cpu_stop_queue_work(cpu2, &work2);

Suppose that stop_two_cpus(cpu1 => 0, cpu2 => 1) races with stop_machine().

	- stop_machine takes the lock on CPU 0, adds the work
	  and drops the lock

	- cpu_stop_queue_work() queues both works

	- stop_machine takes the lock on CPU 1, etc

In this case both CPU 0 and 1 will run multi_cpu_stop() but they will
use different multi_stop_data's, so they will wait for each other
forever?

Oleg.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 09/13] hotplug: Replace hotplug lock with percpu-rwsem
  2015-06-22 12:16 ` [RFC][PATCH 09/13] hotplug: Replace hotplug lock with percpu-rwsem Peter Zijlstra
@ 2015-06-22 22:57   ` Oleg Nesterov
  2015-06-23  7:16     ` Peter Zijlstra
  0 siblings, 1 reply; 106+ messages in thread
From: Oleg Nesterov @ 2015-06-22 22:57 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: paulmck, tj, mingo, linux-kernel, der.herr, dave, riel, viro, torvalds

On 06/22, Peter Zijlstra wrote:
>
> The cpu hotplug lock is a rwsem with read-in-write and read-in-read
> recursion. Implement it as such.

And this patch fixes the problem afaics. Currently cpu_hotplug_begin()
can livelock because it doesn't stop the new readers. With this patch
this is no longer possible.


> -static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
> +static inline void _percpu_down_read(struct percpu_rw_semaphore *sem)
>  {
>  	might_sleep();
>
> -	rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 0, _RET_IP_);
> -
>  	preempt_disable();
>  	/*
>  	 * We are in an RCU-sched read-side critical section, so the writer
> @@ -46,6 +44,12 @@ static inline void percpu_down_read(stru
>  	 */
>  }
>
> +static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
> +{
> +	rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 0, _RET_IP_);
> +	_percpu_down_read(sem);
> +}

...

>  void get_online_cpus(void)
>  {
>  	might_sleep();
> -	if (cpu_hotplug.active_writer == current)
> +
> +	/* read in write recursion */
> +	if (cpu_hotplug.writer == current)
> +		return;
> +
> +	/* read in read recursion */
> +	if (current->cpuhp_ref++)
>  		return;
> -	cpuhp_lock_acquire_read();
> -	mutex_lock(&cpu_hotplug.lock);
> -	atomic_inc(&cpu_hotplug.refcount);
> -	mutex_unlock(&cpu_hotplug.lock);
> +
> +	lock_map_acquire_read(&cpu_hotplug.rwsem.rw_sem.dep_map);
> +	_percpu_down_read(&cpu_hotplug.rwsem);
>  }

Confused... Why do we need _percpu_down_read()? Can't get_online_cpus()
just use percpu_down_read() ?

Yes, percpu_down_read() is not recursive, like the normal down_read().
But this does not matter because we rely on ->cpuhp_ref anyway?


> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1410,6 +1410,8 @@ static struct task_struct *copy_process(
>  	p->sequential_io_avg	= 0;
>  #endif
>
> +	cpu_hotplug_init_task(p);

This is probably unnecessary, copy_process() should not be called under
get_online_cpus().

Oleg.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 05/13] percpu-rwsem: Optimize readers and reduce global impact
  2015-06-22 12:16 ` [RFC][PATCH 05/13] percpu-rwsem: Optimize readers and reduce global impact Peter Zijlstra
@ 2015-06-22 23:02   ` Oleg Nesterov
  2015-06-23  7:28   ` Nicholas Mc Guire
  1 sibling, 0 replies; 106+ messages in thread
From: Oleg Nesterov @ 2015-06-22 23:02 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: paulmck, tj, mingo, linux-kernel, der.herr, dave, riel, viro, torvalds

On 06/22, Peter Zijlstra wrote:
>
> +enum { readers_slow, readers_block };

I still think this enum doesn't make sense, and percpu_rw_semaphore->state
should be a boolean. But this is really minor and subjective.

Reviewed-by: Oleg Nesterov <oleg@redhat.com>

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 06/13] percpu-rwsem: Provide percpu_down_read_trylock()
  2015-06-22 12:16 ` [RFC][PATCH 06/13] percpu-rwsem: Provide percpu_down_read_trylock() Peter Zijlstra
@ 2015-06-22 23:08   ` Oleg Nesterov
  0 siblings, 0 replies; 106+ messages in thread
From: Oleg Nesterov @ 2015-06-22 23:08 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: paulmck, tj, mingo, linux-kernel, der.herr, dave, riel, viro, torvalds

On 06/22, Peter Zijlstra wrote:
>
> +static inline bool percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
> +{
> +	bool ret = true;
> +
> +	preempt_disable();
> +	__this_cpu_inc(*sem->refcount);
> +	if (unlikely(!rcu_sync_is_idle(&sem->rss)))
> +		ret = __percpu_down_read_trylock(sem);
> +	preempt_enable();
> +
> +	if (ret)
> +		rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 1, _RET_IP_);
> +
> +	return ret;
> +}
...
> +bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
> +{
> +	smp_mb(); /* A matches D */
> +
> +	if (likely(smp_load_acquire(&sem->state) != readers_block))
> +		return true;
> +
> +	__percpu_up_read(sem);
> +
> +	return false;
> +}

Looks like we can slightly refactor this code to avoid the code
duplication. But this is minor too and we can do this later.

Reviewed-by: Oleg Nesterov <oleg@redhat.com>

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 11/13] fs/locks: Replace lg_local with a per-cpu spinlock
  2015-06-22 12:16 ` [RFC][PATCH 11/13] fs/locks: Replace lg_local with a per-cpu spinlock Peter Zijlstra
@ 2015-06-23  0:19   ` Oleg Nesterov
  0 siblings, 0 replies; 106+ messages in thread
From: Oleg Nesterov @ 2015-06-23  0:19 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: paulmck, tj, mingo, linux-kernel, der.herr, dave, riel, viro, torvalds

Off-topic question,

On 06/22, Peter Zijlstra wrote:
>
> @@ -2650,9 +2660,8 @@ static void *locks_start(struct seq_file
>  
>  	iter->li_pos = *pos + 1;
>  	percpu_down_write(&file_rwsem);
> -	lg_global_lock(&file_lock_lglock);
>  	spin_lock(&blocked_lock_lock);
> -	return seq_hlist_start_percpu(&file_lock_list, &iter->li_cpu, *pos);
> +	return seq_hlist_start_percpu(&file_lock_list.hlist, &iter->li_cpu, *pos);
>  }

...

>  static void locks_stop(struct seq_file *f, void *v)
>  	__releases(&blocked_lock_lock)
>  {
>  	spin_unlock(&blocked_lock_lock);

With or without this patch, why locks_start/locks_stop need to take/drop
blocked_lock_lock ?

Oleg.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 09/13] hotplug: Replace hotplug lock with percpu-rwsem
  2015-06-22 22:57   ` Oleg Nesterov
@ 2015-06-23  7:16     ` Peter Zijlstra
  2015-06-23 17:01       ` Oleg Nesterov
  0 siblings, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-23  7:16 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: paulmck, tj, mingo, linux-kernel, der.herr, dave, riel, viro, torvalds

On Tue, Jun 23, 2015 at 12:57:39AM +0200, Oleg Nesterov wrote:
> > +
> > +	lock_map_acquire_read(&cpu_hotplug.rwsem.rw_sem.dep_map);
> > +	_percpu_down_read(&cpu_hotplug.rwsem);
> >  }
> 
> Confused... Why do we need _percpu_down_read()? Can't get_online_cpus()
> just use percpu_down_read() ?
> 
> Yes, percpu_down_read() is not recursive, like the normal down_read().
> But this does not matter because we rely on ->cpuhp_ref anyway?

While we will not call the actual lock, lockdep will still get confused
by the inconsistent locking order observed.

Change it and boot, you'll find lockdep output pretty quickly.

> > --- a/kernel/fork.c
> > +++ b/kernel/fork.c
> > @@ -1410,6 +1410,8 @@ static struct task_struct *copy_process(
> >  	p->sequential_io_avg	= 0;
> >  #endif
> >
> > +	cpu_hotplug_init_task(p);
> 
> This is probably unnecessary, copy_process() should not be called under
> get_online_cpus().

Probably true, in which case we could still use the callback to insert a
WARN_ON_ONCE(p->cpuhp_ref) :-)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 05/13] percpu-rwsem: Optimize readers and reduce global impact
  2015-06-22 12:16 ` [RFC][PATCH 05/13] percpu-rwsem: Optimize readers and reduce global impact Peter Zijlstra
  2015-06-22 23:02   ` Oleg Nesterov
@ 2015-06-23  7:28   ` Nicholas Mc Guire
  2015-06-25 19:08     ` Peter Zijlstra
  1 sibling, 1 reply; 106+ messages in thread
From: Nicholas Mc Guire @ 2015-06-23  7:28 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: oleg, paulmck, tj, mingo, linux-kernel, dave, riel, viro, torvalds


A bit off-topic probably
but maybe this should not be in kernel/locking/percpu-rwsem.c but in a
generic percpu location as this construct is present in the core a few times
atleast in:
 kernel/irq/irqdesc.c:kstat_irqs
 kernel/fork.c:nr_processes
 mm/memcontrol.c:mem_cgroup_read_events
 mm/memcontrol.c:mem_cgroup_read_stat

> +
> +#define per_cpu_sum(var)                                             \
> +({                                                                   \
> +     typeof(var) __sum = 0;                                          \
> +     int cpu;                                                        \
> +     for_each_possible_cpu(cpu)                                      \
> +             __sum += per_cpu(var, cpu);                             \
> +     __sum;                                                          \
> +})
> +

so maybe put it into include/linux/percpu.h ?

thx!
hofrat
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 00/13] percpu rwsem -v2
  2015-06-22 19:05   ` Peter Zijlstra
@ 2015-06-23  9:35     ` Daniel Wagner
  2015-06-23 10:00       ` Ingo Molnar
  2015-06-23 14:34       ` Peter Zijlstra
  0 siblings, 2 replies; 106+ messages in thread
From: Daniel Wagner @ 2015-06-23  9:35 UTC (permalink / raw)
  To: Peter Zijlstra, Daniel Wagner
  Cc: oleg, paulmck, tj, mingo, linux-kernel, der.herr, dave, riel,
	viro, torvalds, der.herr

On 06/22/2015 09:05 PM, Peter Zijlstra wrote:
> On Mon, Jun 22, 2015 at 08:11:14PM +0200, Daniel Wagner wrote:
>> On 06/22/2015 02:16 PM, Peter Zijlstra wrote:
>>> Also, since Linus thinks lglocks is a failed locking primitive (which I whole
>>> heartedly agree with, its preempt-disable latencies are an abomination), it
>>> also converts the global part of fs/locks's usage of lglock over to a
>>> percpu-rwsem and uses a per-cpu spinlock for the local part. This both provides
>>> another (4th) percpu-rwsem users and removes an lglock user.
>>
>> I did a quick lockperf run with these patches on a 4 socket E5-4610 machine.
>> These microbenches execercise the fs' locks a bit.
>>
>> I suspect I got the wrong tree. The patches did not apply cleanly. The resulting
>> kernel boots fine and doesn't explode... so far...
> 
> Its against tip/master, although I expect the locking/core bits that
> were sent to Linus earlier today to be the biggest missing piece.
> 
> All I really did was build a kernel with lockdep enabled and boot +
> build a kernel to see it didn't go belly up.
> 
>> The results aren't looking too bad. Though building a kernel with 'make -j200'
>> was extreme slow. I'll look into it tomorrow.

So this turns out to be false alarm. I had icecream installed/actived
and that interfered with gcc. Stupid me.

The machine has 0.5TB memory and doesn't seem to be really concerned about
'make -j200'

make clean && time make -j200

mainline 4.1.0
2nd run
	real    1m7.595s
	user    28m43.125s
	sys     3m48.189s


tip v4.1-2756-ge3d06bd
2nd run
	real    1m6.871s
	user    28m50.803s
	sys     3m50.223s
3rd run
	real    1m6.974s
	user    28m52.093s
	sys     3m50.259s


tip v4.1-2769-g6ce2591 (percpu-rwsem)
2nd run
	real    1m7.847s
	user    29m0.439s
	sys     3m51.181s
3rd run
	real    1m7.113s
	user    29m3.127s
	sys     3m51.516s



Compared to 'make -j64' on tip v4.1-2756-ge3d06bd
2nd run
	real    1m7.605s
	user    28m3.121s
	sys     3m52.541s

>> https://git.samba.org/jlayton/linux.git/?p=jlayton/lockperf.git;a=summary
> 
> Sweet, I wasn't aware these existed. I'll go have a play.
> 
>> posix01
>>                              mean   variance      sigma        max        min
>>                     4.1.0   121.9020 27882.5260   166.9806   603.5509     0.0063
>>              percpu-rwsem   185.3981 38474.3836   196.1489   580.6532     0.0073
>>
>>
>> posix02
>>                              mean   variance      sigma        max        min
>>                     4.1.0    12.7461     3.1802     1.7833    15.5411     8.1018
>>              percpu-rwsem    16.2341     4.3038     2.0746    19.3271    11.1751
>>
> 
> These two seem to hurt, lemme go look at what they do.

Now here the same tests with tip and tip+percpu-rwsem. The patches
applied cleanly :)

I put all the raw data here[1] in case someone is interested. Some of the
test behave a bit strange, running extremely fast compared to the other runs.
That is probably the result of me trying to reduce the run time to the min.


flock01
                             mean   variance      sigma        max        min
                    4.1.0    11.7075   816.3341    28.5716   125.6552     0.0021
       4.1.0+percpu-rwsem    11.4614   760.1345    27.5705   132.5030     0.0026
                      tip     6.8390   329.3037    18.1467    81.0373     0.0021
         tip+percpu-rwsem    10.0870   546.7435    23.3825   106.2396     0.0026


flock02
                             mean   variance      sigma        max        min
                    4.1.0     7.0197     1.1812     1.0868    10.6188     5.1706
       4.1.0+percpu-rwsem     9.3194     1.3443     1.1594    11.5902     6.6138
                      tip     7.1057     1.6719     1.2930    11.2362     5.1434
         tip+percpu-rwsem     9.0357     1.9874     1.4097    14.0254     6.4380


lease01
                             mean   variance      sigma        max        min
                    4.1.0    41.8361    23.8462     4.8833    51.3493    28.5859
       4.1.0+percpu-rwsem    40.2738    20.8323     4.5642    49.6037    28.0704
                      tip    30.2617    13.0900     3.6180    36.6398    20.2085
         tip+percpu-rwsem    31.2730    17.9787     4.2401    37.8981    19.2944


lease02
                             mean   variance      sigma        max        min
                    4.1.0    71.2159    12.7763     3.5744    77.8432    58.0390
       4.1.0+percpu-rwsem    71.4312    14.7688     3.8430    76.5036    57.8615
                      tip    20.2019     5.2042     2.2813    23.1071    13.4647
         tip+percpu-rwsem    20.8305     6.6631     2.5813    23.8034    11.2815


posix01
                             mean   variance      sigma        max        min
                    4.1.0   121.9020 27882.5260   166.9806   603.5509     0.0063
       4.1.0+percpu-rwsem   185.3981 38474.3836   196.1489   580.6532     0.0073
                      tip   129.2736 23752.7122   154.1191   474.0604     0.0063
         tip+percpu-rwsem   142.6474 24732.1571   157.2646   468.7478     0.0072


posix02
                             mean   variance      sigma        max        min
                    4.1.0    12.7461     3.1802     1.7833    15.5411     8.1018
       4.1.0+percpu-rwsem    16.2341     4.3038     2.0746    19.3271    11.1751
                      tip    13.2810     5.3958     2.3229    20.1243     8.9361
         tip+percpu-rwsem    15.6802     4.7514     2.1798    21.5704     9.4074


posix03
                             mean   variance      sigma        max        min
                    4.1.0     0.9121     0.0000     0.0000     0.9121     0.9121
       4.1.0+percpu-rwsem     0.9379     0.0000     0.0000     0.9379     0.9379
                      tip     0.8647     0.0009     0.0297     0.9274     0.7995
         tip+percpu-rwsem     0.8147     0.0003     0.0161     0.8530     0.7824


posix04
                             mean   variance      sigma        max        min
                    4.1.0     0.0703     0.0044     0.0664     0.6764     0.0437
       4.1.0+percpu-rwsem     0.0675     0.0007     0.0267     0.3236     0.0491
                      tip     0.0618     0.0027     0.0521     0.5642     0.0453
         tip+percpu-rwsem     0.0658     0.0003     0.0175     0.1793     0.0493


cheers,
daniel

[1] http://monom.org/percpu-rwsem/

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 00/13] percpu rwsem -v2
  2015-06-23  9:35     ` Daniel Wagner
@ 2015-06-23 10:00       ` Ingo Molnar
  2015-06-23 14:34       ` Peter Zijlstra
  1 sibling, 0 replies; 106+ messages in thread
From: Ingo Molnar @ 2015-06-23 10:00 UTC (permalink / raw)
  To: Daniel Wagner
  Cc: Peter Zijlstra, oleg, paulmck, tj, mingo, linux-kernel, der.herr,
	dave, riel, viro, torvalds


* Daniel Wagner <daniel.wagner@bmw-carit.de> wrote:

> The machine has 0.5TB memory and doesn't seem to be really concerned about
> 'make -j200'
> 
> make clean && time make -j200
> 
> mainline 4.1.0
> 2nd run
> 	real    1m7.595s
> 	user    28m43.125s
> 	sys     3m48.189s
> 
> 
> tip v4.1-2756-ge3d06bd
> 2nd run
> 	real    1m6.871s
> 	user    28m50.803s
> 	sys     3m50.223s
> 3rd run
> 	real    1m6.974s
> 	user    28m52.093s
> 	sys     3m50.259s
> 
> 
> tip v4.1-2769-g6ce2591 (percpu-rwsem)
> 2nd run
> 	real    1m7.847s
> 	user    29m0.439s
> 	sys     3m51.181s
> 3rd run
> 	real    1m7.113s
> 	user    29m3.127s
> 	sys     3m51.516s
> 
> 
> 
> Compared to 'make -j64' on tip v4.1-2756-ge3d06bd
> 2nd run
> 	real    1m7.605s
> 	user    28m3.121s
> 	sys     3m52.541s

Btw., instead of just listing the raw runs, you can get an automatic average and 
stddev numbers with this:

  $ perf stat --null --repeat 5 --pre 'make clean' --post 'sync' make -j200

 Performance counter stats for 'make -j200' (3 runs):

       29.068162979 seconds time elapsed                                          ( +-  0.27% )

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-06-22 22:21   ` Oleg Nesterov
@ 2015-06-23 10:09     ` Peter Zijlstra
  2015-06-23 10:55       ` Peter Zijlstra
  2015-06-23 16:20       ` Oleg Nesterov
  0 siblings, 2 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-23 10:09 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: paulmck, tj, mingo, linux-kernel, der.herr, dave, riel, viro, torvalds

On Tue, Jun 23, 2015 at 12:21:52AM +0200, Oleg Nesterov wrote:

> Suppose that stop_two_cpus(cpu1 => 0, cpu2 => 1) races with stop_machine().
> 
> 	- stop_machine takes the lock on CPU 0, adds the work
> 	  and drops the lock
> 
> 	- cpu_stop_queue_work() queues both works

cpu_stop_queue_work() only ever queues _1_ work.

> 	- stop_machine takes the lock on CPU 1, etc
> 
> In this case both CPU 0 and 1 will run multi_cpu_stop() but they will
> use different multi_stop_data's, so they will wait for each other
> forever?

So what you're saying is:

	queue_stop_cpus_work()		stop_two_cpus()

	cpu_stop_queue_work(0,..);
					spin_lock(0);
					spin_lock(1);

					__cpu_stop_queue_work(0,..);
					__cpu_stop_queue_work(1,..);

					spin_unlock(1);
					spin_unlock(0);
	cpu_stop_queue_work(1,..);

Indeed, I don't know what I was thinking...

We can of course slap a percpu-rwsem in, but I wonder if there's
anything smarter we can do here.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-06-23 10:09     ` Peter Zijlstra
@ 2015-06-23 10:55       ` Peter Zijlstra
  2015-06-23 11:20         ` Peter Zijlstra
  2015-06-23 14:39         ` Paul E. McKenney
  2015-06-23 16:20       ` Oleg Nesterov
  1 sibling, 2 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-23 10:55 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: paulmck, tj, mingo, linux-kernel, der.herr, dave, riel, viro, torvalds

On Tue, Jun 23, 2015 at 12:09:32PM +0200, Peter Zijlstra wrote:
> We can of course slap a percpu-rwsem in, but I wonder if there's
> anything smarter we can do here.

Urgh, we cannot use percpu-rwsem here, because that would require
percpu_down_write_trylock(), and I'm not sure we can get around the
sync_sched() for that.

Now try_stop_cpus(), which requires the down_write_trylock() is used to
implement synchronize_sched_expedited().

Using sync_sched() to implement sync_sched_expedited would make me
happy, but it does somewhat defeat the purpose.



Also, I think _expedited is used too eagerly, look at this:

+void dm_sync_table(struct mapped_device *md)
+{
+       synchronize_srcu(&md->io_barrier);
+       synchronize_rcu_expedited();
+}

sync_srcu() is slow already, why then bother with an
sync_rcu_expedited() :/

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-06-23 10:55       ` Peter Zijlstra
@ 2015-06-23 11:20         ` Peter Zijlstra
  2015-06-23 13:08           ` Peter Zijlstra
  2015-06-23 14:39         ` Paul E. McKenney
  1 sibling, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-23 11:20 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: paulmck, tj, mingo, linux-kernel, der.herr, dave, riel, viro, torvalds

On Tue, Jun 23, 2015 at 12:55:48PM +0200, Peter Zijlstra wrote:
> On Tue, Jun 23, 2015 at 12:09:32PM +0200, Peter Zijlstra wrote:
> > We can of course slap a percpu-rwsem in, but I wonder if there's
> > anything smarter we can do here.
> 
> Urgh, we cannot use percpu-rwsem here, because that would require
> percpu_down_write_trylock(), and I'm not sure we can get around the
> sync_sched() for that.
> 
> Now try_stop_cpus(), which requires the down_write_trylock() is used to
> implement synchronize_sched_expedited().
> 
> Using sync_sched() to implement sync_sched_expedited would make me
> happy, but it does somewhat defeat the purpose.

Paul, why does this use stop_machine anyway? I seemed to remember you
sending resched IPIs around.

The rcu_sched_qs() thing would set passed_quiesce, which you can then
collect to gauge progress.

Shooting IPIs around is bad enough, but running a full blown
stop_machine is really blunt and heavy.


Also, OMFG @ 74b51ee152b6 ("ACPI / osl: speedup grace period in
acpi_os_map_cleanup"), that's an expedited use to help the nVidiot
binary blob. WTF!!


^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-06-23 11:20         ` Peter Zijlstra
@ 2015-06-23 13:08           ` Peter Zijlstra
  2015-06-23 16:36             ` Oleg Nesterov
  2015-06-23 17:30             ` Paul E. McKenney
  0 siblings, 2 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-23 13:08 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: paulmck, tj, mingo, linux-kernel, der.herr, dave, riel, viro, torvalds

On Tue, Jun 23, 2015 at 01:20:41PM +0200, Peter Zijlstra wrote:
> Paul, why does this use stop_machine anyway? I seemed to remember you
> sending resched IPIs around.
> 
> The rcu_sched_qs() thing would set passed_quiesce, which you can then
> collect to gauge progress.
> 
> Shooting IPIs around is bad enough, but running a full blown
> stop_machine is really blunt and heavy.

Is there anything obviously amiss with the below? It does stop_one_cpu()
in a loop instead of the multi cpu stop_machine and is therefore much
friendlier (albeit still heavier than bare resched IPIs) since the CPUs
do not have to go an sync up.

After all, all we're really interested in is that each CPUs has
scheduled at least once, we do not care about the cross cpu syncup.

---
 include/linux/stop_machine.h |  7 ----
 kernel/rcu/tree.c            | 99 +++++---------------------------------------
 kernel/stop_machine.c        | 30 --------------
 3 files changed, 10 insertions(+), 126 deletions(-)

diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index d2abbdb8c6aa..f992da7ee492 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -32,7 +32,6 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
 void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
 			 struct cpu_stop_work *work_buf);
 int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
-int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
 
 #else	/* CONFIG_SMP */
 
@@ -83,12 +82,6 @@ static inline int stop_cpus(const struct cpumask *cpumask,
 	return -ENOENT;
 }
 
-static inline int try_stop_cpus(const struct cpumask *cpumask,
-				cpu_stop_fn_t fn, void *arg)
-{
-	return stop_cpus(cpumask, fn, arg);
-}
-
 #endif	/* CONFIG_SMP */
 
 /*
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index add042926a66..4a8cde155dce 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3257,7 +3257,7 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
 {
 	/*
 	 * There must be a full memory barrier on each affected CPU
-	 * between the time that try_stop_cpus() is called and the
+	 * between the time that stop_one_cpu() is called and the
 	 * time that it returns.
 	 *
 	 * In the current initial implementation of cpu_stop, the
@@ -3291,25 +3291,12 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
  * grace period.  We are then done, so we use atomic_cmpxchg() to
  * update sync_sched_expedited_done to match our snapshot -- but
  * only if someone else has not already advanced past our snapshot.
- *
- * On the other hand, if try_stop_cpus() fails, we check the value
- * of sync_sched_expedited_done.  If it has advanced past our
- * initial snapshot, then someone else must have forced a grace period
- * some time after we took our snapshot.  In this case, our work is
- * done for us, and we can simply return.  Otherwise, we try again,
- * but keep our initial snapshot for purposes of checking for someone
- * doing our work for us.
- *
- * If we fail too many times in a row, we fall back to synchronize_sched().
  */
 void synchronize_sched_expedited(void)
 {
-	cpumask_var_t cm;
-	bool cma = false;
-	int cpu;
-	long firstsnap, s, snap;
-	int trycount = 0;
 	struct rcu_state *rsp = &rcu_sched_state;
+	long s, snap;
+	int cpu;
 
 	/*
 	 * If we are in danger of counter wrap, just do synchronize_sched().
@@ -3332,7 +3319,6 @@ void synchronize_sched_expedited(void)
 	 * full memory barrier.
 	 */
 	snap = atomic_long_inc_return(&rsp->expedited_start);
-	firstsnap = snap;
 	if (!try_get_online_cpus()) {
 		/* CPU hotplug operation in flight, fall back to normal GP. */
 		wait_rcu_gp(call_rcu_sched);
@@ -3341,82 +3327,17 @@ void synchronize_sched_expedited(void)
 	}
 	WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
 
-	/* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
-	cma = zalloc_cpumask_var(&cm, GFP_KERNEL);
-	if (cma) {
-		cpumask_copy(cm, cpu_online_mask);
-		cpumask_clear_cpu(raw_smp_processor_id(), cm);
-		for_each_cpu(cpu, cm) {
-			struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-
-			if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
-				cpumask_clear_cpu(cpu, cm);
-		}
-		if (cpumask_weight(cm) == 0)
-			goto all_cpus_idle;
-	}
-
-	/*
-	 * Each pass through the following loop attempts to force a
-	 * context switch on each CPU.
-	 */
-	while (try_stop_cpus(cma ? cm : cpu_online_mask,
-			     synchronize_sched_expedited_cpu_stop,
-			     NULL) == -EAGAIN) {
-		put_online_cpus();
-		atomic_long_inc(&rsp->expedited_tryfail);
-
-		/* Check to see if someone else did our work for us. */
-		s = atomic_long_read(&rsp->expedited_done);
-		if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
-			/* ensure test happens before caller kfree */
-			smp_mb__before_atomic(); /* ^^^ */
-			atomic_long_inc(&rsp->expedited_workdone1);
-			free_cpumask_var(cm);
-			return;
-		}
-
-		/* No joy, try again later.  Or just synchronize_sched(). */
-		if (trycount++ < 10) {
-			udelay(trycount * num_online_cpus());
-		} else {
-			wait_rcu_gp(call_rcu_sched);
-			atomic_long_inc(&rsp->expedited_normal);
-			free_cpumask_var(cm);
-			return;
-		}
+	for_each_online_cpu(cpu) {
+		struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
 
-		/* Recheck to see if someone else did our work for us. */
-		s = atomic_long_read(&rsp->expedited_done);
-		if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
-			/* ensure test happens before caller kfree */
-			smp_mb__before_atomic(); /* ^^^ */
-			atomic_long_inc(&rsp->expedited_workdone2);
-			free_cpumask_var(cm);
-			return;
-		}
+		/* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
+		if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
+			continue;
 
-		/*
-		 * Refetching sync_sched_expedited_started allows later
-		 * callers to piggyback on our grace period.  We retry
-		 * after they started, so our grace period works for them,
-		 * and they started after our first try, so their grace
-		 * period works for us.
-		 */
-		if (!try_get_online_cpus()) {
-			/* CPU hotplug operation in flight, use normal GP. */
-			wait_rcu_gp(call_rcu_sched);
-			atomic_long_inc(&rsp->expedited_normal);
-			free_cpumask_var(cm);
-			return;
-		}
-		snap = atomic_long_read(&rsp->expedited_start);
-		smp_mb(); /* ensure read is before try_stop_cpus(). */
+		stop_one_cpu(cpu, synchronize_sched_expedited_cpu_stop, NULL);
 	}
-	atomic_long_inc(&rsp->expedited_stoppedcpus);
 
-all_cpus_idle:
-	free_cpumask_var(cm);
+	atomic_long_inc(&rsp->expedited_stoppedcpus);
 
 	/*
 	 * Everyone up to our most recent fetch is covered by our grace
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index fd643d8c4b42..b1329a213503 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -371,36 +371,6 @@ int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
 	return ret;
 }
 
-/**
- * try_stop_cpus - try to stop multiple cpus
- * @cpumask: cpus to stop
- * @fn: function to execute
- * @arg: argument to @fn
- *
- * Identical to stop_cpus() except that it fails with -EAGAIN if
- * someone else is already using the facility.
- *
- * CONTEXT:
- * Might sleep.
- *
- * RETURNS:
- * -EAGAIN if someone else is already stopping cpus, -ENOENT if
- * @fn(@arg) was not executed at all because all cpus in @cpumask were
- * offline; otherwise, 0 if all executions of @fn returned 0, any non
- * zero return value if any returned non zero.
- */
-int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
-{
-	int ret;
-
-	/* static works are used, process one request at a time */
-	if (!mutex_trylock(&stop_cpus_mutex))
-		return -EAGAIN;
-	ret = __stop_cpus(cpumask, fn, arg);
-	mutex_unlock(&stop_cpus_mutex);
-	return ret;
-}
-
 static int cpu_stop_should_run(unsigned int cpu)
 {
 	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);

^ permalink raw reply related	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 00/13] percpu rwsem -v2
  2015-06-23  9:35     ` Daniel Wagner
  2015-06-23 10:00       ` Ingo Molnar
@ 2015-06-23 14:34       ` Peter Zijlstra
  2015-06-23 14:56         ` Daniel Wagner
  1 sibling, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-23 14:34 UTC (permalink / raw)
  To: Daniel Wagner
  Cc: oleg, paulmck, tj, mingo, linux-kernel, der.herr, dave, riel,
	viro, torvalds, jlayton

On Tue, Jun 23, 2015 at 11:35:24AM +0200, Daniel Wagner wrote:
> flock01
>                              mean   variance      sigma        max        min
>                     4.1.0    11.7075   816.3341    28.5716   125.6552     0.0021
>        4.1.0+percpu-rwsem    11.4614   760.1345    27.5705   132.5030     0.0026
>                       tip     6.8390   329.3037    18.1467    81.0373     0.0021
>          tip+percpu-rwsem    10.0870   546.7435    23.3825   106.2396     0.0026

> posix01
>                              mean   variance      sigma        max        min
>                     4.1.0   121.9020 27882.5260   166.9806   603.5509     0.0063
>        4.1.0+percpu-rwsem   185.3981 38474.3836   196.1489   580.6532     0.0073
>                       tip   129.2736 23752.7122   154.1191   474.0604     0.0063
>          tip+percpu-rwsem   142.6474 24732.1571   157.2646   468.7478     0.0072

Both these tests are incredibly unstable for me (as well as for you it
appears). Variance is through the roof on them.

I get runtimes like:

root@ivb-ex:/usr/local/src/lockperf# ./flock01 -n 240 -l 32 /tmp/a 
0.266157011
root@ivb-ex:/usr/local/src/lockperf# ./flock01 -n 240 -l 32 /tmp/a 
139.303399960

That's not really inspiring, if I use bigger loop counts it more or less
settles, but then the EX is unusable because it ends up running 3000
seconds per test.

In any case, on a smaller box (ivb-ep) I got the below results:

posix01
                                     mean   variance      sigma        max        min
data-4.1.0-02756-ge3d06bd         250.7032    40.4864     6.3629   263.7736   238.5192
data-4.1.0-02756-ge3d06bd-dirty   252.6847    35.8953     5.9913   270.1679   233.0215

Which looks better, but the difference is still well within the variance
and thus not significant.

Lemme continue playing with this for a bit more.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-06-23 10:55       ` Peter Zijlstra
  2015-06-23 11:20         ` Peter Zijlstra
@ 2015-06-23 14:39         ` Paul E. McKenney
  1 sibling, 0 replies; 106+ messages in thread
From: Paul E. McKenney @ 2015-06-23 14:39 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
	viro, torvalds

On Tue, Jun 23, 2015 at 12:55:48PM +0200, Peter Zijlstra wrote:
> On Tue, Jun 23, 2015 at 12:09:32PM +0200, Peter Zijlstra wrote:
> > We can of course slap a percpu-rwsem in, but I wonder if there's
> > anything smarter we can do here.
> 
> Urgh, we cannot use percpu-rwsem here, because that would require
> percpu_down_write_trylock(), and I'm not sure we can get around the
> sync_sched() for that.
> 
> Now try_stop_cpus(), which requires the down_write_trylock() is used to
> implement synchronize_sched_expedited().
> 
> Using sync_sched() to implement sync_sched_expedited would make me
> happy, but it does somewhat defeat the purpose.
> 
> 
> 
> Also, I think _expedited is used too eagerly, look at this:
> 
> +void dm_sync_table(struct mapped_device *md)
> +{
> +       synchronize_srcu(&md->io_barrier);
> +       synchronize_rcu_expedited();
> +}
> 
> sync_srcu() is slow already, why then bother with an
> sync_rcu_expedited() :/

Actually, this code was added in 2013, which was after the new variant of
synchronize_srcu(), which last I checked is reasonably fast in the common
case (no readers and not having tons of concurrent synchronize_srcu()
calls on the same srcu_struct), especially on systems with a small number
of CPUs, courtesy of srcu_read_lock()'s and srcu_read_unlock()'s read-side
memory barriers.

So synchronize_rcu() really would be expected to have quite a bit higher
latency than synchronize_srcu().

							Thanx, Paul


^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 00/13] percpu rwsem -v2
  2015-06-23 14:34       ` Peter Zijlstra
@ 2015-06-23 14:56         ` Daniel Wagner
  2015-06-23 17:50           ` Peter Zijlstra
  0 siblings, 1 reply; 106+ messages in thread
From: Daniel Wagner @ 2015-06-23 14:56 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: oleg, paulmck, tj, mingo, linux-kernel, der.herr, dave, riel,
	viro, torvalds, jlayton

On 06/23/2015 04:34 PM, Peter Zijlstra wrote:
> On Tue, Jun 23, 2015 at 11:35:24AM +0200, Daniel Wagner wrote:
>> flock01
>>                              mean   variance      sigma        max        min
>>                     4.1.0    11.7075   816.3341    28.5716   125.6552     0.0021
>>        4.1.0+percpu-rwsem    11.4614   760.1345    27.5705   132.5030     0.0026
>>                       tip     6.8390   329.3037    18.1467    81.0373     0.0021
>>          tip+percpu-rwsem    10.0870   546.7435    23.3825   106.2396     0.0026
> 
>> posix01
>>                              mean   variance      sigma        max        min
>>                     4.1.0   121.9020 27882.5260   166.9806   603.5509     0.0063
>>        4.1.0+percpu-rwsem   185.3981 38474.3836   196.1489   580.6532     0.0073
>>                       tip   129.2736 23752.7122   154.1191   474.0604     0.0063
>>          tip+percpu-rwsem   142.6474 24732.1571   157.2646   468.7478     0.0072
> 
> Both these tests are incredibly unstable for me (as well as for you it
> appears). Variance is through the roof on them.

Since on my test machine not all 4 socket have inter connection, I pinned the 
tests down to one socket to see if that reduces the variance. 

Expect flock01 and posix01 show now really low variances (3 runs):

[...]
flock02
                             mean   variance      sigma        max        min
                    tip-1    11.8994     0.5874     0.7664    13.2022     8.6324
                    tip-2    11.7394     0.5252     0.7247    13.2540     9.7513
                    tip-3    11.8155     0.5288     0.7272    13.2700     9.9480
       tip+percpu-rswem-1    15.3601     0.8981     0.9477    16.8116    12.6910
       tip+percpu-rswem-2    15.2558     0.8442     0.9188    17.0199    12.9586
       tip+percpu-rswem-3    15.5297     0.6386     0.7991    17.4392    12.7992


lease01
                             mean   variance      sigma        max        min
                    tip-1     0.3424     0.0001     0.0110     0.3644     0.3088
                    tip-2     0.3627     0.0003     0.0185     0.4140     0.3312
                    tip-3     0.3446     0.0002     0.0125     0.3851     0.3155
       tip+percpu-rswem-1     0.3464     0.0001     0.0116     0.3781     0.3113
       tip+percpu-rswem-2     0.3597     0.0003     0.0162     0.3978     0.3250
       tip+percpu-rswem-3     0.3513     0.0002     0.0151     0.3933     0.3122
[...]

So with this setup we can start to compare the numbers.

> I get runtimes like:
> 
> root@ivb-ex:/usr/local/src/lockperf# ./flock01 -n 240 -l 32 /tmp/a 
> 0.266157011
> root@ivb-ex:/usr/local/src/lockperf# ./flock01 -n 240 -l 32 /tmp/a 
> 139.303399960

Same here:

flock01
                             mean   variance      sigma        max        min
                    tip-1   242.6147  3632.6201    60.2712   313.3081    86.3743
                    tip-2   233.1934  3850.1995    62.0500   318.2716   101.2738
                    tip-3   223.0392  3944.5220    62.8054   318.1932   110.8155
       tip+percpu-rswem-1   276.5913  2145.0510    46.3147   317.5385   156.1318
       tip+percpu-rswem-2   270.7089  2735.7635    52.3045   318.9418   154.5902
       tip+percpu-rswem-3   267.8207  3028.3557    55.0305   320.2987   150.9659

posix01
                             mean   variance      sigma        max        min
                    tip-1    18.8729   151.2810    12.2996    37.3563     0.0060
                    tip-2    17.6894   140.9982    11.8743    37.2080     0.0060
                    tip-3    18.7785   145.1217    12.0466    35.5001     0.0060
       tip+percpu-rswem-1    18.9970   163.8856    12.8018    35.8795     0.0069
       tip+percpu-rswem-2    18.9594   147.3197    12.1375    35.4404     0.0069
       tip+percpu-rswem-3    18.8366   126.5831    11.2509    35.9014     0.0069



^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 00/13] percpu rwsem -v2
  2015-06-22 12:16 [RFC][PATCH 00/13] percpu rwsem -v2 Peter Zijlstra
                   ` (15 preceding siblings ...)
  2015-06-22 20:06 ` Linus Torvalds
@ 2015-06-23 16:10 ` Davidlohr Bueso
  2015-06-23 16:21   ` Peter Zijlstra
  16 siblings, 1 reply; 106+ messages in thread
From: Davidlohr Bueso @ 2015-06-23 16:10 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: oleg, paulmck, tj, mingo, linux-kernel, der.herr, riel, viro, torvalds

On Mon, 2015-06-22 at 14:16 +0200, Peter Zijlstra wrote:
> This series converts the cpu hotplug lock into a percpu-rwsem to provide a 3rd
> user.

Curious, why not also mem hotplug? It seems to use the exact same
locking mayhem than cpu.

Thanks,
Davidlohr


^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-06-23 10:09     ` Peter Zijlstra
  2015-06-23 10:55       ` Peter Zijlstra
@ 2015-06-23 16:20       ` Oleg Nesterov
  2015-06-23 17:24         ` Oleg Nesterov
  1 sibling, 1 reply; 106+ messages in thread
From: Oleg Nesterov @ 2015-06-23 16:20 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: paulmck, tj, mingo, linux-kernel, der.herr, dave, riel, viro, torvalds

On 06/23, Peter Zijlstra wrote:
>
> On Tue, Jun 23, 2015 at 12:21:52AM +0200, Oleg Nesterov wrote:
>
> > Suppose that stop_two_cpus(cpu1 => 0, cpu2 => 1) races with stop_machine().
> >
> > 	- stop_machine takes the lock on CPU 0, adds the work
> > 	  and drops the lock
> >
> > 	- cpu_stop_queue_work() queues both works
>
> cpu_stop_queue_work() only ever queues _1_ work.
>
> > 	- stop_machine takes the lock on CPU 1, etc
> >
> > In this case both CPU 0 and 1 will run multi_cpu_stop() but they will
> > use different multi_stop_data's, so they will wait for each other
> > forever?
>
> So what you're saying is:
>
> 	queue_stop_cpus_work()		stop_two_cpus()
>
> 	cpu_stop_queue_work(0,..);
> 					spin_lock(0);
> 					spin_lock(1);
>
> 					__cpu_stop_queue_work(0,..);
> 					__cpu_stop_queue_work(1,..);
>
> 					spin_unlock(1);
> 					spin_unlock(0);
> 	cpu_stop_queue_work(1,..);

Yes, sorry for confusion.

> We can of course slap a percpu-rwsem in, but I wonder if there's
> anything smarter we can do here.

I am wondering too if we can make this multi_cpu_stop() more clever.
Or at least add some deadlock detection...

Until then you can probably just uglify queue_stop_cpus_work() and
avoid the race,

	static void queue_stop_cpus_work(const struct cpumask *cpumask,
					 cpu_stop_fn_t fn, void *arg,
					 struct cpu_stop_done *done)
	{
		struct cpu_stopper *stopper;
		struct cpu_stop_work *work;
		unsigned long flags;
		unsigned int cpu;

		local_irq_save(flags);
		for_each_cpu(cpu, cpumask) {
			stopper = &per_cpu(cpu_stopper, cpu);
			spin_lock(&stopper->lock);

			work = &per_cpu(stop_cpus_work, cpu);
			work->fn = fn;
			work->arg = arg;
			work->done = done;
		}

		for_each_cpu(cpu, cpumask)
			__cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu));

		for_each_cpu(cpu, cpumask) {
			stopper = &per_cpu(cpu_stopper, cpu);
			spin_unlock(&stopper->lock);
		}
		local_irq_restore(flags);
	}

ignoring lockdep problems.

It would be nice to remove stop_cpus_mutex, it actually protects
stop_cpus_work... Then probably stop_two_cpus() can just use
stop_cpus(). We could simply make stop_cpus_mutex per-cpu too,
but this doesn't look nice.

Oleg.


^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 00/13] percpu rwsem -v2
  2015-06-23 16:10 ` Davidlohr Bueso
@ 2015-06-23 16:21   ` Peter Zijlstra
  0 siblings, 0 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-23 16:21 UTC (permalink / raw)
  To: Davidlohr Bueso
  Cc: oleg, paulmck, tj, mingo, linux-kernel, der.herr, riel, viro, torvalds

On Tue, Jun 23, 2015 at 09:10:03AM -0700, Davidlohr Bueso wrote:
> On Mon, 2015-06-22 at 14:16 +0200, Peter Zijlstra wrote:
> > This series converts the cpu hotplug lock into a percpu-rwsem to provide a 3rd
> > user.
> 
> Curious, why not also mem hotplug? It seems to use the exact same
> locking mayhem than cpu.

Because it looks like they 'forgot' to copy the notifiers and therefore
I suspect we could simplify things. We might not need the recursive
nonsense.

But I've not yet actually looked at it much.

I was indeed greatly saddened that these people copied cpu hotplug;
clearly they had not gotten the memo that cpu hotplug is a trainwreck.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-06-23 13:08           ` Peter Zijlstra
@ 2015-06-23 16:36             ` Oleg Nesterov
  2015-06-23 17:30             ` Paul E. McKenney
  1 sibling, 0 replies; 106+ messages in thread
From: Oleg Nesterov @ 2015-06-23 16:36 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: paulmck, tj, mingo, linux-kernel, der.herr, dave, riel, viro, torvalds

On 06/23, Peter Zijlstra wrote:
>
>  void synchronize_sched_expedited(void)
>  {

...

> -	while (try_stop_cpus(cma ? cm : cpu_online_mask,
> -			     synchronize_sched_expedited_cpu_stop,
> -			     NULL) == -EAGAIN) {
> -		put_online_cpus();
> -		atomic_long_inc(&rsp->expedited_tryfail);
> -
> -		/* Check to see if someone else did our work for us. */
> -		s = atomic_long_read(&rsp->expedited_done);
> -		if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
> -			/* ensure test happens before caller kfree */
> -			smp_mb__before_atomic(); /* ^^^ */
> -			atomic_long_inc(&rsp->expedited_workdone1);
> -			free_cpumask_var(cm);
> -			return;
> -		}

...

> +	for_each_online_cpu(cpu) {
> +		struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);

...

> +		stop_one_cpu(cpu, synchronize_sched_expedited_cpu_stop, NULL);

I too thought about something like this change ;)

Not sure I read this patch correctly, but it seems that then you can
remove all rsp->expedited_* members/code ?

Oleg.


^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 09/13] hotplug: Replace hotplug lock with percpu-rwsem
  2015-06-23  7:16     ` Peter Zijlstra
@ 2015-06-23 17:01       ` Oleg Nesterov
  2015-06-23 17:53         ` Peter Zijlstra
  0 siblings, 1 reply; 106+ messages in thread
From: Oleg Nesterov @ 2015-06-23 17:01 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: paulmck, tj, mingo, linux-kernel, der.herr, dave, riel, viro, torvalds

On 06/23, Peter Zijlstra wrote:
>
> On Tue, Jun 23, 2015 at 12:57:39AM +0200, Oleg Nesterov wrote:
> > > +
> > > +	lock_map_acquire_read(&cpu_hotplug.rwsem.rw_sem.dep_map);
> > > +	_percpu_down_read(&cpu_hotplug.rwsem);
> > >  }
> >
> > Confused... Why do we need _percpu_down_read()? Can't get_online_cpus()
> > just use percpu_down_read() ?
> >
> > Yes, percpu_down_read() is not recursive, like the normal down_read().
> > But this does not matter because we rely on ->cpuhp_ref anyway?
>
> While we will not call the actual lock, lockdep will still get confused
> by the inconsistent locking order observed.
>
> Change it and boot, you'll find lockdep output pretty quickly.

Hmm. and I simply can't understand why...

>
> > > --- a/kernel/fork.c
> > > +++ b/kernel/fork.c
> > > @@ -1410,6 +1410,8 @@ static struct task_struct *copy_process(
> > >  	p->sequential_io_avg	= 0;
> > >  #endif
> > >
> > > +	cpu_hotplug_init_task(p);
> >
> > This is probably unnecessary, copy_process() should not be called under
> > get_online_cpus().
>
> Probably true, in which case we could still use the callback to insert a
> WARN_ON_ONCE(p->cpuhp_ref) :-)

Yes, agreed.

And, perhaps, WARN_ON_ONCE(in_irq) in try_get_online_cpus() makes sense...
percpu_down_read_trylock() from irq is fine, but try_get_online_cpus()
can come right after get/put_online_cpus() updates ->cpuhp_ref.

And I forgot to say,

>  void get_online_cpus(void)
>  {
>  	might_sleep();
> -	if (cpu_hotplug.active_writer == current)
> +
> +	/* read in write recursion */
> +	if (cpu_hotplug.writer == current)
> +		return;

...

>  void put_online_cpus(void)
>  {
> -	int refcount;
> -
> -	if (cpu_hotplug.active_writer == current)
> +	if (cpu_hotplug.writer == current)
>  		return;

We do not need to check cpu_hotplug.writer in get/put_online_cpus().
cpu_hotplug_begin/end can just inc/dec current->cpuhp_ref.

Oleg.


^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-06-23 16:20       ` Oleg Nesterov
@ 2015-06-23 17:24         ` Oleg Nesterov
  2015-06-25 19:18           ` Peter Zijlstra
  0 siblings, 1 reply; 106+ messages in thread
From: Oleg Nesterov @ 2015-06-23 17:24 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: paulmck, tj, mingo, linux-kernel, der.herr, dave, riel, viro, torvalds

On 06/23, Oleg Nesterov wrote:
>
>
> It would be nice to remove stop_cpus_mutex, it actually protects
> stop_cpus_work... Then probably stop_two_cpus() can just use
> stop_cpus(). We could simply make stop_cpus_mutex per-cpu too,
> but this doesn't look nice.

IOW. Suppose we add ->work_mutex into struct cpu_stopper. Btw,
I think we should move all per-cpu variables there...

Now,

	lock_stop_cpus_works(cpumask)
	{
		for_each_cpu(cpu, cpumask)
			mutex_lock(per_cpu(cpu_stopper_task, cpu).work_mutex);
	}

	unlock_stop_cpus_works(cpumask)
	{
		for_each_cpu(cpu, cpumask)
			mutex_lock(...);
	}

which should be used instead of stop_cpus_mutex. After this change
stop_two_cpus() can just use stop_cpus().


Off-topic. Can't we make __stop_machine() static? The only caller,
_cpu_down() can safely call stop_machine(), get_online_cpus() is
fine under cpu_hotplug_begin().

Oleg.


^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-06-23 13:08           ` Peter Zijlstra
  2015-06-23 16:36             ` Oleg Nesterov
@ 2015-06-23 17:30             ` Paul E. McKenney
  2015-06-23 18:04               ` Peter Zijlstra
  1 sibling, 1 reply; 106+ messages in thread
From: Paul E. McKenney @ 2015-06-23 17:30 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
	viro, torvalds

On Tue, Jun 23, 2015 at 03:08:26PM +0200, Peter Zijlstra wrote:
> On Tue, Jun 23, 2015 at 01:20:41PM +0200, Peter Zijlstra wrote:
> > Paul, why does this use stop_machine anyway? I seemed to remember you
> > sending resched IPIs around.

It used to, but someone submitted a patch long ago that switched it
to try_stop_cpus().  At that time, RCU didn't unconditionally do the
dyntick-idle thing for CONFIG_NO_HZ=n kernel, so try_stop_cpus() was
quite a bit simpler.

That said, I do use your new-age resched-IPI API in other cases.

> > The rcu_sched_qs() thing would set passed_quiesce, which you can then
> > collect to gauge progress.
> > 
> > Shooting IPIs around is bad enough, but running a full blown
> > stop_machine is really blunt and heavy.
> 
> Is there anything obviously amiss with the below? It does stop_one_cpu()
> in a loop instead of the multi cpu stop_machine and is therefore much
> friendlier (albeit still heavier than bare resched IPIs) since the CPUs
> do not have to go an sync up.
> 
> After all, all we're really interested in is that each CPUs has
> scheduled at least once, we do not care about the cross cpu syncup.

This was on my list.  I was thinking of using smp_call_function_single()
combined with polling in order to avoid the double context switch, but
there the approach below is of course simpler.  I was intending to fix
up the rest of RCU's relationship with CPU hotplug first, as this would
allow fully covering the incoming and outgoing code paths.

But perhaps a bit too simple.  A few comments below...

							Thanx, Paul

> ---
>  include/linux/stop_machine.h |  7 ----
>  kernel/rcu/tree.c            | 99 +++++---------------------------------------
>  kernel/stop_machine.c        | 30 --------------
>  3 files changed, 10 insertions(+), 126 deletions(-)
> 
> diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
> index d2abbdb8c6aa..f992da7ee492 100644
> --- a/include/linux/stop_machine.h
> +++ b/include/linux/stop_machine.h
> @@ -32,7 +32,6 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
>  void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
>  			 struct cpu_stop_work *work_buf);
>  int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
> -int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg);
> 
>  #else	/* CONFIG_SMP */
> 
> @@ -83,12 +82,6 @@ static inline int stop_cpus(const struct cpumask *cpumask,
>  	return -ENOENT;
>  }
> 
> -static inline int try_stop_cpus(const struct cpumask *cpumask,
> -				cpu_stop_fn_t fn, void *arg)
> -{
> -	return stop_cpus(cpumask, fn, arg);
> -}
> -
>  #endif	/* CONFIG_SMP */
> 
>  /*
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index add042926a66..4a8cde155dce 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -3257,7 +3257,7 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
>  {
>  	/*
>  	 * There must be a full memory barrier on each affected CPU
> -	 * between the time that try_stop_cpus() is called and the
> +	 * between the time that stop_one_cpu() is called and the
>  	 * time that it returns.
>  	 *
>  	 * In the current initial implementation of cpu_stop, the
> @@ -3291,25 +3291,12 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
>   * grace period.  We are then done, so we use atomic_cmpxchg() to
>   * update sync_sched_expedited_done to match our snapshot -- but
>   * only if someone else has not already advanced past our snapshot.
> - *
> - * On the other hand, if try_stop_cpus() fails, we check the value
> - * of sync_sched_expedited_done.  If it has advanced past our
> - * initial snapshot, then someone else must have forced a grace period
> - * some time after we took our snapshot.  In this case, our work is
> - * done for us, and we can simply return.  Otherwise, we try again,
> - * but keep our initial snapshot for purposes of checking for someone
> - * doing our work for us.
> - *
> - * If we fail too many times in a row, we fall back to synchronize_sched().
>   */
>  void synchronize_sched_expedited(void)
>  {
> -	cpumask_var_t cm;
> -	bool cma = false;
> -	int cpu;
> -	long firstsnap, s, snap;
> -	int trycount = 0;
>  	struct rcu_state *rsp = &rcu_sched_state;
> +	long s, snap;
> +	int cpu;
> 
>  	/*
>  	 * If we are in danger of counter wrap, just do synchronize_sched().
> @@ -3332,7 +3319,6 @@ void synchronize_sched_expedited(void)
>  	 * full memory barrier.
>  	 */
>  	snap = atomic_long_inc_return(&rsp->expedited_start);
> -	firstsnap = snap;

Hmmm...

>  	if (!try_get_online_cpus()) {
>  		/* CPU hotplug operation in flight, fall back to normal GP. */
>  		wait_rcu_gp(call_rcu_sched);
> @@ -3341,82 +3327,17 @@ void synchronize_sched_expedited(void)
>  	}
>  	WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
> 
> -	/* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
> -	cma = zalloc_cpumask_var(&cm, GFP_KERNEL);
> -	if (cma) {
> -		cpumask_copy(cm, cpu_online_mask);
> -		cpumask_clear_cpu(raw_smp_processor_id(), cm);
> -		for_each_cpu(cpu, cm) {
> -			struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
> -
> -			if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
> -				cpumask_clear_cpu(cpu, cm);
> -		}
> -		if (cpumask_weight(cm) == 0)
> -			goto all_cpus_idle;
> -	}

Good, you don't need this because you can check for dynticks later.
You will need to check for offline CPUs.

If you had lots of CPUs coming and going, you could argue that tracking
them would help, but synchronize_sched_expedited() should run fast enough
that there isn't time for CPUs to come or go, at least in the common case.

> -	/*
> -	 * Each pass through the following loop attempts to force a
> -	 * context switch on each CPU.
> -	 */
> -	while (try_stop_cpus(cma ? cm : cpu_online_mask,
> -			     synchronize_sched_expedited_cpu_stop,
> -			     NULL) == -EAGAIN) {
> -		put_online_cpus();
> -		atomic_long_inc(&rsp->expedited_tryfail);
> -
> -		/* Check to see if someone else did our work for us. */
> -		s = atomic_long_read(&rsp->expedited_done);
> -		if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
> -			/* ensure test happens before caller kfree */
> -			smp_mb__before_atomic(); /* ^^^ */
> -			atomic_long_inc(&rsp->expedited_workdone1);
> -			free_cpumask_var(cm);
> -			return;

Here you lose batching.  Yeah, I know that synchronize_sched_expedited()
is -supposed- to be used sparingly, but it is not cool for the kernel
to melt down just because some creative user found a way to heat up a
code path.  Need a mutex_trylock() with a counter and checking for
others having already done the needed work.

> -		}
> -
> -		/* No joy, try again later.  Or just synchronize_sched(). */
> -		if (trycount++ < 10) {
> -			udelay(trycount * num_online_cpus());
> -		} else {
> -			wait_rcu_gp(call_rcu_sched);
> -			atomic_long_inc(&rsp->expedited_normal);
> -			free_cpumask_var(cm);
> -			return;
> -		}

And we still need to be able to drop back to synchronize_sched()
(AKA wait_rcu_gp(call_rcu_sched) in this case) in case we have both a
creative user and a long-running RCU-sched read-side critical section.

> +	for_each_online_cpu(cpu) {
> +		struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
> 
> -		/* Recheck to see if someone else did our work for us. */
> -		s = atomic_long_read(&rsp->expedited_done);
> -		if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
> -			/* ensure test happens before caller kfree */
> -			smp_mb__before_atomic(); /* ^^^ */
> -			atomic_long_inc(&rsp->expedited_workdone2);
> -			free_cpumask_var(cm);
> -			return;
> -		}
> +		/* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
> +		if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
> +			continue;

Let's see...  This does work for idle CPUs and for nohz_full CPUs running
in userspace.

It does not work for the current CPU, so the check needs an additional
check against raw_smp_processor_id(), which is easy enough to add.

There always has been a race window involving CPU hotplug.  My recent
CPU_DYING_IDLE change allows things to be exact on the outgoing side,
and I need to make a similar change on the incoming side.  There will
continue to be a window where RCU needs to pay attention to the CPU,
but neither IPIs nor scheduling works, and I guess I just do a timed
wait in that case.  Rare race anyway, so should be fine.

> -		/*
> -		 * Refetching sync_sched_expedited_started allows later
> -		 * callers to piggyback on our grace period.  We retry
> -		 * after they started, so our grace period works for them,
> -		 * and they started after our first try, so their grace
> -		 * period works for us.
> -		 */
> -		if (!try_get_online_cpus()) {
> -			/* CPU hotplug operation in flight, use normal GP. */
> -			wait_rcu_gp(call_rcu_sched);
> -			atomic_long_inc(&rsp->expedited_normal);
> -			free_cpumask_var(cm);
> -			return;
> -		}
> -		snap = atomic_long_read(&rsp->expedited_start);
> -		smp_mb(); /* ensure read is before try_stop_cpus(). */
> +		stop_one_cpu(cpu, synchronize_sched_expedited_cpu_stop, NULL);

My thought was to use smp_call_function_single(), and to have the function
called recheck dyntick-idle state, avoiding doing a set_tsk_need_resched()
if so.  This would result in a single pass through schedule() instead
of stop_one_cpu()'s double context switch.  It would likely also require
some rework of rcu_note_context_switch(), which stop_one_cpu() avoids
the need for.

>  	}
> -	atomic_long_inc(&rsp->expedited_stoppedcpus);
> 
> -all_cpus_idle:
> -	free_cpumask_var(cm);
> +	atomic_long_inc(&rsp->expedited_stoppedcpus);
> 
>  	/*
>  	 * Everyone up to our most recent fetch is covered by our grace
> diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
> index fd643d8c4b42..b1329a213503 100644
> --- a/kernel/stop_machine.c
> +++ b/kernel/stop_machine.c
> @@ -371,36 +371,6 @@ int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
>  	return ret;
>  }
> 
> -/**
> - * try_stop_cpus - try to stop multiple cpus
> - * @cpumask: cpus to stop
> - * @fn: function to execute
> - * @arg: argument to @fn
> - *
> - * Identical to stop_cpus() except that it fails with -EAGAIN if
> - * someone else is already using the facility.
> - *
> - * CONTEXT:
> - * Might sleep.
> - *
> - * RETURNS:
> - * -EAGAIN if someone else is already stopping cpus, -ENOENT if
> - * @fn(@arg) was not executed at all because all cpus in @cpumask were
> - * offline; otherwise, 0 if all executions of @fn returned 0, any non
> - * zero return value if any returned non zero.
> - */
> -int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
> -{
> -	int ret;
> -
> -	/* static works are used, process one request at a time */
> -	if (!mutex_trylock(&stop_cpus_mutex))
> -		return -EAGAIN;
> -	ret = __stop_cpus(cpumask, fn, arg);
> -	mutex_unlock(&stop_cpus_mutex);
> -	return ret;
> -}
> -
>  static int cpu_stop_should_run(unsigned int cpu)
>  {
>  	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
> 


^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 00/13] percpu rwsem -v2
  2015-06-23 14:56         ` Daniel Wagner
@ 2015-06-23 17:50           ` Peter Zijlstra
  2015-06-23 19:36             ` Peter Zijlstra
  0 siblings, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-23 17:50 UTC (permalink / raw)
  To: Daniel Wagner
  Cc: oleg, paulmck, tj, mingo, linux-kernel, der.herr, dave, riel,
	viro, torvalds, jlayton

On Tue, Jun 23, 2015 at 04:56:39PM +0200, Daniel Wagner wrote:
> flock02
>                              mean   variance      sigma        max        min
>                     tip-1    11.8994     0.5874     0.7664    13.2022     8.6324
>                     tip-2    11.7394     0.5252     0.7247    13.2540     9.7513
>                     tip-3    11.8155     0.5288     0.7272    13.2700     9.9480
>        tip+percpu-rswem-1    15.3601     0.8981     0.9477    16.8116    12.6910
>        tip+percpu-rswem-2    15.2558     0.8442     0.9188    17.0199    12.9586
>        tip+percpu-rswem-3    15.5297     0.6386     0.7991    17.4392    12.7992

I did indeed manage to get flock02 down to a usable level and found:

    3.20 :        ffffffff811ecbdf:       incl   %gs:0x7ee1de72(%rip)        # aa58 <__preempt_count>
    0.27 :        ffffffff811ecbe6:       mov    0xa98553(%rip),%rax        # ffffffff81c85140 <file_rwsem>
   10.78 :        ffffffff811ecbed:       incl   %gs:(%rax)
    0.19 :        ffffffff811ecbf0:       mov    0xa9855a(%rip),%edx        # ffffffff81c85150 <file_rwsem+0x10>
    0.00 :        ffffffff811ecbf6:       test   %edx,%edx
    0.00 :        ffffffff811ecbf8:       jne    ffffffff811ecdd1 <flock_lock_file+0x261>
    3.47 :        ffffffff811ecbfe:       decl   %gs:0x7ee1de53(%rip)        # aa58 <__preempt_count>
    0.00 :        ffffffff811ecc05:       je     ffffffff811eccec <flock_lock_file+0x17c>

Which is percpu_down_read(). Now aside from the fact that I run a
PREEMPT=y kernel, it looks like that sem->refcount increment stalls
because of the dependent load.

Manually hoisting the load very slightly improves things:

    0.24 :        ffffffff811ecbdf:       mov    0xa9855a(%rip),%rax        # ffffffff81c85140 <file_rwsem>
    5.88 :        ffffffff811ecbe6:       incl   %gs:0x7ee1de6b(%rip)        # aa58 <__preempt_count>
    7.94 :        ffffffff811ecbed:       incl   %gs:(%rax)
    0.30 :        ffffffff811ecbf0:       mov    0xa9855a(%rip),%edx        # ffffffff81c85150 <file_rwsem+0x10>
    0.00 :        ffffffff811ecbf6:       test   %edx,%edx
    0.00 :        ffffffff811ecbf8:       jne    ffffffff811ecdd1 <flock_lock_file+0x261>
    3.70 :        ffffffff811ecbfe:       decl   %gs:0x7ee1de53(%rip)        # aa58 <__preempt_count>
    0.00 :        ffffffff811ecc05:       je     ffffffff811eccec <flock_lock_file+0x17c>

But its not much :/

Using DEFINE_STATIC_PERCPU_RWSEM(file_rwsem) would allow GCC to omit the
sem->refcount load entirely, but its not smart enough to see that it can
(tested 4.9 and 5.1).

---
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -35,6 +35,8 @@ extern void __percpu_up_read(struct perc
 
 static inline void _percpu_down_read(struct percpu_rw_semaphore *sem)
 {
+	unsigned int __percpu *refcount = sem->refcount;
+
 	might_sleep();
 
 	preempt_disable();
@@ -47,7 +49,7 @@ static inline void _percpu_down_read(str
 	 * writer will see anything we did within this RCU-sched read-side
 	 * critical section.
 	 */
-	__this_cpu_inc(*sem->refcount);
+	__this_cpu_inc(*refcount);
 	if (unlikely(!rcu_sync_is_idle(&sem->rss)))
 		__percpu_down_read(sem); /* Unconditional memory barrier. */
 	preempt_enable();
@@ -81,6 +83,8 @@ static inline bool percpu_down_read_tryl
 
 static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
 {
+	unsigned int __percpu *refcount = sem->refcount;
+
 	/*
 	 * The barrier() in preempt_disable() prevents the compiler from
 	 * bleeding the critical section out.
@@ -90,7 +94,7 @@ static inline void percpu_up_read(struct
 	 * Same as in percpu_down_read().
 	 */
 	if (likely(rcu_sync_is_idle(&sem->rss)))
-		__this_cpu_dec(*sem->refcount);
+		__this_cpu_dec(*refcount);
 	else
 		__percpu_up_read(sem); /* Unconditional memory barrier. */
 	preempt_enable();

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 09/13] hotplug: Replace hotplug lock with percpu-rwsem
  2015-06-23 17:01       ` Oleg Nesterov
@ 2015-06-23 17:53         ` Peter Zijlstra
  2015-06-24 13:50           ` Oleg Nesterov
  0 siblings, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-23 17:53 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: paulmck, tj, mingo, linux-kernel, der.herr, dave, riel, viro, torvalds

On Tue, Jun 23, 2015 at 07:01:22PM +0200, Oleg Nesterov wrote:
> On 06/23, Peter Zijlstra wrote:
> >
> > On Tue, Jun 23, 2015 at 12:57:39AM +0200, Oleg Nesterov wrote:
> > > > +
> > > > +	lock_map_acquire_read(&cpu_hotplug.rwsem.rw_sem.dep_map);
> > > > +	_percpu_down_read(&cpu_hotplug.rwsem);
> > > >  }
> > >
> > > Confused... Why do we need _percpu_down_read()? Can't get_online_cpus()
> > > just use percpu_down_read() ?
> > >
> > > Yes, percpu_down_read() is not recursive, like the normal down_read().
> > > But this does not matter because we rely on ->cpuhp_ref anyway?
> >
> > While we will not call the actual lock, lockdep will still get confused
> > by the inconsistent locking order observed.
> >
> > Change it and boot, you'll find lockdep output pretty quickly.
> 
> Hmm. and I simply can't understand why...

If in one callchain we do:

	get_online_cpus();
	lock(A);

in another we do:

	lock(A);
	get_online_cpus();

lockdep will complain about the inverted lock order, however this is not
a problem at all for recursive locks.

I think the example you get on boot is slightly more complicated, but
ends up like the above iirc.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-06-23 17:30             ` Paul E. McKenney
@ 2015-06-23 18:04               ` Peter Zijlstra
  2015-06-23 18:26                 ` Paul E. McKenney
  0 siblings, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-23 18:04 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
	viro, torvalds

On Tue, Jun 23, 2015 at 10:30:38AM -0700, Paul E. McKenney wrote:
> Good, you don't need this because you can check for dynticks later.
> You will need to check for offline CPUs.

get_online_cpus()
for_each_online_cpus() {
 ...
}

is what the new code does.

> > -	/*
> > -	 * Each pass through the following loop attempts to force a
> > -	 * context switch on each CPU.
> > -	 */
> > -	while (try_stop_cpus(cma ? cm : cpu_online_mask,
> > -			     synchronize_sched_expedited_cpu_stop,
> > -			     NULL) == -EAGAIN) {
> > -		put_online_cpus();
> > -		atomic_long_inc(&rsp->expedited_tryfail);
> > -
> > -		/* Check to see if someone else did our work for us. */
> > -		s = atomic_long_read(&rsp->expedited_done);
> > -		if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
> > -			/* ensure test happens before caller kfree */
> > -			smp_mb__before_atomic(); /* ^^^ */
> > -			atomic_long_inc(&rsp->expedited_workdone1);
> > -			free_cpumask_var(cm);
> > -			return;
> 
> Here you lose batching.  Yeah, I know that synchronize_sched_expedited()
> is -supposed- to be used sparingly, but it is not cool for the kernel
> to melt down just because some creative user found a way to heat up a
> code path.  Need a mutex_trylock() with a counter and checking for
> others having already done the needed work.

I really think you're making that expedited nonsense far too accessible.

But it was exactly that trylock I was trying to get rid of.

> And we still need to be able to drop back to synchronize_sched()
> (AKA wait_rcu_gp(call_rcu_sched) in this case) in case we have both a
> creative user and a long-running RCU-sched read-side critical section.

No, a long-running RCU-sched read-side is a bug and we should fix that,
its called a preemption-latency, we don't like those.

> > +	for_each_online_cpu(cpu) {
> > +		struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
> > 
> > -		/* Recheck to see if someone else did our work for us. */
> > -		s = atomic_long_read(&rsp->expedited_done);
> > -		if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
> > -			/* ensure test happens before caller kfree */
> > -			smp_mb__before_atomic(); /* ^^^ */
> > -			atomic_long_inc(&rsp->expedited_workdone2);
> > -			free_cpumask_var(cm);
> > -			return;
> > -		}
> > +		/* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
> > +		if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
> > +			continue;
> 
> Let's see...  This does work for idle CPUs and for nohz_full CPUs running
> in userspace.
> 
> It does not work for the current CPU, so the check needs an additional
> check against raw_smp_processor_id(), which is easy enough to add.

Right, realized after I send it out, but it _should_ work for the
current cpu too. Just pointless doing it.

> There always has been a race window involving CPU hotplug.

There is no hotplug race, the entire thing has get_online_cpus() held
across it.

> > +		stop_one_cpu(cpu, synchronize_sched_expedited_cpu_stop, NULL);
> 
> My thought was to use smp_call_function_single(), and to have the function
> called recheck dyntick-idle state, avoiding doing a set_tsk_need_resched()
> if so.

set_tsk_need_resched() is buggy and should not be used.

> This would result in a single pass through schedule() instead
> of stop_one_cpu()'s double context switch.  It would likely also require
> some rework of rcu_note_context_switch(), which stop_one_cpu() avoids
> the need for.

_IF_ you're going to touch rcu_note_context_switch(), you might as well
use a completion, set it for the number of CPUs that need a resched,
spray resched-IPI and have rcu_note_context_switch() do a complete().

But I would really like to avoid adding code to
rcu_note_context_switch(), because we run that on _every_ single context
switch.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-06-23 18:04               ` Peter Zijlstra
@ 2015-06-23 18:26                 ` Paul E. McKenney
  2015-06-23 19:05                   ` Paul E. McKenney
  2015-06-24  7:35                   ` Peter Zijlstra
  0 siblings, 2 replies; 106+ messages in thread
From: Paul E. McKenney @ 2015-06-23 18:26 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
	viro, torvalds

On Tue, Jun 23, 2015 at 08:04:11PM +0200, Peter Zijlstra wrote:
> On Tue, Jun 23, 2015 at 10:30:38AM -0700, Paul E. McKenney wrote:
> > Good, you don't need this because you can check for dynticks later.
> > You will need to check for offline CPUs.
> 
> get_online_cpus()
> for_each_online_cpus() {
>  ...
> }
> 
> is what the new code does.

Ah, I missed that this was not deleted.

> > > -	/*
> > > -	 * Each pass through the following loop attempts to force a
> > > -	 * context switch on each CPU.
> > > -	 */
> > > -	while (try_stop_cpus(cma ? cm : cpu_online_mask,
> > > -			     synchronize_sched_expedited_cpu_stop,
> > > -			     NULL) == -EAGAIN) {
> > > -		put_online_cpus();
> > > -		atomic_long_inc(&rsp->expedited_tryfail);
> > > -
> > > -		/* Check to see if someone else did our work for us. */
> > > -		s = atomic_long_read(&rsp->expedited_done);
> > > -		if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
> > > -			/* ensure test happens before caller kfree */
> > > -			smp_mb__before_atomic(); /* ^^^ */
> > > -			atomic_long_inc(&rsp->expedited_workdone1);
> > > -			free_cpumask_var(cm);
> > > -			return;
> > 
> > Here you lose batching.  Yeah, I know that synchronize_sched_expedited()
> > is -supposed- to be used sparingly, but it is not cool for the kernel
> > to melt down just because some creative user found a way to heat up a
> > code path.  Need a mutex_trylock() with a counter and checking for
> > others having already done the needed work.
> 
> I really think you're making that expedited nonsense far too accessible.

This has nothing to do with accessibility and everything to do with
robustness.  And with me not becoming the triage center for too many
non-RCU bugs.

> But it was exactly that trylock I was trying to get rid of.

OK.  Why, exactly?

> > And we still need to be able to drop back to synchronize_sched()
> > (AKA wait_rcu_gp(call_rcu_sched) in this case) in case we have both a
> > creative user and a long-running RCU-sched read-side critical section.
> 
> No, a long-running RCU-sched read-side is a bug and we should fix that,
> its called a preemption-latency, we don't like those.

Yes, we should fix them.  No, they absolutely must not result in a
meltdown of some unrelated portion of the kernel (like RCU), particularly
if this situation occurs on some system running a production workload
that doesn't happen to care about preemption latency.

> > > +	for_each_online_cpu(cpu) {
> > > +		struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
> > > 
> > > -		/* Recheck to see if someone else did our work for us. */
> > > -		s = atomic_long_read(&rsp->expedited_done);
> > > -		if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
> > > -			/* ensure test happens before caller kfree */
> > > -			smp_mb__before_atomic(); /* ^^^ */
> > > -			atomic_long_inc(&rsp->expedited_workdone2);
> > > -			free_cpumask_var(cm);
> > > -			return;
> > > -		}
> > > +		/* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
> > > +		if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
> > > +			continue;
> > 
> > Let's see...  This does work for idle CPUs and for nohz_full CPUs running
> > in userspace.
> > 
> > It does not work for the current CPU, so the check needs an additional
> > check against raw_smp_processor_id(), which is easy enough to add.
> 
> Right, realized after I send it out, but it _should_ work for the
> current cpu too. Just pointless doing it.

OK, and easily fixed up in any case.

> > There always has been a race window involving CPU hotplug.
> 
> There is no hotplug race, the entire thing has get_online_cpus() held
> across it.

Which I would like to get rid of, but not urgent.

> > > +		stop_one_cpu(cpu, synchronize_sched_expedited_cpu_stop, NULL);
> > 
> > My thought was to use smp_call_function_single(), and to have the function
> > called recheck dyntick-idle state, avoiding doing a set_tsk_need_resched()
> > if so.
> 
> set_tsk_need_resched() is buggy and should not be used.

OK, what API is used for this purpose?

> > This would result in a single pass through schedule() instead
> > of stop_one_cpu()'s double context switch.  It would likely also require
> > some rework of rcu_note_context_switch(), which stop_one_cpu() avoids
> > the need for.
> 
> _IF_ you're going to touch rcu_note_context_switch(), you might as well
> use a completion, set it for the number of CPUs that need a resched,
> spray resched-IPI and have rcu_note_context_switch() do a complete().
> 
> But I would really like to avoid adding code to
> rcu_note_context_switch(), because we run that on _every_ single context
> switch.

I believe that I can rework the current code to get the effect without
increased overhead, given that I have no intention of adding the
complete().  Adding the complete -would- add overhead to that fastpath.

							Thanx, Paul


^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-06-23 18:26                 ` Paul E. McKenney
@ 2015-06-23 19:05                   ` Paul E. McKenney
  2015-06-24  2:23                     ` Paul E. McKenney
  2015-06-24  7:35                   ` Peter Zijlstra
  1 sibling, 1 reply; 106+ messages in thread
From: Paul E. McKenney @ 2015-06-23 19:05 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
	viro, torvalds

On Tue, Jun 23, 2015 at 11:26:26AM -0700, Paul E. McKenney wrote:
> On Tue, Jun 23, 2015 at 08:04:11PM +0200, Peter Zijlstra wrote:
> > On Tue, Jun 23, 2015 at 10:30:38AM -0700, Paul E. McKenney wrote:
> > > Good, you don't need this because you can check for dynticks later.
> > > You will need to check for offline CPUs.
> > 
> > get_online_cpus()
> > for_each_online_cpus() {
> >  ...
> > }
> > 
> > is what the new code does.
> 
> Ah, I missed that this was not deleted.

But get_online_cpus() will re-introduce a deadlock.

							Thanx, Paul

> > > > -	/*
> > > > -	 * Each pass through the following loop attempts to force a
> > > > -	 * context switch on each CPU.
> > > > -	 */
> > > > -	while (try_stop_cpus(cma ? cm : cpu_online_mask,
> > > > -			     synchronize_sched_expedited_cpu_stop,
> > > > -			     NULL) == -EAGAIN) {
> > > > -		put_online_cpus();
> > > > -		atomic_long_inc(&rsp->expedited_tryfail);
> > > > -
> > > > -		/* Check to see if someone else did our work for us. */
> > > > -		s = atomic_long_read(&rsp->expedited_done);
> > > > -		if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
> > > > -			/* ensure test happens before caller kfree */
> > > > -			smp_mb__before_atomic(); /* ^^^ */
> > > > -			atomic_long_inc(&rsp->expedited_workdone1);
> > > > -			free_cpumask_var(cm);
> > > > -			return;
> > > 
> > > Here you lose batching.  Yeah, I know that synchronize_sched_expedited()
> > > is -supposed- to be used sparingly, but it is not cool for the kernel
> > > to melt down just because some creative user found a way to heat up a
> > > code path.  Need a mutex_trylock() with a counter and checking for
> > > others having already done the needed work.
> > 
> > I really think you're making that expedited nonsense far too accessible.
> 
> This has nothing to do with accessibility and everything to do with
> robustness.  And with me not becoming the triage center for too many
> non-RCU bugs.
> 
> > But it was exactly that trylock I was trying to get rid of.
> 
> OK.  Why, exactly?
> 
> > > And we still need to be able to drop back to synchronize_sched()
> > > (AKA wait_rcu_gp(call_rcu_sched) in this case) in case we have both a
> > > creative user and a long-running RCU-sched read-side critical section.
> > 
> > No, a long-running RCU-sched read-side is a bug and we should fix that,
> > its called a preemption-latency, we don't like those.
> 
> Yes, we should fix them.  No, they absolutely must not result in a
> meltdown of some unrelated portion of the kernel (like RCU), particularly
> if this situation occurs on some system running a production workload
> that doesn't happen to care about preemption latency.
> 
> > > > +	for_each_online_cpu(cpu) {
> > > > +		struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
> > > > 
> > > > -		/* Recheck to see if someone else did our work for us. */
> > > > -		s = atomic_long_read(&rsp->expedited_done);
> > > > -		if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
> > > > -			/* ensure test happens before caller kfree */
> > > > -			smp_mb__before_atomic(); /* ^^^ */
> > > > -			atomic_long_inc(&rsp->expedited_workdone2);
> > > > -			free_cpumask_var(cm);
> > > > -			return;
> > > > -		}
> > > > +		/* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
> > > > +		if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
> > > > +			continue;
> > > 
> > > Let's see...  This does work for idle CPUs and for nohz_full CPUs running
> > > in userspace.
> > > 
> > > It does not work for the current CPU, so the check needs an additional
> > > check against raw_smp_processor_id(), which is easy enough to add.
> > 
> > Right, realized after I send it out, but it _should_ work for the
> > current cpu too. Just pointless doing it.
> 
> OK, and easily fixed up in any case.
> 
> > > There always has been a race window involving CPU hotplug.
> > 
> > There is no hotplug race, the entire thing has get_online_cpus() held
> > across it.
> 
> Which I would like to get rid of, but not urgent.
> 
> > > > +		stop_one_cpu(cpu, synchronize_sched_expedited_cpu_stop, NULL);
> > > 
> > > My thought was to use smp_call_function_single(), and to have the function
> > > called recheck dyntick-idle state, avoiding doing a set_tsk_need_resched()
> > > if so.
> > 
> > set_tsk_need_resched() is buggy and should not be used.
> 
> OK, what API is used for this purpose?
> 
> > > This would result in a single pass through schedule() instead
> > > of stop_one_cpu()'s double context switch.  It would likely also require
> > > some rework of rcu_note_context_switch(), which stop_one_cpu() avoids
> > > the need for.
> > 
> > _IF_ you're going to touch rcu_note_context_switch(), you might as well
> > use a completion, set it for the number of CPUs that need a resched,
> > spray resched-IPI and have rcu_note_context_switch() do a complete().
> > 
> > But I would really like to avoid adding code to
> > rcu_note_context_switch(), because we run that on _every_ single context
> > switch.
> 
> I believe that I can rework the current code to get the effect without
> increased overhead, given that I have no intention of adding the
> complete().  Adding the complete -would- add overhead to that fastpath.
> 
> 							Thanx, Paul


^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 00/13] percpu rwsem -v2
  2015-06-23 17:50           ` Peter Zijlstra
@ 2015-06-23 19:36             ` Peter Zijlstra
  2015-06-24  8:46               ` Ingo Molnar
  0 siblings, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-23 19:36 UTC (permalink / raw)
  To: Daniel Wagner
  Cc: oleg, paulmck, tj, mingo, linux-kernel, der.herr, dave, riel,
	viro, torvalds, jlayton

On Tue, Jun 23, 2015 at 07:50:12PM +0200, Peter Zijlstra wrote:
> On Tue, Jun 23, 2015 at 04:56:39PM +0200, Daniel Wagner wrote:
> > flock02
> >                              mean   variance      sigma        max        min
> >                     tip-1    11.8994     0.5874     0.7664    13.2022     8.6324
> >                     tip-2    11.7394     0.5252     0.7247    13.2540     9.7513
> >                     tip-3    11.8155     0.5288     0.7272    13.2700     9.9480
> >        tip+percpu-rswem-1    15.3601     0.8981     0.9477    16.8116    12.6910
> >        tip+percpu-rswem-2    15.2558     0.8442     0.9188    17.0199    12.9586
> >        tip+percpu-rswem-3    15.5297     0.6386     0.7991    17.4392    12.7992
> 
> I did indeed manage to get flock02 down to a usable level and found:

Aside from the flock_lock_file function moving up, we also get an
increase in _raw_spin_lock.

Before:

     5.17%     5.17%  flock02       [kernel.vmlinux]            [k] _raw_spin_lock
                 |
                 ---_raw_spin_lock
                    |          
                    |--99.75%-- flock_lock_file_wait
                    |          sys_flock
                    |          entry_SYSCALL_64_fastpath
                    |          flock
                     --0.25%-- [...]


After:

     7.20%     7.20%  flock02       [kernel.vmlinux]            [k] _raw_spin_lock
                 |
                 ---_raw_spin_lock
                    |          
                    |--52.23%-- flock_lock_file_wait
                    |          sys_flock
                    |          entry_SYSCALL_64_fastpath
                    |          flock
                    |          
                    |--25.92%-- flock_lock_file
                    |          flock_lock_file_wait
                    |          sys_flock
                    |          entry_SYSCALL_64_fastpath
                    |          flock
                    |          
                    |--21.42%-- locks_delete_lock_ctx
                    |          flock_lock_file
                    |          flock_lock_file_wait
                    |          sys_flock
                    |          entry_SYSCALL_64_fastpath
                    |          flock
                     --0.43%-- [...]


And its not at all clear to me why this would be. It looks like
FILE_LOCK_DEFERRED is happening, but I've not yet figured out why that
would be.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-06-23 19:05                   ` Paul E. McKenney
@ 2015-06-24  2:23                     ` Paul E. McKenney
  2015-06-24  8:32                       ` Peter Zijlstra
  0 siblings, 1 reply; 106+ messages in thread
From: Paul E. McKenney @ 2015-06-24  2:23 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
	viro, torvalds

On Tue, Jun 23, 2015 at 12:05:06PM -0700, Paul E. McKenney wrote:
> On Tue, Jun 23, 2015 at 11:26:26AM -0700, Paul E. McKenney wrote:
> > On Tue, Jun 23, 2015 at 08:04:11PM +0200, Peter Zijlstra wrote:
> > > On Tue, Jun 23, 2015 at 10:30:38AM -0700, Paul E. McKenney wrote:
> > > > Good, you don't need this because you can check for dynticks later.
> > > > You will need to check for offline CPUs.
> > > 
> > > get_online_cpus()
> > > for_each_online_cpus() {
> > >  ...
> > > }
> > > 
> > > is what the new code does.
> > 
> > Ah, I missed that this was not deleted.
> 
> But get_online_cpus() will re-introduce a deadlock.

And here is an untested patch that applies the gist of your approach,
the series of stop_one_cpu() calls, but without undoing the rest.
I forged your Signed-off-by, please let me know if that doesn't work
for you.  There are a number of simplifications that can be made, but
the basic approach gets a good testing first.

And I just noticed that I forgot to get rid of try_stop_cpus().
Well, there will probably be a test failure or two to handle, so
I can add that in the next version.  ;-)

							Thanx, Paul

------------------------------------------------------------------------

commit 1de96c34b39d840c5fe2689640345ed26f78b8f8
Author: Peter Zijlstra <peterz@infradead.org>
Date:   Tue Jun 23 19:03:45 2015 -0700

    rcu: Switch synchronize_sched_expedited() to stop_one_cpu()
    
    The synchronize_sched_expedited() currently invokes try_stop_cpus(),
    which schedules the stopper kthreads on each online non-idle CPU,
    and waits until all those kthreads are running before letting any
    of them stop.  This is disastrous for real-time workloads, which
    get hit with a preemption that is as long as the longest scheduling
    latency on any CPU, including any non-realtime housekeeping CPUs.
    This commit therefore switches to using stop_one_cpu() on each CPU
    in turn.  This avoids inflicting the worst-case scheduling latency
    on the worst-case CPU onto all other CPUs, and also simplifies the
    code a little bit.
    
    Follow-up commits will simplify the counter-snapshotting algorithm
    and convert a number of the counters that are now protected by the
    new ->expedited_mutex to non-atomic.
    
    Signed-off-by: Peter Zijlstra <peterz@infradead.org>
    [ paulmck: Kept stop_one_cpu(), dropped disabling of "guardrails". ]
    Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 78d0a87ff354..a30971474134 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -103,6 +103,7 @@ struct rcu_state sname##_state = { \
 	.orphan_nxttail = &sname##_state.orphan_nxtlist, \
 	.orphan_donetail = &sname##_state.orphan_donelist, \
 	.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
+	.expedited_mutex = __MUTEX_INITIALIZER(sname##_state.expedited_mutex), \
 	.name = RCU_STATE_NAME(sname), \
 	.abbr = sabbr, \
 }
@@ -3357,8 +3358,6 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
  */
 void synchronize_sched_expedited(void)
 {
-	cpumask_var_t cm;
-	bool cma = false;
 	int cpu;
 	long firstsnap, s, snap;
 	int trycount = 0;
@@ -3394,28 +3393,11 @@ void synchronize_sched_expedited(void)
 	}
 	WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
 
-	/* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
-	cma = zalloc_cpumask_var(&cm, GFP_KERNEL);
-	if (cma) {
-		cpumask_copy(cm, cpu_online_mask);
-		cpumask_clear_cpu(raw_smp_processor_id(), cm);
-		for_each_cpu(cpu, cm) {
-			struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-
-			if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
-				cpumask_clear_cpu(cpu, cm);
-		}
-		if (cpumask_weight(cm) == 0)
-			goto all_cpus_idle;
-	}
-
 	/*
 	 * Each pass through the following loop attempts to force a
 	 * context switch on each CPU.
 	 */
-	while (try_stop_cpus(cma ? cm : cpu_online_mask,
-			     synchronize_sched_expedited_cpu_stop,
-			     NULL) == -EAGAIN) {
+	while (!mutex_trylock(&rsp->expedited_mutex)) {
 		put_online_cpus();
 		atomic_long_inc(&rsp->expedited_tryfail);
 
@@ -3425,7 +3407,6 @@ void synchronize_sched_expedited(void)
 			/* ensure test happens before caller kfree */
 			smp_mb__before_atomic(); /* ^^^ */
 			atomic_long_inc(&rsp->expedited_workdone1);
-			free_cpumask_var(cm);
 			return;
 		}
 
@@ -3435,7 +3416,6 @@ void synchronize_sched_expedited(void)
 		} else {
 			wait_rcu_gp(call_rcu_sched);
 			atomic_long_inc(&rsp->expedited_normal);
-			free_cpumask_var(cm);
 			return;
 		}
 
@@ -3445,7 +3425,6 @@ void synchronize_sched_expedited(void)
 			/* ensure test happens before caller kfree */
 			smp_mb__before_atomic(); /* ^^^ */
 			atomic_long_inc(&rsp->expedited_workdone2);
-			free_cpumask_var(cm);
 			return;
 		}
 
@@ -3460,16 +3439,23 @@ void synchronize_sched_expedited(void)
 			/* CPU hotplug operation in flight, use normal GP. */
 			wait_rcu_gp(call_rcu_sched);
 			atomic_long_inc(&rsp->expedited_normal);
-			free_cpumask_var(cm);
 			return;
 		}
 		snap = atomic_long_read(&rsp->expedited_start);
 		smp_mb(); /* ensure read is before try_stop_cpus(). */
 	}
-	atomic_long_inc(&rsp->expedited_stoppedcpus);
 
-all_cpus_idle:
-	free_cpumask_var(cm);
+	/* Stop each CPU that is online, non-idle, and not us. */
+	for_each_online_cpu(cpu) {
+		struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+
+		/* Skip our CPU and any idle CPUs. */
+		if (raw_smp_processor_id() == cpu ||
+		    !(atomic_add_return(0, &rdtp->dynticks) & 0x1))
+			continue;
+		stop_one_cpu(cpu, synchronize_sched_expedited_cpu_stop, NULL);
+	}
+	atomic_long_inc(&rsp->expedited_stoppedcpus);
 
 	/*
 	 * Everyone up to our most recent fetch is covered by our grace
@@ -3488,6 +3474,7 @@ all_cpus_idle:
 		}
 	} while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s);
 	atomic_long_inc(&rsp->expedited_done_exit);
+	mutex_unlock(&rsp->expedited_mutex);
 
 	put_online_cpus();
 }
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index de22d6d06bf9..b04ffa0dea58 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -478,6 +478,7 @@ struct rcu_state {
 						/*  _rcu_barrier(). */
 	/* End of fields guarded by barrier_mutex. */
 
+	struct mutex  expedited_mutex;		/* Serializes expediting. */
 	atomic_long_t expedited_start;		/* Starting ticket. */
 	atomic_long_t expedited_done;		/* Done ticket. */
 	atomic_long_t expedited_wrap;		/* # near-wrap incidents. */


^ permalink raw reply related	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-06-23 18:26                 ` Paul E. McKenney
  2015-06-23 19:05                   ` Paul E. McKenney
@ 2015-06-24  7:35                   ` Peter Zijlstra
  2015-06-24  8:42                     ` Ingo Molnar
  2015-06-24 14:50                     ` Paul E. McKenney
  1 sibling, 2 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-24  7:35 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
	viro, torvalds

On Tue, Jun 23, 2015 at 11:26:26AM -0700, Paul E. McKenney wrote:
> > I really think you're making that expedited nonsense far too accessible.
> 
> This has nothing to do with accessibility and everything to do with
> robustness.  And with me not becoming the triage center for too many
> non-RCU bugs.

But by making it so you're rewarding abuse instead of flagging it :-(

> > > And we still need to be able to drop back to synchronize_sched()
> > > (AKA wait_rcu_gp(call_rcu_sched) in this case) in case we have both a
> > > creative user and a long-running RCU-sched read-side critical section.
> > 
> > No, a long-running RCU-sched read-side is a bug and we should fix that,
> > its called a preemption-latency, we don't like those.
> 
> Yes, we should fix them.  No, they absolutely must not result in a
> meltdown of some unrelated portion of the kernel (like RCU), particularly
> if this situation occurs on some system running a production workload
> that doesn't happen to care about preemption latency.

I still don't see a problem here though; the stop_one_cpu() invocation
for the CPU that's suffering its preemption latency will take longer,
but so what?

How does polling and dropping back to sync_rcu() generate better
behaviour than simply waiting for the completion?

> > > > +		stop_one_cpu(cpu, synchronize_sched_expedited_cpu_stop, NULL);
> > > 
> > > My thought was to use smp_call_function_single(), and to have the function
> > > called recheck dyntick-idle state, avoiding doing a set_tsk_need_resched()
> > > if so.
> > 
> > set_tsk_need_resched() is buggy and should not be used.
> 
> OK, what API is used for this purpose?

As per exception you (rcu) already have access to resched_cpu(), use
that -- if it doesn't do what you need it to, we'll fix it, you're the
only consumer of it.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-06-24  2:23                     ` Paul E. McKenney
@ 2015-06-24  8:32                       ` Peter Zijlstra
  2015-06-24  9:31                         ` Peter Zijlstra
  2015-06-24 15:01                         ` Paul E. McKenney
  0 siblings, 2 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-24  8:32 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
	viro, torvalds

On Tue, Jun 23, 2015 at 07:23:44PM -0700, Paul E. McKenney wrote:
> And here is an untested patch that applies the gist of your approach,
> the series of stop_one_cpu() calls, but without undoing the rest.
> I forged your Signed-off-by, please let me know if that doesn't work
> for you.  There are a number of simplifications that can be made, but
> the basic approach gets a good testing first.

So I really do not get the point of the trylock. It doesn't make sense.

Why would you poll the mutex instead of just wait for it and then
recheck if someone did the work while you were waiting for it?

What's wrong with the below?

---
 kernel/rcu/tree.c | 100 +++++++++++++++---------------------------------------
 kernel/rcu/tree.h |   1 +
 2 files changed, 29 insertions(+), 72 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index add042926a66..b39a5672a7ac 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -103,6 +103,7 @@ struct rcu_state sname##_state = { \
 	.orphan_nxttail = &sname##_state.orphan_nxtlist, \
 	.orphan_donetail = &sname##_state.orphan_donelist, \
 	.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
+	.expedited_mutex = __MUTEX_INITIALIZER(sname##_state.expedited_mutex), \
 	.name = RCU_STATE_NAME(sname), \
 	.abbr = sabbr, \
 }
@@ -3304,12 +3305,9 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
  */
 void synchronize_sched_expedited(void)
 {
-	cpumask_var_t cm;
-	bool cma = false;
-	int cpu;
-	long firstsnap, s, snap;
-	int trycount = 0;
 	struct rcu_state *rsp = &rcu_sched_state;
+	long s, snap;
+	int cpu;
 
 	/*
 	 * If we are in danger of counter wrap, just do synchronize_sched().
@@ -3332,7 +3330,6 @@ void synchronize_sched_expedited(void)
 	 * full memory barrier.
 	 */
 	snap = atomic_long_inc_return(&rsp->expedited_start);
-	firstsnap = snap;
 	if (!try_get_online_cpus()) {
 		/* CPU hotplug operation in flight, fall back to normal GP. */
 		wait_rcu_gp(call_rcu_sched);
@@ -3341,83 +3338,40 @@ void synchronize_sched_expedited(void)
 	}
 	WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
 
-	/* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
-	cma = zalloc_cpumask_var(&cm, GFP_KERNEL);
-	if (cma) {
-		cpumask_copy(cm, cpu_online_mask);
-		cpumask_clear_cpu(raw_smp_processor_id(), cm);
-		for_each_cpu(cpu, cm) {
-			struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-
-			if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
-				cpumask_clear_cpu(cpu, cm);
-		}
-		if (cpumask_weight(cm) == 0)
-			goto all_cpus_idle;
-	}
-
 	/*
 	 * Each pass through the following loop attempts to force a
 	 * context switch on each CPU.
 	 */
-	while (try_stop_cpus(cma ? cm : cpu_online_mask,
-			     synchronize_sched_expedited_cpu_stop,
-			     NULL) == -EAGAIN) {
-		put_online_cpus();
-		atomic_long_inc(&rsp->expedited_tryfail);
+	mutex_lock(&rsp->expedited_mutex);
 
-		/* Check to see if someone else did our work for us. */
-		s = atomic_long_read(&rsp->expedited_done);
-		if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
-			/* ensure test happens before caller kfree */
-			smp_mb__before_atomic(); /* ^^^ */
-			atomic_long_inc(&rsp->expedited_workdone1);
-			free_cpumask_var(cm);
-			return;
-		}
+	/*
+	 * Check to see if someone else did our work for us, while we were
+	 * waiting for the mutex.
+	 */
+	s = atomic_long_read(&rsp->expedited_done);
+	if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
+		/* ensure test happens before caller kfree */
+		smp_mb__before_atomic(); /* ^^^ */
+		atomic_long_inc(&rsp->expedited_workdone1);
+		goto unlock;
+	}
 
-		/* No joy, try again later.  Or just synchronize_sched(). */
-		if (trycount++ < 10) {
-			udelay(trycount * num_online_cpus());
-		} else {
-			wait_rcu_gp(call_rcu_sched);
-			atomic_long_inc(&rsp->expedited_normal);
-			free_cpumask_var(cm);
-			return;
-		}
+	/* Stop each CPU that is online, non-idle, and not us. */
+	for_each_online_cpu(cpu) {
+		struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
 
-		/* Recheck to see if someone else did our work for us. */
-		s = atomic_long_read(&rsp->expedited_done);
-		if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
-			/* ensure test happens before caller kfree */
-			smp_mb__before_atomic(); /* ^^^ */
-			atomic_long_inc(&rsp->expedited_workdone2);
-			free_cpumask_var(cm);
-			return;
-		}
+		/* Skip our CPU, */
+		if (raw_smp_processor_id() == cpu)
+			continue;
 
-		/*
-		 * Refetching sync_sched_expedited_started allows later
-		 * callers to piggyback on our grace period.  We retry
-		 * after they started, so our grace period works for them,
-		 * and they started after our first try, so their grace
-		 * period works for us.
-		 */
-		if (!try_get_online_cpus()) {
-			/* CPU hotplug operation in flight, use normal GP. */
-			wait_rcu_gp(call_rcu_sched);
-			atomic_long_inc(&rsp->expedited_normal);
-			free_cpumask_var(cm);
-			return;
-		}
-		snap = atomic_long_read(&rsp->expedited_start);
-		smp_mb(); /* ensure read is before try_stop_cpus(). */
+		/* and any idle CPUs. */
+		if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
+			continue;
+
+		stop_one_cpu(cpu, synchronize_sched_expedited_cpu_stop, NULL);
 	}
 	atomic_long_inc(&rsp->expedited_stoppedcpus);
 
-all_cpus_idle:
-	free_cpumask_var(cm);
-
 	/*
 	 * Everyone up to our most recent fetch is covered by our grace
 	 * period.  Update the counter, but only if our work is still
@@ -3435,6 +3389,8 @@ void synchronize_sched_expedited(void)
 		}
 	} while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s);
 	atomic_long_inc(&rsp->expedited_done_exit);
+unlock:
+	mutex_unlock(&rsp->expedited_mutex);
 
 	put_online_cpus();
 }
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 4adb7ca0bf47..10348c081e8e 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -483,6 +483,7 @@ struct rcu_state {
 						/*  _rcu_barrier(). */
 	/* End of fields guarded by barrier_mutex. */
 
+	struct mutex  expedited_mutex;		/* Serializes expediting. */
 	atomic_long_t expedited_start;		/* Starting ticket. */
 	atomic_long_t expedited_done;		/* Done ticket. */
 	atomic_long_t expedited_wrap;		/* # near-wrap incidents. */

^ permalink raw reply related	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-06-24  7:35                   ` Peter Zijlstra
@ 2015-06-24  8:42                     ` Ingo Molnar
  2015-06-24 13:39                       ` Paul E. McKenney
  2015-06-24 14:50                     ` Paul E. McKenney
  1 sibling, 1 reply; 106+ messages in thread
From: Ingo Molnar @ 2015-06-24  8:42 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Paul E. McKenney, Oleg Nesterov, tj, mingo, linux-kernel,
	der.herr, dave, riel, viro, torvalds


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Tue, Jun 23, 2015 at 11:26:26AM -0700, Paul E. McKenney wrote:
> > >
> > > I really think you're making that expedited nonsense far too accessible.
> > 
> > This has nothing to do with accessibility and everything to do with 
> > robustness.  And with me not becoming the triage center for too many non-RCU 
> > bugs.
> 
> But by making it so you're rewarding abuse instead of flagging it :-(

Btw., being a 'triage center' is the bane of APIs that are overly successful,
so we should take that burden with pride! :-)

Lockdep (and the scheduler APIs as well) frequently got into such situations as 
well, and we mostly solved it by being more informative with debug splats.

I don't think a kernel API should (ever!) stay artificially silent, just for fear 
of flagging too many problems in other code.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 00/13] percpu rwsem -v2
  2015-06-23 19:36             ` Peter Zijlstra
@ 2015-06-24  8:46               ` Ingo Molnar
  2015-06-24  9:01                 ` Peter Zijlstra
  2015-06-24  9:18                 ` Daniel Wagner
  0 siblings, 2 replies; 106+ messages in thread
From: Ingo Molnar @ 2015-06-24  8:46 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Daniel Wagner, oleg, paulmck, tj, mingo, linux-kernel, der.herr,
	dave, riel, viro, torvalds, jlayton


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Tue, Jun 23, 2015 at 07:50:12PM +0200, Peter Zijlstra wrote:
> > On Tue, Jun 23, 2015 at 04:56:39PM +0200, Daniel Wagner wrote:
> > > flock02
> > >                              mean   variance      sigma        max        min
> > >                     tip-1    11.8994     0.5874     0.7664    13.2022     8.6324
> > >                     tip-2    11.7394     0.5252     0.7247    13.2540     9.7513
> > >                     tip-3    11.8155     0.5288     0.7272    13.2700     9.9480
> > >        tip+percpu-rswem-1    15.3601     0.8981     0.9477    16.8116    12.6910
> > >        tip+percpu-rswem-2    15.2558     0.8442     0.9188    17.0199    12.9586
> > >        tip+percpu-rswem-3    15.5297     0.6386     0.7991    17.4392    12.7992
> > 
> > I did indeed manage to get flock02 down to a usable level and found:
> 
> Aside from the flock_lock_file function moving up, we also get an
> increase in _raw_spin_lock.
> 
> Before:
> 
>      5.17%     5.17%  flock02       [kernel.vmlinux]            [k] _raw_spin_lock
>                  |
>                  ---_raw_spin_lock
>                     |          
>                     |--99.75%-- flock_lock_file_wait
>                     |          sys_flock
>                     |          entry_SYSCALL_64_fastpath
>                     |          flock
>                      --0.25%-- [...]
> 
> 
> After:
> 
>      7.20%     7.20%  flock02       [kernel.vmlinux]            [k] _raw_spin_lock
>                  |
>                  ---_raw_spin_lock
>                     |          
>                     |--52.23%-- flock_lock_file_wait
>                     |          sys_flock
>                     |          entry_SYSCALL_64_fastpath
>                     |          flock
>                     |          
>                     |--25.92%-- flock_lock_file
>                     |          flock_lock_file_wait
>                     |          sys_flock
>                     |          entry_SYSCALL_64_fastpath
>                     |          flock
>                     |          
>                     |--21.42%-- locks_delete_lock_ctx
>                     |          flock_lock_file
>                     |          flock_lock_file_wait
>                     |          sys_flock
>                     |          entry_SYSCALL_64_fastpath
>                     |          flock
>                      --0.43%-- [...]
> 
> 
> And its not at all clear to me why this would be. It looks like
> FILE_LOCK_DEFERRED is happening, but I've not yet figured out why that
> would be.

So I'd suggest to first compare preemption behavior: does the workload 
context-switch heavily, and is it the exact same context switching rate and are 
the points of preemption the same as well between the two kernels?

[ Such high variance is often caused by (dynamically) unstable load balancing and 
  the workload never finding a good equilibrium. Any observable locking overhead 
  is usually just a second order concern or a symptom. Assuming the workload 
  context switches heavily. ]

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 00/13] percpu rwsem -v2
  2015-06-24  8:46               ` Ingo Molnar
@ 2015-06-24  9:01                 ` Peter Zijlstra
  2015-06-24  9:18                 ` Daniel Wagner
  1 sibling, 0 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-24  9:01 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Daniel Wagner, oleg, paulmck, tj, mingo, linux-kernel, der.herr,
	dave, riel, viro, torvalds, jlayton

On Wed, Jun 24, 2015 at 10:46:48AM +0200, Ingo Molnar wrote:
> > > > flock02
> > > >                              mean   variance      sigma        max        min
> > > >                     tip-1    11.8994     0.5874     0.7664    13.2022     8.6324
> > > >                     tip-2    11.7394     0.5252     0.7247    13.2540     9.7513
> > > >                     tip-3    11.8155     0.5288     0.7272    13.2700     9.9480
> > > >        tip+percpu-rswem-1    15.3601     0.8981     0.9477    16.8116    12.6910
> > > >        tip+percpu-rswem-2    15.2558     0.8442     0.9188    17.0199    12.9586
> > > >        tip+percpu-rswem-3    15.5297     0.6386     0.7991    17.4392    12.7992

> [ Such high variance is often caused by (dynamically) unstable load balancing and 
>   the workload never finding a good equilibrium. Any observable locking overhead 
>   is usually just a second order concern or a symptom. Assuming the workload 
>   context switches heavily. ]

flock02 is a relatively stable benchmark -- unlike some of the others
where the variance is orders of magnitude higher than the avg.

But yes, I'll go poke at it more. I just need to hunt down unrelated
fail before continuing with this.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 00/13] percpu rwsem -v2
  2015-06-24  8:46               ` Ingo Molnar
  2015-06-24  9:01                 ` Peter Zijlstra
@ 2015-06-24  9:18                 ` Daniel Wagner
  2015-07-01  5:57                   ` Daniel Wagner
  1 sibling, 1 reply; 106+ messages in thread
From: Daniel Wagner @ 2015-06-24  9:18 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra
  Cc: oleg, paulmck, tj, mingo, linux-kernel, der.herr, dave, riel,
	viro, torvalds, jlayton

On 06/24/2015 10:46 AM, Ingo Molnar wrote:
> So I'd suggest to first compare preemption behavior: does the workload 
> context-switch heavily, and is it the exact same context switching rate and are 
> the points of preemption the same as well between the two kernels?

If I read this correctly, the answer is yes.

First the 'stable' flock02 test:

perf stat --repeat 5  --pre 'rm -rf /tmp/a' ~/src/lockperf/flock02 -n 128 -l 64 /tmp/a
0.008793148
0.008784990
0.008587804
0.008693641
0.008776946

 Performance counter stats for '/home/wagi/src/lockperf/flock02 -n 128 -l 64 /tmp/a' (5 runs):

         76.509634      task-clock (msec)         #    3.312 CPUs utilized            ( +-  0.67% )
                 2      context-switches          #    0.029 K/sec                    ( +- 26.50% )
               128      cpu-migrations            #    0.002 M/sec                    ( +-  0.31% )
             5,295      page-faults               #    0.069 M/sec                    ( +-  0.49% )
        89,944,154      cycles                    #    1.176 GHz                      ( +-  0.66% )
        58,670,259      stalled-cycles-frontend   #   65.23% frontend cycles idle     ( +-  0.88% )
                 0      stalled-cycles-backend    #    0.00% backend  cycles idle   
        76,991,414      instructions              #    0.86  insns per cycle        
                                                  #    0.76  stalled cycles per insn  ( +-  0.19% )
        15,239,720      branches                  #  199.187 M/sec                    ( +-  0.20% )
           103,418      branch-misses             #    0.68% of all branches          ( +-  6.68% )

       0.023102895 seconds time elapsed                                          ( +-  1.09% )


And here posix01 which shows high variance:

perf stat --repeat 5  --pre 'rm -rf /tmp/a' ~/src/lockperf/posix01 -n 128 -l 64 /tmp/a
0.006020402
32.510838421
55.516466069
46.794470223
5.097701438

 Performance counter stats for '/home/wagi/src/lockperf/posix01 -n 128 -l 64 /tmp/a' (5 runs):

       4177.932106      task-clock (msec)         #   14.162 CPUs utilized            ( +- 34.59% )
            70,646      context-switches          #    0.017 M/sec                    ( +- 31.56% )
            28,009      cpu-migrations            #    0.007 M/sec                    ( +- 33.55% )
             4,834      page-faults               #    0.001 M/sec                    ( +-  0.98% )
     7,291,160,968      cycles                    #    1.745 GHz                      ( +- 32.17% )
     5,216,204,262      stalled-cycles-frontend   #   71.54% frontend cycles idle     ( +- 32.13% )
                 0      stalled-cycles-backend    #    0.00% backend  cycles idle   
     1,901,289,780      instructions              #    0.26  insns per cycle        
                                                  #    2.74  stalled cycles per insn  ( +- 30.80% )
       440,415,914      branches                  #  105.415 M/sec                    ( +- 31.06% )
         1,347,021      branch-misses             #    0.31% of all branches          ( +- 29.17% )

       0.295016987 seconds time elapsed                                          ( +- 32.01% )


BTW, thanks for the perf stat tip. Really handy!

cheers,
daniel

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-06-24  8:32                       ` Peter Zijlstra
@ 2015-06-24  9:31                         ` Peter Zijlstra
  2015-06-24 13:48                           ` Paul E. McKenney
  2015-06-24 15:01                         ` Paul E. McKenney
  1 sibling, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-24  9:31 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
	viro, torvalds

On Wed, Jun 24, 2015 at 10:32:57AM +0200, Peter Zijlstra wrote:
> +	s = atomic_long_read(&rsp->expedited_done);
> +	if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
> +		/* ensure test happens before caller kfree */
> +		smp_mb__before_atomic(); /* ^^^ */

FWIW isn't that guaranteed by the control dep?

> +		atomic_long_inc(&rsp->expedited_workdone1);
> +		goto unlock;
> +	}

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-06-24  8:42                     ` Ingo Molnar
@ 2015-06-24 13:39                       ` Paul E. McKenney
  2015-06-24 13:43                         ` Ingo Molnar
  0 siblings, 1 reply; 106+ messages in thread
From: Paul E. McKenney @ 2015-06-24 13:39 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, Oleg Nesterov, tj, mingo, linux-kernel, der.herr,
	dave, riel, viro, torvalds

On Wed, Jun 24, 2015 at 10:42:48AM +0200, Ingo Molnar wrote:
> 
> * Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > On Tue, Jun 23, 2015 at 11:26:26AM -0700, Paul E. McKenney wrote:
> > > >
> > > > I really think you're making that expedited nonsense far too accessible.
> > > 
> > > This has nothing to do with accessibility and everything to do with 
> > > robustness.  And with me not becoming the triage center for too many non-RCU 
> > > bugs.
> > 
> > But by making it so you're rewarding abuse instead of flagging it :-(
> 
> Btw., being a 'triage center' is the bane of APIs that are overly successful,
> so we should take that burden with pride! :-)

I will gladly accept that compliment.

And the burden.  But, lazy as I am, I intend to automate it.  ;-)

> Lockdep (and the scheduler APIs as well) frequently got into such situations as 
> well, and we mostly solved it by being more informative with debug splats.
> 
> I don't think a kernel API should (ever!) stay artificially silent, just for fear 
> of flagging too many problems in other code.

I agree, as attested by RCU CPU stall warnings, lockdep-RCU, sparse-based
RCU checks, and the object-debug-based checks for double call_rcu().
That said, in all of these cases, including your example of lockdep,
the diagnostic is a debug splat rather than a mutex-contention meltdown.
And it is the mutex-contention meltdown that I will continue making
synchronize_sched_expedited() avoid.

But given the change from bulk try_stop_cpus() to either stop_one_cpu() or
IPIs, it would not be hard to splat if a given CPU didn't come back fast
enough.  The latency tracer would of course provide better information,
but synchronize_sched_expedited() could do a coarse-grained job with
less setup required.

My first guess for the timeout would be something like 500 milliseconds.
Thoughts?

							Thanx, Paul


^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-06-24 13:39                       ` Paul E. McKenney
@ 2015-06-24 13:43                         ` Ingo Molnar
  2015-06-24 14:03                           ` Paul E. McKenney
  0 siblings, 1 reply; 106+ messages in thread
From: Ingo Molnar @ 2015-06-24 13:43 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Peter Zijlstra, Oleg Nesterov, tj, mingo, linux-kernel, der.herr,
	dave, riel, viro, torvalds


* Paul E. McKenney <paulmck@linux.vnet.ibm.com> wrote:

> On Wed, Jun 24, 2015 at 10:42:48AM +0200, Ingo Molnar wrote:
> > 
> > * Peter Zijlstra <peterz@infradead.org> wrote:
> > 
> > > On Tue, Jun 23, 2015 at 11:26:26AM -0700, Paul E. McKenney wrote:
> > > > >
> > > > > I really think you're making that expedited nonsense far too accessible.
> > > > 
> > > > This has nothing to do with accessibility and everything to do with 
> > > > robustness.  And with me not becoming the triage center for too many non-RCU 
> > > > bugs.
> > > 
> > > But by making it so you're rewarding abuse instead of flagging it :-(
> > 
> > Btw., being a 'triage center' is the bane of APIs that are overly successful,
> > so we should take that burden with pride! :-)
> 
> I will gladly accept that compliment.
> 
> And the burden.  But, lazy as I am, I intend to automate it.  ;-)

lol :)

> > Lockdep (and the scheduler APIs as well) frequently got into such situations as 
> > well, and we mostly solved it by being more informative with debug splats.
> > 
> > I don't think a kernel API should (ever!) stay artificially silent, just for fear 
> > of flagging too many problems in other code.
> 
> I agree, as attested by RCU CPU stall warnings, lockdep-RCU, sparse-based
> RCU checks, and the object-debug-based checks for double call_rcu().
> That said, in all of these cases, including your example of lockdep,
> the diagnostic is a debug splat rather than a mutex-contention meltdown.
> And it is the mutex-contention meltdown that I will continue making
> synchronize_sched_expedited() avoid.
> 
> But given the change from bulk try_stop_cpus() to either stop_one_cpu() or
> IPIs, it would not be hard to splat if a given CPU didn't come back fast
> enough.  The latency tracer would of course provide better information,
> but synchronize_sched_expedited() could do a coarse-grained job with
> less setup required.
> 
> My first guess for the timeout would be something like 500 milliseconds. 
> Thoughts?

So I'd start with 5,000 milliseconds and observe the results first ...

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-06-24  9:31                         ` Peter Zijlstra
@ 2015-06-24 13:48                           ` Paul E. McKenney
  0 siblings, 0 replies; 106+ messages in thread
From: Paul E. McKenney @ 2015-06-24 13:48 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
	viro, torvalds

On Wed, Jun 24, 2015 at 11:31:02AM +0200, Peter Zijlstra wrote:
> On Wed, Jun 24, 2015 at 10:32:57AM +0200, Peter Zijlstra wrote:
> > +	s = atomic_long_read(&rsp->expedited_done);
> > +	if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
> > +		/* ensure test happens before caller kfree */
> > +		smp_mb__before_atomic(); /* ^^^ */
> 
> FWIW isn't that guaranteed by the control dep?

For trailing stores, yes, but not for trailing loads.  Of course,
trailing loads don't matter in the pure kfree case, but do matter in
other situations.  And this isn't anywhere near a fastpath, so I
am not all that worried about the extra memory barrier.

							Thanx, Paul

> > +		atomic_long_inc(&rsp->expedited_workdone1);
> > +		goto unlock;
> > +	}
> 


^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 09/13] hotplug: Replace hotplug lock with percpu-rwsem
  2015-06-23 17:53         ` Peter Zijlstra
@ 2015-06-24 13:50           ` Oleg Nesterov
  2015-06-24 14:13             ` Peter Zijlstra
  2015-06-28 23:56             ` [PATCH 0/3] percpu-rwsem: introduce percpu_rw_semaphore->recursive mode Oleg Nesterov
  0 siblings, 2 replies; 106+ messages in thread
From: Oleg Nesterov @ 2015-06-24 13:50 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: paulmck, tj, mingo, linux-kernel, der.herr, dave, riel, viro, torvalds

On 06/23, Peter Zijlstra wrote:
>
> On Tue, Jun 23, 2015 at 07:01:22PM +0200, Oleg Nesterov wrote:
> > On 06/23, Peter Zijlstra wrote:
> > >
> > > On Tue, Jun 23, 2015 at 12:57:39AM +0200, Oleg Nesterov wrote:
> > > > > +
> > > > > +	lock_map_acquire_read(&cpu_hotplug.rwsem.rw_sem.dep_map);
> > > > > +	_percpu_down_read(&cpu_hotplug.rwsem);
> > > > >  }
> > > >
> > > > Confused... Why do we need _percpu_down_read()? Can't get_online_cpus()
> > > > just use percpu_down_read() ?
> > > >
> > > > Yes, percpu_down_read() is not recursive, like the normal down_read().
> > > > But this does not matter because we rely on ->cpuhp_ref anyway?
> > >
> > > While we will not call the actual lock, lockdep will still get confused
> > > by the inconsistent locking order observed.
> > >
> > > Change it and boot, you'll find lockdep output pretty quickly.
> >
> > Hmm. and I simply can't understand why...
>
> If in one callchain we do:
>
> 	get_online_cpus();
> 	lock(A);
>
> in another we do:
>
> 	lock(A);
> 	get_online_cpus();
>
> lockdep will complain about the inverted lock order, however this is not
> a problem at all for recursive locks.

Ah, but in this case lockdep is right. This is deadlockable because
with the new implementation percpu_down_write() blocks the new readers.
So this change just hides the valid warning.

Just suppose that the 3rd CPU does percpu_down_write()->down_write()
right after the 2nd CPU (above) takes lock(A).

I have to admit that I didn't realize that the code above is currently
correct... but it is.

So we need percpu_down_write_dont_block_readers(). I already thought
about this before, I'll try to make the patch tomorrow on top of your
changes.

This means that we do not need task_struct->cpuhp_ref, but we can't
avoid livelock we currently have: cpu_hotplug_begin() can never succeed
if the new readers come fast enough.

Oleg.


^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-06-24 13:43                         ` Ingo Molnar
@ 2015-06-24 14:03                           ` Paul E. McKenney
  0 siblings, 0 replies; 106+ messages in thread
From: Paul E. McKenney @ 2015-06-24 14:03 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, Oleg Nesterov, tj, mingo, linux-kernel, der.herr,
	dave, riel, viro, torvalds

On Wed, Jun 24, 2015 at 03:43:37PM +0200, Ingo Molnar wrote:
> 
> * Paul E. McKenney <paulmck@linux.vnet.ibm.com> wrote:
> 
> > On Wed, Jun 24, 2015 at 10:42:48AM +0200, Ingo Molnar wrote:
> > > 
> > > * Peter Zijlstra <peterz@infradead.org> wrote:
> > > 
> > > > On Tue, Jun 23, 2015 at 11:26:26AM -0700, Paul E. McKenney wrote:
> > > > > >
> > > > > > I really think you're making that expedited nonsense far too accessible.
> > > > > 
> > > > > This has nothing to do with accessibility and everything to do with 
> > > > > robustness.  And with me not becoming the triage center for too many non-RCU 
> > > > > bugs.
> > > > 
> > > > But by making it so you're rewarding abuse instead of flagging it :-(
> > > 
> > > Btw., being a 'triage center' is the bane of APIs that are overly successful,
> > > so we should take that burden with pride! :-)
> > 
> > I will gladly accept that compliment.
> > 
> > And the burden.  But, lazy as I am, I intend to automate it.  ;-)
> 
> lol :)
> 
> > > Lockdep (and the scheduler APIs as well) frequently got into such situations as 
> > > well, and we mostly solved it by being more informative with debug splats.
> > > 
> > > I don't think a kernel API should (ever!) stay artificially silent, just for fear 
> > > of flagging too many problems in other code.
> > 
> > I agree, as attested by RCU CPU stall warnings, lockdep-RCU, sparse-based
> > RCU checks, and the object-debug-based checks for double call_rcu().
> > That said, in all of these cases, including your example of lockdep,
> > the diagnostic is a debug splat rather than a mutex-contention meltdown.
> > And it is the mutex-contention meltdown that I will continue making
> > synchronize_sched_expedited() avoid.
> > 
> > But given the change from bulk try_stop_cpus() to either stop_one_cpu() or
> > IPIs, it would not be hard to splat if a given CPU didn't come back fast
> > enough.  The latency tracer would of course provide better information,
> > but synchronize_sched_expedited() could do a coarse-grained job with
> > less setup required.
> > 
> > My first guess for the timeout would be something like 500 milliseconds. 
> > Thoughts?
> 
> So I'd start with 5,000 milliseconds and observe the results first ...

Sounds good, especially when I recall that the default RCU CPU stall
warning timeout is 21,000 milliseconds...  ;-)

							Thanx, Paul


^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 09/13] hotplug: Replace hotplug lock with percpu-rwsem
  2015-06-24 13:50           ` Oleg Nesterov
@ 2015-06-24 14:13             ` Peter Zijlstra
  2015-06-24 15:12               ` Oleg Nesterov
  2015-06-28 23:56             ` [PATCH 0/3] percpu-rwsem: introduce percpu_rw_semaphore->recursive mode Oleg Nesterov
  1 sibling, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-24 14:13 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: paulmck, tj, mingo, linux-kernel, der.herr, dave, riel, viro, torvalds

On Wed, Jun 24, 2015 at 03:50:49PM +0200, Oleg Nesterov wrote:
> On 06/23, Peter Zijlstra wrote:
> > If in one callchain we do:
> >
> > 	get_online_cpus();
> > 	lock(A);
> >
> > in another we do:
> >
> > 	lock(A);
> > 	get_online_cpus();
> >
> > lockdep will complain about the inverted lock order, however this is not
> > a problem at all for recursive locks.
> 
> Ah, but in this case lockdep is right. This is deadlockable because
> with the new implementation percpu_down_write() blocks the new readers.
> So this change just hides the valid warning.
> 
> Just suppose that the 3rd CPU does percpu_down_write()->down_write()
> right after the 2nd CPU (above) takes lock(A).
> 
> I have to admit that I didn't realize that the code above is currently
> correct... but it is.
> 
> So we need percpu_down_write_dont_block_readers(). I already thought
> about this before, I'll try to make the patch tomorrow on top of your
> changes.
> 
> This means that we do not need task_struct->cpuhp_ref, but we can't
> avoid livelock we currently have: cpu_hotplug_begin() can never succeed
> if the new readers come fast enough.

I'm confused.. why isn't the read-in-read recursion good enough?

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-06-24  7:35                   ` Peter Zijlstra
  2015-06-24  8:42                     ` Ingo Molnar
@ 2015-06-24 14:50                     ` Paul E. McKenney
  2015-06-24 15:01                       ` Peter Zijlstra
  1 sibling, 1 reply; 106+ messages in thread
From: Paul E. McKenney @ 2015-06-24 14:50 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
	viro, torvalds

On Wed, Jun 24, 2015 at 09:35:03AM +0200, Peter Zijlstra wrote:
> On Tue, Jun 23, 2015 at 11:26:26AM -0700, Paul E. McKenney wrote:
> > > I really think you're making that expedited nonsense far too accessible.
> > 
> > This has nothing to do with accessibility and everything to do with
> > robustness.  And with me not becoming the triage center for too many
> > non-RCU bugs.
> 
> But by making it so you're rewarding abuse instead of flagging it :-(

As discussed in the thread with Ingo, I will do both.

Alternatively, RCU -is- abuse.  Anyone who tries to tell you
otherwise simply lacks proper respect for and adoration of traditional
synchronization mechanisms.  ;-)

> > > > And we still need to be able to drop back to synchronize_sched()
> > > > (AKA wait_rcu_gp(call_rcu_sched) in this case) in case we have both a
> > > > creative user and a long-running RCU-sched read-side critical section.
> > > 
> > > No, a long-running RCU-sched read-side is a bug and we should fix that,
> > > its called a preemption-latency, we don't like those.
> > 
> > Yes, we should fix them.  No, they absolutely must not result in a
> > meltdown of some unrelated portion of the kernel (like RCU), particularly
> > if this situation occurs on some system running a production workload
> > that doesn't happen to care about preemption latency.
> 
> I still don't see a problem here though; the stop_one_cpu() invocation
> for the CPU that's suffering its preemption latency will take longer,
> but so what?
> 
> How does polling and dropping back to sync_rcu() generate better
> behaviour than simply waiting for the completion?

Because if there is too much delay, synchronize_rcu() is no slower
than is synchronize_rcu_expedited(), plus synchronize_rcu() is much
more efficient.

That said, it appears that I have not given any particular thought to the
polling code since about 2008 or so, and it could use quite an upgrade...

> > > > > +		stop_one_cpu(cpu, synchronize_sched_expedited_cpu_stop, NULL);
> > > > 
> > > > My thought was to use smp_call_function_single(), and to have the function
> > > > called recheck dyntick-idle state, avoiding doing a set_tsk_need_resched()
> > > > if so.
> > > 
> > > set_tsk_need_resched() is buggy and should not be used.
> > 
> > OK, what API is used for this purpose?
> 
> As per exception you (rcu) already have access to resched_cpu(), use
> that -- if it doesn't do what you need it to, we'll fix it, you're the
> only consumer of it.

Color me slow and stupid!

And it looks like resched_cpu() does just fine on the local CPU, so it
should be just fine as is.  Thank you for the reminder.

							Thanx, Paul


^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-06-24  8:32                       ` Peter Zijlstra
  2015-06-24  9:31                         ` Peter Zijlstra
@ 2015-06-24 15:01                         ` Paul E. McKenney
  2015-06-24 15:34                           ` Peter Zijlstra
  1 sibling, 1 reply; 106+ messages in thread
From: Paul E. McKenney @ 2015-06-24 15:01 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
	viro, torvalds

On Wed, Jun 24, 2015 at 10:32:57AM +0200, Peter Zijlstra wrote:
> On Tue, Jun 23, 2015 at 07:23:44PM -0700, Paul E. McKenney wrote:
> > And here is an untested patch that applies the gist of your approach,
> > the series of stop_one_cpu() calls, but without undoing the rest.
> > I forged your Signed-off-by, please let me know if that doesn't work
> > for you.  There are a number of simplifications that can be made, but
> > the basic approach gets a good testing first.
> 
> So I really do not get the point of the trylock. It doesn't make sense.
> 
> Why would you poll the mutex instead of just wait for it and then
> recheck if someone did the work while you were waiting for it?
> 
> What's wrong with the below?

Various delays can cause tasks to queue on the mutex out of order.
This can cause a given task not only to have been delayed between
sampling ->expedited_start and the mutex_lock(), but be further delayed
because tasks granted the mutex earlier will wait on grace periods that
the delayed task doesn't need to wait on.  These extra waits are simply
not consistent with the "expedited" in synchronize_sched_expedited().

That said, my polling code can most definitely be improved -- as I
mentioned earlier, it is from 2008 or so, back when a lot of things
worked differently.  My first thought is to apply something sort of
like force_quiescent_state()'s funnel locking, but with unconditional
mutex_lock() instead of the raw_spin_trylock().  That way, when a given
task is awakened, there is a high probability that a grace period it
can use has already elapsed, allowing it to break out of the loop and go
on its way.  This can probably be further improved, but it is a decent
place for me to start.

							Thanx, Paul

> ---
>  kernel/rcu/tree.c | 100 +++++++++++++++---------------------------------------
>  kernel/rcu/tree.h |   1 +
>  2 files changed, 29 insertions(+), 72 deletions(-)
> 
> diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
> index add042926a66..b39a5672a7ac 100644
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -103,6 +103,7 @@ struct rcu_state sname##_state = { \
>  	.orphan_nxttail = &sname##_state.orphan_nxtlist, \
>  	.orphan_donetail = &sname##_state.orphan_donelist, \
>  	.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
> +	.expedited_mutex = __MUTEX_INITIALIZER(sname##_state.expedited_mutex), \
>  	.name = RCU_STATE_NAME(sname), \
>  	.abbr = sabbr, \
>  }
> @@ -3304,12 +3305,9 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
>   */
>  void synchronize_sched_expedited(void)
>  {
> -	cpumask_var_t cm;
> -	bool cma = false;
> -	int cpu;
> -	long firstsnap, s, snap;
> -	int trycount = 0;
>  	struct rcu_state *rsp = &rcu_sched_state;
> +	long s, snap;
> +	int cpu;
> 
>  	/*
>  	 * If we are in danger of counter wrap, just do synchronize_sched().
> @@ -3332,7 +3330,6 @@ void synchronize_sched_expedited(void)
>  	 * full memory barrier.
>  	 */
>  	snap = atomic_long_inc_return(&rsp->expedited_start);
> -	firstsnap = snap;
>  	if (!try_get_online_cpus()) {
>  		/* CPU hotplug operation in flight, fall back to normal GP. */
>  		wait_rcu_gp(call_rcu_sched);
> @@ -3341,83 +3338,40 @@ void synchronize_sched_expedited(void)
>  	}
>  	WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
> 
> -	/* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
> -	cma = zalloc_cpumask_var(&cm, GFP_KERNEL);
> -	if (cma) {
> -		cpumask_copy(cm, cpu_online_mask);
> -		cpumask_clear_cpu(raw_smp_processor_id(), cm);
> -		for_each_cpu(cpu, cm) {
> -			struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
> -
> -			if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
> -				cpumask_clear_cpu(cpu, cm);
> -		}
> -		if (cpumask_weight(cm) == 0)
> -			goto all_cpus_idle;
> -	}
> -
>  	/*
>  	 * Each pass through the following loop attempts to force a
>  	 * context switch on each CPU.
>  	 */
> -	while (try_stop_cpus(cma ? cm : cpu_online_mask,
> -			     synchronize_sched_expedited_cpu_stop,
> -			     NULL) == -EAGAIN) {
> -		put_online_cpus();
> -		atomic_long_inc(&rsp->expedited_tryfail);
> +	mutex_lock(&rsp->expedited_mutex);
> 
> -		/* Check to see if someone else did our work for us. */
> -		s = atomic_long_read(&rsp->expedited_done);
> -		if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
> -			/* ensure test happens before caller kfree */
> -			smp_mb__before_atomic(); /* ^^^ */
> -			atomic_long_inc(&rsp->expedited_workdone1);
> -			free_cpumask_var(cm);
> -			return;
> -		}
> +	/*
> +	 * Check to see if someone else did our work for us, while we were
> +	 * waiting for the mutex.
> +	 */
> +	s = atomic_long_read(&rsp->expedited_done);
> +	if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
> +		/* ensure test happens before caller kfree */
> +		smp_mb__before_atomic(); /* ^^^ */
> +		atomic_long_inc(&rsp->expedited_workdone1);
> +		goto unlock;
> +	}
> 
> -		/* No joy, try again later.  Or just synchronize_sched(). */
> -		if (trycount++ < 10) {
> -			udelay(trycount * num_online_cpus());
> -		} else {
> -			wait_rcu_gp(call_rcu_sched);
> -			atomic_long_inc(&rsp->expedited_normal);
> -			free_cpumask_var(cm);
> -			return;
> -		}
> +	/* Stop each CPU that is online, non-idle, and not us. */
> +	for_each_online_cpu(cpu) {
> +		struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
> 
> -		/* Recheck to see if someone else did our work for us. */
> -		s = atomic_long_read(&rsp->expedited_done);
> -		if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
> -			/* ensure test happens before caller kfree */
> -			smp_mb__before_atomic(); /* ^^^ */
> -			atomic_long_inc(&rsp->expedited_workdone2);
> -			free_cpumask_var(cm);
> -			return;
> -		}
> +		/* Skip our CPU, */
> +		if (raw_smp_processor_id() == cpu)
> +			continue;
> 
> -		/*
> -		 * Refetching sync_sched_expedited_started allows later
> -		 * callers to piggyback on our grace period.  We retry
> -		 * after they started, so our grace period works for them,
> -		 * and they started after our first try, so their grace
> -		 * period works for us.
> -		 */
> -		if (!try_get_online_cpus()) {
> -			/* CPU hotplug operation in flight, use normal GP. */
> -			wait_rcu_gp(call_rcu_sched);
> -			atomic_long_inc(&rsp->expedited_normal);
> -			free_cpumask_var(cm);
> -			return;
> -		}
> -		snap = atomic_long_read(&rsp->expedited_start);
> -		smp_mb(); /* ensure read is before try_stop_cpus(). */
> +		/* and any idle CPUs. */
> +		if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
> +			continue;
> +
> +		stop_one_cpu(cpu, synchronize_sched_expedited_cpu_stop, NULL);
>  	}
>  	atomic_long_inc(&rsp->expedited_stoppedcpus);
> 
> -all_cpus_idle:
> -	free_cpumask_var(cm);
> -
>  	/*
>  	 * Everyone up to our most recent fetch is covered by our grace
>  	 * period.  Update the counter, but only if our work is still
> @@ -3435,6 +3389,8 @@ void synchronize_sched_expedited(void)
>  		}
>  	} while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s);
>  	atomic_long_inc(&rsp->expedited_done_exit);
> +unlock:
> +	mutex_unlock(&rsp->expedited_mutex);
> 
>  	put_online_cpus();
>  }
> diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
> index 4adb7ca0bf47..10348c081e8e 100644
> --- a/kernel/rcu/tree.h
> +++ b/kernel/rcu/tree.h
> @@ -483,6 +483,7 @@ struct rcu_state {
>  						/*  _rcu_barrier(). */
>  	/* End of fields guarded by barrier_mutex. */
> 
> +	struct mutex  expedited_mutex;		/* Serializes expediting. */
>  	atomic_long_t expedited_start;		/* Starting ticket. */
>  	atomic_long_t expedited_done;		/* Done ticket. */
>  	atomic_long_t expedited_wrap;		/* # near-wrap incidents. */
> 


^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-06-24 14:50                     ` Paul E. McKenney
@ 2015-06-24 15:01                       ` Peter Zijlstra
  2015-06-24 15:27                         ` Paul E. McKenney
  0 siblings, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-24 15:01 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
	viro, torvalds

On Wed, Jun 24, 2015 at 07:50:42AM -0700, Paul E. McKenney wrote:
> On Wed, Jun 24, 2015 at 09:35:03AM +0200, Peter Zijlstra wrote:

> > I still don't see a problem here though; the stop_one_cpu() invocation
> > for the CPU that's suffering its preemption latency will take longer,
> > but so what?
> > 
> > How does polling and dropping back to sync_rcu() generate better
> > behaviour than simply waiting for the completion?
> 
> Because if there is too much delay, synchronize_rcu() is no slower
> than is synchronize_rcu_expedited(), plus synchronize_rcu() is much
> more efficient.

Still confused.. How is polling and then blocking more efficient than
just blocking in the first place? I'm seeing the polling as a waste of
cpu time.

The thing is, if we're stalled on a stop_one_cpu() call, the sync_rcu()
is equally stalled. The sync_rcu() cannot wait more efficient than we're
already waiting either.



^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 09/13] hotplug: Replace hotplug lock with percpu-rwsem
  2015-06-24 14:13             ` Peter Zijlstra
@ 2015-06-24 15:12               ` Oleg Nesterov
  2015-06-24 16:15                 ` Peter Zijlstra
  0 siblings, 1 reply; 106+ messages in thread
From: Oleg Nesterov @ 2015-06-24 15:12 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: paulmck, tj, mingo, linux-kernel, der.herr, dave, riel, viro, torvalds

On 06/24, Peter Zijlstra wrote:
>
> On Wed, Jun 24, 2015 at 03:50:49PM +0200, Oleg Nesterov wrote:
> > On 06/23, Peter Zijlstra wrote:
> > > If in one callchain we do:
> > >
> > > 	get_online_cpus();
> > > 	lock(A);
> > >
> > > in another we do:
> > >
> > > 	lock(A);
> > > 	get_online_cpus();
> > >
> > > lockdep will complain about the inverted lock order, however this is not
> > > a problem at all for recursive locks.
> >
> > Ah, but in this case lockdep is right. This is deadlockable because
> > with the new implementation percpu_down_write() blocks the new readers.
> > So this change just hides the valid warning.
> >
> > Just suppose that the 3rd CPU does percpu_down_write()->down_write()
> > right after the 2nd CPU (above) takes lock(A).
> >
> > I have to admit that I didn't realize that the code above is currently
> > correct... but it is.
> >
> > So we need percpu_down_write_dont_block_readers(). I already thought
> > about this before, I'll try to make the patch tomorrow on top of your
> > changes.
> >
> > This means that we do not need task_struct->cpuhp_ref, but we can't
> > avoid livelock we currently have: cpu_hotplug_begin() can never succeed
> > if the new readers come fast enough.
>
> I'm confused.. why isn't the read-in-read recursion good enough?

Because the code above can actually deadlock if 2 CPU's do this at
the same time?

task_struct->cpuhp_ref only makes read-in-read work, but
percpu_down_write() blocks the new readers.

Suppose that ->cpuhp_ref == 0 on CPU's 0 and 1, suppose that CPU 2
does percpu_down_write() and "sem->state = readers_block" is already
visible to CPU 1 when it calls get_online_cpus().

	CPU_0			CPU_1		CPU_2

	get_online_cpus();	lock(A);

	// waits for CPU_1
	lock(A)	

						// waits for CPU_0
						percpu_down_write();

				// waits for CPU_2
				get_online_cpus();


Oleg.


^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-06-24 15:01                       ` Peter Zijlstra
@ 2015-06-24 15:27                         ` Paul E. McKenney
  2015-06-24 15:40                           ` Peter Zijlstra
  0 siblings, 1 reply; 106+ messages in thread
From: Paul E. McKenney @ 2015-06-24 15:27 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
	viro, torvalds

On Wed, Jun 24, 2015 at 05:01:51PM +0200, Peter Zijlstra wrote:
> On Wed, Jun 24, 2015 at 07:50:42AM -0700, Paul E. McKenney wrote:
> > On Wed, Jun 24, 2015 at 09:35:03AM +0200, Peter Zijlstra wrote:
> 
> > > I still don't see a problem here though; the stop_one_cpu() invocation
> > > for the CPU that's suffering its preemption latency will take longer,
> > > but so what?
> > > 
> > > How does polling and dropping back to sync_rcu() generate better
> > > behaviour than simply waiting for the completion?
> > 
> > Because if there is too much delay, synchronize_rcu() is no slower
> > than is synchronize_rcu_expedited(), plus synchronize_rcu() is much
> > more efficient.
> 
> Still confused.. How is polling and then blocking more efficient than
> just blocking in the first place? I'm seeing the polling as a waste of
> cpu time.

As I said, the current code is quite old and will get a facelift.

> The thing is, if we're stalled on a stop_one_cpu() call, the sync_rcu()
> is equally stalled. The sync_rcu() cannot wait more efficient than we're
> already waiting either.

Ah, but synchronize_rcu() doesn't force waiting on more than one extra
grace period.  With strictly queued mutex, you can end up waiting on
several.

							Thanx, Paul


^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-06-24 15:01                         ` Paul E. McKenney
@ 2015-06-24 15:34                           ` Peter Zijlstra
  0 siblings, 0 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-24 15:34 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
	viro, torvalds

On Wed, Jun 24, 2015 at 08:01:29AM -0700, Paul E. McKenney wrote:
> On Wed, Jun 24, 2015 at 10:32:57AM +0200, Peter Zijlstra wrote:
> > On Tue, Jun 23, 2015 at 07:23:44PM -0700, Paul E. McKenney wrote:
> > > And here is an untested patch that applies the gist of your approach,
> > > the series of stop_one_cpu() calls, but without undoing the rest.
> > > I forged your Signed-off-by, please let me know if that doesn't work
> > > for you.  There are a number of simplifications that can be made, but
> > > the basic approach gets a good testing first.
> > 
> > So I really do not get the point of the trylock. It doesn't make sense.
> > 
> > Why would you poll the mutex instead of just wait for it and then
> > recheck if someone did the work while you were waiting for it?
> > 
> > What's wrong with the below?
> 
> Various delays can cause tasks to queue on the mutex out of order.

If the mutex owner sleeps, mutexes are FIFO, otherwise things can get
iffy indeed.

> This can cause a given task not only to have been delayed between
> sampling ->expedited_start and the mutex_lock(), but be further delayed
> because tasks granted the mutex earlier will wait on grace periods that
> the delayed task doesn't need to wait on.  These extra waits are simply
> not consistent with the "expedited" in synchronize_sched_expedited().

Feh, I really do not know if its worth optimizing the concurrent
expedited case, but we could just make it an open-coded mutex that's
strictly FIFO. A waitqueue on the done variable might be sufficient.

That's still tons better than polling.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-06-24 15:27                         ` Paul E. McKenney
@ 2015-06-24 15:40                           ` Peter Zijlstra
  2015-06-24 16:09                             ` Paul E. McKenney
  0 siblings, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-24 15:40 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
	viro, torvalds

On Wed, Jun 24, 2015 at 08:27:19AM -0700, Paul E. McKenney wrote:
> > The thing is, if we're stalled on a stop_one_cpu() call, the sync_rcu()
> > is equally stalled. The sync_rcu() cannot wait more efficient than we're
> > already waiting either.
> 
> Ah, but synchronize_rcu() doesn't force waiting on more than one extra
> grace period.  With strictly queued mutex, you can end up waiting on
> several.

But you could fix that by replacing/augmenting the expedited ticket with
gpnum/copmleted as used in get_state_synchronize_rcu()/cond_synchronize_rcu().



^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-06-24 15:40                           ` Peter Zijlstra
@ 2015-06-24 16:09                             ` Paul E. McKenney
  2015-06-24 16:42                               ` Peter Zijlstra
  0 siblings, 1 reply; 106+ messages in thread
From: Paul E. McKenney @ 2015-06-24 16:09 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
	viro, torvalds

On Wed, Jun 24, 2015 at 05:40:10PM +0200, Peter Zijlstra wrote:
> On Wed, Jun 24, 2015 at 08:27:19AM -0700, Paul E. McKenney wrote:
> > > The thing is, if we're stalled on a stop_one_cpu() call, the sync_rcu()
> > > is equally stalled. The sync_rcu() cannot wait more efficient than we're
> > > already waiting either.
> > 
> > Ah, but synchronize_rcu() doesn't force waiting on more than one extra
> > grace period.  With strictly queued mutex, you can end up waiting on
> > several.
> 
> But you could fix that by replacing/augmenting the expedited ticket with
> gpnum/copmleted as used in get_state_synchronize_rcu()/cond_synchronize_rcu().

Yes, good point, that would be a way of speeding the existing polling
loop up in the case where the polling loop took longer than a normal
grace period.  Might also be a way to speed up the new "polling" regime,
but I am still beating up the counters.  ;-)

But if the mutex serializes everything unconditionally, then you have
already potentially waited for several grace periods worth of time
before you get a chance to check the ticket, so the check doesn't help.
Or am I missing something subtle here?

It looks like I do need to use smp_call_function_single() and your
resched_cpu() because calling stop_one_cpu() sequentially is about
twice as slow as try_stop_cpus() in rcutorture runs of up to 16 CPUs.
But either way, your point about not stopping all the CPUs does hold.

							Thanx, Paul


^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 09/13] hotplug: Replace hotplug lock with percpu-rwsem
  2015-06-24 15:12               ` Oleg Nesterov
@ 2015-06-24 16:15                 ` Peter Zijlstra
  0 siblings, 0 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-24 16:15 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: paulmck, tj, mingo, linux-kernel, der.herr, dave, riel, viro, torvalds

On Wed, Jun 24, 2015 at 05:12:12PM +0200, Oleg Nesterov wrote:
> On 06/24, Peter Zijlstra wrote:

> > I'm confused.. why isn't the read-in-read recursion good enough?
> 
> Because the code above can actually deadlock if 2 CPU's do this at
> the same time?

Hmm yes.. this makes the hotplug locking worse than I feared it was, but
alas.

FYI, the actual splat.

---

[    7.399737] ======================================================
[    7.406640] [ INFO: possible circular locking dependency detected ]
[    7.413643] 4.1.0-02756-ge3d06bd-dirty #185 Not tainted
[    7.419481] -------------------------------------------------------
[    7.426483] kworker/0:1/215 is trying to acquire lock:
[    7.432221]  (&cpu_hotplug.rwsem){++++++}, at: [<ffffffff810ebd63>] apply_workqueue_attrs+0x183/0x4b0
[    7.442564] 
[    7.442564] but task is already holding lock:
[    7.449079]  (&item->mutex){+.+.+.}, at: [<ffffffff815c4dc3>] drm_global_item_ref+0x33/0xe0
[    7.458455] 
[    7.458455] which lock already depends on the new lock.
[    7.458455] 
[    7.467591] 
[    7.467591] the existing dependency chain (in reverse order) is:
[    7.475949] 
-> #3 (&item->mutex){+.+.+.}:
[    7.480662]        [<ffffffff811232b1>] lock_acquire+0xd1/0x290
[    7.487280]        [<ffffffff818ea777>] mutex_lock_nested+0x47/0x3c0
[    7.494390]        [<ffffffff815c4dc3>] drm_global_item_ref+0x33/0xe0
[    7.501596]        [<ffffffff815dcd90>] mgag200_mm_init+0x50/0x1c0
[    7.508514]        [<ffffffff815d757f>] mgag200_driver_load+0x30f/0x500
[    7.515916]        [<ffffffff815b1491>] drm_dev_register+0xb1/0x100
[    7.522922]        [<ffffffff815b428d>] drm_get_pci_dev+0x8d/0x1e0
[    7.529840]        [<ffffffff815dbd3f>] mga_pci_probe+0x9f/0xc0
[    7.536463]        [<ffffffff814bde92>] local_pci_probe+0x42/0xa0
[    7.543283]        [<ffffffff810e54e8>] work_for_cpu_fn+0x18/0x30
[    7.550106]        [<ffffffff810e9d57>] process_one_work+0x1e7/0x7e0
[    7.557214]        [<ffffffff810ea518>] worker_thread+0x1c8/0x460
[    7.564029]        [<ffffffff810f05b6>] kthread+0xf6/0x110
[    7.570166]        [<ffffffff818eefdf>] ret_from_fork+0x3f/0x70
[    7.576792] 
-> #2 (drm_global_mutex){+.+.+.}:
[    7.581891]        [<ffffffff811232b1>] lock_acquire+0xd1/0x290
[    7.588514]        [<ffffffff818ea777>] mutex_lock_nested+0x47/0x3c0
[    7.595622]        [<ffffffff815b1406>] drm_dev_register+0x26/0x100
[    7.602632]        [<ffffffff815b428d>] drm_get_pci_dev+0x8d/0x1e0
[    7.609547]        [<ffffffff815dbd3f>] mga_pci_probe+0x9f/0xc0
[    7.616170]        [<ffffffff814bde92>] local_pci_probe+0x42/0xa0
[    7.622987]        [<ffffffff810e54e8>] work_for_cpu_fn+0x18/0x30
[    7.629806]        [<ffffffff810e9d57>] process_one_work+0x1e7/0x7e0
[    7.636913]        [<ffffffff810ea518>] worker_thread+0x1c8/0x460
[    7.643727]        [<ffffffff810f05b6>] kthread+0xf6/0x110
[    7.649866]        [<ffffffff818eefdf>] ret_from_fork+0x3f/0x70
[    7.656490] 
-> #1 ((&wfc.work)){+.+.+.}:
[    7.661104]        [<ffffffff811232b1>] lock_acquire+0xd1/0x290
[    7.667727]        [<ffffffff810e737d>] flush_work+0x3d/0x260
[    7.674155]        [<ffffffff810e9822>] work_on_cpu+0x82/0x90
[    7.680584]        [<ffffffff814bf2a2>] pci_device_probe+0x112/0x120
[    7.687692]        [<ffffffff815e685f>] driver_probe_device+0x17f/0x2e0
[    7.695094]        [<ffffffff815e6a94>] __driver_attach+0x94/0xa0
[    7.701910]        [<ffffffff815e4786>] bus_for_each_dev+0x66/0xa0
[    7.708824]        [<ffffffff815e626e>] driver_attach+0x1e/0x20
[    7.715447]        [<ffffffff815e5ed8>] bus_add_driver+0x168/0x210
[    7.722361]        [<ffffffff815e7880>] driver_register+0x60/0xe0
[    7.729180]        [<ffffffff814bd754>] __pci_register_driver+0x64/0x70
[    7.736580]        [<ffffffff81f9a10d>] pcie_portdrv_init+0x66/0x79
[    7.743593]        [<ffffffff810002c8>] do_one_initcall+0x88/0x1c0
[    7.750508]        [<ffffffff81f5f169>] kernel_init_freeable+0x1f5/0x282
[    7.758005]        [<ffffffff818da36e>] kernel_init+0xe/0xe0
[    7.764338]        [<ffffffff818eefdf>] ret_from_fork+0x3f/0x70
[    7.770961] 
-> #0 (&cpu_hotplug.rwsem){++++++}:
[    7.776255]        [<ffffffff81122817>] __lock_acquire+0x2207/0x2240
[    7.783363]        [<ffffffff811232b1>] lock_acquire+0xd1/0x290
[    7.789986]        [<ffffffff810cb6e2>] get_online_cpus+0x62/0xb0
[    7.796805]        [<ffffffff810ebd63>] apply_workqueue_attrs+0x183/0x4b0
[    7.804398]        [<ffffffff810ed7bc>] __alloc_workqueue_key+0x2ec/0x560
[    7.811992]        [<ffffffff815cbefa>] ttm_mem_global_init+0x5a/0x310
[    7.819295]        [<ffffffff815dcbb2>] mgag200_ttm_mem_global_init+0x12/0x20
[    7.827277]        [<ffffffff815c4df5>] drm_global_item_ref+0x65/0xe0
[    7.834481]        [<ffffffff815dcd90>] mgag200_mm_init+0x50/0x1c0
[    7.841395]        [<ffffffff815d757f>] mgag200_driver_load+0x30f/0x500
[    7.848793]        [<ffffffff815b1491>] drm_dev_register+0xb1/0x100
[    7.855804]        [<ffffffff815b428d>] drm_get_pci_dev+0x8d/0x1e0
[    7.862715]        [<ffffffff815dbd3f>] mga_pci_probe+0x9f/0xc0
[    7.869338]        [<ffffffff814bde92>] local_pci_probe+0x42/0xa0
[    7.876159]        [<ffffffff810e54e8>] work_for_cpu_fn+0x18/0x30
[    7.882979]        [<ffffffff810e9d57>] process_one_work+0x1e7/0x7e0
[    7.890087]        [<ffffffff810ea518>] worker_thread+0x1c8/0x460
[    7.896907]        [<ffffffff810f05b6>] kthread+0xf6/0x110
[    7.903043]        [<ffffffff818eefdf>] ret_from_fork+0x3f/0x70
[    7.909673] 
[    7.909673] other info that might help us debug this:
[    7.909673] 
[    7.918616] Chain exists of:
  &cpu_hotplug.rwsem --> drm_global_mutex --> &item->mutex

[    7.927907]  Possible unsafe locking scenario:
[    7.927907] 
[    7.934521]        CPU0                    CPU1
[    7.939580]        ----                    ----
[    7.944639]   lock(&item->mutex);
[    7.948359]                                lock(drm_global_mutex);
[    7.955292]                                lock(&item->mutex);
[    7.961855]   lock(&cpu_hotplug.rwsem);
[    7.966158] 
[    7.966158]  *** DEADLOCK ***
[    7.966158] 
[    7.972771] 4 locks held by kworker/0:1/215:
[    7.977539]  #0:  ("events"){.+.+.+}, at: [<ffffffff810e9cc6>] process_one_work+0x156/0x7e0
[    7.986929]  #1:  ((&wfc.work)){+.+.+.}, at: [<ffffffff810e9cc6>] process_one_work+0x156/0x7e0
[    7.996600]  #2:  (drm_global_mutex){+.+.+.}, at: [<ffffffff815b1406>] drm_dev_register+0x26/0x100
[    8.006690]  #3:  (&item->mutex){+.+.+.}, at: [<ffffffff815c4dc3>] drm_global_item_ref+0x33/0xe0
[    8.016559] 
[    8.016559] stack backtrace:
[    8.021427] CPU: 0 PID: 215 Comm: kworker/0:1 Not tainted 4.1.0-02756-ge3d06bd-dirty #185
[    8.030565] Hardware name: Intel Corporation S2600GZ/S2600GZ, BIOS SE5C600.86B.02.02.0002.122320131210 12/23/2013
[    8.042034] Workqueue: events work_for_cpu_fn
[    8.046909]  ffffffff82857e30 ffff88042b3437c8 ffffffff818e5189 0000000000000011
[    8.055216]  ffffffff8282aa40 ffff88042b343818 ffffffff8111ee76 0000000000000004
[    8.063522]  ffff88042b343888 ffff88042b33f040 0000000000000004 ffff88042b33f040
[    8.071827] Call Trace:
[    8.074559]  [<ffffffff818e5189>] dump_stack+0x4c/0x6e
[    8.080300]  [<ffffffff8111ee76>] print_circular_bug+0x1c6/0x220
[    8.087011]  [<ffffffff81122817>] __lock_acquire+0x2207/0x2240
[    8.093528]  [<ffffffff811232b1>] lock_acquire+0xd1/0x290
[    8.099559]  [<ffffffff810ebd63>] ? apply_workqueue_attrs+0x183/0x4b0
[    8.106755]  [<ffffffff810cb6e2>] get_online_cpus+0x62/0xb0
[    8.112981]  [<ffffffff810ebd63>] ? apply_workqueue_attrs+0x183/0x4b0
[    8.120176]  [<ffffffff810ead27>] ? alloc_workqueue_attrs+0x27/0x80
[    8.127178]  [<ffffffff810ebd63>] apply_workqueue_attrs+0x183/0x4b0
[    8.134182]  [<ffffffff8111cc21>] ? debug_mutex_init+0x31/0x40
[    8.140690]  [<ffffffff810ed7bc>] __alloc_workqueue_key+0x2ec/0x560
[    8.147691]  [<ffffffff815cbefa>] ttm_mem_global_init+0x5a/0x310
[    8.154405]  [<ffffffff8122b050>] ? __kmalloc+0x5e0/0x630
[    8.160435]  [<ffffffff815c4de2>] ? drm_global_item_ref+0x52/0xe0
[    8.167243]  [<ffffffff815dcbb2>] mgag200_ttm_mem_global_init+0x12/0x20
[    8.174631]  [<ffffffff815c4df5>] drm_global_item_ref+0x65/0xe0
[    8.181245]  [<ffffffff815dcd90>] mgag200_mm_init+0x50/0x1c0
[    8.187570]  [<ffffffff815d757f>] mgag200_driver_load+0x30f/0x500
[    8.194383]  [<ffffffff815b1491>] drm_dev_register+0xb1/0x100
[    8.200802]  [<ffffffff815b428d>] drm_get_pci_dev+0x8d/0x1e0
[    8.207125]  [<ffffffff818ebf9e>] ? mutex_unlock+0xe/0x10
[    8.213156]  [<ffffffff815dbd3f>] mga_pci_probe+0x9f/0xc0
[    8.219187]  [<ffffffff814bde92>] local_pci_probe+0x42/0xa0
[    8.225412]  [<ffffffff8111db81>] ? __lock_is_held+0x51/0x80
[    8.231736]  [<ffffffff810e54e8>] work_for_cpu_fn+0x18/0x30
[    8.237962]  [<ffffffff810e9d57>] process_one_work+0x1e7/0x7e0
[    8.244477]  [<ffffffff810e9cc6>] ? process_one_work+0x156/0x7e0
[    8.251187]  [<ffffffff810ea518>] worker_thread+0x1c8/0x460
[    8.257410]  [<ffffffff810ea350>] ? process_one_work+0x7e0/0x7e0
[    8.264120]  [<ffffffff810ea350>] ? process_one_work+0x7e0/0x7e0
[    8.270829]  [<ffffffff810f05b6>] kthread+0xf6/0x110
[    8.276375]  [<ffffffff818ee230>] ? _raw_spin_unlock_irq+0x30/0x60
[    8.283282]  [<ffffffff810f04c0>] ? kthread_create_on_node+0x220/0x220
[    8.290566]  [<ffffffff818eefdf>] ret_from_fork+0x3f/0x70
[    8.296597]  [<ffffffff810f04c0>] ? kthread_create_on_node+0x220/0x220

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-06-24 16:09                             ` Paul E. McKenney
@ 2015-06-24 16:42                               ` Peter Zijlstra
  2015-06-24 17:10                                 ` Paul E. McKenney
  0 siblings, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-24 16:42 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
	viro, torvalds

On Wed, Jun 24, 2015 at 09:09:04AM -0700, Paul E. McKenney wrote:
> Yes, good point, that would be a way of speeding the existing polling
> loop up in the case where the polling loop took longer than a normal
> grace period.  Might also be a way to speed up the new "polling" regime,
> but I am still beating up the counters.  ;-)
> 
> But if the mutex serializes everything unconditionally, then you have
> already potentially waited for several grace periods worth of time
> before you get a chance to check the ticket, so the check doesn't help.
> Or am I missing something subtle here?

Observe gpnum before you acquire the mutex, once you get it, check it
against completed, if you've waited long enough, bail.

The thing is, once you start bailing on this condition your 'queue'
drains very fast and this is around the same time sync_rcu() would've
released the waiters too.

Furthermore, until this point we can have 'slow' progress by kicking the
CPUs.

That said, the all cpus concurrent sync_rcu_expedited scenario is
absolutely horrid, its everyone spray everyone else.

> It looks like I do need to use smp_call_function_single() and your
> resched_cpu() because calling stop_one_cpu() sequentially is about
> twice as slow as try_stop_cpus() in rcutorture runs of up to 16 CPUs.
> But either way, your point about not stopping all the CPUs does hold.

Bah, I was afraid of that, the problem is that we wait for the
individual stop_work to complete before sending another.

The below is getting a little out of hand, but should avoid the problem
and might be easier than getting the IPI think going, but who knows.

---
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -103,6 +103,7 @@ struct rcu_state sname##_state = { \
 	.orphan_nxttail = &sname##_state.orphan_nxtlist, \
 	.orphan_donetail = &sname##_state.orphan_donelist, \
 	.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
+	.expedited_mutex = __MUTEX_INITIALIZER(sname##_state.expedited_mutex), \
 	.name = RCU_STATE_NAME(sname), \
 	.abbr = sabbr, \
 }
@@ -3253,23 +3254,28 @@ void cond_synchronize_rcu(unsigned long
 }
 EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
 
+struct exp_stop_state {
+	wait_queue_head_t	*wq;
+	atomic_t		count;
+};
+
 static int synchronize_sched_expedited_cpu_stop(void *data)
 {
+	struct exp_stop_state *ess = data;
+
 	/*
 	 * There must be a full memory barrier on each affected CPU
 	 * between the time that try_stop_cpus() is called and the
 	 * time that it returns.
-	 *
-	 * In the current initial implementation of cpu_stop, the
-	 * above condition is already met when the control reaches
-	 * this point and the following smp_mb() is not strictly
-	 * necessary.  Do smp_mb() anyway for documentation and
-	 * robustness against future implementation changes.
 	 */
-	smp_mb(); /* See above comment block. */
+	if (atomic_dec_and_test(&ess->count))
+		wake_up(ess->wq);
+
 	return 0;
 }
 
+static DEFINE_PER_CPU(struct cpu_stop_work, exp_stop_work);
+
 /**
  * synchronize_sched_expedited - Brute-force RCU-sched grace period
  *
@@ -3304,12 +3310,11 @@ static int synchronize_sched_expedited_c
  */
 void synchronize_sched_expedited(void)
 {
-	cpumask_var_t cm;
-	bool cma = false;
-	int cpu;
-	long firstsnap, s, snap;
-	int trycount = 0;
+	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(stop_wait);
+	struct exp_stop_state ess = { .wq = &stop_wait, };
 	struct rcu_state *rsp = &rcu_sched_state;
+	long s, snap;
+	int cpu;
 
 	/*
 	 * If we are in danger of counter wrap, just do synchronize_sched().
@@ -3332,7 +3337,6 @@ void synchronize_sched_expedited(void)
 	 * full memory barrier.
 	 */
 	snap = atomic_long_inc_return(&rsp->expedited_start);
-	firstsnap = snap;
 	if (!try_get_online_cpus()) {
 		/* CPU hotplug operation in flight, fall back to normal GP. */
 		wait_rcu_gp(call_rcu_sched);
@@ -3341,82 +3345,44 @@ void synchronize_sched_expedited(void)
 	}
 	WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
 
-	/* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
-	cma = zalloc_cpumask_var(&cm, GFP_KERNEL);
-	if (cma) {
-		cpumask_copy(cm, cpu_online_mask);
-		cpumask_clear_cpu(raw_smp_processor_id(), cm);
-		for_each_cpu(cpu, cm) {
-			struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-
-			if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
-				cpumask_clear_cpu(cpu, cm);
-		}
-		if (cpumask_weight(cm) == 0)
-			goto all_cpus_idle;
-	}
-
 	/*
 	 * Each pass through the following loop attempts to force a
 	 * context switch on each CPU.
 	 */
-	while (try_stop_cpus(cma ? cm : cpu_online_mask,
-			     synchronize_sched_expedited_cpu_stop,
-			     NULL) == -EAGAIN) {
-		put_online_cpus();
-		atomic_long_inc(&rsp->expedited_tryfail);
+	mutex_lock(&rsp->expedited_mutex);
 
-		/* Check to see if someone else did our work for us. */
-		s = atomic_long_read(&rsp->expedited_done);
-		if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
-			/* ensure test happens before caller kfree */
-			smp_mb__before_atomic(); /* ^^^ */
-			atomic_long_inc(&rsp->expedited_workdone1);
-			free_cpumask_var(cm);
-			return;
-		}
+	/*
+	 * Check to see if someone else did our work for us, while we were
+	 * waiting for the mutex.
+	 */
+	s = atomic_long_read(&rsp->expedited_done);
+	if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
+		/* ensure test happens before caller kfree */
+		smp_mb__before_atomic(); /* ^^^ */
+		atomic_long_inc(&rsp->expedited_workdone1);
+		goto unlock;
+	}
 
-		/* No joy, try again later.  Or just synchronize_sched(). */
-		if (trycount++ < 10) {
-			udelay(trycount * num_online_cpus());
-		} else {
-			wait_rcu_gp(call_rcu_sched);
-			atomic_long_inc(&rsp->expedited_normal);
-			free_cpumask_var(cm);
-			return;
-		}
+	/* Stop each CPU that is online, non-idle, and not us. */
+	for_each_online_cpu(cpu) {
+		struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
 
-		/* Recheck to see if someone else did our work for us. */
-		s = atomic_long_read(&rsp->expedited_done);
-		if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
-			/* ensure test happens before caller kfree */
-			smp_mb__before_atomic(); /* ^^^ */
-			atomic_long_inc(&rsp->expedited_workdone2);
-			free_cpumask_var(cm);
-			return;
-		}
+		/* Skip our CPU, */
+		if (raw_smp_processor_id() == cpu)
+			continue;
 
-		/*
-		 * Refetching sync_sched_expedited_started allows later
-		 * callers to piggyback on our grace period.  We retry
-		 * after they started, so our grace period works for them,
-		 * and they started after our first try, so their grace
-		 * period works for us.
-		 */
-		if (!try_get_online_cpus()) {
-			/* CPU hotplug operation in flight, use normal GP. */
-			wait_rcu_gp(call_rcu_sched);
-			atomic_long_inc(&rsp->expedited_normal);
-			free_cpumask_var(cm);
-			return;
-		}
-		snap = atomic_long_read(&rsp->expedited_start);
-		smp_mb(); /* ensure read is before try_stop_cpus(). */
+		/* and any idle CPUs. */
+		if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
+			continue;
+
+		atomic_inc(&ess.count);
+		stop_one_cpu_nowait(cpu, synchronize_sched_expedited_cpu_stop, ,
+				&per_cpu(exp_stop_work, cpu));
 	}
-	atomic_long_inc(&rsp->expedited_stoppedcpus);
 
-all_cpus_idle:
-	free_cpumask_var(cm);
+	wait_event(ess.wq, !atomic_read(&ess.count));
+
+	atomic_long_inc(&rsp->expedited_stoppedcpus);
 
 	/*
 	 * Everyone up to our most recent fetch is covered by our grace
@@ -3435,6 +3401,8 @@ void synchronize_sched_expedited(void)
 		}
 	} while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s);
 	atomic_long_inc(&rsp->expedited_done_exit);
+unlock:
+	mutex_unlock(&rsp->expedited_mutex);
 
 	put_online_cpus();
 }
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -483,6 +483,7 @@ struct rcu_state {
 						/*  _rcu_barrier(). */
 	/* End of fields guarded by barrier_mutex. */
 
+	struct mutex  expedited_mutex;		/* Serializes expediting. */
 	atomic_long_t expedited_start;		/* Starting ticket. */
 	atomic_long_t expedited_done;		/* Done ticket. */
 	atomic_long_t expedited_wrap;		/* # near-wrap incidents. */

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-06-24 16:42                               ` Peter Zijlstra
@ 2015-06-24 17:10                                 ` Paul E. McKenney
  2015-06-24 17:20                                   ` Paul E. McKenney
                                                     ` (2 more replies)
  0 siblings, 3 replies; 106+ messages in thread
From: Paul E. McKenney @ 2015-06-24 17:10 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
	viro, torvalds

On Wed, Jun 24, 2015 at 06:42:00PM +0200, Peter Zijlstra wrote:
> On Wed, Jun 24, 2015 at 09:09:04AM -0700, Paul E. McKenney wrote:
> > Yes, good point, that would be a way of speeding the existing polling
> > loop up in the case where the polling loop took longer than a normal
> > grace period.  Might also be a way to speed up the new "polling" regime,
> > but I am still beating up the counters.  ;-)
> > 
> > But if the mutex serializes everything unconditionally, then you have
> > already potentially waited for several grace periods worth of time
> > before you get a chance to check the ticket, so the check doesn't help.
> > Or am I missing something subtle here?
> 
> Observe gpnum before you acquire the mutex, once you get it, check it
> against completed, if you've waited long enough, bail.
> 
> The thing is, once you start bailing on this condition your 'queue'
> drains very fast and this is around the same time sync_rcu() would've
> released the waiters too.

In my experience, this sort of thing simply melts down on large systems.
I am reworking this with multiple locks so as to keep the large-system
contention down to a dull roar.

> Furthermore, until this point we can have 'slow' progress by kicking the
> CPUs.
> 
> That said, the all cpus concurrent sync_rcu_expedited scenario is
> absolutely horrid, its everyone spray everyone else.

Agreed, but we really need a system in this state to remain responsive
enough to allow reasonable debugging to proceed rather than just silently
hanging.  Ergo, I will be providing multiple locks to keep contention
within the realm of reason.  It really isn't complex enough to be worth
arguing about.  Maybe 20 lines of straightforward code.  (Yeah, yeah,
Murphy says otherwise, but he will have to prove it.)

> > It looks like I do need to use smp_call_function_single() and your
> > resched_cpu() because calling stop_one_cpu() sequentially is about
> > twice as slow as try_stop_cpus() in rcutorture runs of up to 16 CPUs.
> > But either way, your point about not stopping all the CPUs does hold.
> 
> Bah, I was afraid of that, the problem is that we wait for the
> individual stop_work to complete before sending another.
> 
> The below is getting a little out of hand, but should avoid the problem
> and might be easier than getting the IPI think going, but who knows.

OK, I will give this a try.  Of course, the counter needs to be
initialized to 1 rather than zero, and it needs to be atomically
decremented after all stop_one_cpu_nowait() invocations, otherwise you
can get an early wakeup due to the usual race conditions.

							Thanx, Paul

> ---
> --- a/kernel/rcu/tree.c
> +++ b/kernel/rcu/tree.c
> @@ -103,6 +103,7 @@ struct rcu_state sname##_state = { \
>  	.orphan_nxttail = &sname##_state.orphan_nxtlist, \
>  	.orphan_donetail = &sname##_state.orphan_donelist, \
>  	.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
> +	.expedited_mutex = __MUTEX_INITIALIZER(sname##_state.expedited_mutex), \
>  	.name = RCU_STATE_NAME(sname), \
>  	.abbr = sabbr, \
>  }
> @@ -3253,23 +3254,28 @@ void cond_synchronize_rcu(unsigned long
>  }
>  EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
> 
> +struct exp_stop_state {
> +	wait_queue_head_t	*wq;
> +	atomic_t		count;
> +};
> +
>  static int synchronize_sched_expedited_cpu_stop(void *data)
>  {
> +	struct exp_stop_state *ess = data;
> +
>  	/*
>  	 * There must be a full memory barrier on each affected CPU
>  	 * between the time that try_stop_cpus() is called and the
>  	 * time that it returns.
> -	 *
> -	 * In the current initial implementation of cpu_stop, the
> -	 * above condition is already met when the control reaches
> -	 * this point and the following smp_mb() is not strictly
> -	 * necessary.  Do smp_mb() anyway for documentation and
> -	 * robustness against future implementation changes.
>  	 */
> -	smp_mb(); /* See above comment block. */
> +	if (atomic_dec_and_test(&ess->count))
> +		wake_up(ess->wq);
> +
>  	return 0;
>  }
> 
> +static DEFINE_PER_CPU(struct cpu_stop_work, exp_stop_work);
> +
>  /**
>   * synchronize_sched_expedited - Brute-force RCU-sched grace period
>   *
> @@ -3304,12 +3310,11 @@ static int synchronize_sched_expedited_c
>   */
>  void synchronize_sched_expedited(void)
>  {
> -	cpumask_var_t cm;
> -	bool cma = false;
> -	int cpu;
> -	long firstsnap, s, snap;
> -	int trycount = 0;
> +	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(stop_wait);
> +	struct exp_stop_state ess = { .wq = &stop_wait, };
>  	struct rcu_state *rsp = &rcu_sched_state;
> +	long s, snap;
> +	int cpu;
> 
>  	/*
>  	 * If we are in danger of counter wrap, just do synchronize_sched().
> @@ -3332,7 +3337,6 @@ void synchronize_sched_expedited(void)
>  	 * full memory barrier.
>  	 */
>  	snap = atomic_long_inc_return(&rsp->expedited_start);
> -	firstsnap = snap;
>  	if (!try_get_online_cpus()) {
>  		/* CPU hotplug operation in flight, fall back to normal GP. */
>  		wait_rcu_gp(call_rcu_sched);
> @@ -3341,82 +3345,44 @@ void synchronize_sched_expedited(void)
>  	}
>  	WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
> 
> -	/* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
> -	cma = zalloc_cpumask_var(&cm, GFP_KERNEL);
> -	if (cma) {
> -		cpumask_copy(cm, cpu_online_mask);
> -		cpumask_clear_cpu(raw_smp_processor_id(), cm);
> -		for_each_cpu(cpu, cm) {
> -			struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
> -
> -			if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
> -				cpumask_clear_cpu(cpu, cm);
> -		}
> -		if (cpumask_weight(cm) == 0)
> -			goto all_cpus_idle;
> -	}
> -
>  	/*
>  	 * Each pass through the following loop attempts to force a
>  	 * context switch on each CPU.
>  	 */
> -	while (try_stop_cpus(cma ? cm : cpu_online_mask,
> -			     synchronize_sched_expedited_cpu_stop,
> -			     NULL) == -EAGAIN) {
> -		put_online_cpus();
> -		atomic_long_inc(&rsp->expedited_tryfail);
> +	mutex_lock(&rsp->expedited_mutex);
> 
> -		/* Check to see if someone else did our work for us. */
> -		s = atomic_long_read(&rsp->expedited_done);
> -		if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
> -			/* ensure test happens before caller kfree */
> -			smp_mb__before_atomic(); /* ^^^ */
> -			atomic_long_inc(&rsp->expedited_workdone1);
> -			free_cpumask_var(cm);
> -			return;
> -		}
> +	/*
> +	 * Check to see if someone else did our work for us, while we were
> +	 * waiting for the mutex.
> +	 */
> +	s = atomic_long_read(&rsp->expedited_done);
> +	if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
> +		/* ensure test happens before caller kfree */
> +		smp_mb__before_atomic(); /* ^^^ */
> +		atomic_long_inc(&rsp->expedited_workdone1);
> +		goto unlock;
> +	}
> 
> -		/* No joy, try again later.  Or just synchronize_sched(). */
> -		if (trycount++ < 10) {
> -			udelay(trycount * num_online_cpus());
> -		} else {
> -			wait_rcu_gp(call_rcu_sched);
> -			atomic_long_inc(&rsp->expedited_normal);
> -			free_cpumask_var(cm);
> -			return;
> -		}
> +	/* Stop each CPU that is online, non-idle, and not us. */
> +	for_each_online_cpu(cpu) {
> +		struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
> 
> -		/* Recheck to see if someone else did our work for us. */
> -		s = atomic_long_read(&rsp->expedited_done);
> -		if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
> -			/* ensure test happens before caller kfree */
> -			smp_mb__before_atomic(); /* ^^^ */
> -			atomic_long_inc(&rsp->expedited_workdone2);
> -			free_cpumask_var(cm);
> -			return;
> -		}
> +		/* Skip our CPU, */
> +		if (raw_smp_processor_id() == cpu)
> +			continue;
> 
> -		/*
> -		 * Refetching sync_sched_expedited_started allows later
> -		 * callers to piggyback on our grace period.  We retry
> -		 * after they started, so our grace period works for them,
> -		 * and they started after our first try, so their grace
> -		 * period works for us.
> -		 */
> -		if (!try_get_online_cpus()) {
> -			/* CPU hotplug operation in flight, use normal GP. */
> -			wait_rcu_gp(call_rcu_sched);
> -			atomic_long_inc(&rsp->expedited_normal);
> -			free_cpumask_var(cm);
> -			return;
> -		}
> -		snap = atomic_long_read(&rsp->expedited_start);
> -		smp_mb(); /* ensure read is before try_stop_cpus(). */
> +		/* and any idle CPUs. */
> +		if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
> +			continue;
> +
> +		atomic_inc(&ess.count);
> +		stop_one_cpu_nowait(cpu, synchronize_sched_expedited_cpu_stop, ,
> +				&per_cpu(exp_stop_work, cpu));
>  	}
> -	atomic_long_inc(&rsp->expedited_stoppedcpus);
> 
> -all_cpus_idle:
> -	free_cpumask_var(cm);
> +	wait_event(ess.wq, !atomic_read(&ess.count));
> +
> +	atomic_long_inc(&rsp->expedited_stoppedcpus);
> 
>  	/*
>  	 * Everyone up to our most recent fetch is covered by our grace
> @@ -3435,6 +3401,8 @@ void synchronize_sched_expedited(void)
>  		}
>  	} while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s);
>  	atomic_long_inc(&rsp->expedited_done_exit);
> +unlock:
> +	mutex_unlock(&rsp->expedited_mutex);
> 
>  	put_online_cpus();
>  }
> --- a/kernel/rcu/tree.h
> +++ b/kernel/rcu/tree.h
> @@ -483,6 +483,7 @@ struct rcu_state {
>  						/*  _rcu_barrier(). */
>  	/* End of fields guarded by barrier_mutex. */
> 
> +	struct mutex  expedited_mutex;		/* Serializes expediting. */
>  	atomic_long_t expedited_start;		/* Starting ticket. */
>  	atomic_long_t expedited_done;		/* Done ticket. */
>  	atomic_long_t expedited_wrap;		/* # near-wrap incidents. */
> 


^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-06-24 17:10                                 ` Paul E. McKenney
@ 2015-06-24 17:20                                   ` Paul E. McKenney
  2015-06-24 17:29                                     ` Peter Zijlstra
  2015-06-24 17:28                                   ` Peter Zijlstra
  2015-06-24 17:58                                   ` Peter Zijlstra
  2 siblings, 1 reply; 106+ messages in thread
From: Paul E. McKenney @ 2015-06-24 17:20 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
	viro, torvalds

On Wed, Jun 24, 2015 at 10:10:04AM -0700, Paul E. McKenney wrote:
> On Wed, Jun 24, 2015 at 06:42:00PM +0200, Peter Zijlstra wrote:
> > On Wed, Jun 24, 2015 at 09:09:04AM -0700, Paul E. McKenney wrote:

[ . . . ]

> > > It looks like I do need to use smp_call_function_single() and your
> > > resched_cpu() because calling stop_one_cpu() sequentially is about
> > > twice as slow as try_stop_cpus() in rcutorture runs of up to 16 CPUs.
> > > But either way, your point about not stopping all the CPUs does hold.
> > 
> > Bah, I was afraid of that, the problem is that we wait for the
> > individual stop_work to complete before sending another.
> > 
> > The below is getting a little out of hand, but should avoid the problem
> > and might be easier than getting the IPI think going, but who knows.
> 
> OK, I will give this a try.  Of course, the counter needs to be
> initialized to 1 rather than zero, and it needs to be atomically
> decremented after all stop_one_cpu_nowait() invocations, otherwise you
> can get an early wakeup due to the usual race conditions.

Except that I promised Ingo I would check for CPUs failing to schedule
quickly enough, which means that I must track them individually rather
than via a single counter...

You did have me going for a bit, though!  ;-)

							Thanx, Paul

> > ---
> > --- a/kernel/rcu/tree.c
> > +++ b/kernel/rcu/tree.c
> > @@ -103,6 +103,7 @@ struct rcu_state sname##_state = { \
> >  	.orphan_nxttail = &sname##_state.orphan_nxtlist, \
> >  	.orphan_donetail = &sname##_state.orphan_donelist, \
> >  	.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
> > +	.expedited_mutex = __MUTEX_INITIALIZER(sname##_state.expedited_mutex), \
> >  	.name = RCU_STATE_NAME(sname), \
> >  	.abbr = sabbr, \
> >  }
> > @@ -3253,23 +3254,28 @@ void cond_synchronize_rcu(unsigned long
> >  }
> >  EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
> > 
> > +struct exp_stop_state {
> > +	wait_queue_head_t	*wq;
> > +	atomic_t		count;
> > +};
> > +
> >  static int synchronize_sched_expedited_cpu_stop(void *data)
> >  {
> > +	struct exp_stop_state *ess = data;
> > +
> >  	/*
> >  	 * There must be a full memory barrier on each affected CPU
> >  	 * between the time that try_stop_cpus() is called and the
> >  	 * time that it returns.
> > -	 *
> > -	 * In the current initial implementation of cpu_stop, the
> > -	 * above condition is already met when the control reaches
> > -	 * this point and the following smp_mb() is not strictly
> > -	 * necessary.  Do smp_mb() anyway for documentation and
> > -	 * robustness against future implementation changes.
> >  	 */
> > -	smp_mb(); /* See above comment block. */
> > +	if (atomic_dec_and_test(&ess->count))
> > +		wake_up(ess->wq);
> > +
> >  	return 0;
> >  }
> > 
> > +static DEFINE_PER_CPU(struct cpu_stop_work, exp_stop_work);
> > +
> >  /**
> >   * synchronize_sched_expedited - Brute-force RCU-sched grace period
> >   *
> > @@ -3304,12 +3310,11 @@ static int synchronize_sched_expedited_c
> >   */
> >  void synchronize_sched_expedited(void)
> >  {
> > -	cpumask_var_t cm;
> > -	bool cma = false;
> > -	int cpu;
> > -	long firstsnap, s, snap;
> > -	int trycount = 0;
> > +	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(stop_wait);
> > +	struct exp_stop_state ess = { .wq = &stop_wait, };
> >  	struct rcu_state *rsp = &rcu_sched_state;
> > +	long s, snap;
> > +	int cpu;
> > 
> >  	/*
> >  	 * If we are in danger of counter wrap, just do synchronize_sched().
> > @@ -3332,7 +3337,6 @@ void synchronize_sched_expedited(void)
> >  	 * full memory barrier.
> >  	 */
> >  	snap = atomic_long_inc_return(&rsp->expedited_start);
> > -	firstsnap = snap;
> >  	if (!try_get_online_cpus()) {
> >  		/* CPU hotplug operation in flight, fall back to normal GP. */
> >  		wait_rcu_gp(call_rcu_sched);
> > @@ -3341,82 +3345,44 @@ void synchronize_sched_expedited(void)
> >  	}
> >  	WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
> > 
> > -	/* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
> > -	cma = zalloc_cpumask_var(&cm, GFP_KERNEL);
> > -	if (cma) {
> > -		cpumask_copy(cm, cpu_online_mask);
> > -		cpumask_clear_cpu(raw_smp_processor_id(), cm);
> > -		for_each_cpu(cpu, cm) {
> > -			struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
> > -
> > -			if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
> > -				cpumask_clear_cpu(cpu, cm);
> > -		}
> > -		if (cpumask_weight(cm) == 0)
> > -			goto all_cpus_idle;
> > -	}
> > -
> >  	/*
> >  	 * Each pass through the following loop attempts to force a
> >  	 * context switch on each CPU.
> >  	 */
> > -	while (try_stop_cpus(cma ? cm : cpu_online_mask,
> > -			     synchronize_sched_expedited_cpu_stop,
> > -			     NULL) == -EAGAIN) {
> > -		put_online_cpus();
> > -		atomic_long_inc(&rsp->expedited_tryfail);
> > +	mutex_lock(&rsp->expedited_mutex);
> > 
> > -		/* Check to see if someone else did our work for us. */
> > -		s = atomic_long_read(&rsp->expedited_done);
> > -		if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
> > -			/* ensure test happens before caller kfree */
> > -			smp_mb__before_atomic(); /* ^^^ */
> > -			atomic_long_inc(&rsp->expedited_workdone1);
> > -			free_cpumask_var(cm);
> > -			return;
> > -		}
> > +	/*
> > +	 * Check to see if someone else did our work for us, while we were
> > +	 * waiting for the mutex.
> > +	 */
> > +	s = atomic_long_read(&rsp->expedited_done);
> > +	if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
> > +		/* ensure test happens before caller kfree */
> > +		smp_mb__before_atomic(); /* ^^^ */
> > +		atomic_long_inc(&rsp->expedited_workdone1);
> > +		goto unlock;
> > +	}
> > 
> > -		/* No joy, try again later.  Or just synchronize_sched(). */
> > -		if (trycount++ < 10) {
> > -			udelay(trycount * num_online_cpus());
> > -		} else {
> > -			wait_rcu_gp(call_rcu_sched);
> > -			atomic_long_inc(&rsp->expedited_normal);
> > -			free_cpumask_var(cm);
> > -			return;
> > -		}
> > +	/* Stop each CPU that is online, non-idle, and not us. */
> > +	for_each_online_cpu(cpu) {
> > +		struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
> > 
> > -		/* Recheck to see if someone else did our work for us. */
> > -		s = atomic_long_read(&rsp->expedited_done);
> > -		if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
> > -			/* ensure test happens before caller kfree */
> > -			smp_mb__before_atomic(); /* ^^^ */
> > -			atomic_long_inc(&rsp->expedited_workdone2);
> > -			free_cpumask_var(cm);
> > -			return;
> > -		}
> > +		/* Skip our CPU, */
> > +		if (raw_smp_processor_id() == cpu)
> > +			continue;
> > 
> > -		/*
> > -		 * Refetching sync_sched_expedited_started allows later
> > -		 * callers to piggyback on our grace period.  We retry
> > -		 * after they started, so our grace period works for them,
> > -		 * and they started after our first try, so their grace
> > -		 * period works for us.
> > -		 */
> > -		if (!try_get_online_cpus()) {
> > -			/* CPU hotplug operation in flight, use normal GP. */
> > -			wait_rcu_gp(call_rcu_sched);
> > -			atomic_long_inc(&rsp->expedited_normal);
> > -			free_cpumask_var(cm);
> > -			return;
> > -		}
> > -		snap = atomic_long_read(&rsp->expedited_start);
> > -		smp_mb(); /* ensure read is before try_stop_cpus(). */
> > +		/* and any idle CPUs. */
> > +		if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
> > +			continue;
> > +
> > +		atomic_inc(&ess.count);
> > +		stop_one_cpu_nowait(cpu, synchronize_sched_expedited_cpu_stop, ,
> > +				&per_cpu(exp_stop_work, cpu));
> >  	}
> > -	atomic_long_inc(&rsp->expedited_stoppedcpus);
> > 
> > -all_cpus_idle:
> > -	free_cpumask_var(cm);
> > +	wait_event(ess.wq, !atomic_read(&ess.count));
> > +
> > +	atomic_long_inc(&rsp->expedited_stoppedcpus);
> > 
> >  	/*
> >  	 * Everyone up to our most recent fetch is covered by our grace
> > @@ -3435,6 +3401,8 @@ void synchronize_sched_expedited(void)
> >  		}
> >  	} while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s);
> >  	atomic_long_inc(&rsp->expedited_done_exit);
> > +unlock:
> > +	mutex_unlock(&rsp->expedited_mutex);
> > 
> >  	put_online_cpus();
> >  }
> > --- a/kernel/rcu/tree.h
> > +++ b/kernel/rcu/tree.h
> > @@ -483,6 +483,7 @@ struct rcu_state {
> >  						/*  _rcu_barrier(). */
> >  	/* End of fields guarded by barrier_mutex. */
> > 
> > +	struct mutex  expedited_mutex;		/* Serializes expediting. */
> >  	atomic_long_t expedited_start;		/* Starting ticket. */
> >  	atomic_long_t expedited_done;		/* Done ticket. */
> >  	atomic_long_t expedited_wrap;		/* # near-wrap incidents. */
> > 


^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-06-24 17:10                                 ` Paul E. McKenney
  2015-06-24 17:20                                   ` Paul E. McKenney
@ 2015-06-24 17:28                                   ` Peter Zijlstra
  2015-06-24 17:32                                     ` Peter Zijlstra
  2015-06-24 18:14                                     ` Peter Zijlstra
  2015-06-24 17:58                                   ` Peter Zijlstra
  2 siblings, 2 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-24 17:28 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
	viro, torvalds

On Wed, Jun 24, 2015 at 10:10:17AM -0700, Paul E. McKenney wrote:
> OK, I will give this a try.  Of course, the counter needs to be
> initialized to 1 rather than zero, and it needs to be atomically
> decremented after all stop_one_cpu_nowait() invocations, otherwise you
> can get an early wakeup due to the usual race conditions.

Clever that.

How about something like this, it replaced mutex and start/done ticket
thing with an MCS style lockless FIFO queue.

I further uses the gpnum/completed thing to short circuit things if
we've waited long enough.

---
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3253,23 +3253,28 @@ void cond_synchronize_rcu(unsigned long
 }
 EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
 
+struct exp_stop_state {
+	wait_queue_head_t	*wq;
+	atomic_t		count;
+};
+
 static int synchronize_sched_expedited_cpu_stop(void *data)
 {
+	struct exp_stop_state *ess = data;
+
 	/*
 	 * There must be a full memory barrier on each affected CPU
 	 * between the time that try_stop_cpus() is called and the
 	 * time that it returns.
-	 *
-	 * In the current initial implementation of cpu_stop, the
-	 * above condition is already met when the control reaches
-	 * this point and the following smp_mb() is not strictly
-	 * necessary.  Do smp_mb() anyway for documentation and
-	 * robustness against future implementation changes.
 	 */
-	smp_mb(); /* See above comment block. */
+	if (atomic_dec_and_test(&ess->count))
+		wake_up(ess->wq);
+
 	return 0;
 }
 
+static DEFINE_PER_CPU(struct cpu_stop_work, exp_stop_work);
+
 /**
  * synchronize_sched_expedited - Brute-force RCU-sched grace period
  *
@@ -3304,138 +3309,84 @@ static int synchronize_sched_expedited_c
  */
 void synchronize_sched_expedited(void)
 {
-	cpumask_var_t cm;
-	bool cma = false;
-	int cpu;
-	long firstsnap, s, snap;
-	int trycount = 0;
+	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(stop_wait);
+	struct exp_stop_state ess = {
+		.wq = &stop_wait,
+		.count = ATOMIC_INIT(1),
+	};
 	struct rcu_state *rsp = &rcu_sched_state;
+	struct expedited_queue_task {
+		struct expedited_queue_task *next;
+		struct task_struct *task;
+		int done;
+	} *prev, *next, entry = {
+		.task = current,
+	};
+	long gpnum;
+	int cpu;
 
-	/*
-	 * If we are in danger of counter wrap, just do synchronize_sched().
-	 * By allowing sync_sched_expedited_started to advance no more than
-	 * ULONG_MAX/8 ahead of sync_sched_expedited_done, we are ensuring
-	 * that more than 3.5 billion CPUs would be required to force a
-	 * counter wrap on a 32-bit system.  Quite a few more CPUs would of
-	 * course be required on a 64-bit system.
-	 */
-	if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start),
-			 (ulong)atomic_long_read(&rsp->expedited_done) +
-			 ULONG_MAX / 8)) {
-		wait_rcu_gp(call_rcu_sched);
-		atomic_long_inc(&rsp->expedited_wrap);
-		return;
-	}
-
-	/*
-	 * Take a ticket.  Note that atomic_inc_return() implies a
-	 * full memory barrier.
-	 */
-	snap = atomic_long_inc_return(&rsp->expedited_start);
-	firstsnap = snap;
 	if (!try_get_online_cpus()) {
 		/* CPU hotplug operation in flight, fall back to normal GP. */
 		wait_rcu_gp(call_rcu_sched);
-		atomic_long_inc(&rsp->expedited_normal);
 		return;
 	}
 	WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
 
-	/* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
-	cma = zalloc_cpumask_var(&cm, GFP_KERNEL);
-	if (cma) {
-		cpumask_copy(cm, cpu_online_mask);
-		cpumask_clear_cpu(raw_smp_processor_id(), cm);
-		for_each_cpu(cpu, cm) {
-			struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+	smp_mb();
+	gpnum = smp_load_acquire(&rsp->gpnum);
 
-			if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
-				cpumask_clear_cpu(cpu, cm);
+	/* MCS style queue 'lock' */
+	prev = xchg(&rsp->expedited_queue, &entry);
+	if (prev) {
+		WRITE_ONCE(prev->next, &entry);
+		for (;;) {
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			if (smp_load_acquire(&entry.done))
+				break;
+			schedule();
 		}
-		if (cpumask_weight(cm) == 0)
-			goto all_cpus_idle;
+		__set_current_state(TASK_RUNNING);
 	}
 
 	/*
-	 * Each pass through the following loop attempts to force a
-	 * context switch on each CPU.
+	 * Check to see if someone else did our work for us, while we were
+	 * waiting on the queue.
 	 */
-	while (try_stop_cpus(cma ? cm : cpu_online_mask,
-			     synchronize_sched_expedited_cpu_stop,
-			     NULL) == -EAGAIN) {
-		put_online_cpus();
-		atomic_long_inc(&rsp->expedited_tryfail);
-
-		/* Check to see if someone else did our work for us. */
-		s = atomic_long_read(&rsp->expedited_done);
-		if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
-			/* ensure test happens before caller kfree */
-			smp_mb__before_atomic(); /* ^^^ */
-			atomic_long_inc(&rsp->expedited_workdone1);
-			free_cpumask_var(cm);
-			return;
-		}
-
-		/* No joy, try again later.  Or just synchronize_sched(). */
-		if (trycount++ < 10) {
-			udelay(trycount * num_online_cpus());
-		} else {
-			wait_rcu_gp(call_rcu_sched);
-			atomic_long_inc(&rsp->expedited_normal);
-			free_cpumask_var(cm);
-			return;
-		}
-
-		/* Recheck to see if someone else did our work for us. */
-		s = atomic_long_read(&rsp->expedited_done);
-		if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
-			/* ensure test happens before caller kfree */
-			smp_mb__before_atomic(); /* ^^^ */
-			atomic_long_inc(&rsp->expedited_workdone2);
-			free_cpumask_var(cm);
-			return;
-		}
+	if (ULONG_CMP_LT(gpnum, smp_load_acquire(&rsp->completed)))
+		goto unlock;
 
-		/*
-		 * Refetching sync_sched_expedited_started allows later
-		 * callers to piggyback on our grace period.  We retry
-		 * after they started, so our grace period works for them,
-		 * and they started after our first try, so their grace
-		 * period works for us.
-		 */
-		if (!try_get_online_cpus()) {
-			/* CPU hotplug operation in flight, use normal GP. */
-			wait_rcu_gp(call_rcu_sched);
-			atomic_long_inc(&rsp->expedited_normal);
-			free_cpumask_var(cm);
-			return;
-		}
-		snap = atomic_long_read(&rsp->expedited_start);
-		smp_mb(); /* ensure read is before try_stop_cpus(). */
+	/* Stop each CPU that is online, non-idle, and not us. */
+	for_each_online_cpu(cpu) {
+		struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+
+		/* Skip our CPU, */
+		if (raw_smp_processor_id() == cpu)
+			continue;
+
+		/* and any idle CPUs. */
+		if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
+			continue;
+
+		atomic_inc(&ess.count);
+		stop_one_cpu_nowait(cpu, synchronize_sched_expedited_cpu_stop,
+				&ess, &per_cpu(exp_stop_work, cpu));
 	}
-	atomic_long_inc(&rsp->expedited_stoppedcpus);
+	atomic_dec(&ess.count);
 
-all_cpus_idle:
-	free_cpumask_var(cm);
+	wait_event(stop_wait, !atomic_read(&ess.count));
 
-	/*
-	 * Everyone up to our most recent fetch is covered by our grace
-	 * period.  Update the counter, but only if our work is still
-	 * relevant -- which it won't be if someone who started later
-	 * than we did already did their update.
-	 */
-	do {
-		atomic_long_inc(&rsp->expedited_done_tries);
-		s = atomic_long_read(&rsp->expedited_done);
-		if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
-			/* ensure test happens before caller kfree */
-			smp_mb__before_atomic(); /* ^^^ */
-			atomic_long_inc(&rsp->expedited_done_lost);
-			break;
-		}
-	} while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s);
-	atomic_long_inc(&rsp->expedited_done_exit);
+unlock:
+	/* MCS style queue 'unlock' */
+	next = READ_ONCE(entry.next);
+	if (!next) {
+		if (cmpxchg(&rsp->expedited_queue, &entry, NULL) == &entry)
+			goto done;
+		while (!(next = READ_ONCE(entry.next)))
+			cpu_relax();
+	}
+	smp_store_release(&next->done, 1);
 
+done:
 	put_online_cpus();
 }
 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -483,17 +483,7 @@ struct rcu_state {
 						/*  _rcu_barrier(). */
 	/* End of fields guarded by barrier_mutex. */
 
-	atomic_long_t expedited_start;		/* Starting ticket. */
-	atomic_long_t expedited_done;		/* Done ticket. */
-	atomic_long_t expedited_wrap;		/* # near-wrap incidents. */
-	atomic_long_t expedited_tryfail;	/* # acquisition failures. */
-	atomic_long_t expedited_workdone1;	/* # done by others #1. */
-	atomic_long_t expedited_workdone2;	/* # done by others #2. */
-	atomic_long_t expedited_normal;		/* # fallbacks to normal. */
-	atomic_long_t expedited_stoppedcpus;	/* # successful stop_cpus. */
-	atomic_long_t expedited_done_tries;	/* # tries to update _done. */
-	atomic_long_t expedited_done_lost;	/* # times beaten to _done. */
-	atomic_long_t expedited_done_exit;	/* # times exited _done loop. */
+	void *expedited_queue;
 
 	unsigned long jiffies_force_qs;		/* Time at which to invoke */
 						/*  force_quiescent_state(). */

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-06-24 17:20                                   ` Paul E. McKenney
@ 2015-06-24 17:29                                     ` Peter Zijlstra
  0 siblings, 0 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-24 17:29 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
	viro, torvalds

On Wed, Jun 24, 2015 at 10:20:18AM -0700, Paul E. McKenney wrote:
> Except that I promised Ingo I would check for CPUs failing to schedule
> quickly enough, which means that I must track them individually rather
> than via a single counter...

You can track individual CPUs timestamps by extending the per-cpu
storage we use for the exp_stop_work.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-06-24 17:28                                   ` Peter Zijlstra
@ 2015-06-24 17:32                                     ` Peter Zijlstra
  2015-06-24 18:14                                     ` Peter Zijlstra
  1 sibling, 0 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-24 17:32 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
	viro, torvalds

On Wed, Jun 24, 2015 at 07:28:18PM +0200, Peter Zijlstra wrote:
> +unlock:
> +	/* MCS style queue 'unlock' */
> +	next = READ_ONCE(entry.next);
> +	if (!next) {
> +		if (cmpxchg(&rsp->expedited_queue, &entry, NULL) == &entry)
> +			goto done;
> +		while (!(next = READ_ONCE(entry.next)))
> +			cpu_relax();
> +	}
> +	smp_store_release(&next->done, 1);

Do you suppose:

	wake_up_process(next->task);

would help? :-)

>  
> +done:
>  	put_online_cpus();
>  }
>  EXPORT_SYMBOL_GPL(synchronize_sched_expedited);

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-06-24 17:10                                 ` Paul E. McKenney
  2015-06-24 17:20                                   ` Paul E. McKenney
  2015-06-24 17:28                                   ` Peter Zijlstra
@ 2015-06-24 17:58                                   ` Peter Zijlstra
  2015-06-25  3:23                                     ` Paul E. McKenney
  2 siblings, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-24 17:58 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
	viro, torvalds

On Wed, Jun 24, 2015 at 10:10:17AM -0700, Paul E. McKenney wrote:
> > The thing is, once you start bailing on this condition your 'queue'
> > drains very fast and this is around the same time sync_rcu() would've
> > released the waiters too.
> 
> In my experience, this sort of thing simply melts down on large systems.
> I am reworking this with multiple locks so as to keep the large-system
> contention down to a dull roar.

So with the MCS queue we're got less global trashing than you had with
the start/done tickets. Only the queue head on enqueue.


^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-06-24 17:28                                   ` Peter Zijlstra
  2015-06-24 17:32                                     ` Peter Zijlstra
@ 2015-06-24 18:14                                     ` Peter Zijlstra
  1 sibling, 0 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-24 18:14 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
	viro, torvalds

On Wed, Jun 24, 2015 at 07:28:18PM +0200, Peter Zijlstra wrote:
> How about something like this, it replaced mutex and start/done ticket
> thing with an MCS style lockless FIFO queue.
> 
> I further uses the gpnum/completed thing to short circuit things if
> we've waited long enough.

Prettier version

--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3253,23 +3253,41 @@ void cond_synchronize_rcu(unsigned long
 }
 EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
 
+struct expedited_task_state {
+	struct expedited_task_state *next;
+	struct task_struct *task;
+	atomic_t count;
+	int done;
+};
+
 static int synchronize_sched_expedited_cpu_stop(void *data)
 {
+	struct expedited_task_state *ets = data;
+
 	/*
 	 * There must be a full memory barrier on each affected CPU
 	 * between the time that try_stop_cpus() is called and the
 	 * time that it returns.
-	 *
-	 * In the current initial implementation of cpu_stop, the
-	 * above condition is already met when the control reaches
-	 * this point and the following smp_mb() is not strictly
-	 * necessary.  Do smp_mb() anyway for documentation and
-	 * robustness against future implementation changes.
 	 */
-	smp_mb(); /* See above comment block. */
+	if (atomic_dec_and_test(&ets->count))
+		wake_up_process(ets->task);
+
 	return 0;
 }
 
+static DEFINE_PER_CPU(struct cpu_stop_work, exp_stop_work);
+
+#define current_wait(cond)					\
+do {								\
+	for (;;) {						\
+		set_current_state(TASK_UNINTERRUPTIBLE);	\
+		if (cond)					\
+			break;					\
+		schedule();					\
+	}							\
+	__set_current_state(TASK_RUNNING);			\
+} while (0)
+
 /**
  * synchronize_sched_expedited - Brute-force RCU-sched grace period
  *
@@ -3304,138 +3322,71 @@ static int synchronize_sched_expedited_c
  */
 void synchronize_sched_expedited(void)
 {
-	cpumask_var_t cm;
-	bool cma = false;
-	int cpu;
-	long firstsnap, s, snap;
-	int trycount = 0;
 	struct rcu_state *rsp = &rcu_sched_state;
+	struct expedited_task_state *prev, *next, entry = {
+		.task = current,
+		.count = ATOMIC_INIT(1), /* avoid spurious wakeups */
+	};
+	long gpnum;
+	int cpu;
 
-	/*
-	 * If we are in danger of counter wrap, just do synchronize_sched().
-	 * By allowing sync_sched_expedited_started to advance no more than
-	 * ULONG_MAX/8 ahead of sync_sched_expedited_done, we are ensuring
-	 * that more than 3.5 billion CPUs would be required to force a
-	 * counter wrap on a 32-bit system.  Quite a few more CPUs would of
-	 * course be required on a 64-bit system.
-	 */
-	if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start),
-			 (ulong)atomic_long_read(&rsp->expedited_done) +
-			 ULONG_MAX / 8)) {
-		wait_rcu_gp(call_rcu_sched);
-		atomic_long_inc(&rsp->expedited_wrap);
-		return;
-	}
-
-	/*
-	 * Take a ticket.  Note that atomic_inc_return() implies a
-	 * full memory barrier.
-	 */
-	snap = atomic_long_inc_return(&rsp->expedited_start);
-	firstsnap = snap;
 	if (!try_get_online_cpus()) {
 		/* CPU hotplug operation in flight, fall back to normal GP. */
 		wait_rcu_gp(call_rcu_sched);
-		atomic_long_inc(&rsp->expedited_normal);
 		return;
 	}
 	WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
 
-	/* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
-	cma = zalloc_cpumask_var(&cm, GFP_KERNEL);
-	if (cma) {
-		cpumask_copy(cm, cpu_online_mask);
-		cpumask_clear_cpu(raw_smp_processor_id(), cm);
-		for_each_cpu(cpu, cm) {
-			struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-
-			if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
-				cpumask_clear_cpu(cpu, cm);
-		}
-		if (cpumask_weight(cm) == 0)
-			goto all_cpus_idle;
+	smp_mb();
+	gpnum = smp_load_acquire(&rsp->gpnum);
+
+	/* MCS style queue 'lock' */
+	prev = xchg(&rsp->expedited_queue, &entry);
+	if (prev) {
+		WRITE_ONCE(prev->next, &entry);
+		current_wait(smp_load_acquire(&entry.done));
 	}
 
 	/*
-	 * Each pass through the following loop attempts to force a
-	 * context switch on each CPU.
+	 * Check to see if someone else did our work for us, while we were
+	 * waiting on the queue.
 	 */
-	while (try_stop_cpus(cma ? cm : cpu_online_mask,
-			     synchronize_sched_expedited_cpu_stop,
-			     NULL) == -EAGAIN) {
-		put_online_cpus();
-		atomic_long_inc(&rsp->expedited_tryfail);
-
-		/* Check to see if someone else did our work for us. */
-		s = atomic_long_read(&rsp->expedited_done);
-		if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
-			/* ensure test happens before caller kfree */
-			smp_mb__before_atomic(); /* ^^^ */
-			atomic_long_inc(&rsp->expedited_workdone1);
-			free_cpumask_var(cm);
-			return;
-		}
-
-		/* No joy, try again later.  Or just synchronize_sched(). */
-		if (trycount++ < 10) {
-			udelay(trycount * num_online_cpus());
-		} else {
-			wait_rcu_gp(call_rcu_sched);
-			atomic_long_inc(&rsp->expedited_normal);
-			free_cpumask_var(cm);
-			return;
-		}
-
-		/* Recheck to see if someone else did our work for us. */
-		s = atomic_long_read(&rsp->expedited_done);
-		if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
-			/* ensure test happens before caller kfree */
-			smp_mb__before_atomic(); /* ^^^ */
-			atomic_long_inc(&rsp->expedited_workdone2);
-			free_cpumask_var(cm);
-			return;
-		}
-
-		/*
-		 * Refetching sync_sched_expedited_started allows later
-		 * callers to piggyback on our grace period.  We retry
-		 * after they started, so our grace period works for them,
-		 * and they started after our first try, so their grace
-		 * period works for us.
-		 */
-		if (!try_get_online_cpus()) {
-			/* CPU hotplug operation in flight, use normal GP. */
-			wait_rcu_gp(call_rcu_sched);
-			atomic_long_inc(&rsp->expedited_normal);
-			free_cpumask_var(cm);
-			return;
-		}
-		snap = atomic_long_read(&rsp->expedited_start);
-		smp_mb(); /* ensure read is before try_stop_cpus(). */
+	if (ULONG_CMP_LT(gpnum, smp_load_acquire(&rsp->completed)))
+		goto unlock;
+
+	/* Stop each CPU that is online, non-idle, and not us. */
+	for_each_online_cpu(cpu) {
+		struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+
+		/* Skip our CPU, */
+		if (raw_smp_processor_id() == cpu)
+			continue;
+
+		/* and any idle CPUs. */
+		if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
+			continue;
+
+		atomic_inc(&entry.count);
+		stop_one_cpu_nowait(cpu, synchronize_sched_expedited_cpu_stop,
+				&entry, &per_cpu(exp_stop_work, cpu));
 	}
-	atomic_long_inc(&rsp->expedited_stoppedcpus);
 
-all_cpus_idle:
-	free_cpumask_var(cm);
+	atomic_dec(&entry.count); /* let the wakeups in */
+	current_wait(!atomic_read(&entry.count));
 
-	/*
-	 * Everyone up to our most recent fetch is covered by our grace
-	 * period.  Update the counter, but only if our work is still
-	 * relevant -- which it won't be if someone who started later
-	 * than we did already did their update.
-	 */
-	do {
-		atomic_long_inc(&rsp->expedited_done_tries);
-		s = atomic_long_read(&rsp->expedited_done);
-		if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
-			/* ensure test happens before caller kfree */
-			smp_mb__before_atomic(); /* ^^^ */
-			atomic_long_inc(&rsp->expedited_done_lost);
-			break;
-		}
-	} while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s);
-	atomic_long_inc(&rsp->expedited_done_exit);
+unlock:
+	/* MCS style queue 'unlock' */
+	next = READ_ONCE(entry.next);
+	if (!next) {
+		if (cmpxchg(&rsp->expedited_queue, &entry, NULL) == &entry)
+			goto done;
+		while (!(next = READ_ONCE(entry.next)))
+			cpu_relax();
+	}
+	smp_store_release(&next->done, 1);
+	wake_up_process(next->task);
 
+done:
 	put_online_cpus();
 }
 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -483,17 +483,7 @@ struct rcu_state {
 						/*  _rcu_barrier(). */
 	/* End of fields guarded by barrier_mutex. */
 
-	atomic_long_t expedited_start;		/* Starting ticket. */
-	atomic_long_t expedited_done;		/* Done ticket. */
-	atomic_long_t expedited_wrap;		/* # near-wrap incidents. */
-	atomic_long_t expedited_tryfail;	/* # acquisition failures. */
-	atomic_long_t expedited_workdone1;	/* # done by others #1. */
-	atomic_long_t expedited_workdone2;	/* # done by others #2. */
-	atomic_long_t expedited_normal;		/* # fallbacks to normal. */
-	atomic_long_t expedited_stoppedcpus;	/* # successful stop_cpus. */
-	atomic_long_t expedited_done_tries;	/* # tries to update _done. */
-	atomic_long_t expedited_done_lost;	/* # times beaten to _done. */
-	atomic_long_t expedited_done_exit;	/* # times exited _done loop. */
+	void *expedited_queue;
 
 	unsigned long jiffies_force_qs;		/* Time at which to invoke */
 						/*  force_quiescent_state(). */

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-06-24 17:58                                   ` Peter Zijlstra
@ 2015-06-25  3:23                                     ` Paul E. McKenney
  2015-06-25 11:07                                       ` Peter Zijlstra
  0 siblings, 1 reply; 106+ messages in thread
From: Paul E. McKenney @ 2015-06-25  3:23 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
	viro, torvalds

On Wed, Jun 24, 2015 at 07:58:30PM +0200, Peter Zijlstra wrote:
> On Wed, Jun 24, 2015 at 10:10:17AM -0700, Paul E. McKenney wrote:
> > > The thing is, once you start bailing on this condition your 'queue'
> > > drains very fast and this is around the same time sync_rcu() would've
> > > released the waiters too.
> > 
> > In my experience, this sort of thing simply melts down on large systems.
> > I am reworking this with multiple locks so as to keep the large-system
> > contention down to a dull roar.
> 
> So with the MCS queue we're got less global trashing than you had with
> the start/done tickets. Only the queue head on enqueue.

Here is what I had in mind, where you don't have any global trashing
except when the ->expedited_sequence gets updated.  Passes mild rcutorture
testing.

Still needs asynchronous CPU stoppage and stall warnings and trace
documentation updates.  Plus fixes for whatever bugs show up.

							Thanx, Paul

------------------------------------------------------------------------

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 78d0a87ff354..887370b7e52a 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -70,6 +70,7 @@ MODULE_ALIAS("rcutree");
 
 static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
 static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
+static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS];
 
 /*
  * In order to export the rcu_state name to the tracing tools, it
@@ -3323,6 +3324,22 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
 	return 0;
 }
 
+/* Common code for synchronize_sched_expedited() work-done checking. */
+static bool sync_sched_exp_wd(struct rcu_state *rsp, struct rcu_node *rnp,
+			      atomic_long_t *stat, unsigned long s)
+{
+	if (ULONG_CMP_GE(READ_ONCE(rsp->expedited_sequence), s)) {
+		if (rnp)
+			mutex_unlock(&rnp->exp_funnel_mutex);
+		/* Ensure test happens before caller kfree(). */
+		smp_mb__before_atomic(); /* ^^^ */
+		atomic_long_inc(stat);
+		put_online_cpus();
+		return true;
+	}
+	return false;
+}
+
 /**
  * synchronize_sched_expedited - Brute-force RCU-sched grace period
  *
@@ -3334,58 +3351,24 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
  * restructure your code to batch your updates, and then use a single
  * synchronize_sched() instead.
  *
- * This implementation can be thought of as an application of ticket
- * locking to RCU, with sync_sched_expedited_started and
- * sync_sched_expedited_done taking on the roles of the halves
- * of the ticket-lock word.  Each task atomically increments
- * sync_sched_expedited_started upon entry, snapshotting the old value,
- * then attempts to stop all the CPUs.  If this succeeds, then each
- * CPU will have executed a context switch, resulting in an RCU-sched
- * grace period.  We are then done, so we use atomic_cmpxchg() to
- * update sync_sched_expedited_done to match our snapshot -- but
- * only if someone else has not already advanced past our snapshot.
- *
- * On the other hand, if try_stop_cpus() fails, we check the value
- * of sync_sched_expedited_done.  If it has advanced past our
- * initial snapshot, then someone else must have forced a grace period
- * some time after we took our snapshot.  In this case, our work is
- * done for us, and we can simply return.  Otherwise, we try again,
- * but keep our initial snapshot for purposes of checking for someone
- * doing our work for us.
- *
- * If we fail too many times in a row, we fall back to synchronize_sched().
+ * This implementation can be thought of as an application of sequence
+ * locking to expedited grace periods, but using the sequence counter to
+ * determine when someone else has already done the work instead of for
+ * retrying readers.
  */
 void synchronize_sched_expedited(void)
 {
-	cpumask_var_t cm;
-	bool cma = false;
 	int cpu;
-	long firstsnap, s, snap;
-	int trycount = 0;
+	long s;
 	struct rcu_state *rsp = &rcu_sched_state;
+	struct rcu_node *rnp0;
+	struct rcu_node *rnp1 = NULL;
 
-	/*
-	 * If we are in danger of counter wrap, just do synchronize_sched().
-	 * By allowing sync_sched_expedited_started to advance no more than
-	 * ULONG_MAX/8 ahead of sync_sched_expedited_done, we are ensuring
-	 * that more than 3.5 billion CPUs would be required to force a
-	 * counter wrap on a 32-bit system.  Quite a few more CPUs would of
-	 * course be required on a 64-bit system.
-	 */
-	if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start),
-			 (ulong)atomic_long_read(&rsp->expedited_done) +
-			 ULONG_MAX / 8)) {
-		wait_rcu_gp(call_rcu_sched);
-		atomic_long_inc(&rsp->expedited_wrap);
-		return;
-	}
+	/* Take a snapshot of the sequence number.  */
+	smp_mb(); /* Caller's modifications seen first by other CPUs. */
+	s = (READ_ONCE(rsp->expedited_sequence) + 3) & ~0x1;
+	smp_mb(); /* Above access must not bleed into critical section. */
 
-	/*
-	 * Take a ticket.  Note that atomic_inc_return() implies a
-	 * full memory barrier.
-	 */
-	snap = atomic_long_inc_return(&rsp->expedited_start);
-	firstsnap = snap;
 	if (!try_get_online_cpus()) {
 		/* CPU hotplug operation in flight, fall back to normal GP. */
 		wait_rcu_gp(call_rcu_sched);
@@ -3394,100 +3377,47 @@ void synchronize_sched_expedited(void)
 	}
 	WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
 
-	/* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
-	cma = zalloc_cpumask_var(&cm, GFP_KERNEL);
-	if (cma) {
-		cpumask_copy(cm, cpu_online_mask);
-		cpumask_clear_cpu(raw_smp_processor_id(), cm);
-		for_each_cpu(cpu, cm) {
-			struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-
-			if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
-				cpumask_clear_cpu(cpu, cm);
-		}
-		if (cpumask_weight(cm) == 0)
-			goto all_cpus_idle;
-	}
-
 	/*
-	 * Each pass through the following loop attempts to force a
-	 * context switch on each CPU.
+	 * Each pass through the following loop works its way
+	 * up the rcu_node tree, returning if others have done the
+	 * work or otherwise falls through holding the root rnp's
+	 * ->exp_funnel_mutex.  The mapping from CPU to rcu_node structure
+	 * can be inexact, as it is just promoting locality and is not
+	 * strictly needed for correctness.
 	 */
-	while (try_stop_cpus(cma ? cm : cpu_online_mask,
-			     synchronize_sched_expedited_cpu_stop,
-			     NULL) == -EAGAIN) {
-		put_online_cpus();
-		atomic_long_inc(&rsp->expedited_tryfail);
-
-		/* Check to see if someone else did our work for us. */
-		s = atomic_long_read(&rsp->expedited_done);
-		if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
-			/* ensure test happens before caller kfree */
-			smp_mb__before_atomic(); /* ^^^ */
-			atomic_long_inc(&rsp->expedited_workdone1);
-			free_cpumask_var(cm);
+	rnp0 = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode;
+	for (; rnp0 != NULL; rnp0 = rnp0->parent) {
+		if (sync_sched_exp_wd(rsp, rnp1, &rsp->expedited_workdone1, s))
 			return;
-		}
+		mutex_lock(&rnp0->exp_funnel_mutex);
+		if (rnp1)
+			mutex_unlock(&rnp1->exp_funnel_mutex);
+		rnp1 = rnp0;
+	}
+	rnp0 = rnp1;  /* rcu_get_root(rsp), AKA root rcu_node structure. */
+	if (sync_sched_exp_wd(rsp, rnp0, &rsp->expedited_workdone2, s))
+		return;
 
-		/* No joy, try again later.  Or just synchronize_sched(). */
-		if (trycount++ < 10) {
-			udelay(trycount * num_online_cpus());
-		} else {
-			wait_rcu_gp(call_rcu_sched);
-			atomic_long_inc(&rsp->expedited_normal);
-			free_cpumask_var(cm);
-			return;
-		}
+	WRITE_ONCE(rsp->expedited_sequence, rsp->expedited_sequence + 1);
+	smp_mb(); /* Ensure expedited GP seen after counter increment. */
+	WARN_ON_ONCE(!(rsp->expedited_sequence & 0x1));
 
-		/* Recheck to see if someone else did our work for us. */
-		s = atomic_long_read(&rsp->expedited_done);
-		if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
-			/* ensure test happens before caller kfree */
-			smp_mb__before_atomic(); /* ^^^ */
-			atomic_long_inc(&rsp->expedited_workdone2);
-			free_cpumask_var(cm);
-			return;
-		}
+	/* Stop each CPU that is online, non-idle, and not us. */
+	for_each_online_cpu(cpu) {
+		struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
 
-		/*
-		 * Refetching sync_sched_expedited_started allows later
-		 * callers to piggyback on our grace period.  We retry
-		 * after they started, so our grace period works for them,
-		 * and they started after our first try, so their grace
-		 * period works for us.
-		 */
-		if (!try_get_online_cpus()) {
-			/* CPU hotplug operation in flight, use normal GP. */
-			wait_rcu_gp(call_rcu_sched);
-			atomic_long_inc(&rsp->expedited_normal);
-			free_cpumask_var(cm);
-			return;
-		}
-		snap = atomic_long_read(&rsp->expedited_start);
-		smp_mb(); /* ensure read is before try_stop_cpus(). */
+		/* Skip our CPU and any idle CPUs. */
+		if (raw_smp_processor_id() == cpu ||
+		    !(atomic_add_return(0, &rdtp->dynticks) & 0x1))
+			continue;
+		stop_one_cpu(cpu, synchronize_sched_expedited_cpu_stop, NULL);
 	}
-	atomic_long_inc(&rsp->expedited_stoppedcpus);
-
-all_cpus_idle:
-	free_cpumask_var(cm);
 
-	/*
-	 * Everyone up to our most recent fetch is covered by our grace
-	 * period.  Update the counter, but only if our work is still
-	 * relevant -- which it won't be if someone who started later
-	 * than we did already did their update.
-	 */
-	do {
-		atomic_long_inc(&rsp->expedited_done_tries);
-		s = atomic_long_read(&rsp->expedited_done);
-		if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
-			/* ensure test happens before caller kfree */
-			smp_mb__before_atomic(); /* ^^^ */
-			atomic_long_inc(&rsp->expedited_done_lost);
-			break;
-		}
-	} while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s);
-	atomic_long_inc(&rsp->expedited_done_exit);
+	smp_mb(); /* Ensure expedited GP seen before counter increment. */
+	WRITE_ONCE(rsp->expedited_sequence, rsp->expedited_sequence + 1);
+	WARN_ON_ONCE(rsp->expedited_sequence & 0x1);
+	mutex_unlock(&rnp0->exp_funnel_mutex);
+	smp_mb(); /* ensure subsequent action seen after grace period. */
 
 	put_online_cpus();
 }
@@ -4043,6 +3973,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
 {
 	static const char * const buf[] = RCU_NODE_NAME_INIT;
 	static const char * const fqs[] = RCU_FQS_NAME_INIT;
+	static const char * const exp[] = RCU_EXP_NAME_INIT;
 	static u8 fl_mask = 0x1;
 
 	int levelcnt[RCU_NUM_LVLS];		/* # nodes in each level. */
@@ -4101,6 +4032,9 @@ static void __init rcu_init_one(struct rcu_state *rsp,
 			rnp->level = i;
 			INIT_LIST_HEAD(&rnp->blkd_tasks);
 			rcu_init_one_nocb(rnp);
+			mutex_init(&rnp->exp_funnel_mutex);
+			lockdep_set_class_and_name(&rnp->exp_funnel_mutex,
+						   &rcu_exp_class[i], exp[i]);
 		}
 	}
 
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index de22d6d06bf9..f0f4dd96dd73 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -68,6 +68,7 @@
 #  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0 }
 #  define RCU_NODE_NAME_INIT  { "rcu_node_0" }
 #  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0" }
+#  define RCU_EXP_NAME_INIT   { "rcu_node_exp_0" }
 #elif NR_CPUS <= RCU_FANOUT_2
 #  define RCU_NUM_LVLS	      2
 #  define NUM_RCU_LVL_0	      1
@@ -76,6 +77,7 @@
 #  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0, NUM_RCU_LVL_1 }
 #  define RCU_NODE_NAME_INIT  { "rcu_node_0", "rcu_node_1" }
 #  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0", "rcu_node_fqs_1" }
+#  define RCU_EXP_NAME_INIT   { "rcu_node_exp_0", "rcu_node_exp_1" }
 #elif NR_CPUS <= RCU_FANOUT_3
 #  define RCU_NUM_LVLS	      3
 #  define NUM_RCU_LVL_0	      1
@@ -85,6 +87,7 @@
 #  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 }
 #  define RCU_NODE_NAME_INIT  { "rcu_node_0", "rcu_node_1", "rcu_node_2" }
 #  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" }
+#  define RCU_EXP_NAME_INIT   { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" }
 #elif NR_CPUS <= RCU_FANOUT_4
 #  define RCU_NUM_LVLS	      4
 #  define NUM_RCU_LVL_0	      1
@@ -95,6 +98,7 @@
 #  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 }
 #  define RCU_NODE_NAME_INIT  { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" }
 #  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" }
+#  define RCU_EXP_NAME_INIT   { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" }
 #else
 # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
 #endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
@@ -237,6 +241,8 @@ struct rcu_node {
 	int need_future_gp[2];
 				/* Counts of upcoming no-CB GP requests. */
 	raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp;
+
+	struct mutex exp_funnel_mutex ____cacheline_internodealigned_in_smp;
 } ____cacheline_internodealigned_in_smp;
 
 /*
@@ -478,17 +484,11 @@ struct rcu_state {
 						/*  _rcu_barrier(). */
 	/* End of fields guarded by barrier_mutex. */
 
-	atomic_long_t expedited_start;		/* Starting ticket. */
-	atomic_long_t expedited_done;		/* Done ticket. */
-	atomic_long_t expedited_wrap;		/* # near-wrap incidents. */
+	unsigned long expedited_sequence;	/* Take a ticket. */
 	atomic_long_t expedited_tryfail;	/* # acquisition failures. */
 	atomic_long_t expedited_workdone1;	/* # done by others #1. */
 	atomic_long_t expedited_workdone2;	/* # done by others #2. */
 	atomic_long_t expedited_normal;		/* # fallbacks to normal. */
-	atomic_long_t expedited_stoppedcpus;	/* # successful stop_cpus. */
-	atomic_long_t expedited_done_tries;	/* # tries to update _done. */
-	atomic_long_t expedited_done_lost;	/* # times beaten to _done. */
-	atomic_long_t expedited_done_exit;	/* # times exited _done loop. */
 
 	unsigned long jiffies_force_qs;		/* Time at which to invoke */
 						/*  force_quiescent_state(). */
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 3ea7ffc7d5c4..d2aab8dcd58e 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -185,18 +185,13 @@ static int show_rcuexp(struct seq_file *m, void *v)
 {
 	struct rcu_state *rsp = (struct rcu_state *)m->private;
 
-	seq_printf(m, "s=%lu d=%lu w=%lu tf=%lu wd1=%lu wd2=%lu n=%lu sc=%lu dt=%lu dl=%lu dx=%lu\n",
-		   atomic_long_read(&rsp->expedited_start),
-		   atomic_long_read(&rsp->expedited_done),
-		   atomic_long_read(&rsp->expedited_wrap),
+	seq_printf(m, "t=%lu tf=%lu wd1=%lu wd2=%lu n=%lu sc=%lu\n",
+		   rsp->expedited_sequence,
 		   atomic_long_read(&rsp->expedited_tryfail),
 		   atomic_long_read(&rsp->expedited_workdone1),
 		   atomic_long_read(&rsp->expedited_workdone2),
 		   atomic_long_read(&rsp->expedited_normal),
-		   atomic_long_read(&rsp->expedited_stoppedcpus),
-		   atomic_long_read(&rsp->expedited_done_tries),
-		   atomic_long_read(&rsp->expedited_done_lost),
-		   atomic_long_read(&rsp->expedited_done_exit));
+		   rsp->expedited_sequence / 2);
 	return 0;
 }
 


^ permalink raw reply related	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-06-25  3:23                                     ` Paul E. McKenney
@ 2015-06-25 11:07                                       ` Peter Zijlstra
  2015-06-25 13:47                                         ` Paul E. McKenney
  0 siblings, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-25 11:07 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
	viro, torvalds

On Wed, Jun 24, 2015 at 08:23:17PM -0700, Paul E. McKenney wrote:
> Here is what I had in mind, where you don't have any global trashing
> except when the ->expedited_sequence gets updated.  Passes mild rcutorture
> testing.

>  	/*
> +	 * Each pass through the following loop works its way
> +	 * up the rcu_node tree, returning if others have done the
> +	 * work or otherwise falls through holding the root rnp's
> +	 * ->exp_funnel_mutex.  The mapping from CPU to rcu_node structure
> +	 * can be inexact, as it is just promoting locality and is not
> +	 * strictly needed for correctness.
>  	 */
> +	rnp0 = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode;
> +	for (; rnp0 != NULL; rnp0 = rnp0->parent) {
> +		if (sync_sched_exp_wd(rsp, rnp1, &rsp->expedited_workdone1, s))
>  			return;
> +		mutex_lock(&rnp0->exp_funnel_mutex);
> +		if (rnp1)
> +			mutex_unlock(&rnp1->exp_funnel_mutex);
> +		rnp1 = rnp0;
> +	}
> +	rnp0 = rnp1;  /* rcu_get_root(rsp), AKA root rcu_node structure. */
> +	if (sync_sched_exp_wd(rsp, rnp0, &rsp->expedited_workdone2, s))
> +		return;

I'm still somewhat confused by the whole strict order sequence vs this
non ordered 'polling' of global state.

This funnel thing basically waits random times depending on the
contention of these mutexes and tries again. Ultimately serializing on
the root funnel thing.

So on the one hand you have to strictly order these expedited caller,
but then you don't want to actually process them in order. If 'by magic'
you manage to process the 3rd in queue, you can drop the 2nd because it
will have waited long enough. OTOH the 2nd will have waited too long.

You also do not take the actual RCU state machine into account -- this
is a parallel state.

Can't we integrate the force quiescent state machinery with the
expedited machinery -- that is instead of building a parallel state, use
the expedited thing to push the regular machine forward?

We can use the stop_machine calls to force the local RCU state forward,
after all, we _know_ we just made a context switch into the stopper
thread. All we need to do is disable interrupts to hold off the tick
(which normally drives the state machine) and just unconditionally
advance our state.

If we use the regular GP machinery, you also don't have to strongly
order the callers, just stick them on whatever GP was active when they
came in and let them roll, this allows much better (and more natural)
concurrent processing.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-06-25 11:07                                       ` Peter Zijlstra
@ 2015-06-25 13:47                                         ` Paul E. McKenney
  2015-06-25 14:20                                           ` Peter Zijlstra
  0 siblings, 1 reply; 106+ messages in thread
From: Paul E. McKenney @ 2015-06-25 13:47 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
	viro, torvalds

On Thu, Jun 25, 2015 at 01:07:34PM +0200, Peter Zijlstra wrote:
> On Wed, Jun 24, 2015 at 08:23:17PM -0700, Paul E. McKenney wrote:
> > Here is what I had in mind, where you don't have any global trashing
> > except when the ->expedited_sequence gets updated.  Passes mild rcutorture
> > testing.
> 
> >  	/*
> > +	 * Each pass through the following loop works its way
> > +	 * up the rcu_node tree, returning if others have done the
> > +	 * work or otherwise falls through holding the root rnp's
> > +	 * ->exp_funnel_mutex.  The mapping from CPU to rcu_node structure
> > +	 * can be inexact, as it is just promoting locality and is not
> > +	 * strictly needed for correctness.
> >  	 */
> > +	rnp0 = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode;
> > +	for (; rnp0 != NULL; rnp0 = rnp0->parent) {
> > +		if (sync_sched_exp_wd(rsp, rnp1, &rsp->expedited_workdone1, s))
> >  			return;
> > +		mutex_lock(&rnp0->exp_funnel_mutex);
> > +		if (rnp1)
> > +			mutex_unlock(&rnp1->exp_funnel_mutex);
> > +		rnp1 = rnp0;
> > +	}
> > +	rnp0 = rnp1;  /* rcu_get_root(rsp), AKA root rcu_node structure. */
> > +	if (sync_sched_exp_wd(rsp, rnp0, &rsp->expedited_workdone2, s))
> > +		return;
> 
> I'm still somewhat confused by the whole strict order sequence vs this
> non ordered 'polling' of global state.
> 
> This funnel thing basically waits random times depending on the
> contention of these mutexes and tries again. Ultimately serializing on
> the root funnel thing.

Not random at all!

The whole funnel is controlled by the root ->exp_funnel_mutex holder,
who is going to hold the lock for a single expedited grace period, then
release it.  This means that any time a task acquires a lock, there is
very likely to have been a recent state change.  Hence the checks after
each lock acquisition.

So in the heavy-use case, what tends to happen is that there are one
or two expedited grace periods, and then the entire queue of waiters
acquiring ->exp_funnel_mutex simply evaporates -- they can make use of
the expedited grace period whose completion resulted in their acquisition
completing and thus them being awakened.  No fuss, no muss, no unnecessary
contention or cache thrashing.

> So on the one hand you have to strictly order these expedited caller,
> but then you don't want to actually process them in order. If 'by magic'
> you manage to process the 3rd in queue, you can drop the 2nd because it
> will have waited long enough. OTOH the 2nd will have waited too long.

Let's take the example of a 4096-CPU system with default configuration of
CONFIG_RCU_FANOUT=64 and CONFIG_RCU_FANOUT_LEAF=16.  There will then be
256 leaf rcu_node structures, each of which is subordinate to one of four
internal rcu_node structures, each of which is subordinate to the root
rcu_node structure.  There can then be up to 260 tasks waiting on non-root
rcu_node ->exp_funnel_mutex, with an additional task holding the root
rcu_node ->exp_funnel_mutex and carrying out an expedited grace period.
Once that grace period completes, one of the tasks holding an internal
->exp_funnel_mutex acquires the root ->exp_funnel_mutex.  If it can use
the just-completed grace period, it releases its ->exp_funnel_mutex,
and the cycle repeats, until the queue drains.  If not, then it will
carry out another grace period, perhaps making some of the queue wait
unnecessarily -- but that can happen in the strictly queued case as well,
due to delays between snapshotting the counter and getting on the queue.

The key advantage of the funnel approach is that many tasks can be
concurrently discovering that the grace period they need has already
happened.

Of course, if there are more than 260 tasks queued, the excess tasks will
queue on the leaf ->exp_funnel_mutex mutexes.  But they will eventually
start draining 256 at a time, in parallel.

And nothing comes for free.  In an idle system, the single task wanting
an expedited grace period must work its way up the rcu_node tree.  In
the 4096-CPU case with default configuration, it must acquire three
uncontended mutexes.  But this is way down in the noise compared to
the 4095 cache misses required to determine that all the rest of the
CPUs are idle.  So the funnel approach is a good tradeoff.

> You also do not take the actual RCU state machine into account -- this
> is a parallel state.
> 
> Can't we integrate the force quiescent state machinery with the
> expedited machinery -- that is instead of building a parallel state, use
> the expedited thing to push the regular machine forward?
> 
> We can use the stop_machine calls to force the local RCU state forward,
> after all, we _know_ we just made a context switch into the stopper
> thread. All we need to do is disable interrupts to hold off the tick
> (which normally drives the state machine) and just unconditionally
> advance our state.
> 
> If we use the regular GP machinery, you also don't have to strongly
> order the callers, just stick them on whatever GP was active when they
> came in and let them roll, this allows much better (and more natural)
> concurrent processing.

That gets quite complex, actually.  Lots of races with the normal grace
periods doing one thing or another.

However, it should be quite easy to go the other way and make the normal
grace-period processing take advantage of expedited grace periods that
happened to occur at the right time.  I will look into this, thank you
for the nudge!

							Thanx, Paul


^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-06-25 13:47                                         ` Paul E. McKenney
@ 2015-06-25 14:20                                           ` Peter Zijlstra
  2015-06-25 14:51                                             ` Paul E. McKenney
  0 siblings, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-25 14:20 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
	viro, torvalds

On Thu, Jun 25, 2015 at 06:47:55AM -0700, Paul E. McKenney wrote:
> On Thu, Jun 25, 2015 at 01:07:34PM +0200, Peter Zijlstra wrote:
> > I'm still somewhat confused by the whole strict order sequence vs this
> > non ordered 'polling' of global state.
> > 
> > This funnel thing basically waits random times depending on the
> > contention of these mutexes and tries again. Ultimately serializing on
> > the root funnel thing.
> 
> Not random at all!

No, they are random per, definition it depends on the amount of
contention and since that's random, the rest it too.

> The whole funnel is controlled by the root ->exp_funnel_mutex holder,
> who is going to hold the lock for a single expedited grace period, then
> release it.  This means that any time a task acquires a lock, there is
> very likely to have been a recent state change.  Hence the checks after
> each lock acquisition.
> 
> So in the heavy-use case, what tends to happen is that there are one
> or two expedited grace periods, and then the entire queue of waiters
> acquiring ->exp_funnel_mutex simply evaporates -- they can make use of
> the expedited grace period whose completion resulted in their acquisition
> completing and thus them being awakened.  No fuss, no muss, no unnecessary
> contention or cache thrashing.

Plenty of cache trashing, since your 'tree' is not at all cache aligned
or even remotely coherent with the actual machine topology -- I'll keep
reminding you :-)

But I must admit that the workings of the sequence thing elided me this
morning. Yes that's much better than the strict ticket order of before.

> > You also do not take the actual RCU state machine into account -- this
> > is a parallel state.
> > 
> > Can't we integrate the force quiescent state machinery with the
> > expedited machinery -- that is instead of building a parallel state, use
> > the expedited thing to push the regular machine forward?
> > 
> > We can use the stop_machine calls to force the local RCU state forward,
> > after all, we _know_ we just made a context switch into the stopper
> > thread. All we need to do is disable interrupts to hold off the tick
> > (which normally drives the state machine) and just unconditionally
> > advance our state.
> > 
> > If we use the regular GP machinery, you also don't have to strongly
> > order the callers, just stick them on whatever GP was active when they
> > came in and let them roll, this allows much better (and more natural)
> > concurrent processing.
> 
> That gets quite complex, actually.  Lots of races with the normal grace
> periods doing one thing or another.

How so? I'm probably missing several years of RCU trickery and detail
again, but since we can advance from the tick, we should be able to
advance from the stop work with IRQs disabled with equal ease.

And since the stop work and the tick are fully serialized, there cannot
be any races there.

And the stop work against other CPUs is the exact same races you already
had with tick vs tick.

So please humour me and explain how all this is far more complicated ;-)

> However, it should be quite easy to go the other way and make the normal
> grace-period processing take advantage of expedited grace periods that
> happened to occur at the right time.  I will look into this, thank you
> for the nudge!

That should already be happening, right? Since we force context
switches, the tick driven RCU state machine will observe those and make
progress -- assuming it was trying to make progress at all of course.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-06-25 14:20                                           ` Peter Zijlstra
@ 2015-06-25 14:51                                             ` Paul E. McKenney
  2015-06-26 12:32                                               ` Peter Zijlstra
  0 siblings, 1 reply; 106+ messages in thread
From: Paul E. McKenney @ 2015-06-25 14:51 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
	viro, torvalds

On Thu, Jun 25, 2015 at 04:20:11PM +0200, Peter Zijlstra wrote:
> On Thu, Jun 25, 2015 at 06:47:55AM -0700, Paul E. McKenney wrote:
> > On Thu, Jun 25, 2015 at 01:07:34PM +0200, Peter Zijlstra wrote:
> > > I'm still somewhat confused by the whole strict order sequence vs this
> > > non ordered 'polling' of global state.
> > > 
> > > This funnel thing basically waits random times depending on the
> > > contention of these mutexes and tries again. Ultimately serializing on
> > > the root funnel thing.
> > 
> > Not random at all!
> 
> No, they are random per, definition it depends on the amount of
> contention and since that's random, the rest it too.

Not sure how to parse this one.  ;-)

> > The whole funnel is controlled by the root ->exp_funnel_mutex holder,
> > who is going to hold the lock for a single expedited grace period, then
> > release it.  This means that any time a task acquires a lock, there is
> > very likely to have been a recent state change.  Hence the checks after
> > each lock acquisition.
> > 
> > So in the heavy-use case, what tends to happen is that there are one
> > or two expedited grace periods, and then the entire queue of waiters
> > acquiring ->exp_funnel_mutex simply evaporates -- they can make use of
> > the expedited grace period whose completion resulted in their acquisition
> > completing and thus them being awakened.  No fuss, no muss, no unnecessary
> > contention or cache thrashing.
> 
> Plenty of cache trashing, since your 'tree' is not at all cache aligned
> or even remotely coherent with the actual machine topology -- I'll keep
> reminding you :-)

And, as I keep reminding you, if you actually show me system-level data
demonstrating that this is a real problem, I might consider taking some
action.  And also reminding you that in the meantime, you can experiment
by setting the fanout sizes to match a given system and see if it makes
any visible difference.  (Yes, I do understand the odd numbering of
hyperthreads, but you can still run a reasonable experiment.)

> But I must admit that the workings of the sequence thing elided me this
> morning. Yes that's much better than the strict ticket order of before.

OK, good!

> > > You also do not take the actual RCU state machine into account -- this
> > > is a parallel state.
> > > 
> > > Can't we integrate the force quiescent state machinery with the
> > > expedited machinery -- that is instead of building a parallel state, use
> > > the expedited thing to push the regular machine forward?
> > > 
> > > We can use the stop_machine calls to force the local RCU state forward,
> > > after all, we _know_ we just made a context switch into the stopper
> > > thread. All we need to do is disable interrupts to hold off the tick
> > > (which normally drives the state machine) and just unconditionally
> > > advance our state.
> > > 
> > > If we use the regular GP machinery, you also don't have to strongly
> > > order the callers, just stick them on whatever GP was active when they
> > > came in and let them roll, this allows much better (and more natural)
> > > concurrent processing.
> > 
> > That gets quite complex, actually.  Lots of races with the normal grace
> > periods doing one thing or another.
> 
> How so? I'm probably missing several years of RCU trickery and detail
> again, but since we can advance from the tick, we should be able to
> advance from the stop work with IRQs disabled with equal ease.
> 
> And since the stop work and the tick are fully serialized, there cannot
> be any races there.
> 
> And the stop work against other CPUs is the exact same races you already
> had with tick vs tick.
> 
> So please humour me and explain how all this is far more complicated ;-)

Yeah, I do need to get RCU design/implementation documentation put together.

In the meantime, RCU's normal grace-period machinery is designed to be
quite loosely coupled.  The idea is that almost all actions occur locally,
reducing contention and cache thrashing.  But an expedited grace period
needs tight coupling in order to be able to complete quickly.  Making
something that switches between loose and tight coupling in short order
is not at all simple.

> > However, it should be quite easy to go the other way and make the normal
> > grace-period processing take advantage of expedited grace periods that
> > happened to occur at the right time.  I will look into this, thank you
> > for the nudge!
> 
> That should already be happening, right? Since we force context
> switches, the tick driven RCU state machine will observe those and make
> progress -- assuming it was trying to make progress at all of course.

It is to an extent, but I believe that I can do better.  On the other hand,
it is quite possible that this is a 6AM delusion on my part.  ;-)

If it is not a delusion, the eventual solution will likely be a much more
satisfying answer to your "why not merge into the normal RCU grace period
machinery" question.  But I need to complete reworking the expedited
machinery first!

							Thanx, Paul


^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 05/13] percpu-rwsem: Optimize readers and reduce global impact
  2015-06-23  7:28   ` Nicholas Mc Guire
@ 2015-06-25 19:08     ` Peter Zijlstra
  2015-06-25 19:17       ` Tejun Heo
  0 siblings, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-25 19:08 UTC (permalink / raw)
  To: Nicholas Mc Guire
  Cc: oleg, paulmck, tj, mingo, linux-kernel, dave, riel, viro, torvalds

On Tue, Jun 23, 2015 at 09:28:11AM +0200, Nicholas Mc Guire wrote:
> 
> A bit off-topic probably
> but maybe this should not be in kernel/locking/percpu-rwsem.c but in a
> generic percpu location as this construct is present in the core a few times
> atleast in:
>  kernel/irq/irqdesc.c:kstat_irqs

>  kernel/fork.c:nr_processes

That has an odd unsigned long vs int fail, but yes.

>  mm/memcontrol.c:mem_cgroup_read_events
>  mm/memcontrol.c:mem_cgroup_read_stat

Those seem to be hotplug challenged. I'm thinking dropping that
nocpu_base.count[] crap and just iterating all possible CPUs would've
been much easier.

> > +#define per_cpu_sum(var)                                             \
> > +({                                                                   \
> > +     typeof(var) __sum = 0;                                          \
> > +     int cpu;                                                        \
> > +     for_each_possible_cpu(cpu)                                      \
> > +             __sum += per_cpu(var, cpu);                             \
> > +     __sum;                                                          \
> > +})
> > +
> 
> so maybe put it into include/linux/percpu.h ?

Yes I can do that.

We can try and use it more after that, there seems to be loads of places
that could use this fs/namespace.c fs/inode.c etc..

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 05/13] percpu-rwsem: Optimize readers and reduce global impact
  2015-06-25 19:08     ` Peter Zijlstra
@ 2015-06-25 19:17       ` Tejun Heo
  2015-06-29  9:32         ` Peter Zijlstra
  0 siblings, 1 reply; 106+ messages in thread
From: Tejun Heo @ 2015-06-25 19:17 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Nicholas Mc Guire, oleg, paulmck, mingo, linux-kernel, dave,
	riel, viro, torvalds

Hello,

On Thu, Jun 25, 2015 at 09:08:00PM +0200, Peter Zijlstra wrote:
> >  mm/memcontrol.c:mem_cgroup_read_events
> >  mm/memcontrol.c:mem_cgroup_read_stat
> 
> Those seem to be hotplug challenged. I'm thinking dropping that
> nocpu_base.count[] crap and just iterating all possible CPUs would've
> been much easier.

A patch doing that is already queued for this merge window.  IIRC,
it's included as part of cgroup writeback updates.

> > > +#define per_cpu_sum(var)                                             \
> > > +({                                                                   \
> > > +     typeof(var) __sum = 0;                                          \
> > > +     int cpu;                                                        \
> > > +     for_each_possible_cpu(cpu)                                      \
> > > +             __sum += per_cpu(var, cpu);                             \
> > > +     __sum;                                                          \
> > > +})
> > > +
> > 
> > so maybe put it into include/linux/percpu.h ?

percpu-defs.h would be the better place for it.

> Yes I can do that.
> 
> We can try and use it more after that, there seems to be loads of places
> that could use this fs/namespace.c fs/inode.c etc..

Hmmm... the only worry I have about this is people using it on u64 on
32bit machines.  CPU local ops can do split updates on lower and upper
halves and the remotely-read value will be surprising.  We have the
same issues w/ regular per_cpu accesses to but the summing function /
macro is better at giving the false sense of security.  Prolly
limiting it upto ulong size is a good idea?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-06-23 17:24         ` Oleg Nesterov
@ 2015-06-25 19:18           ` Peter Zijlstra
  0 siblings, 0 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-25 19:18 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: paulmck, tj, mingo, linux-kernel, der.herr, dave, riel, viro, torvalds

On Tue, Jun 23, 2015 at 07:24:16PM +0200, Oleg Nesterov wrote:
> IOW. Suppose we add ->work_mutex into struct cpu_stopper. Btw,
> I think we should move all per-cpu variables there...
> 
> Now,
> 
> 	lock_stop_cpus_works(cpumask)
> 	{
> 		for_each_cpu(cpu, cpumask)
> 			mutex_lock(per_cpu(cpu_stopper_task, cpu).work_mutex);
> 	}
> 
> 	unlock_stop_cpus_works(cpumask)
> 	{
> 		for_each_cpu(cpu, cpumask)
> 			mutex_lock(...);
> 	}
> 
> which should be used instead of stop_cpus_mutex. After this change
> stop_two_cpus() can just use stop_cpus().

Right, lockdep annotating that will be 'interesting' though. And
stop_two_cpus() then has the problem of allocating a cpumask. Simpler to
let it keep 'abuse' the queueing spinlock in there.

> Off-topic. Can't we make __stop_machine() static? The only caller,
> _cpu_down() can safely call stop_machine(), get_online_cpus() is
> fine under cpu_hotplug_begin().

Can do I think.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-06-25 14:51                                             ` Paul E. McKenney
@ 2015-06-26 12:32                                               ` Peter Zijlstra
  2015-06-26 16:14                                                 ` Paul E. McKenney
  0 siblings, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-26 12:32 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
	viro, torvalds

On Thu, Jun 25, 2015 at 07:51:46AM -0700, Paul E. McKenney wrote:
> > So please humour me and explain how all this is far more complicated ;-)
> 
> Yeah, I do need to get RCU design/implementation documentation put together.
> 
> In the meantime, RCU's normal grace-period machinery is designed to be
> quite loosely coupled.  The idea is that almost all actions occur locally,
> reducing contention and cache thrashing.  But an expedited grace period
> needs tight coupling in order to be able to complete quickly.  Making
> something that switches between loose and tight coupling in short order
> is not at all simple.

But expedited just means faster, we never promised that
sync_rcu_expedited is the absolute fastest primitive ever.

So I really should go read the RCU code I suppose, but I don't get
what's wrong with starting a forced quiescent state, then doing the
stop_work spray, where each work will run the regular RCU tick thing to
push it forwards.

>From my feeble memories, what I remember is that the last cpu to
complete a GP on a leaf node will push the completion up to the next
level, until at last we've reached the root of your tree and we can
complete the GP globally.

To me it just makes more sense to have a single RCU state machine. With
expedited we'll push it as fast as we can, but no faster.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-06-26 12:32                                               ` Peter Zijlstra
@ 2015-06-26 16:14                                                 ` Paul E. McKenney
  2015-06-29  7:56                                                   ` Peter Zijlstra
  0 siblings, 1 reply; 106+ messages in thread
From: Paul E. McKenney @ 2015-06-26 16:14 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
	viro, torvalds

On Fri, Jun 26, 2015 at 02:32:07PM +0200, Peter Zijlstra wrote:
> On Thu, Jun 25, 2015 at 07:51:46AM -0700, Paul E. McKenney wrote:
> > > So please humour me and explain how all this is far more complicated ;-)
> > 
> > Yeah, I do need to get RCU design/implementation documentation put together.
> > 
> > In the meantime, RCU's normal grace-period machinery is designed to be
> > quite loosely coupled.  The idea is that almost all actions occur locally,
> > reducing contention and cache thrashing.  But an expedited grace period
> > needs tight coupling in order to be able to complete quickly.  Making
> > something that switches between loose and tight coupling in short order
> > is not at all simple.
> 
> But expedited just means faster, we never promised that
> sync_rcu_expedited is the absolute fastest primitive ever.

Which is good, because given that it is doing something to each and
every CPU, it most assuredly won't in any way resemble the absolute
fastest primitive ever.  ;-)

> So I really should go read the RCU code I suppose, but I don't get
> what's wrong with starting a forced quiescent state, then doing the
> stop_work spray, where each work will run the regular RCU tick thing to
> push it forwards.
> 
> >From my feeble memories, what I remember is that the last cpu to
> complete a GP on a leaf node will push the completion up to the next
> level, until at last we've reached the root of your tree and we can
> complete the GP globally.

That is true, the task that notices the last required quiescent state
will push up the tree and notice that the grace period has ended.
If that task is not the grace-period kthread, it will then awaken
the grace-period kthread.

> To me it just makes more sense to have a single RCU state machine. With
> expedited we'll push it as fast as we can, but no faster.

Suppose that someone invokes synchronize_sched_expedited(), but there
is no normal grace period in flight.  Then each CPU will note its own
quiescent state, but when it later might have tried to push it up the
tree, it will see that there is no grace period in effect, and will
therefore not bother.

OK, we could have synchronize_sched_expedited() tell the grace-period
kthread to start a grace period if one was not already in progress.
But that still isn't good enough, because the grace-period kthread will
take some time to initialize the new grace period, and if we hammer all
the CPUs before the initialization is complete, the resulting quiescent
states cannot be counted against the new grace period.  (The reason for
this is that there is some delay between the actual quiescent state
and the time that it is reported, so we have to be very careful not
to incorrectly report a quiescent state from an earlier grace period
against the current grace period.)

OK, the grace-period kthread could tell synchronize_sched_expedited()
when it has finished initializing the grace period, though this is
starting to get a bit on the Rube Goldberg side.  But this -still- is
not good enough, because even though the grace-period kthread has fully
initialized the new grace period, the individual CPUs are unaware of it.
And they will therefore continue to ignore any quiescent state that they
encounter, because they cannot prove that it actually happened after
the start of the current grace period.

OK, we could have some sort of indication when all CPUs become aware
of the new grace period by having them atomically manipulate a global
counter.  Presumably we have some flag indicating when this is and is
not needed so that we avoid the killer memory contention in the common
case where it is not needed.  But this -still- isn't good enough, because
idle CPUs never will become aware of the new grace period -- by design,
as they are supposed to be able to sleep through an arbitrary number of
grace periods.

OK, so we could have some sort of indication when all non-idle CPUs
become aware of the new grace period.  But there could be races where
an idle CPU suddenly becomes non-idle just after it was reported that
the all non-idle CPUs were aware of the grace period.  This would result
in a hang, because this this newly non-idle CPU might not have noticed
the new grace period at the time that synchronize_sched_expedited()
hammers it, which would mean that this newly non-idle CPU would refuse
to report the resulting quiescent state.

OK, so the grace-period kthread could track and report the set of CPUs
that had ever been idle since synchronize_sched_expedited() contacted it.
But holy overhead Batman!!!

And that is just one of the possible interactions with the grace-period
kthread.  It might be in the middle of setting up a new grace period.
It might be in the middle of cleaning up after the last grace period.
It might be waiting for a grace period to complete, and the last quiescent
state was just reported, but hasn't propagated all the way up yet.  All
of these would need to be handled correctly, and a number of them would
be as messy as the above scenario.  Some might be even more messy.

I feel like there is a much easier way, but cannot yet articulate it.
I came across a couple of complications and a blind alley with it thus
far, but it still looks promising.  I expect to be able to generate
actual code for it within a few days, but right now it is just weird
abstract shapes in my head.  (Sorry, if I knew how to describe them,
I could just write the code!  When I do write the code, it will probably
seem obvious and trivial, that being the usual outcome...)

							Thanx, Paul


^ permalink raw reply	[flat|nested] 106+ messages in thread

* [PATCH 0/3] percpu-rwsem: introduce percpu_rw_semaphore->recursive mode
  2015-06-24 13:50           ` Oleg Nesterov
  2015-06-24 14:13             ` Peter Zijlstra
@ 2015-06-28 23:56             ` Oleg Nesterov
  2015-06-28 23:56               ` [PATCH 1/3] rcusync: introduce rcu_sync_struct->exclusive mode Oleg Nesterov
                                 ` (2 more replies)
  1 sibling, 3 replies; 106+ messages in thread
From: Oleg Nesterov @ 2015-06-28 23:56 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: paulmck, tj, mingo, linux-kernel, der.herr, dave, riel, viro, torvalds

On 06/24, Oleg Nesterov wrote:
>
> So we need percpu_down_write_dont_block_readers(). I already thought
> about this before, I'll try to make the patch tomorrow on top of your
> changes.

Never say tomorrow...

> This means that we do not need task_struct->cpuhp_ref, but we can't
> avoid livelock we currently have: cpu_hotplug_begin() can never succeed
> if the new readers come fast enough.

Like with any other "recursive" lock.

Peter, I know you don't like the 1st patch. And yes, we could add another
mutex into percpu_rw_semaphore instead. But I think it would be better
to rely on rcu_sync_enter(). As for completion, we can remove it later.
Nevermind, the actual change is 3/3 and it looks simple.

Oleg.


^ permalink raw reply	[flat|nested] 106+ messages in thread

* [PATCH 1/3] rcusync: introduce rcu_sync_struct->exclusive mode
  2015-06-28 23:56             ` [PATCH 0/3] percpu-rwsem: introduce percpu_rw_semaphore->recursive mode Oleg Nesterov
@ 2015-06-28 23:56               ` Oleg Nesterov
  2015-06-28 23:56               ` [PATCH 2/3] percpu-rwsem: don't use percpu_rw_semaphore->rw_sem to exclude writers Oleg Nesterov
  2015-06-28 23:56               ` [PATCH 3/3] percpu-rwsem: introduce percpu_rw_semaphore->recursive mode Oleg Nesterov
  2 siblings, 0 replies; 106+ messages in thread
From: Oleg Nesterov @ 2015-06-28 23:56 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: paulmck, tj, mingo, der.herr, dave, riel, viro, torvalds, linux-kernel

Add rcu_sync_struct->exclusive boolean set by rcu_sync_init(), it
obviously controls the exclusiveness of rcu_sync_enter(). This is
what percpu_down_write() actually wants.

We turn ->gp_wait into "struct completion gp_comp", it is used as
a resource counter in "exclusive" mode. Otherwise we only use its
completion->wait member for wait_event/wake_up_all. We never mix
the completion/wait_queue_head_t operations.

TODO: we can cleanup this logic and avoid "struct completion", but
this needs a bit more changes.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 include/linux/percpu-rwsem.h  |    2 +-
 include/linux/rcusync.h       |   29 ++++++++++++++++-------------
 kernel/locking/percpu-rwsem.c |    2 +-
 kernel/rcu/sync.c             |   25 ++++++++++++++++++++-----
 4 files changed, 38 insertions(+), 20 deletions(-)

diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index e12ce86..9202e73 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -21,7 +21,7 @@ static DEFINE_PER_CPU(unsigned int, __percpu_rwsem_refcount_##name);	\
 static struct percpu_rw_semaphore name = {				\
 	.refcount = &__percpu_rwsem_refcount_##name,			\
 	.state = 0,							\
-	.rss = __RCU_SYNC_INITIALIZER(name.rss, RCU_SCHED_SYNC),	\
+	.rss = __RCU_SYNC_INITIALIZER(name.rss, RCU_SCHED_SYNC, 1),	\
 	.writer = __WAIT_QUEUE_HEAD_INITIALIZER(name.writer),		\
 	.rw_sem = __RWSEM_INITIALIZER(name.rw_sem),			\
 }
diff --git a/include/linux/rcusync.h b/include/linux/rcusync.h
index 0135838..aaea86a 100644
--- a/include/linux/rcusync.h
+++ b/include/linux/rcusync.h
@@ -1,7 +1,7 @@
 #ifndef _LINUX_RCUSYNC_H_
 #define _LINUX_RCUSYNC_H_
 
-#include <linux/wait.h>
+#include <linux/completion.h>
 #include <linux/rcupdate.h>
 
 enum rcu_sync_type { RCU_SYNC, RCU_SCHED_SYNC, RCU_BH_SYNC };
@@ -9,11 +9,12 @@ enum rcu_sync_type { RCU_SYNC, RCU_SCHED_SYNC, RCU_BH_SYNC };
 struct rcu_sync_struct {
 	int			gp_state;
 	int			gp_count;
-	wait_queue_head_t	gp_wait;
+	struct completion	gp_comp;
 
 	int			cb_state;
 	struct rcu_head		cb_head;
 
+	bool			exclusive;
 	enum rcu_sync_type	gp_type;
 };
 
@@ -28,30 +29,32 @@ static inline bool rcu_sync_is_idle(struct rcu_sync_struct *rss)
 #endif
 }
 
-extern void rcu_sync_init(struct rcu_sync_struct *, enum rcu_sync_type);
+extern void rcu_sync_init(struct rcu_sync_struct *,
+				enum rcu_sync_type, bool excl);
 extern void rcu_sync_enter(struct rcu_sync_struct *);
 extern void rcu_sync_exit(struct rcu_sync_struct *);
 extern void rcu_sync_dtor(struct rcu_sync_struct *);
 
-#define __RCU_SYNC_INITIALIZER(name, type) {				\
+#define __RCU_SYNC_INITIALIZER(name, type, excl) {			\
 		.gp_state = 0,						\
 		.gp_count = 0,						\
-		.gp_wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.gp_wait),	\
+		.gp_comp = COMPLETION_INITIALIZER(name.gp_comp),	\
 		.cb_state = 0,						\
+		.exclusive = excl,					\
 		.gp_type = type,					\
 	}
 
-#define	__DEFINE_RCU_SYNC(name, type)	\
-	struct rcu_sync_struct name = __RCU_SYNC_INITIALIZER(name, type)
+#define	__DEFINE_RCU_SYNC(name, type, excl)	\
+	struct rcu_sync_struct name = __RCU_SYNC_INITIALIZER(name, type, excl)
 
-#define DEFINE_RCU_SYNC(name)		\
-	__DEFINE_RCU_SYNC(name, RCU_SYNC)
+#define DEFINE_RCU_SYNC(name, excl)		\
+	__DEFINE_RCU_SYNC(name, RCU_SYNC, excl)
 
-#define DEFINE_RCU_SCHED_SYNC(name)	\
-	__DEFINE_RCU_SYNC(name, RCU_SCHED_SYNC)
+#define DEFINE_RCU_SCHED_SYNC(name, excl)	\
+	__DEFINE_RCU_SYNC(name, RCU_SCHED_SYNC, excl)
 
-#define DEFINE_RCU_BH_SYNC(name)	\
-	__DEFINE_RCU_SYNC(name, RCU_BH_SYNC)
+#define DEFINE_RCU_BH_SYNC(name, excl)	\
+	__DEFINE_RCU_SYNC(name, RCU_BH_SYNC, excl)
 
 #endif /* _LINUX_RCUSYNC_H_ */
 
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 915646c..014d2f4 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -18,7 +18,7 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
 		return -ENOMEM;
 
 	sem->state = readers_slow;
-	rcu_sync_init(&sem->rss, RCU_SCHED_SYNC);
+	rcu_sync_init(&sem->rss, RCU_SCHED_SYNC, true);
 	init_waitqueue_head(&sem->writer);
 	__init_rwsem(&sem->rw_sem, name, rwsem_key);
 
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index 8835ad1..03ddc61 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -38,7 +38,8 @@ static const struct {
 enum { GP_IDLE = 0, GP_PENDING, GP_PASSED };
 enum { CB_IDLE = 0, CB_PENDING, CB_REPLAY };
 
-#define	rss_lock	gp_wait.lock
+#define	rss_lock	gp_comp.wait.lock
+#define	gp_wait		gp_comp.wait
 
 #ifdef CONFIG_PROVE_RCU
 bool __rcu_sync_is_idle(struct rcu_sync_struct *rss)
@@ -49,10 +50,12 @@ bool __rcu_sync_is_idle(struct rcu_sync_struct *rss)
 EXPORT_SYMBOL_GPL(__rcu_sync_is_idle);
 #endif
 
-void rcu_sync_init(struct rcu_sync_struct *rss, enum rcu_sync_type type)
+void rcu_sync_init(struct rcu_sync_struct *rss,
+			enum rcu_sync_type type, bool excl)
 {
 	memset(rss, 0, sizeof(*rss));
-	init_waitqueue_head(&rss->gp_wait);
+	init_completion(&rss->gp_comp);
+	rss->exclusive = excl;
 	rss->gp_type = type;
 }
 
@@ -72,9 +75,13 @@ void rcu_sync_enter(struct rcu_sync_struct *rss)
 	if (need_sync) {
 		gp_ops[rss->gp_type].sync();
 		rss->gp_state = GP_PASSED;
-		wake_up_all(&rss->gp_wait);
+		if (!rss->exclusive)
+			wake_up_all(&rss->gp_wait);
 	} else if (need_wait) {
-		wait_event(rss->gp_wait, rss->gp_state == GP_PASSED);
+		if (!rss->exclusive)
+			wait_event(rss->gp_wait, rss->gp_state == GP_PASSED);
+		else
+			wait_for_completion(&rss->gp_comp);
 	} else {
 		/*
 		 * Possible when there's a pending CB from a rcu_sync_exit().
@@ -119,6 +126,12 @@ static void rcu_sync_func(struct rcu_head *rcu)
 	spin_unlock_irqrestore(&rss->rss_lock, flags);
 }
 
+static inline void __complete_locked(struct completion *x)
+{
+	x->done++;
+	__wake_up_locked(&x->wait, TASK_NORMAL, 1);
+}
+
 void rcu_sync_exit(struct rcu_sync_struct *rss)
 {
 	spin_lock_irq(&rss->rss_lock);
@@ -129,6 +142,8 @@ void rcu_sync_exit(struct rcu_sync_struct *rss)
 		} else if (rss->cb_state == CB_PENDING) {
 			rss->cb_state = CB_REPLAY;
 		}
+	} else if (rss->exclusive) {
+		__complete_locked(&rss->gp_comp);
 	}
 	spin_unlock_irq(&rss->rss_lock);
 }
-- 
1.5.5.1


^ permalink raw reply related	[flat|nested] 106+ messages in thread

* [PATCH 2/3] percpu-rwsem: don't use percpu_rw_semaphore->rw_sem to exclude writers
  2015-06-28 23:56             ` [PATCH 0/3] percpu-rwsem: introduce percpu_rw_semaphore->recursive mode Oleg Nesterov
  2015-06-28 23:56               ` [PATCH 1/3] rcusync: introduce rcu_sync_struct->exclusive mode Oleg Nesterov
@ 2015-06-28 23:56               ` Oleg Nesterov
  2015-06-28 23:56               ` [PATCH 3/3] percpu-rwsem: introduce percpu_rw_semaphore->recursive mode Oleg Nesterov
  2 siblings, 0 replies; 106+ messages in thread
From: Oleg Nesterov @ 2015-06-28 23:56 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: paulmck, tj, mingo, der.herr, dave, riel, viro, torvalds, linux-kernel

percpu_down_write() does down_write() to exclude both the readers and
other writers. We can rely on rcu_sync_enter() in exclusive mode and
take ->rw_sem right before wait_event().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 kernel/locking/percpu-rwsem.c |    3 +--
 1 files changed, 1 insertions(+), 2 deletions(-)

diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 014d2f4..609c13b 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -139,8 +139,6 @@ static bool readers_active_check(struct percpu_rw_semaphore *sem)
 
 void percpu_down_write(struct percpu_rw_semaphore *sem)
 {
-	down_write(&sem->rw_sem);
-
 	/* Notify readers to take the slow path. */
 	rcu_sync_enter(&sem->rss);
 
@@ -158,6 +156,7 @@ void percpu_down_write(struct percpu_rw_semaphore *sem)
 	 * therefore will wait for them.
 	 */
 
+	down_write(&sem->rw_sem);
 	/* Wait for all now active readers to complete. */
 	wait_event(sem->writer, readers_active_check(sem));
 }
-- 
1.5.5.1


^ permalink raw reply related	[flat|nested] 106+ messages in thread

* [PATCH 3/3] percpu-rwsem: introduce percpu_rw_semaphore->recursive mode
  2015-06-28 23:56             ` [PATCH 0/3] percpu-rwsem: introduce percpu_rw_semaphore->recursive mode Oleg Nesterov
  2015-06-28 23:56               ` [PATCH 1/3] rcusync: introduce rcu_sync_struct->exclusive mode Oleg Nesterov
  2015-06-28 23:56               ` [PATCH 2/3] percpu-rwsem: don't use percpu_rw_semaphore->rw_sem to exclude writers Oleg Nesterov
@ 2015-06-28 23:56               ` Oleg Nesterov
  2 siblings, 0 replies; 106+ messages in thread
From: Oleg Nesterov @ 2015-06-28 23:56 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: paulmck, tj, mingo, der.herr, dave, riel, viro, torvalds, linux-kernel

Add percpu_rw_semaphore->recursive boolean. If it is true then the
recursive percpu_down_read() is safe, percpu_down_write() doesn't
exclude the new readers, like cpu_hotplug_begin().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 include/linux/percpu-rwsem.h  |   15 ++++++++++-----
 kernel/events/uprobes.c       |    2 +-
 kernel/locking/percpu-rwsem.c |   15 +++++++++++----
 3 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index 9202e73..9441abd 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -13,16 +13,18 @@ struct percpu_rw_semaphore {
 	int			state;
 	struct rcu_sync_struct	rss;
 	wait_queue_head_t	writer;
+	bool			recursive;
 	struct rw_semaphore	rw_sem;
 };
 
-#define DEFINE_STATIC_PERCPU_RWSEM(name)				\
+#define DEFINE_STATIC_PERCPU_RWSEM(name, rec)				\
 static DEFINE_PER_CPU(unsigned int, __percpu_rwsem_refcount_##name);	\
 static struct percpu_rw_semaphore name = {				\
 	.refcount = &__percpu_rwsem_refcount_##name,			\
 	.state = 0,							\
 	.rss = __RCU_SYNC_INITIALIZER(name.rss, RCU_SCHED_SYNC, 1),	\
 	.writer = __WAIT_QUEUE_HEAD_INITIALIZER(name.writer),		\
+	.recursive = rec,						\
 	.rw_sem = __RWSEM_INITIALIZER(name.rw_sem),			\
 }
 
@@ -37,7 +39,10 @@ static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
 {
 	might_sleep();
 
-	rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 0, _RET_IP_);
+	if (sem->recursive)
+		rwlock_acquire_read(&sem->rw_sem.dep_map, 0, 0, _RET_IP_);
+	else
+		rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 0, _RET_IP_);
 
 	preempt_disable();
 	/*
@@ -97,14 +102,14 @@ static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
 extern void percpu_down_write(struct percpu_rw_semaphore *);
 extern void percpu_up_write(struct percpu_rw_semaphore *);
 
-extern int __percpu_init_rwsem(struct percpu_rw_semaphore *,
+extern int __percpu_init_rwsem(struct percpu_rw_semaphore *, bool,
 				const char *, struct lock_class_key *);
 extern void percpu_free_rwsem(struct percpu_rw_semaphore *);
 
-#define percpu_init_rwsem(sem)					\
+#define percpu_init_rwsem(sem, recursive)			\
 ({								\
 	static struct lock_class_key rwsem_key;			\
-	__percpu_init_rwsem(sem, #sem, &rwsem_key);		\
+	__percpu_init_rwsem(sem, recursive, #sem, &rwsem_key);	\
 })
 
 #endif
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index cb346f2..a4813a1 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1985,7 +1985,7 @@ static int __init init_uprobes(void)
 	for (i = 0; i < UPROBES_HASH_SZ; i++)
 		mutex_init(&uprobes_mmap_mutex[i]);
 
-	if (percpu_init_rwsem(&dup_mmap_sem))
+	if (percpu_init_rwsem(&dup_mmap_sem, false))
 		return -ENOMEM;
 
 	return register_die_notifier(&uprobe_exception_nb);
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 609c13b..3db7c45 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -10,7 +10,7 @@
 
 enum { readers_slow, readers_block };
 
-int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
+int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, bool recursive,
 			const char *name, struct lock_class_key *rwsem_key)
 {
 	sem->refcount = alloc_percpu(unsigned int);
@@ -20,6 +20,7 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
 	sem->state = readers_slow;
 	rcu_sync_init(&sem->rss, RCU_SCHED_SYNC, true);
 	init_waitqueue_head(&sem->writer);
+	sem->recursive = recursive;
 	__init_rwsem(&sem->rw_sem, name, rwsem_key);
 
 	return 0;
@@ -124,9 +125,15 @@ void __percpu_up_read(struct percpu_rw_semaphore *sem)
  */
 static bool readers_active_check(struct percpu_rw_semaphore *sem)
 {
-	if (per_cpu_sum(*sem->refcount) != 0)
+	if (sem->recursive && !down_write_trylock(&sem->rw_sem))
 		return false;
 
+	if (per_cpu_sum(*sem->refcount) != 0) {
+		if (sem->recursive)
+			up_write(&sem->rw_sem);
+		return false;
+	}
+
 	/*
 	 * If we observed the decrement; ensure we see the entire critical
 	 * section.
@@ -155,8 +162,8 @@ void percpu_down_write(struct percpu_rw_semaphore *sem)
 	 * then we are guaranteed to see their sem->refcount increment, and
 	 * therefore will wait for them.
 	 */
-
-	down_write(&sem->rw_sem);
+	if (!sem->recursive)
+		down_write(&sem->rw_sem);
 	/* Wait for all now active readers to complete. */
 	wait_event(sem->writer, readers_active_check(sem));
 }
-- 
1.5.5.1


^ permalink raw reply related	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-06-26 16:14                                                 ` Paul E. McKenney
@ 2015-06-29  7:56                                                   ` Peter Zijlstra
  2015-06-30 21:32                                                     ` Paul E. McKenney
  0 siblings, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-29  7:56 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
	viro, torvalds

On Fri, Jun 26, 2015 at 09:14:28AM -0700, Paul E. McKenney wrote:
> > To me it just makes more sense to have a single RCU state machine. With
> > expedited we'll push it as fast as we can, but no faster.
> 
> Suppose that someone invokes synchronize_sched_expedited(), but there
> is no normal grace period in flight.  Then each CPU will note its own
> quiescent state, but when it later might have tried to push it up the
> tree, it will see that there is no grace period in effect, and will
> therefore not bother.

Right, I did mention the force grace period machinery to make sure we
start one before poking :-)

> OK, we could have synchronize_sched_expedited() tell the grace-period
> kthread to start a grace period if one was not already in progress.

I had indeed forgotten that got farmed out to the kthread; on which, my
poor desktop seems to have spend ~140 minutes of its (most recent)
existence poking RCU things.

    7 root      20   0       0      0      0 S   0.0  0.0  56:34.66 rcu_sched
    8 root      20   0       0      0      0 S   0.0  0.0  20:58.19 rcuos/0
    9 root      20   0       0      0      0 S   0.0  0.0  18:50.75 rcuos/1
   10 root      20   0       0      0      0 S   0.0  0.0  18:30.62 rcuos/2
   11 root      20   0       0      0      0 S   0.0  0.0  17:33.24 rcuos/3
   12 root      20   0       0      0      0 S   0.0  0.0   2:43.54 rcuos/4
   13 root      20   0       0      0      0 S   0.0  0.0   3:00.31 rcuos/5
   14 root      20   0       0      0      0 S   0.0  0.0   3:09.27 rcuos/6
   15 root      20   0       0      0      0 S   0.0  0.0   2:52.98 rcuos/7

Which is almost as much time as my konsole:

 2853 peterz    20   0  586240 103664  41848 S   1.0  0.3 147:39.50 konsole

Which seems somewhat excessive. But who knows.

> OK, the grace-period kthread could tell synchronize_sched_expedited()
> when it has finished initializing the grace period, though this is
> starting to get a bit on the Rube Goldberg side.  But this -still- is
> not good enough, because even though the grace-period kthread has fully
> initialized the new grace period, the individual CPUs are unaware of it.

Right, so over the weekend -- I had postponed reading this rather long
email for I was knackered -- I had figured that because we trickle the
GP completion up, you probably equally trickle the GP start down of
sorts and there might be 'interesting' things there.

> And they will therefore continue to ignore any quiescent state that they
> encounter, because they cannot prove that it actually happened after
> the start of the current grace period.

Right, badness :-)

Although here I'll once again go ahead and say something ignorant; how
come that's a problem? Surely if we know the kthread thing has finished
starting a GP, any one CPU issuing a full memory barrier (as would be
implied by switching to the stop worker) must then indeed observe that
global state? due to that transitivity thing.

That is, I'm having a wee bit of bother for seeing how you'd need
manipulation of global variables as you elude to below.

> But this -still- isn't good enough, because
> idle CPUs never will become aware of the new grace period -- by design,
> as they are supposed to be able to sleep through an arbitrary number of
> grace periods.

Yes, I'm sure. Waking up seems like a serializing experience though; but
I suppose that's not good enough if we wake up right before we force
start the GP.

> I feel like there is a much easier way, but cannot yet articulate it.
> I came across a couple of complications and a blind alley with it thus
> far, but it still looks promising.  I expect to be able to generate
> actual code for it within a few days, but right now it is just weird
> abstract shapes in my head.  (Sorry, if I knew how to describe them,
> I could just write the code!  When I do write the code, it will probably
> seem obvious and trivial, that being the usual outcome...)

Hehe, glad to have been of help :-)

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 05/13] percpu-rwsem: Optimize readers and reduce global impact
  2015-06-25 19:17       ` Tejun Heo
@ 2015-06-29  9:32         ` Peter Zijlstra
  2015-06-29 15:12           ` Tejun Heo
  0 siblings, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-29  9:32 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Nicholas Mc Guire, oleg, paulmck, mingo, linux-kernel, dave,
	riel, viro, torvalds

On Thu, Jun 25, 2015 at 03:17:01PM -0400, Tejun Heo wrote:

> Hmmm... the only worry I have about this is people using it on u64 on
> 32bit machines.  CPU local ops can do split updates on lower and upper
> halves and the remotely-read value will be surprising.  We have the
> same issues w/ regular per_cpu accesses to but the summing function /
> macro is better at giving the false sense of security.  Prolly
> limiting it upto ulong size is a good idea?

Agreed, luckily we already have the infrastructure for this, something
like so?

--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -287,6 +287,16 @@ do {									\
 	preempt_enable();						\
 } while (0)
 
+#define per_cpu_sum(var)						\
+({									\
+	typeof(var) __sum = 0;						\
+	int cpu;							\
+	compiletime_assert_atomic_type(__sum);				\
+	for_each_possible_cpu(cpu)					\
+		__sum += per_cpu(var, cpu);				\
+	__sum;								\
+})
+
 /*
  * Branching function to split up a function into a set of functions that
  * are called for different scalar sizes of the objects handled.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 05/13] percpu-rwsem: Optimize readers and reduce global impact
  2015-06-29  9:32         ` Peter Zijlstra
@ 2015-06-29 15:12           ` Tejun Heo
  2015-06-29 15:14             ` Peter Zijlstra
  0 siblings, 1 reply; 106+ messages in thread
From: Tejun Heo @ 2015-06-29 15:12 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Nicholas Mc Guire, oleg, paulmck, mingo, linux-kernel, dave,
	riel, viro, torvalds

Hello, Peter.

On Mon, Jun 29, 2015 at 11:32:19AM +0200, Peter Zijlstra wrote:
> Agreed, luckily we already have the infrastructure for this, something
> like so?
> 
> --- a/include/linux/percpu-defs.h
> +++ b/include/linux/percpu-defs.h
> @@ -287,6 +287,16 @@ do {									\
>  	preempt_enable();						\
>  } while (0)
>  
> +#define per_cpu_sum(var)						\
> +({									\
> +	typeof(var) __sum = 0;						\
> +	int cpu;							\

Why not __cpu?

> +	compiletime_assert_atomic_type(__sum);				\
> +	for_each_possible_cpu(cpu)					\
> +		__sum += per_cpu(var, cpu);				\
> +	__sum;								\
> +})

But other than that, looks good to me.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 05/13] percpu-rwsem: Optimize readers and reduce global impact
  2015-06-29 15:12           ` Tejun Heo
@ 2015-06-29 15:14             ` Peter Zijlstra
  0 siblings, 0 replies; 106+ messages in thread
From: Peter Zijlstra @ 2015-06-29 15:14 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Nicholas Mc Guire, oleg, paulmck, mingo, linux-kernel, dave,
	riel, viro, torvalds

On Mon, Jun 29, 2015 at 11:12:20AM -0400, Tejun Heo wrote:
> Hello, Peter.
> 
> On Mon, Jun 29, 2015 at 11:32:19AM +0200, Peter Zijlstra wrote:
> > Agreed, luckily we already have the infrastructure for this, something
> > like so?
> > 
> > --- a/include/linux/percpu-defs.h
> > +++ b/include/linux/percpu-defs.h
> > @@ -287,6 +287,16 @@ do {									\
> >  	preempt_enable();						\
> >  } while (0)
> >  
> > +#define per_cpu_sum(var)						\
> > +({									\
> > +	typeof(var) __sum = 0;						\
> > +	int cpu;							\
> 
> Why not __cpu?

I've no idea, __cpu is indeed more consistent, consider it changed.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-06-29  7:56                                                   ` Peter Zijlstra
@ 2015-06-30 21:32                                                     ` Paul E. McKenney
  2015-07-01 11:56                                                       ` Peter Zijlstra
  0 siblings, 1 reply; 106+ messages in thread
From: Paul E. McKenney @ 2015-06-30 21:32 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
	viro, torvalds

On Mon, Jun 29, 2015 at 09:56:46AM +0200, Peter Zijlstra wrote:
> On Fri, Jun 26, 2015 at 09:14:28AM -0700, Paul E. McKenney wrote:
> > > To me it just makes more sense to have a single RCU state machine. With
> > > expedited we'll push it as fast as we can, but no faster.
> > 
> > Suppose that someone invokes synchronize_sched_expedited(), but there
> > is no normal grace period in flight.  Then each CPU will note its own
> > quiescent state, but when it later might have tried to push it up the
> > tree, it will see that there is no grace period in effect, and will
> > therefore not bother.
> 
> Right, I did mention the force grace period machinery to make sure we
> start one before poking :-)

Fair enough...

> > OK, we could have synchronize_sched_expedited() tell the grace-period
> > kthread to start a grace period if one was not already in progress.
> 
> I had indeed forgotten that got farmed out to the kthread; on which, my
> poor desktop seems to have spend ~140 minutes of its (most recent)
> existence poking RCU things.
> 
>     7 root      20   0       0      0      0 S   0.0  0.0  56:34.66 rcu_sched
>     8 root      20   0       0      0      0 S   0.0  0.0  20:58.19 rcuos/0
>     9 root      20   0       0      0      0 S   0.0  0.0  18:50.75 rcuos/1
>    10 root      20   0       0      0      0 S   0.0  0.0  18:30.62 rcuos/2
>    11 root      20   0       0      0      0 S   0.0  0.0  17:33.24 rcuos/3
>    12 root      20   0       0      0      0 S   0.0  0.0   2:43.54 rcuos/4
>    13 root      20   0       0      0      0 S   0.0  0.0   3:00.31 rcuos/5
>    14 root      20   0       0      0      0 S   0.0  0.0   3:09.27 rcuos/6
>    15 root      20   0       0      0      0 S   0.0  0.0   2:52.98 rcuos/7
> 
> Which is almost as much time as my konsole:
> 
>  2853 peterz    20   0  586240 103664  41848 S   1.0  0.3 147:39.50 konsole
> 
> Which seems somewhat excessive. But who knows.

No idea.  How long has that system been up?  What has it been doing?

The rcu_sched overhead is expected behavior if the system has run between
ten and one hundred million grace periods, give or take an order of
magnitude depending on the number of idle CPUs and so on.

The overhead for the RCU offload kthreads is what it is.  A kfree() takes
as much time as a kfree does, and they are all nicely counted up for you.

> > OK, the grace-period kthread could tell synchronize_sched_expedited()
> > when it has finished initializing the grace period, though this is
> > starting to get a bit on the Rube Goldberg side.  But this -still- is
> > not good enough, because even though the grace-period kthread has fully
> > initialized the new grace period, the individual CPUs are unaware of it.
> 
> Right, so over the weekend -- I had postponed reading this rather long
> email for I was knackered -- I had figured that because we trickle the
> GP completion up, you probably equally trickle the GP start down of
> sorts and there might be 'interesting' things there.

The GP completion trickles both up and down, though the down part shouldn't
matter in this case.

> > And they will therefore continue to ignore any quiescent state that they
> > encounter, because they cannot prove that it actually happened after
> > the start of the current grace period.
> 
> Right, badness :-)
> 
> Although here I'll once again go ahead and say something ignorant; how
> come that's a problem? Surely if we know the kthread thing has finished
> starting a GP, any one CPU issuing a full memory barrier (as would be
> implied by switching to the stop worker) must then indeed observe that
> global state? due to that transitivity thing.
> 
> That is, I'm having a wee bit of bother for seeing how you'd need
> manipulation of global variables as you elude to below.

Well, I thought that you wanted to leverage the combining tree to
determine when the grace period had completed.  If a given CPU isn't
pushing its quiescent states up the combining tree, then the combining
tree can't do much for you.

> > But this -still- isn't good enough, because
> > idle CPUs never will become aware of the new grace period -- by design,
> > as they are supposed to be able to sleep through an arbitrary number of
> > grace periods.
> 
> Yes, I'm sure. Waking up seems like a serializing experience though; but
> I suppose that's not good enough if we wake up right before we force
> start the GP.

That would indeed be one of the problems that could occur.  ;-)

> > I feel like there is a much easier way, but cannot yet articulate it.
> > I came across a couple of complications and a blind alley with it thus
> > far, but it still looks promising.  I expect to be able to generate
> > actual code for it within a few days, but right now it is just weird
> > abstract shapes in my head.  (Sorry, if I knew how to describe them,
> > I could just write the code!  When I do write the code, it will probably
> > seem obvious and trivial, that being the usual outcome...)
> 
> Hehe, glad to have been of help :-)

Well, I do have something that seems reasonably straightforward.  Sending
the patches along separately.  Not sure that it is worth its weight.

The idea is that we keep the expedited grace periods working as they do
now, independently of the normal grace period.  The normal grace period
takes a sequence number just after initialization, and checks to see
if an expedited grace period happened in the meantime at the beginning
of each quiescent-state forcing episode.  This saves the last one or
two quiescent-state forcing scans if the case where an expedited grace
period really did happen.

It is possible for the expedited grace period to help things along by
waking up the grace-period kthread, but of course doing this too much
further increases the time consumed by your rcu_sched kthread.  It is
possible to compromise by only doing the wakeup every so many grace
periods or only once per a given period of time, which is the approach
the last patch in the series takes.

I will be sending the series shortly, followed by a series for the
other portions of the expedited grace-period upgrade.

							Thanx, Paul


^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 00/13] percpu rwsem -v2
  2015-06-24  9:18                 ` Daniel Wagner
@ 2015-07-01  5:57                   ` Daniel Wagner
  2015-07-01 21:54                     ` Linus Torvalds
  0 siblings, 1 reply; 106+ messages in thread
From: Daniel Wagner @ 2015-07-01  5:57 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra
  Cc: oleg, paulmck, tj, mingo, linux-kernel, der.herr, dave, riel,
	viro, torvalds, jlayton

Hi,

I did a sweep over the parameters for posix01. The parameters are number
of processes and number of locks taken per process. In contrast to the
other test, it looks like there is no set which ends a nice stable
result (read low variance). I have tried several things including
pinning down all processes to CPUs to avoid migration. The results
improved slightly but there was still a high variance.

Anyway I have collected some data and I like to share it. Maybe it is
still useful. All numbers here are without the above mentioned pinning.
There are some runs missing (don't know the reason yet) and I didn't let
it run till the end. So add some salt to these numbers.

The test script and raw data can be found here:

http://monom.org/posix01/

The tables reads:
  nproc: number of process started
  columns: number of locks taken per process

Hardware
  4x E5-4610, for this test all process are scheduled on one socket

First the numbers for tip 4.1.0-02756-ge3d06bd.

nproc 8
               100          200          300          400          500
count  1000.000000  1000.000000  1000.000000  1000.000000  1000.000000
mean      0.075449     0.210547     0.340658     0.464083     0.590400
std       0.015550     0.024989     0.032080     0.043803     0.055003
min       0.021643     0.067456     0.211779     0.279643     0.327628
25%       0.065337     0.195664     0.318114     0.430040     0.546488
50%       0.075345     0.209411     0.338512     0.461397     0.591433
75%       0.084725     0.226517     0.364190     0.494638     0.626532
max       0.127050     0.281836     0.454558     0.607559     0.762149


nproc 16
               100          200          300          400          500
count  1000.000000  1000.000000  1000.000000  1000.000000  1000.000000
mean      1.023660     2.463384     3.891954     5.312716     6.752857
std       0.105065     0.124916     0.136476     0.172906     0.207449
min       0.351199     1.527379     3.106403     4.157478     5.519601
25%       0.961098     2.397597     3.807098     5.201875     6.633034
50%       1.031460     2.467317     3.895824     5.321227     6.757502
75%       1.093412     2.539284     3.985122     5.432336     6.889859
max       1.278603     2.785901     4.369434     5.798982     7.324263


nproc 24
               100          200          300          400          500
count  1000.000000  1000.000000  1000.000000  1000.000000  1000.000000
mean      3.460166     7.942193    11.898540    11.150066    11.060036
std       0.191564     0.232989     0.612868     0.680323     0.465967
min       2.748545     6.575510     9.977165     9.209685     8.937682
25%       3.325521     7.806847    11.440580    10.774070    10.912302
50%       3.493138     7.951859    11.852556    11.163595    11.074910
75%       3.596927     8.088036    12.443429    11.365197    11.243125
max       3.974884     8.589840    13.079780    16.341043    14.244954


nproc 32
               100          200          300          400          500
count  1000.000000  1000.000000  1000.000000  1000.000000  1000.000000
mean      6.797286    13.943421    14.373278    15.857103    20.047039
std       0.366013     0.417859     0.625967     0.377463     0.302939
min       3.323312    12.266006    12.492706    14.451931    17.496059
25%       6.649401    13.719397    14.186790    15.738348    19.958001
50%       6.868362    13.862458    14.312992    15.870438    20.083564
75%       6.995801    14.027167    14.429383    15.984881    20.215722
max       7.369007    15.631300    21.587450    19.364991    20.755793


nproc 40
               100          200          300          400          500
count  1000.000000  1000.000000  1000.000000  1000.000000  1000.000000
mean     11.156514    16.936808    18.930412    25.605206    32.334239
std       0.613158     0.614545     0.485336     0.344226     0.398747
min       5.609261    13.147398    16.930261    23.448985    28.992899
25%      10.999876    16.740775    18.788180    25.481274    32.188020
50%      11.251502    16.883100    18.946506    25.648879    32.369347
75%      11.439205    17.032133    19.105678    25.806715    32.565019
max      12.155905    24.116348    26.152117    26.502637    33.263763


nproc 48
               100          200          300          400          500
count  1000.000000  1000.000000  1000.000000  1000.000000  1000.000000
mean     16.523705    18.214558    27.877811    37.703763    47.655792
std       0.974732     1.118383     0.357481     0.435081     0.472945
min       7.909358    16.279568    25.989797    35.308061    45.279940
25%      16.385582    17.960832    27.729399    37.555420    47.458123
50%      16.692900    18.137635    27.920459    37.767064    47.679325
75%      16.927355    18.311502    28.092018    37.950782    47.926311
max      17.720374    35.810409    28.721941    38.746273    49.333097


nproc 56
               100          200          300          400          500
count  1000.000000  1000.000000  1000.000000  1000.000000  1000.000000
mean     11.567668    25.100333    38.603884    52.135564    65.716669
std       0.320771     0.369833     0.554834     0.534120     0.612844
min      10.123811    22.598875    35.668780    49.182148    62.504962
25%      11.394438    24.925338    38.389200    51.885988    65.441492
50%      11.593920    25.135043    38.641839    52.206010    65.771692
75%      11.789101    25.328558    38.895343    52.451819    66.068270
max      12.319346    25.948404    46.458428    53.605888    67.270679


nproc 64
               100          200          300          400          500
count  1000.000000  1000.000000  1000.000000  1000.000000  1000.000000
mean     15.421295    33.254418    51.073912    68.936111    86.919074
std       0.398493     0.411222     0.551629     0.690891     0.694183
min      13.269859    30.900978    48.174802    65.549282    83.099271
25%      15.203732    33.037478    50.821702    68.619365    86.579749
50%      15.467885    33.279869    51.130972    69.001664    86.953804
75%      15.694466    33.514712    51.380860    69.361632    87.341084
max      16.347321    34.475095    52.507292    70.884752    88.807083


nproc 72
               100          200          300          400          500
count  1000.000000  1000.000000  1000.000000  1000.000000  1000.000000
mean     19.762286    42.488827    65.167763    87.903430   110.666679
std       0.483660     0.480269     0.689872     0.828354     0.892759
min      15.506067    39.937453    61.196633    84.227403   107.014850
25%      19.519194    42.261548    64.834133    87.515837   110.225142
50%      19.809986    42.541263    65.265768    87.974049   110.747980
75%      20.083315    42.792858    65.603762    88.392599   111.223192
max      20.913434    43.830009    66.791452    90.184550   113.062344


nproc 80
               100          200          300          400          500
count  1000.000000  1000.000000  1000.000000  1000.000000  1000.000000
mean     24.782285    52.853068    80.902314   109.112294   137.441640
std       0.523731     0.639160     0.799033     0.952619     1.091478
min      20.126615    47.813274    77.357915   104.033857   131.978443
25%      24.498501    52.547855    80.509926   108.606293   136.877050
50%      24.835766    52.918841    80.950773   109.197236   137.498470
75%      25.137887    53.244013    81.376380   109.723791   138.101133
max      26.161997    54.372957    83.266046   111.709888   140.419400


nproc 88
               100          200          300          400          500
count  1000.000000  1000.000000  1000.000000  1000.000000  1000.000000
mean     30.196867    64.467080    98.710365   133.024282   167.330900
std       0.749476     0.691460     0.863908     1.033780     1.240237
min      16.647491    60.034797    94.053510   128.281171   161.778166
25%      29.896764    64.121607    98.290368   132.484092   166.711172
50%      30.271808    64.514222    98.742714   133.089852   167.429483
75%      30.627200    64.903154    99.262584   133.706735   168.086624
max      31.806051    66.343856   101.077264   136.143873   170.449596


nproc 96
               100          200          300          400          500
count  1000.000000  1000.000000  1000.000000  1000.000000  1000.000000
mean     36.304100    77.194851   117.958001   158.820159   199.868940
std       0.712442     0.718565     1.009163     1.220813     1.462219
min      31.128111    73.850226   112.075970   152.910227   192.977453
25%      35.928427    76.811233   117.466922   158.151278   199.058411
50%      36.378220    77.209148   117.998878   158.879704   199.861157
75%      36.761744    77.636286   118.615380   159.583272   200.701769
max      38.069263    79.445286   120.878239   162.826438   206.826424


nproc 104
               100          200          300          400          500
count  1000.000000  1000.000000  1000.000000  1000.000000  1000.000000
mean     42.731401    90.887253   138.815476   186.824953   235.055458
std       1.045572     0.742232     0.999065     1.298818     1.554890
min      23.734733    87.384048   133.462821   180.971966   227.475939
25%      42.353032    90.441055   138.213962   186.109237   234.169575
50%      42.861112    90.900274   138.836083   186.835884   235.084204
75%      43.236527    91.382487   139.460129   187.694247   236.011148
max      44.600281    93.394394   141.959512   190.171221   239.491909


nproc 112
              100         200         300         400
count  460.000000  460.000000  460.000000  460.000000
mean    49.782729  105.468739  161.416099  217.385757
std      0.904312    1.011980    1.222772    1.475225
min     45.334285  100.711113  156.087707  210.639527
25%     49.394518  104.971028  160.743875  216.590612
50%     49.906665  105.604756  161.528712  217.437408
75%     50.363428  106.088852  162.187166  218.286111
max     51.800116  108.372299  164.614385  221.788613


And now the same tests for tip+percpu_rwsem:

nproc 8
               100          200          300          400          500
count  1000.000000  1000.000000  1000.000000  1000.000000  1000.000000
mean      0.285784     0.639623     0.935062     1.165287     1.457565
std       0.040458     0.089317     0.112704     0.094596     0.110337
min       0.118961     0.253775     0.351943     0.869095     1.026194
25%       0.263250     0.600806     0.858630     1.100281     1.376566
50%       0.287019     0.649395     0.930437     1.167166     1.461235
75%       0.312601     0.692013     1.013786     1.228887     1.533511
max       0.407264     0.860837     1.298671     1.460842     1.927867


nproc 16
               100          200          300          400          500
count  1000.000000  1000.000000  1000.000000  1000.000000  1000.000000
mean      2.338683     5.219408     8.117279    11.050641    14.035433
std       0.146102     0.270400     0.392875     0.510692     0.576044
min       1.836110     4.179970     6.491748     8.998336    11.442838
25%       2.239374     5.042915     7.860587    10.728740    13.667630
50%       2.335801     5.217732     8.125243    11.052183    14.010561
75%       2.443152     5.404223     8.396037    11.404375    14.417740
max       2.798029     5.927344     9.172875    12.203548    15.444552


nproc 24
               100          200          300          400          500
count  1000.000000  1000.000000  1000.000000  1000.000000  1000.000000
mean      6.399927    13.673487    20.729554    27.316864    34.125202
std       0.558388     1.157996     1.647191     2.066864     2.487975
min       4.961608    10.767524    17.145018    22.441426    28.566438
25%       5.987118    12.849801    19.555979    25.943463    32.399122
50%       6.388215    13.583983    20.533054    27.122120    33.959403
75%       6.915310    14.786835    22.252796    29.187176    36.308254
max       7.405319    15.823960    23.858206    31.754922    38.997955


nproc 32
               100          200          300          400          500
count  1000.000000  1000.000000  1000.000000  1000.000000  1000.000000
mean     11.973832    24.885823    36.705614    48.036525    57.418669
std       1.270516     2.604583     3.963139     5.283237     6.441122
min       9.395066    19.958662    27.768684    38.247046    46.265231
25%      10.955417    22.708953    33.510437    43.613011    51.901209
50%      11.801515    24.556642    35.805816    47.315635    55.933447
75%      13.294692    27.520679    40.689642    53.139912    63.860584
max      14.217272    29.968337    44.409489    58.246754    71.045867


nproc 40
               100          200          300          400          500
count  1000.000000  1000.000000  1000.000000  1000.000000  1000.000000
mean     19.307414    39.204462    55.768040    70.808627    83.830246
std       2.189803     3.982241     5.467692     6.737372     8.124025
min      14.450258    30.606836    44.342114    55.520218    64.704178
25%      17.418113    35.968251    51.341042    65.352697    77.744806
50%      19.067713    39.023460    55.548934    70.282785    83.374667
75%      21.479466    42.666118    60.379906    76.604241    91.158904
max      23.687483    47.019928    67.143361    85.084045   100.957011


nproc 48
               100          200          300          400          500
count  1000.000000  1000.000000  1000.000000  1000.000000  1000.000000
mean     28.386773    55.462523    77.886706    92.579064   104.319703
std       3.231688     6.142373     8.633285    10.950222    12.510504
min      21.703659    42.486864    56.904221    66.605689    76.529646
25%      25.635256    50.575642    71.306694    82.931995    94.222776
50%      28.136694    55.235674    77.298409    91.993559   104.909015
75%      31.484979    60.645302    85.693462   102.195018   114.141212
max      35.713537    68.342796    96.065304   115.926497   130.916876


nproc 56
               100          200          300          400          500
count  1000.000000  1000.000000  1000.000000  1000.000000  1000.000000
mean     39.037206    74.470404    97.900979   111.320283   135.943281
std       4.594741     8.940246    11.715321    13.823450    16.032080
min      29.532559    55.193557    65.590273    79.580482    98.565733
25%      35.212004    66.990273    88.066459   100.643871   122.864654
50%      38.796902    73.928176    96.771490   110.669216   136.199617
75%      43.154846    82.041731   108.937264   120.727216   147.769269
max      49.215714    92.181542   125.188702   141.113117   170.961264


nproc 64
               100          200          300          400          500
count  1000.000000  1000.000000  1000.000000  1000.000000  1000.000000
mean     51.099012    93.028015   114.649700   145.944300   178.043572
std       6.310777    12.719401    14.675830    18.019135    21.084448
min      36.770938    54.620852    80.837116    98.765936   126.207980
25%      45.955694    84.078285   103.452854   132.127548   160.746493
50%      50.275929    93.031565   114.333533   144.951788   177.105994
75%      56.955477   104.656181   128.418118   163.865640   197.275452
max      63.369715   120.360706   146.542148   182.482159   218.814651


nproc 72
              100         200         300         400         500
count  506.000000  506.000000  506.000000  506.000000  506.000000
mean    64.905270  108.760098  138.811285  179.277895  222.584001
std      8.784532   16.293281   18.160401   21.203767   25.904456
min     43.035451   64.762288   96.401934  127.995159  162.341026
25%     58.658290   98.438247  126.035692  162.944645  202.228444
50%     64.756854  109.608197  139.190635  181.413255  223.359111
75%     72.488483  123.608470  152.745541  195.549278  245.454358
max     83.424516  139.214509  172.538610  218.677815  270.799895


nproc 80
             100         200         300         400         500
count  61.000000   61.000000   61.000000   61.000000   61.000000
mean   76.727789  124.438489  174.095378  225.855798  272.416390
std     9.757928   18.034325   20.216132   24.868596   29.384832
min    55.988043   83.842137  130.842940  173.596051  208.508169
25%    69.218268  116.679810  162.149179  207.015727  252.194955
50%    75.392969  125.378519  173.117425  225.071270  276.188038
75%    83.748328  136.689138  192.392097  245.019530  296.407232
max    97.004966  165.172805  206.391629  266.751069  318.089290


nproc 88
              100
count  157.000000
mean    90.337638
std     15.239911
min     53.393662
25%     79.648088
50%     91.075065
75%    103.530939
max    120.680507


And an attempt at visualization:

http://monom.org/posix01/sweep-4.1.0-02756-ge3d06bd.png
http://monom.org/posix01/sweep-4.1.0-02769-g6ce2591.png


Let me know if these numbers help or not. I start to get better in
running those tests tough they take quite some time to finish. So if
they are useless I sleep well without doing this :)

cheers,
daniel

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-06-30 21:32                                                     ` Paul E. McKenney
@ 2015-07-01 11:56                                                       ` Peter Zijlstra
  2015-07-01 15:56                                                         ` Paul E. McKenney
  0 siblings, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-07-01 11:56 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
	viro, torvalds

On Tue, Jun 30, 2015 at 02:32:58PM -0700, Paul E. McKenney wrote:

> > I had indeed forgotten that got farmed out to the kthread; on which, my
> > poor desktop seems to have spend ~140 minutes of its (most recent)
> > existence poking RCU things.
> > 
> >     7 root      20   0       0      0      0 S   0.0  0.0  56:34.66 rcu_sched
> >     8 root      20   0       0      0      0 S   0.0  0.0  20:58.19 rcuos/0
> >     9 root      20   0       0      0      0 S   0.0  0.0  18:50.75 rcuos/1
> >    10 root      20   0       0      0      0 S   0.0  0.0  18:30.62 rcuos/2
> >    11 root      20   0       0      0      0 S   0.0  0.0  17:33.24 rcuos/3
> >    12 root      20   0       0      0      0 S   0.0  0.0   2:43.54 rcuos/4
> >    13 root      20   0       0      0      0 S   0.0  0.0   3:00.31 rcuos/5
> >    14 root      20   0       0      0      0 S   0.0  0.0   3:09.27 rcuos/6
> >    15 root      20   0       0      0      0 S   0.0  0.0   2:52.98 rcuos/7
> > 
> > Which is almost as much time as my konsole:
> > 
> >  2853 peterz    20   0  586240 103664  41848 S   1.0  0.3 147:39.50 konsole
> > 
> > Which seems somewhat excessive. But who knows.
> 
> No idea.  How long has that system been up?  What has it been doing?

Some 40 odd days it seems. Its my desktop, I read email (in mutt in
Konsole), I type patches (in vim in Konsole), I compile kernels (in
Konsole) etc..

Now konsole is threaded and each new window/tab is just another thread
in the same process so runtime should accumulate. However I just found
that for some obscure reason there's two konsole processes around, and
the other is the one that I'm using most, it also has significantly more
runtime.

 3264 ?        Sl   452:43          \_ /usr/bin/konsole

Must be some of that brain damaged desktop shite that confused things --
I see the one is stared with some -session argument. Some day I'll
discover how to destroy all that nonsense and make things behave as they
should.

> The rcu_sched overhead is expected behavior if the system has run between
> ten and one hundred million grace periods, give or take an order of
> magnitude depending on the number of idle CPUs and so on.
> 
> The overhead for the RCU offload kthreads is what it is.  A kfree() takes
> as much time as a kfree does, and they are all nicely counted up for you.

Yah, if only we could account it back to whomever caused it :/

> > Although here I'll once again go ahead and say something ignorant; how
> > come that's a problem? Surely if we know the kthread thing has finished
> > starting a GP, any one CPU issuing a full memory barrier (as would be
> > implied by switching to the stop worker) must then indeed observe that
> > global state? due to that transitivity thing.
> > 
> > That is, I'm having a wee bit of bother for seeing how you'd need
> > manipulation of global variables as you elude to below.
> 
> Well, I thought that you wanted to leverage the combining tree to
> determine when the grace period had completed.  If a given CPU isn't
> pushing its quiescent states up the combining tree, then the combining
> tree can't do much for you.

Right that is what I wanted, and sure the combining thing needs to
happen with atomics, but that's not new, it already does that.

What I was talking about was the interaction between the force
quiescence state and the poking detectoring that a QS had indeed be
started.

> Well, I do have something that seems reasonably straightforward.  Sending
> the patches along separately.  Not sure that it is worth its weight.
> 
> The idea is that we keep the expedited grace periods working as they do
> now, independently of the normal grace period.  The normal grace period
> takes a sequence number just after initialization, and checks to see
> if an expedited grace period happened in the meantime at the beginning
> of each quiescent-state forcing episode.  This saves the last one or
> two quiescent-state forcing scans if the case where an expedited grace
> period really did happen.
> 
> It is possible for the expedited grace period to help things along by
> waking up the grace-period kthread, but of course doing this too much
> further increases the time consumed by your rcu_sched kthread. 

Ah so that is the purpose of that patch. Still, I'm having trouble
seeing how you can do this too much, you would only be waking it if
there was a GP pending completion, right? At which point waking it is
the right thing.

If you wake it unconditionally, even if there's nothing to do, then yes
that'd be a waste of cycles.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-07-01 11:56                                                       ` Peter Zijlstra
@ 2015-07-01 15:56                                                         ` Paul E. McKenney
  2015-07-01 16:16                                                           ` Peter Zijlstra
  0 siblings, 1 reply; 106+ messages in thread
From: Paul E. McKenney @ 2015-07-01 15:56 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
	viro, torvalds

On Wed, Jul 01, 2015 at 01:56:42PM +0200, Peter Zijlstra wrote:
> On Tue, Jun 30, 2015 at 02:32:58PM -0700, Paul E. McKenney wrote:
> 
> > > I had indeed forgotten that got farmed out to the kthread; on which, my
> > > poor desktop seems to have spend ~140 minutes of its (most recent)
> > > existence poking RCU things.
> > > 
> > >     7 root      20   0       0      0      0 S   0.0  0.0  56:34.66 rcu_sched
> > >     8 root      20   0       0      0      0 S   0.0  0.0  20:58.19 rcuos/0
> > >     9 root      20   0       0      0      0 S   0.0  0.0  18:50.75 rcuos/1
> > >    10 root      20   0       0      0      0 S   0.0  0.0  18:30.62 rcuos/2
> > >    11 root      20   0       0      0      0 S   0.0  0.0  17:33.24 rcuos/3
> > >    12 root      20   0       0      0      0 S   0.0  0.0   2:43.54 rcuos/4
> > >    13 root      20   0       0      0      0 S   0.0  0.0   3:00.31 rcuos/5
> > >    14 root      20   0       0      0      0 S   0.0  0.0   3:09.27 rcuos/6
> > >    15 root      20   0       0      0      0 S   0.0  0.0   2:52.98 rcuos/7
> > > 
> > > Which is almost as much time as my konsole:
> > > 
> > >  2853 peterz    20   0  586240 103664  41848 S   1.0  0.3 147:39.50 konsole
> > > 
> > > Which seems somewhat excessive. But who knows.
> > 
> > No idea.  How long has that system been up?  What has it been doing?
> 
> Some 40 odd days it seems. Its my desktop, I read email (in mutt in
> Konsole), I type patches (in vim in Konsole), I compile kernels (in
> Konsole) etc..
> 
> Now konsole is threaded and each new window/tab is just another thread
> in the same process so runtime should accumulate. However I just found
> that for some obscure reason there's two konsole processes around, and
> the other is the one that I'm using most, it also has significantly more
> runtime.
> 
>  3264 ?        Sl   452:43          \_ /usr/bin/konsole
> 
> Must be some of that brain damaged desktop shite that confused things --
> I see the one is stared with some -session argument. Some day I'll
> discover how to destroy all that nonsense and make things behave as they
> should.

Well, you appear to be using about 6% of a CPU, or 0.7% of the entire
8-CPU system for the RCU GP kthread.  That is more than I would like to
see consumed.

Odd that you have four of eight of the rcuos CPUs with higher consumption
than the others.  I would expect three of eight.  Are you by chance running
an eight-core system with hyperthreading disabled in hardware, via boot
parameter, or via explicit offline?  The real question I have is "is
nr_cpu_ids equal to 16 rather than to 8?"

A significant fraction of rcu_sched's CPU overhead is likely due to that
extra wakeup for the fourth leader rcuos kthread.

Also, do you have nohz_full set?  Just wondering why callback offloading
is enabled.  (If you want it enabled, fine, but from what I can see your
workload isn't being helped by it and it does have higher overhead.)

Even if you don't want offloading and do disable it, it would be good to
reduce the penalty.  Is there something I can do to reduce the overhead
of waking several kthreads?  Right now, I just do a series of wake_up()
calls, one for each leader rcuos kthread.

Oh, are you running v3.10 or some such?  If so, there are some more
recent RCU changes that can help with this.  They are called out here:

http://www.rdrop.com/users/paulmck/scalability/paper/BareMetal.2015.01.15b.pdf

> > The rcu_sched overhead is expected behavior if the system has run between
> > ten and one hundred million grace periods, give or take an order of
> > magnitude depending on the number of idle CPUs and so on.
> > 
> > The overhead for the RCU offload kthreads is what it is.  A kfree() takes
> > as much time as a kfree does, and they are all nicely counted up for you.
> 
> Yah, if only we could account it back to whomever caused it :/

It could be done, but would require increasing the size of rcu_head.
And would require costly fine-grained timing of callback execution.
Not something for production systems, I would guess.

> > > Although here I'll once again go ahead and say something ignorant; how
> > > come that's a problem? Surely if we know the kthread thing has finished
> > > starting a GP, any one CPU issuing a full memory barrier (as would be
> > > implied by switching to the stop worker) must then indeed observe that
> > > global state? due to that transitivity thing.
> > > 
> > > That is, I'm having a wee bit of bother for seeing how you'd need
> > > manipulation of global variables as you elude to below.
> > 
> > Well, I thought that you wanted to leverage the combining tree to
> > determine when the grace period had completed.  If a given CPU isn't
> > pushing its quiescent states up the combining tree, then the combining
> > tree can't do much for you.
> 
> Right that is what I wanted, and sure the combining thing needs to
> happen with atomics, but that's not new, it already does that.
> 
> What I was talking about was the interaction between the force
> quiescence state and the poking detectoring that a QS had indeed be
> started.

It gets worse.

Suppose that a grace period is already in progess.  You cannot leverage
its use of the combining tree because some of the CPUs might have already
indicated a quiescent state, which means that the current grace period
won't necessarily wait for all of the CPUs that the concurrent expedited
grace period needs to wait on.  So you need to kick the current grace
period, wait for it to complete, wait for the next one to start (with
all the fun and exciting issues called out earlier), do the expedited
grace period, then wait for completion.

> > Well, I do have something that seems reasonably straightforward.  Sending
> > the patches along separately.  Not sure that it is worth its weight.
> > 
> > The idea is that we keep the expedited grace periods working as they do
> > now, independently of the normal grace period.  The normal grace period
> > takes a sequence number just after initialization, and checks to see
> > if an expedited grace period happened in the meantime at the beginning
> > of each quiescent-state forcing episode.  This saves the last one or
> > two quiescent-state forcing scans if the case where an expedited grace
> > period really did happen.
> > 
> > It is possible for the expedited grace period to help things along by
> > waking up the grace-period kthread, but of course doing this too much
> > further increases the time consumed by your rcu_sched kthread. 
> 
> Ah so that is the purpose of that patch. Still, I'm having trouble
> seeing how you can do this too much, you would only be waking it if
> there was a GP pending completion, right? At which point waking it is
> the right thing.
> 
> If you wake it unconditionally, even if there's nothing to do, then yes
> that'd be a waste of cycles.

Heh!  You are already complaining about rcu_sched consuming 0.7%
of your system, and rightfully so.  Increasing this overhead still
further therefore cannot be considered a good thing unless there is some
overwhelming benefit.  And I am not seeing that benefit.  Perhaps due
to a failure of imagination, but until someone enlightens me, I have to
throttle the wakeups -- or, perhaps better, omit the wakeups entirely.

Actually, I am not convinced that I should push any of the patches that
leverage expedited grace periods to help out normal grace periods.

							Thanx, Paul


^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-07-01 15:56                                                         ` Paul E. McKenney
@ 2015-07-01 16:16                                                           ` Peter Zijlstra
  2015-07-01 18:45                                                             ` Paul E. McKenney
  0 siblings, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-07-01 16:16 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
	viro, torvalds

On Wed, Jul 01, 2015 at 08:56:55AM -0700, Paul E. McKenney wrote:
> On Wed, Jul 01, 2015 at 01:56:42PM +0200, Peter Zijlstra wrote:
> Odd that you have four of eight of the rcuos CPUs with higher consumption
> than the others.  I would expect three of eight.  Are you by chance running
> an eight-core system with hyperthreading disabled in hardware, via boot
> parameter, or via explicit offline?  The real question I have is "is
> nr_cpu_ids equal to 16 rather than to 8?"

It should not, but I'd have to instrument to be sure. Its a regular
4 core + ht part.

model name      : Intel(R) Core(TM) i7-2600K CPU @ 3.40GHz

> Also, do you have nohz_full set?

Nope..

> Just wondering why callback offloading
> is enabled.  (If you want it enabled, fine, but from what I can see your
> workload isn't being helped by it and it does have higher overhead.)

I think this is a distro .config; every time I strip the desktop kernel
I end up needing a driver I hadn't built. Clearly I've not really paid
attention to the RCU options.

> Even if you don't want offloading and do disable it, it would be good to
> reduce the penalty.  Is there something I can do to reduce the overhead
> of waking several kthreads?  Right now, I just do a series of wake_up()
> calls, one for each leader rcuos kthread.
> 
> Oh, are you running v3.10 or some such?  If so, there are some more
> recent RCU changes that can help with this.  They are called out here:

Not that old, but not something recent either. I'll upgrade and see if
it goes away. I really detest rebooting the desktop, but it needs to
happen every so often.

> > Yah, if only we could account it back to whomever caused it :/
> 
> It could be done, but would require increasing the size of rcu_head.
> And would require costly fine-grained timing of callback execution.
> Not something for production systems, I would guess.

Nope :/ I know.

> > What I was talking about was the interaction between the force
> > quiescence state and the poking detectoring that a QS had indeed be
> > started.
> 
> It gets worse.
> 
> Suppose that a grace period is already in progess.  You cannot leverage
> its use of the combining tree because some of the CPUs might have already
> indicated a quiescent state, which means that the current grace period
> won't necessarily wait for all of the CPUs that the concurrent expedited
> grace period needs to wait on.  So you need to kick the current grace
> period, wait for it to complete, wait for the next one to start (with
> all the fun and exciting issues called out earlier), do the expedited
> grace period, then wait for completion.

Ah yes. You do do find the fun cases :-)

> > If you wake it unconditionally, even if there's nothing to do, then yes
> > that'd be a waste of cycles.
> 
> Heh!  You are already complaining about rcu_sched consuming 0.7%
> of your system, and rightfully so.  Increasing this overhead still
> further therefore cannot be considered a good thing unless there is some
> overwhelming benefit.  And I am not seeing that benefit.  Perhaps due
> to a failure of imagination, but until someone enlightens me, I have to
> throttle the wakeups -- or, perhaps better, omit the wakeups entirely.
> 
> Actually, I am not convinced that I should push any of the patches that
> leverage expedited grace periods to help out normal grace periods.

It would seem a shame not to.. I've not yet had time to form a coherent
reply to that thread though.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 12/13] stop_machine: Remove lglock
  2015-07-01 16:16                                                           ` Peter Zijlstra
@ 2015-07-01 18:45                                                             ` Paul E. McKenney
  0 siblings, 0 replies; 106+ messages in thread
From: Paul E. McKenney @ 2015-07-01 18:45 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Oleg Nesterov, tj, mingo, linux-kernel, der.herr, dave, riel,
	viro, torvalds

On Wed, Jul 01, 2015 at 06:16:40PM +0200, Peter Zijlstra wrote:
> On Wed, Jul 01, 2015 at 08:56:55AM -0700, Paul E. McKenney wrote:
> > On Wed, Jul 01, 2015 at 01:56:42PM +0200, Peter Zijlstra wrote:
> > Odd that you have four of eight of the rcuos CPUs with higher consumption
> > than the others.  I would expect three of eight.  Are you by chance running
> > an eight-core system with hyperthreading disabled in hardware, via boot
> > parameter, or via explicit offline?  The real question I have is "is
> > nr_cpu_ids equal to 16 rather than to 8?"
> 
> It should not, but I'd have to instrument to be sure. Its a regular
> 4 core + ht part.
> 
> model name      : Intel(R) Core(TM) i7-2600K CPU @ 3.40GHz

Well, if nr_cpu_ids is equal to 8, I likely need to recheck my math.

> > Also, do you have nohz_full set?
> 
> Nope..
> 
> > Just wondering why callback offloading
> > is enabled.  (If you want it enabled, fine, but from what I can see your
> > workload isn't being helped by it and it does have higher overhead.)
> 
> I think this is a distro .config; every time I strip the desktop kernel
> I end up needing a driver I hadn't built. Clearly I've not really paid
> attention to the RCU options.

OK, early versions of RHEL definitely would do what you have by default,
and I would need to check with Rik to find out what stuff got backported
when.

> > Even if you don't want offloading and do disable it, it would be good to
> > reduce the penalty.  Is there something I can do to reduce the overhead
> > of waking several kthreads?  Right now, I just do a series of wake_up()
> > calls, one for each leader rcuos kthread.
> > 
> > Oh, are you running v3.10 or some such?  If so, there are some more
> > recent RCU changes that can help with this.  They are called out here:
> 
> Not that old, but not something recent either. I'll upgrade and see if
> it goes away. I really detest rebooting the desktop, but it needs to
> happen every so often.

Feel free to send me the .config, the exact version, and any boot
parameters you have.  That would allow me to tell you whether or not
moving ahead would do you any good.

> > > Yah, if only we could account it back to whomever caused it :/
> > 
> > It could be done, but would require increasing the size of rcu_head.
> > And would require costly fine-grained timing of callback execution.
> > Not something for production systems, I would guess.
> 
> Nope :/ I know.
> 
> > > What I was talking about was the interaction between the force
> > > quiescence state and the poking detectoring that a QS had indeed be
> > > started.
> > 
> > It gets worse.
> > 
> > Suppose that a grace period is already in progess.  You cannot leverage
> > its use of the combining tree because some of the CPUs might have already
> > indicated a quiescent state, which means that the current grace period
> > won't necessarily wait for all of the CPUs that the concurrent expedited
> > grace period needs to wait on.  So you need to kick the current grace
> > period, wait for it to complete, wait for the next one to start (with
> > all the fun and exciting issues called out earlier), do the expedited
> > grace period, then wait for completion.
> 
> Ah yes. You do do find the fun cases :-)

Given that I am RCU maintainer, I had better be able to.  A large
quantity of them rushed into my head when you suggested this, hence my
initial reaction.  That said, Oleg is probably better at finding fun
cases than am I.

> > > If you wake it unconditionally, even if there's nothing to do, then yes
> > > that'd be a waste of cycles.
> > 
> > Heh!  You are already complaining about rcu_sched consuming 0.7%
> > of your system, and rightfully so.  Increasing this overhead still
> > further therefore cannot be considered a good thing unless there is some
> > overwhelming benefit.  And I am not seeing that benefit.  Perhaps due
> > to a failure of imagination, but until someone enlightens me, I have to
> > throttle the wakeups -- or, perhaps better, omit the wakeups entirely.
> > 
> > Actually, I am not convinced that I should push any of the patches that
> > leverage expedited grace periods to help out normal grace periods.
> 
> It would seem a shame not to.. I've not yet had time to form a coherent
> reply to that thread though.

Well, it does increase complexity and coupling, and I don't see that
it provides big-animal benefits to justify this.  Again, might be just
insufficient imagination on my part, but...

							Thanx, Paul


^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 00/13] percpu rwsem -v2
  2015-07-01  5:57                   ` Daniel Wagner
@ 2015-07-01 21:54                     ` Linus Torvalds
  2015-07-02  9:41                       ` Peter Zijlstra
  0 siblings, 1 reply; 106+ messages in thread
From: Linus Torvalds @ 2015-07-01 21:54 UTC (permalink / raw)
  To: Daniel Wagner
  Cc: Ingo Molnar, Peter Zijlstra, Oleg Nesterov, Paul McKenney,
	Tejun Heo, Ingo Molnar, Linux Kernel Mailing List, der.herr,
	Davidlohr Bueso, Rik van Riel, Al Viro, Jeff Layton

On Tue, Jun 30, 2015 at 10:57 PM, Daniel Wagner <wagi@monom.org> wrote:
>
> And an attempt at visualization:
>
> http://monom.org/posix01/sweep-4.1.0-02756-ge3d06bd.png
> http://monom.org/posix01/sweep-4.1.0-02769-g6ce2591.png

Ugh. The old numbers look (mostly) fairly tight, and then the new ones
are all over the map, and usually much worse.

We've seen this behavior before when switching from a non-sleeping
lock to a sleeping one. The sleeping locks have absolutely horrible
behavior when they get contended, and spend tons of CPU time on the
sleep/wakeup management, based on almost random timing noise. And it
can get orders of magnitude worse if there are any nested locks that
basically trigger trains of that kind of behavior.

In general, sleeping locks are just horribly horribly bad for things
that do small simple operations. Which is what fs/locks.c does.

I'm not convinced it's fixable. Maybe the new rwsem just isn't a good idea.

                      Linus

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 00/13] percpu rwsem -v2
  2015-07-01 21:54                     ` Linus Torvalds
@ 2015-07-02  9:41                       ` Peter Zijlstra
  2015-07-20  5:53                         ` Daniel Wagner
  0 siblings, 1 reply; 106+ messages in thread
From: Peter Zijlstra @ 2015-07-02  9:41 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Daniel Wagner, Ingo Molnar, Oleg Nesterov, Paul McKenney,
	Tejun Heo, Ingo Molnar, Linux Kernel Mailing List, der.herr,
	Davidlohr Bueso, Rik van Riel, Al Viro, Jeff Layton

On Wed, Jul 01, 2015 at 02:54:59PM -0700, Linus Torvalds wrote:
> On Tue, Jun 30, 2015 at 10:57 PM, Daniel Wagner <wagi@monom.org> wrote:
> >
> > And an attempt at visualization:
> >
> > http://monom.org/posix01/sweep-4.1.0-02756-ge3d06bd.png
> > http://monom.org/posix01/sweep-4.1.0-02769-g6ce2591.png
> 
> Ugh. The old numbers look (mostly) fairly tight, and then the new ones
> are all over the map, and usually much worse.
> 
> We've seen this behavior before when switching from a non-sleeping
> lock to a sleeping one. The sleeping locks have absolutely horrible
> behavior when they get contended, and spend tons of CPU time on the
> sleep/wakeup management, 

Right, I'm just not seeing how any of that would happen here :/ The read
side would only ever block on reading /proc/$something and I'm fairly
sure that benchmark doesn't actually touch that file.

In any case, I will look into this, I've just not had time yet..

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 00/13] percpu rwsem -v2
  2015-07-02  9:41                       ` Peter Zijlstra
@ 2015-07-20  5:53                         ` Daniel Wagner
  2015-07-20 18:44                           ` Linus Torvalds
  0 siblings, 1 reply; 106+ messages in thread
From: Daniel Wagner @ 2015-07-20  5:53 UTC (permalink / raw)
  To: Peter Zijlstra, Linus Torvalds
  Cc: Ingo Molnar, Oleg Nesterov, Paul McKenney, Tejun Heo,
	Ingo Molnar, Linux Kernel Mailing List, der.herr,
	Davidlohr Bueso, Rik van Riel, Al Viro, Jeff Layton

On 07/02/2015 11:41 AM, Peter Zijlstra wrote:
> On Wed, Jul 01, 2015 at 02:54:59PM -0700, Linus Torvalds wrote:
>> On Tue, Jun 30, 2015 at 10:57 PM, Daniel Wagner <wagi@monom.org> wrote:
>>>
>>> And an attempt at visualization:
>>>
>>> http://monom.org/posix01/sweep-4.1.0-02756-ge3d06bd.png
>>> http://monom.org/posix01/sweep-4.1.0-02769-g6ce2591.png
>>
>> Ugh. The old numbers look (mostly) fairly tight, and then the new ones
>> are all over the map, and usually much worse.
>>
>> We've seen this behavior before when switching from a non-sleeping
>> lock to a sleeping one. The sleeping locks have absolutely horrible
>> behavior when they get contended, and spend tons of CPU time on the
>> sleep/wakeup management, 
> 
> Right, I'm just not seeing how any of that would happen here :/ The read
> side would only ever block on reading /proc/$something and I'm fairly
> sure that benchmark doesn't actually touch that file.
> 
> In any case, I will look into this, I've just not had time yet..

I did some more testing and found out that the slow path of percpu_down_read()
is never taken (as expected). The only change left is the exchange from a
percpu arch_spinlock_t spinlocks to percpu spinlock_t spinlocks. 

Turning them back into arch_spinlock_t gives almost the same numbers as
with spinlock_t.

Then Peter suggested to change the code to

	preempt_disable();
	spin_unlock();
	preempt_enable_no_resched();

to verify if arch_spin_lock() is buggy and does not disable preemption
and we see a lock holder preemption on non virt setups.

Here all the numbers and plots:

- base line
  http://monom.org/posix01-4/tip-4.1.0-02756-ge3d06bd.png
  http://monom.org/posix01-4/tip-4.1.0-02756-ge3d06bd.txt  

- arch_spinlock_t
  http://monom.org/posix01-4/arch_spintlock_t-4.1.0-02769-g6ce2591-dirty.png
  http://monom.org/posix01-4/arch_spintlock_t-4.1.0-02769-g6ce2591-dirty.txt
  http://monom.org/posix01-4/arch_spintlock_t-4.1.0-02769-g6ce2591-dirty.patch

- no resched
  http://monom.org/posix01-4/no_resched-4.1.0-02770-g4d518cf.png
  http://monom.org/posix01-4/no_resched-4.1.0-02770-g4d518cf.txt
  http://monom.org/posix01-4/no_resched-4.1.0-02770-g4d518cf.patch

cheers,
daniel

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [RFC][PATCH 00/13] percpu rwsem -v2
  2015-07-20  5:53                         ` Daniel Wagner
@ 2015-07-20 18:44                           ` Linus Torvalds
  0 siblings, 0 replies; 106+ messages in thread
From: Linus Torvalds @ 2015-07-20 18:44 UTC (permalink / raw)
  To: Daniel Wagner
  Cc: Peter Zijlstra, Ingo Molnar, Oleg Nesterov, Paul McKenney,
	Tejun Heo, Ingo Molnar, Linux Kernel Mailing List, der.herr,
	Davidlohr Bueso, Rik van Riel, Al Viro, Jeff Layton

On Sun, Jul 19, 2015 at 10:53 PM, Daniel Wagner <wagi@monom.org> wrote:
>
> Turning them back into arch_spinlock_t gives almost the same numbers as
> with spinlock_t.
>
> Then Peter suggested to change the code to
>
>         preempt_disable();
>         spin_unlock();
>         preempt_enable_no_resched();
>
> to verify if arch_spin_lock() is buggy and does not disable preemption
> and we see a lock holder preemption on non virt setups.

Hmm. "arch_spin_lock()" isn't _supposed_ to disable preemption. The
caller should do that (possibly by disabling interrupts). See
include/linux/spinlock_api_smp.h for details.

But yes, that's a *very* subtle difference between "arch_spin_lock()"
and "spin_lock()". The former doesn't do lockdep or other debugging
and it doesn't disable preemption. So they are not interchangeable.

The current lglocks uses arch_spin_lock exactly because it does not
*want* lockdep tracking (it does its own) and because it does its own
preemption handling.

So saying "verify if arch_spin_lock() is buggy and does not disable
preemption" is complete BS. If arch_spin_lock() were to disable
preemption, _that_ would be a bug.

                Linus

^ permalink raw reply	[flat|nested] 106+ messages in thread

end of thread, other threads:[~2015-07-20 18:44 UTC | newest]

Thread overview: 106+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-06-22 12:16 [RFC][PATCH 00/13] percpu rwsem -v2 Peter Zijlstra
2015-06-22 12:16 ` [RFC][PATCH 01/13] rcu: Create rcu_sync infrastructure Peter Zijlstra
2015-06-22 12:16 ` [RFC][PATCH 02/13] rcusync: Introduce struct rcu_sync_ops Peter Zijlstra
2015-06-22 12:16 ` [RFC][PATCH 03/13] rcusync: Add the CONFIG_PROVE_RCU checks Peter Zijlstra
2015-06-22 12:16 ` [RFC][PATCH 04/13] rcusync: Introduce rcu_sync_dtor() Peter Zijlstra
2015-06-22 12:16 ` [RFC][PATCH 05/13] percpu-rwsem: Optimize readers and reduce global impact Peter Zijlstra
2015-06-22 23:02   ` Oleg Nesterov
2015-06-23  7:28   ` Nicholas Mc Guire
2015-06-25 19:08     ` Peter Zijlstra
2015-06-25 19:17       ` Tejun Heo
2015-06-29  9:32         ` Peter Zijlstra
2015-06-29 15:12           ` Tejun Heo
2015-06-29 15:14             ` Peter Zijlstra
2015-06-22 12:16 ` [RFC][PATCH 06/13] percpu-rwsem: Provide percpu_down_read_trylock() Peter Zijlstra
2015-06-22 23:08   ` Oleg Nesterov
2015-06-22 12:16 ` [RFC][PATCH 07/13] sched: Reorder task_struct Peter Zijlstra
2015-06-22 12:16 ` [RFC][PATCH 08/13] percpu-rwsem: DEFINE_STATIC_PERCPU_RWSEM Peter Zijlstra
2015-06-22 12:16 ` [RFC][PATCH 09/13] hotplug: Replace hotplug lock with percpu-rwsem Peter Zijlstra
2015-06-22 22:57   ` Oleg Nesterov
2015-06-23  7:16     ` Peter Zijlstra
2015-06-23 17:01       ` Oleg Nesterov
2015-06-23 17:53         ` Peter Zijlstra
2015-06-24 13:50           ` Oleg Nesterov
2015-06-24 14:13             ` Peter Zijlstra
2015-06-24 15:12               ` Oleg Nesterov
2015-06-24 16:15                 ` Peter Zijlstra
2015-06-28 23:56             ` [PATCH 0/3] percpu-rwsem: introduce percpu_rw_semaphore->recursive mode Oleg Nesterov
2015-06-28 23:56               ` [PATCH 1/3] rcusync: introduce rcu_sync_struct->exclusive mode Oleg Nesterov
2015-06-28 23:56               ` [PATCH 2/3] percpu-rwsem: don't use percpu_rw_semaphore->rw_sem to exclude writers Oleg Nesterov
2015-06-28 23:56               ` [PATCH 3/3] percpu-rwsem: introduce percpu_rw_semaphore->recursive mode Oleg Nesterov
2015-06-22 12:16 ` [RFC][PATCH 10/13] fs/locks: Replace lg_global with a percpu-rwsem Peter Zijlstra
2015-06-22 12:16 ` [RFC][PATCH 11/13] fs/locks: Replace lg_local with a per-cpu spinlock Peter Zijlstra
2015-06-23  0:19   ` Oleg Nesterov
2015-06-22 12:16 ` [RFC][PATCH 12/13] stop_machine: Remove lglock Peter Zijlstra
2015-06-22 22:21   ` Oleg Nesterov
2015-06-23 10:09     ` Peter Zijlstra
2015-06-23 10:55       ` Peter Zijlstra
2015-06-23 11:20         ` Peter Zijlstra
2015-06-23 13:08           ` Peter Zijlstra
2015-06-23 16:36             ` Oleg Nesterov
2015-06-23 17:30             ` Paul E. McKenney
2015-06-23 18:04               ` Peter Zijlstra
2015-06-23 18:26                 ` Paul E. McKenney
2015-06-23 19:05                   ` Paul E. McKenney
2015-06-24  2:23                     ` Paul E. McKenney
2015-06-24  8:32                       ` Peter Zijlstra
2015-06-24  9:31                         ` Peter Zijlstra
2015-06-24 13:48                           ` Paul E. McKenney
2015-06-24 15:01                         ` Paul E. McKenney
2015-06-24 15:34                           ` Peter Zijlstra
2015-06-24  7:35                   ` Peter Zijlstra
2015-06-24  8:42                     ` Ingo Molnar
2015-06-24 13:39                       ` Paul E. McKenney
2015-06-24 13:43                         ` Ingo Molnar
2015-06-24 14:03                           ` Paul E. McKenney
2015-06-24 14:50                     ` Paul E. McKenney
2015-06-24 15:01                       ` Peter Zijlstra
2015-06-24 15:27                         ` Paul E. McKenney
2015-06-24 15:40                           ` Peter Zijlstra
2015-06-24 16:09                             ` Paul E. McKenney
2015-06-24 16:42                               ` Peter Zijlstra
2015-06-24 17:10                                 ` Paul E. McKenney
2015-06-24 17:20                                   ` Paul E. McKenney
2015-06-24 17:29                                     ` Peter Zijlstra
2015-06-24 17:28                                   ` Peter Zijlstra
2015-06-24 17:32                                     ` Peter Zijlstra
2015-06-24 18:14                                     ` Peter Zijlstra
2015-06-24 17:58                                   ` Peter Zijlstra
2015-06-25  3:23                                     ` Paul E. McKenney
2015-06-25 11:07                                       ` Peter Zijlstra
2015-06-25 13:47                                         ` Paul E. McKenney
2015-06-25 14:20                                           ` Peter Zijlstra
2015-06-25 14:51                                             ` Paul E. McKenney
2015-06-26 12:32                                               ` Peter Zijlstra
2015-06-26 16:14                                                 ` Paul E. McKenney
2015-06-29  7:56                                                   ` Peter Zijlstra
2015-06-30 21:32                                                     ` Paul E. McKenney
2015-07-01 11:56                                                       ` Peter Zijlstra
2015-07-01 15:56                                                         ` Paul E. McKenney
2015-07-01 16:16                                                           ` Peter Zijlstra
2015-07-01 18:45                                                             ` Paul E. McKenney
2015-06-23 14:39         ` Paul E. McKenney
2015-06-23 16:20       ` Oleg Nesterov
2015-06-23 17:24         ` Oleg Nesterov
2015-06-25 19:18           ` Peter Zijlstra
2015-06-22 12:16 ` [RFC][PATCH 13/13] locking: " Peter Zijlstra
2015-06-22 12:36 ` [RFC][PATCH 00/13] percpu rwsem -v2 Peter Zijlstra
2015-06-22 18:11 ` Daniel Wagner
2015-06-22 19:05   ` Peter Zijlstra
2015-06-23  9:35     ` Daniel Wagner
2015-06-23 10:00       ` Ingo Molnar
2015-06-23 14:34       ` Peter Zijlstra
2015-06-23 14:56         ` Daniel Wagner
2015-06-23 17:50           ` Peter Zijlstra
2015-06-23 19:36             ` Peter Zijlstra
2015-06-24  8:46               ` Ingo Molnar
2015-06-24  9:01                 ` Peter Zijlstra
2015-06-24  9:18                 ` Daniel Wagner
2015-07-01  5:57                   ` Daniel Wagner
2015-07-01 21:54                     ` Linus Torvalds
2015-07-02  9:41                       ` Peter Zijlstra
2015-07-20  5:53                         ` Daniel Wagner
2015-07-20 18:44                           ` Linus Torvalds
2015-06-22 20:06 ` Linus Torvalds
2015-06-23 16:10 ` Davidlohr Bueso
2015-06-23 16:21   ` Peter Zijlstra

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.