linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC][PATCH 0/5] Optimize percpu-rwsem
@ 2015-05-26 11:43 Peter Zijlstra
  2015-05-26 11:43 ` [RFC][PATCH 1/5] rcu: Create rcu_sync infrastructure Peter Zijlstra
                   ` (6 more replies)
  0 siblings, 7 replies; 36+ messages in thread
From: Peter Zijlstra @ 2015-05-26 11:43 UTC (permalink / raw)
  To: oleg, paulmck; +Cc: tj, mingo, linux-kernel, der.herr, peterz, dave, torvalds

Hi all,

This is a derived work of the cpu hotplug lock rework I did in 2013 which never
really went anywhere because Linus didn't like it.

This applies those same optimizations to the percpu-rwsem. Seeing how we did
all the work it seemed a waste to not use it at all.




^ permalink raw reply	[flat|nested] 36+ messages in thread

* [RFC][PATCH 1/5] rcu: Create rcu_sync infrastructure
  2015-05-26 11:43 [RFC][PATCH 0/5] Optimize percpu-rwsem Peter Zijlstra
@ 2015-05-26 11:43 ` Peter Zijlstra
  2015-05-30 16:58   ` Paul E. McKenney
  2015-05-26 11:43 ` [RFC][PATCH 2/5] rcusync: Introduce struct rcu_sync_ops Peter Zijlstra
                   ` (5 subsequent siblings)
  6 siblings, 1 reply; 36+ messages in thread
From: Peter Zijlstra @ 2015-05-26 11:43 UTC (permalink / raw)
  To: oleg, paulmck; +Cc: tj, mingo, linux-kernel, der.herr, peterz, dave, torvalds

[-- Attachment #1: oleg_nesterov-2_rcu-create_rcu_sync_infrastructure.patch --]
[-- Type: text/plain, Size: 5787 bytes --]

From: Oleg Nesterov <oleg@redhat.com>

It is functionally equivalent to

        struct rcu_sync_struct {
                atomic_t counter;
        };

        static inline bool rcu_sync_is_idle(struct rcu_sync_struct *rss)
        {
                return atomic_read(&rss->counter) == 0;
        }

        static inline void rcu_sync_enter(struct rcu_sync_struct *rss)
        {
                atomic_inc(&rss->counter);
                synchronize_sched();
        }

        static inline void rcu_sync_exit(struct rcu_sync_struct *rss)
        {
                synchronize_sched();
                atomic_dec(&rss->counter);
        }

except: it records the state and synchronize_sched() is only called by
rcu_sync_enter() and only if necessary.

Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/rcusync.h |   64 ++++++++++++++++++++++++++++
 kernel/rcu/Makefile     |    2 
 kernel/rcu/sync.c       |  108 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 173 insertions(+), 1 deletion(-)

--- /dev/null
+++ b/include/linux/rcusync.h
@@ -0,0 +1,64 @@
+#ifndef _LINUX_RCUSYNC_H_
+#define _LINUX_RCUSYNC_H_
+
+#include <linux/wait.h>
+#include <linux/rcupdate.h>
+
+struct rcu_sync_struct {
+	int			gp_state;
+	int			gp_count;
+	wait_queue_head_t	gp_wait;
+
+	int			cb_state;
+	struct rcu_head		cb_head;
+
+	void (*sync)(void);
+	void (*call)(struct rcu_head *, void (*)(struct rcu_head *));
+};
+
+#define ___RCU_SYNC_INIT(name)						\
+	.gp_state = 0,							\
+	.gp_count = 0,							\
+	.gp_wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.gp_wait),		\
+	.cb_state = 0
+
+#define __RCU_SCHED_SYNC_INIT(name) {					\
+	___RCU_SYNC_INIT(name),						\
+	.sync = synchronize_sched,					\
+	.call = call_rcu_sched,						\
+}
+
+#define __RCU_BH_SYNC_INIT(name) {					\
+	___RCU_SYNC_INIT(name),						\
+	.sync = synchronize_rcu_bh,					\
+	.call = call_rcu_bh,						\
+}
+
+#define __RCU_SYNC_INIT(name) {						\
+	___RCU_SYNC_INIT(name),						\
+	.sync = synchronize_rcu,					\
+	.call = call_rcu,						\
+}
+
+#define DEFINE_RCU_SCHED_SYNC(name)					\
+	struct rcu_sync_struct name = __RCU_SCHED_SYNC_INIT(name)
+
+#define DEFINE_RCU_BH_SYNC(name)					\
+	struct rcu_sync_struct name = __RCU_BH_SYNC_INIT(name)
+
+#define DEFINE_RCU_SYNC(name)						\
+	struct rcu_sync_struct name = __RCU_SYNC_INIT(name)
+
+static inline bool rcu_sync_is_idle(struct rcu_sync_struct *rss)
+{
+	return !rss->gp_state; /* GP_IDLE */
+}
+
+enum rcu_sync_type { RCU_SYNC, RCU_SCHED_SYNC, RCU_BH_SYNC };
+
+extern void rcu_sync_init(struct rcu_sync_struct *, enum rcu_sync_type);
+extern void rcu_sync_enter(struct rcu_sync_struct *);
+extern void rcu_sync_exit(struct rcu_sync_struct *);
+
+#endif /* _LINUX_RCUSYNC_H_ */
+
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -1,4 +1,4 @@
-obj-y += update.o
+obj-y += update.o sync.o
 obj-$(CONFIG_SRCU) += srcu.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_TREE_RCU) += tree.o
--- /dev/null
+++ b/kernel/rcu/sync.c
@@ -0,0 +1,108 @@
+
+#include <linux/rcusync.h>
+#include <linux/sched.h>
+
+enum { GP_IDLE = 0, GP_PENDING, GP_PASSED };
+enum { CB_IDLE = 0, CB_PENDING, CB_REPLAY };
+
+#define	rss_lock	gp_wait.lock
+
+void rcu_sync_init(struct rcu_sync_struct *rss, enum rcu_sync_type type)
+{
+	memset(rss, 0, sizeof(*rss));
+	init_waitqueue_head(&rss->gp_wait);
+
+	switch (type) {
+	case RCU_SYNC:
+		rss->sync = synchronize_rcu;
+		rss->call = call_rcu;
+		break;
+
+	case RCU_SCHED_SYNC:
+		rss->sync = synchronize_sched;
+		rss->call = call_rcu_sched;
+		break;
+
+	case RCU_BH_SYNC:
+		rss->sync = synchronize_rcu_bh;
+		rss->call = call_rcu_bh;
+		break;
+	}
+}
+
+void rcu_sync_enter(struct rcu_sync_struct *rss)
+{
+	bool need_wait, need_sync;
+
+	spin_lock_irq(&rss->rss_lock);
+	need_wait = rss->gp_count++;
+	need_sync = rss->gp_state == GP_IDLE;
+	if (need_sync)
+		rss->gp_state = GP_PENDING;
+	spin_unlock_irq(&rss->rss_lock);
+
+	BUG_ON(need_wait && need_sync);
+
+	if (need_sync) {
+		rss->sync();
+		rss->gp_state = GP_PASSED;
+		wake_up_all(&rss->gp_wait);
+	} else if (need_wait) {
+		wait_event(rss->gp_wait, rss->gp_state == GP_PASSED);
+	} else {
+		/*
+		 * Possible when there's a pending CB from a rcu_sync_exit().
+		 * Nobody has yet been allowed the 'fast' path and thus we can
+		 * avoid doing any sync(). The callback will get 'dropped'.
+		 */
+		BUG_ON(rss->gp_state != GP_PASSED);
+	}
+}
+
+static void rcu_sync_func(struct rcu_head *rcu)
+{
+	struct rcu_sync_struct *rss =
+		container_of(rcu, struct rcu_sync_struct, cb_head);
+	unsigned long flags;
+
+
+	BUG_ON(rss->gp_state != GP_PASSED);
+	BUG_ON(rss->cb_state == CB_IDLE);
+
+	spin_lock_irqsave(&rss->rss_lock, flags);
+	if (rss->gp_count) {
+		/*
+		 * A new rcu_sync_begin() has happened; drop the callback.
+		 */
+		rss->cb_state = CB_IDLE;
+	} else if (rss->cb_state == CB_REPLAY) {
+		/*
+		 * A new rcu_sync_exit() has happened; requeue the callback
+		 * to catch a later GP.
+		 */
+		rss->cb_state = CB_PENDING;
+		rss->call(&rss->cb_head, rcu_sync_func);
+	} else {
+		/*
+		 * We're at least a GP after rcu_sync_exit(); eveybody will now
+		 * have observed the write side critical section. Let 'em rip!.
+		 */
+		rss->cb_state = CB_IDLE;
+		rss->gp_state = GP_IDLE;
+	}
+	spin_unlock_irqrestore(&rss->rss_lock, flags);
+}
+
+void rcu_sync_exit(struct rcu_sync_struct *rss)
+{
+	spin_lock_irq(&rss->rss_lock);
+	if (!--rss->gp_count) {
+		if (rss->cb_state == CB_IDLE) {
+			rss->cb_state = CB_PENDING;
+			rss->call(&rss->cb_head, rcu_sync_func);
+		} else if (rss->cb_state == CB_PENDING) {
+			rss->cb_state = CB_REPLAY;
+		}
+	}
+	spin_unlock_irq(&rss->rss_lock);
+}



^ permalink raw reply	[flat|nested] 36+ messages in thread

* [RFC][PATCH 2/5] rcusync: Introduce struct rcu_sync_ops
  2015-05-26 11:43 [RFC][PATCH 0/5] Optimize percpu-rwsem Peter Zijlstra
  2015-05-26 11:43 ` [RFC][PATCH 1/5] rcu: Create rcu_sync infrastructure Peter Zijlstra
@ 2015-05-26 11:43 ` Peter Zijlstra
  2015-05-26 11:43 ` [RFC][PATCH 3/5] rcusync: Add the CONFIG_PROVE_RCU checks Peter Zijlstra
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 36+ messages in thread
From: Peter Zijlstra @ 2015-05-26 11:43 UTC (permalink / raw)
  To: oleg, paulmck; +Cc: tj, mingo, linux-kernel, der.herr, peterz, dave, torvalds

[-- Attachment #1: oleg_nesterov-4_rcusync-introduce_struct_rcu_sync_ops.patch --]
[-- Type: text/plain, Size: 5014 bytes --]

From: Oleg Nesterov <oleg@redhat.com>

Add the new struct rcu_sync_ops which holds sync/call methods, and
turn the function pointers in rcu_sync_struct into an array of struct
rcu_sync_ops.

This simplifies the "init" helpers, and this way it is simpler to add
the new methods we need, especially ifdef'ed.

Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/rcusync.h |   60 ++++++++++++++++++------------------------------
 kernel/rcu/sync.c       |   43 +++++++++++++++++-----------------
 2 files changed, 45 insertions(+), 58 deletions(-)

--- a/include/linux/rcusync.h
+++ b/include/linux/rcusync.h
@@ -4,6 +4,8 @@
 #include <linux/wait.h>
 #include <linux/rcupdate.h>
 
+enum rcu_sync_type { RCU_SYNC, RCU_SCHED_SYNC, RCU_BH_SYNC };
+
 struct rcu_sync_struct {
 	int			gp_state;
 	int			gp_count;
@@ -12,53 +14,37 @@ struct rcu_sync_struct {
 	int			cb_state;
 	struct rcu_head		cb_head;
 
-	void (*sync)(void);
-	void (*call)(struct rcu_head *, void (*)(struct rcu_head *));
+	enum rcu_sync_type	gp_type;
 };
 
-#define ___RCU_SYNC_INIT(name)						\
-	.gp_state = 0,							\
-	.gp_count = 0,							\
-	.gp_wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.gp_wait),		\
-	.cb_state = 0
-
-#define __RCU_SCHED_SYNC_INIT(name) {					\
-	___RCU_SYNC_INIT(name),						\
-	.sync = synchronize_sched,					\
-	.call = call_rcu_sched,						\
-}
-
-#define __RCU_BH_SYNC_INIT(name) {					\
-	___RCU_SYNC_INIT(name),						\
-	.sync = synchronize_rcu_bh,					\
-	.call = call_rcu_bh,						\
-}
-
-#define __RCU_SYNC_INIT(name) {						\
-	___RCU_SYNC_INIT(name),						\
-	.sync = synchronize_rcu,					\
-	.call = call_rcu,						\
-}
-
-#define DEFINE_RCU_SCHED_SYNC(name)					\
-	struct rcu_sync_struct name = __RCU_SCHED_SYNC_INIT(name)
-
-#define DEFINE_RCU_BH_SYNC(name)					\
-	struct rcu_sync_struct name = __RCU_BH_SYNC_INIT(name)
-
-#define DEFINE_RCU_SYNC(name)						\
-	struct rcu_sync_struct name = __RCU_SYNC_INIT(name)
-
 static inline bool rcu_sync_is_idle(struct rcu_sync_struct *rss)
 {
 	return !rss->gp_state; /* GP_IDLE */
 }
 
-enum rcu_sync_type { RCU_SYNC, RCU_SCHED_SYNC, RCU_BH_SYNC };
-
 extern void rcu_sync_init(struct rcu_sync_struct *, enum rcu_sync_type);
 extern void rcu_sync_enter(struct rcu_sync_struct *);
 extern void rcu_sync_exit(struct rcu_sync_struct *);
 
+#define __RCU_SYNC_INITIALIZER(name, type) {				\
+		.gp_state = 0,						\
+		.gp_count = 0,						\
+		.gp_wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.gp_wait),	\
+		.cb_state = 0,						\
+		.gp_type = type,					\
+	}
+
+#define	__DEFINE_RCU_SYNC(name, type)	\
+	struct rcu_sync_struct name = __RCU_SYNC_INITIALIZER(name, type)
+
+#define DEFINE_RCU_SYNC(name)		\
+	__DEFINE_RCU_SYNC(name, RCU_SYNC)
+
+#define DEFINE_RCU_SCHED_SYNC(name)	\
+	__DEFINE_RCU_SYNC(name, RCU_SCHED_SYNC)
+
+#define DEFINE_RCU_BH_SYNC(name)	\
+	__DEFINE_RCU_SYNC(name, RCU_BH_SYNC)
+
 #endif /* _LINUX_RCUSYNC_H_ */
 
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -1,7 +1,24 @@
-
 #include <linux/rcusync.h>
 #include <linux/sched.h>
 
+static const struct {
+	void (*sync)(void);
+	void (*call)(struct rcu_head *, void (*)(struct rcu_head *));
+} gp_ops[] = {
+	[RCU_SYNC] = {
+		.sync = synchronize_rcu,
+		.call = call_rcu,
+	},
+	[RCU_SCHED_SYNC] = {
+		.sync = synchronize_sched,
+		.call = call_rcu_sched,
+	},
+	[RCU_BH_SYNC] = {
+		.sync = synchronize_rcu_bh,
+		.call = call_rcu_bh,
+	},
+};
+
 enum { GP_IDLE = 0, GP_PENDING, GP_PASSED };
 enum { CB_IDLE = 0, CB_PENDING, CB_REPLAY };
 
@@ -11,23 +28,7 @@ void rcu_sync_init(struct rcu_sync_struc
 {
 	memset(rss, 0, sizeof(*rss));
 	init_waitqueue_head(&rss->gp_wait);
-
-	switch (type) {
-	case RCU_SYNC:
-		rss->sync = synchronize_rcu;
-		rss->call = call_rcu;
-		break;
-
-	case RCU_SCHED_SYNC:
-		rss->sync = synchronize_sched;
-		rss->call = call_rcu_sched;
-		break;
-
-	case RCU_BH_SYNC:
-		rss->sync = synchronize_rcu_bh;
-		rss->call = call_rcu_bh;
-		break;
-	}
+	rss->gp_type = type;
 }
 
 void rcu_sync_enter(struct rcu_sync_struct *rss)
@@ -44,7 +45,7 @@ void rcu_sync_enter(struct rcu_sync_stru
 	BUG_ON(need_wait && need_sync);
 
 	if (need_sync) {
-		rss->sync();
+		gp_ops[rss->gp_type].sync();
 		rss->gp_state = GP_PASSED;
 		wake_up_all(&rss->gp_wait);
 	} else if (need_wait) {
@@ -81,7 +82,7 @@ static void rcu_sync_func(struct rcu_hea
 		 * to catch a later GP.
 		 */
 		rss->cb_state = CB_PENDING;
-		rss->call(&rss->cb_head, rcu_sync_func);
+		gp_ops[rss->gp_type].call(&rss->cb_head, rcu_sync_func);
 	} else {
 		/*
 		 * We're at least a GP after rcu_sync_exit(); eveybody will now
@@ -99,7 +100,7 @@ void rcu_sync_exit(struct rcu_sync_struc
 	if (!--rss->gp_count) {
 		if (rss->cb_state == CB_IDLE) {
 			rss->cb_state = CB_PENDING;
-			rss->call(&rss->cb_head, rcu_sync_func);
+			gp_ops[rss->gp_type].call(&rss->cb_head, rcu_sync_func);
 		} else if (rss->cb_state == CB_PENDING) {
 			rss->cb_state = CB_REPLAY;
 		}



^ permalink raw reply	[flat|nested] 36+ messages in thread

* [RFC][PATCH 3/5] rcusync: Add the CONFIG_PROVE_RCU checks
  2015-05-26 11:43 [RFC][PATCH 0/5] Optimize percpu-rwsem Peter Zijlstra
  2015-05-26 11:43 ` [RFC][PATCH 1/5] rcu: Create rcu_sync infrastructure Peter Zijlstra
  2015-05-26 11:43 ` [RFC][PATCH 2/5] rcusync: Introduce struct rcu_sync_ops Peter Zijlstra
@ 2015-05-26 11:43 ` Peter Zijlstra
  2015-05-26 11:44 ` [RFC][PATCH 4/5] rcusync: Introduce rcu_sync_dtor() Peter Zijlstra
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 36+ messages in thread
From: Peter Zijlstra @ 2015-05-26 11:43 UTC (permalink / raw)
  To: oleg, paulmck; +Cc: tj, mingo, linux-kernel, der.herr, peterz, dave, torvalds

[-- Attachment #1: oleg_nesterov-5_rcusync-add_the_config_prove_rcu_checks.patch --]
[-- Type: text/plain, Size: 2350 bytes --]

From: Oleg Nesterov <oleg@redhat.com>

It would be nice to validate that the caller of rcu_sync_is_idle()
holds the corresponding type of RCU read-side lock. Add the new
rcu_sync_ops->held() method and change rcu_sync_is_idle() to
WARN() if it returns false.

This obviously penalizes the readers (fast-path), but only if
CONFIG_PROVE_RCU.

Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Suggested-by: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/rcusync.h |    6 ++++++
 kernel/rcu/sync.c       |   21 +++++++++++++++++++++
 2 files changed, 27 insertions(+)

--- a/include/linux/rcusync.h
+++ b/include/linux/rcusync.h
@@ -17,9 +17,15 @@ struct rcu_sync_struct {
 	enum rcu_sync_type	gp_type;
 };
 
+extern bool __rcu_sync_is_idle(struct rcu_sync_struct *);
+
 static inline bool rcu_sync_is_idle(struct rcu_sync_struct *rss)
 {
+#ifdef CONFIG_PROVE_RCU
+	return __rcu_sync_is_idle(rss);
+#else
 	return !rss->gp_state; /* GP_IDLE */
+#endif
 }
 
 extern void rcu_sync_init(struct rcu_sync_struct *, enum rcu_sync_type);
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -1,21 +1,33 @@
 #include <linux/rcusync.h>
 #include <linux/sched.h>
 
+#ifdef CONFIG_PROVE_RCU
+#define __INIT_HELD(func)	.held = func,
+#else
+#define __INIT_HELD(func)
+#endif
+
 static const struct {
 	void (*sync)(void);
 	void (*call)(struct rcu_head *, void (*)(struct rcu_head *));
+#ifdef CONFIG_PROVE_RCU
+	int  (*held)(void);
+#endif
 } gp_ops[] = {
 	[RCU_SYNC] = {
 		.sync = synchronize_rcu,
 		.call = call_rcu,
+		__INIT_HELD(rcu_read_lock_held)
 	},
 	[RCU_SCHED_SYNC] = {
 		.sync = synchronize_sched,
 		.call = call_rcu_sched,
+		__INIT_HELD(rcu_read_lock_sched_held)
 	},
 	[RCU_BH_SYNC] = {
 		.sync = synchronize_rcu_bh,
 		.call = call_rcu_bh,
+		__INIT_HELD(rcu_read_lock_bh_held)
 	},
 };
 
@@ -24,6 +36,15 @@ enum { CB_IDLE = 0, CB_PENDING, CB_REPLA
 
 #define	rss_lock	gp_wait.lock
 
+#ifdef CONFIG_PROVE_RCU
+bool __rcu_sync_is_idle(struct rcu_sync_struct *rss)
+{
+	WARN_ON(!gp_ops[rss->gp_type].held());
+	return rss->gp_state == GP_IDLE;
+}
+EXPORT_SYMBOL_GPL(__rcu_sync_is_idle);
+#endif
+
 void rcu_sync_init(struct rcu_sync_struct *rss, enum rcu_sync_type type)
 {
 	memset(rss, 0, sizeof(*rss));



^ permalink raw reply	[flat|nested] 36+ messages in thread

* [RFC][PATCH 4/5] rcusync: Introduce rcu_sync_dtor()
  2015-05-26 11:43 [RFC][PATCH 0/5] Optimize percpu-rwsem Peter Zijlstra
                   ` (2 preceding siblings ...)
  2015-05-26 11:43 ` [RFC][PATCH 3/5] rcusync: Add the CONFIG_PROVE_RCU checks Peter Zijlstra
@ 2015-05-26 11:44 ` Peter Zijlstra
  2015-05-26 11:44 ` [RFC][PATCH 5/5] percpu-rwsem: Optimize readers and reduce global impact Peter Zijlstra
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 36+ messages in thread
From: Peter Zijlstra @ 2015-05-26 11:44 UTC (permalink / raw)
  To: oleg, paulmck; +Cc: tj, mingo, linux-kernel, der.herr, peterz, dave, torvalds

[-- Attachment #1: oleg_nesterov-6_rcusync-introduce_rcu_sync_dtor.patch --]
[-- Type: text/plain, Size: 2190 bytes --]

From: Oleg Nesterov <oleg@redhat.com>

Add the new rcu_sync_ops->wait() method and the new helper,
rcu_sync_dtor().

It is needed if you are going to, say, kfree(rcu_sync_object).
It simply calls ops->wait() to "flush" the potentially pending
rcu callback.

Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/rcusync.h |    1 +
 kernel/rcu/sync.c       |   22 ++++++++++++++++++++++
 2 files changed, 23 insertions(+)

--- a/include/linux/rcusync.h
+++ b/include/linux/rcusync.h
@@ -31,6 +31,7 @@ static inline bool rcu_sync_is_idle(stru
 extern void rcu_sync_init(struct rcu_sync_struct *, enum rcu_sync_type);
 extern void rcu_sync_enter(struct rcu_sync_struct *);
 extern void rcu_sync_exit(struct rcu_sync_struct *);
+extern void rcu_sync_dtor(struct rcu_sync_struct *);
 
 #define __RCU_SYNC_INITIALIZER(name, type) {				\
 		.gp_state = 0,						\
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -10,6 +10,7 @@
 static const struct {
 	void (*sync)(void);
 	void (*call)(struct rcu_head *, void (*)(struct rcu_head *));
+	void (*wait)(void);
 #ifdef CONFIG_PROVE_RCU
 	int  (*held)(void);
 #endif
@@ -17,16 +18,19 @@ static const struct {
 	[RCU_SYNC] = {
 		.sync = synchronize_rcu,
 		.call = call_rcu,
+		.wait = rcu_barrier,
 		__INIT_HELD(rcu_read_lock_held)
 	},
 	[RCU_SCHED_SYNC] = {
 		.sync = synchronize_sched,
 		.call = call_rcu_sched,
+		.wait = rcu_barrier_sched,
 		__INIT_HELD(rcu_read_lock_sched_held)
 	},
 	[RCU_BH_SYNC] = {
 		.sync = synchronize_rcu_bh,
 		.call = call_rcu_bh,
+		.wait = rcu_barrier_bh,
 		__INIT_HELD(rcu_read_lock_bh_held)
 	},
 };
@@ -128,3 +132,21 @@ void rcu_sync_exit(struct rcu_sync_struc
 	}
 	spin_unlock_irq(&rss->rss_lock);
 }
+
+void rcu_sync_dtor(struct rcu_sync_struct *rss)
+{
+	int cb_state;
+
+	BUG_ON(rss->gp_count);
+
+	spin_lock_irq(&rss->rss_lock);
+	if (rss->cb_state == CB_REPLAY)
+		rss->cb_state = CB_PENDING;
+	cb_state = rss->cb_state;
+	spin_unlock_irq(&rss->rss_lock);
+
+	if (cb_state != CB_IDLE) {
+		gp_ops[rss->gp_type].wait();
+		BUG_ON(rss->cb_state != CB_IDLE);
+	}
+}



^ permalink raw reply	[flat|nested] 36+ messages in thread

* [RFC][PATCH 5/5] percpu-rwsem: Optimize readers and reduce global impact
  2015-05-26 11:43 [RFC][PATCH 0/5] Optimize percpu-rwsem Peter Zijlstra
                   ` (3 preceding siblings ...)
  2015-05-26 11:44 ` [RFC][PATCH 4/5] rcusync: Introduce rcu_sync_dtor() Peter Zijlstra
@ 2015-05-26 11:44 ` Peter Zijlstra
  2015-05-29 19:45   ` Oleg Nesterov
  2015-05-30 17:18   ` Paul E. McKenney
  2015-05-26 18:12 ` [RFC][PATCH 0/5] Optimize percpu-rwsem Linus Torvalds
  2015-05-26 19:54 ` Davidlohr Bueso
  6 siblings, 2 replies; 36+ messages in thread
From: Peter Zijlstra @ 2015-05-26 11:44 UTC (permalink / raw)
  To: oleg, paulmck; +Cc: tj, mingo, linux-kernel, der.herr, peterz, dave, torvalds

[-- Attachment #1: peterz-zijlstra-opt-percpu-rwsem.patch --]
[-- Type: text/plain, Size: 13128 bytes --]

Currently the percpu-rwsem has two issues:

 - it switches to (global) atomic ops while a writer is waiting;
   which could be quite a while and slows down releasing the readers.

 - it employs synchronize_sched_expedited() _twice_ which is evil and
   should die -- it shoots IPIs around the machine.

This patch cures the first problem by ordering the reader-state vs
reader-count (see the comments in __percpu_down_read() and
percpu_down_write()). This changes a global atomic op into a full
memory barrier, which doesn't have the global cacheline contention.

It cures the second problem by employing the rcu-sync primitives by
Oleg which reduces to no sync_sched() calls in the 'normal' case of
no write contention -- global locks had better be rare, and has a
maximum of one sync_sched() call in case of contention.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/percpu-rwsem.h  |   62 +++++++++-
 kernel/locking/percpu-rwsem.c |  238 +++++++++++++++++++++---------------------
 2 files changed, 176 insertions(+), 124 deletions(-)

--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -5,18 +5,64 @@
 #include <linux/rwsem.h>
 #include <linux/percpu.h>
 #include <linux/wait.h>
+#include <linux/rcusync.h>
 #include <linux/lockdep.h>
 
 struct percpu_rw_semaphore {
-	unsigned int __percpu	*fast_read_ctr;
-	atomic_t		write_ctr;
+	unsigned int __percpu	*refcount;
+	int			state;
+	struct rcu_sync_struct	rss;
+	wait_queue_head_t	writer;
 	struct rw_semaphore	rw_sem;
-	atomic_t		slow_read_ctr;
-	wait_queue_head_t	write_waitq;
 };
 
-extern void percpu_down_read(struct percpu_rw_semaphore *);
-extern void percpu_up_read(struct percpu_rw_semaphore *);
+extern void __percpu_down_read(struct percpu_rw_semaphore *);
+extern void __percpu_up_read(struct percpu_rw_semaphore *);
+
+static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
+{
+	might_sleep();
+
+	rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 0, _RET_IP_);
+
+	preempt_disable();
+	/*
+	 * We are in an RCU-sched read-side critical section, so the writer
+	 * cannot both change sem->state from readers_fast and start
+	 * checking counters while we are here. So if we see !sem->state,
+	 * we know that the writer won't be checking until we past the
+	 * preempt_enable() and that once the synchronize_sched() is done, the
+	 * writer will see anything we did within this RCU-sched read-side
+	 * critical section.
+	 */
+	__this_cpu_inc(*sem->refcount);
+	if (unlikely(!rcu_sync_is_idle(&sem->rss)))
+		__percpu_down_read(sem); /* Unconditional memory barrier. */
+	preempt_enable();
+	/*
+	 * The barrier() from preempt_enable() prevents the compiler from
+	 * bleeding the critical section out.
+	 */
+}
+
+static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
+{
+	/*
+	 * The barrier() in preempt_disable() prevents the compiler from
+	 * bleeding the critical section out.
+	 */
+	preempt_disable();
+	/*
+	 * Same as in percpu_down_read().
+	 */
+	if (likely(rcu_sync_is_idle(&sem->rss)))
+		__this_cpu_dec(*sem->refcount);
+	else
+		__percpu_up_read(sem); /* Unconditional memory barrier. */
+	preempt_enable();
+
+	rwsem_release(&sem->rw_sem.dep_map, 1, _RET_IP_);
+}
 
 extern void percpu_down_write(struct percpu_rw_semaphore *);
 extern void percpu_up_write(struct percpu_rw_semaphore *);
@@ -25,10 +71,10 @@ extern int __percpu_init_rwsem(struct pe
 				const char *, struct lock_class_key *);
 extern void percpu_free_rwsem(struct percpu_rw_semaphore *);
 
-#define percpu_init_rwsem(brw)	\
+#define percpu_init_rwsem(sem)					\
 ({								\
 	static struct lock_class_key rwsem_key;			\
-	__percpu_init_rwsem(brw, #brw, &rwsem_key);		\
+	__percpu_init_rwsem(sem, #sem, &rwsem_key);		\
 })
 
 #endif
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -8,158 +8,164 @@
 #include <linux/sched.h>
 #include <linux/errno.h>
 
-int __percpu_init_rwsem(struct percpu_rw_semaphore *brw,
+enum { readers_slow, readers_block };
+
+int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
 			const char *name, struct lock_class_key *rwsem_key)
 {
-	brw->fast_read_ctr = alloc_percpu(int);
-	if (unlikely(!brw->fast_read_ctr))
+	sem->refcount = alloc_percpu(unsigned int);
+	if (unlikely(!sem->refcount))
 		return -ENOMEM;
 
-	/* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
-	__init_rwsem(&brw->rw_sem, name, rwsem_key);
-	atomic_set(&brw->write_ctr, 0);
-	atomic_set(&brw->slow_read_ctr, 0);
-	init_waitqueue_head(&brw->write_waitq);
+	sem->state = readers_slow;
+	rcu_sync_init(&sem->rss, RCU_SCHED_SYNC);
+	init_waitqueue_head(&sem->writer);
+	__init_rwsem(&sem->rw_sem, name, rwsem_key);
+
 	return 0;
 }
 
-void percpu_free_rwsem(struct percpu_rw_semaphore *brw)
+void percpu_free_rwsem(struct percpu_rw_semaphore *sem)
 {
-	free_percpu(brw->fast_read_ctr);
-	brw->fast_read_ctr = NULL; /* catch use after free bugs */
+	rcu_sync_dtor(&sem->rss);
+	free_percpu(sem->refcount);
+	sem->refcount = NULL; /* catch use after free bugs */
 }
 
-/*
- * This is the fast-path for down_read/up_read, it only needs to ensure
- * there is no pending writer (atomic_read(write_ctr) == 0) and inc/dec the
- * fast per-cpu counter. The writer uses synchronize_sched_expedited() to
- * serialize with the preempt-disabled section below.
- *
- * The nontrivial part is that we should guarantee acquire/release semantics
- * in case when
- *
- *	R_W: down_write() comes after up_read(), the writer should see all
- *	     changes done by the reader
- * or
- *	W_R: down_read() comes after up_write(), the reader should see all
- *	     changes done by the writer
- *
- * If this helper fails the callers rely on the normal rw_semaphore and
- * atomic_dec_and_test(), so in this case we have the necessary barriers.
- *
- * But if it succeeds we do not have any barriers, atomic_read(write_ctr) or
- * __this_cpu_add() below can be reordered with any LOAD/STORE done by the
- * reader inside the critical section. See the comments in down_write and
- * up_write below.
- */
-static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val)
+void __percpu_down_read(struct percpu_rw_semaphore *sem)
 {
-	bool success = false;
+	/*
+	 * Due to having preemption disabled the decrement happens on
+	 * the same CPU as the increment, avoiding the
+	 * increment-on-one-CPU-and-decrement-on-another problem.
+	 *
+	 * And yes, if the reader misses the writer's assignment of
+	 * readers_block to sem->state, then the writer is
+	 * guaranteed to see the reader's increment.  Conversely, any
+	 * readers that increment their sem->refcount after the
+	 * writer looks are guaranteed to see the readers_block value,
+	 * which in turn means that they are guaranteed to immediately
+	 * decrement their sem->refcount, so that it doesn't matter
+	 * that the writer missed them.
+	 */
+
+	smp_mb(); /* A matches D */
+
+	if (likely(sem->state != readers_block))
+		return;
+
+	/*
+	 * Per the above comment; we still have preemption disabled and
+	 * will thus decrement on the same CPU as we incremented.
+	 */
+	__percpu_up_read(sem);
+
+	/*
+	 * We either call schedule() in the wait, or we'll fall through
+	 * and reschedule on the preempt_enable() in percpu_down_read().
+	 */
+	preempt_enable_no_resched();
+
+	/*
+	 * Avoid lockdep for the down/up_read() we already have them.
+	 */
+	__down_read(&sem->rw_sem);
+	__this_cpu_inc(*sem->refcount);
+	__up_read(&sem->rw_sem);
 
 	preempt_disable();
-	if (likely(!atomic_read(&brw->write_ctr))) {
-		__this_cpu_add(*brw->fast_read_ctr, val);
-		success = true;
-	}
-	preempt_enable();
+}
+
+void __percpu_up_read(struct percpu_rw_semaphore *sem)
+{
+	smp_mb(); /* B matches C */
+	/*
+	 * In other words, if they see our decrement (presumably to aggregate
+	 * zero, as that is the only time it matters) they will also see our
+	 * critical section.
+	 */
+	this_cpu_dec(*sem->refcount);
 
-	return success;
+	/* Prod writer to recheck readers_active */
+	wake_up(&sem->writer);
 }
 
+
+#define per_cpu_sum(var)						\
+({									\
+	typeof(var) __sum = 0;						\
+	int cpu;							\
+	for_each_possible_cpu(cpu)					\
+		__sum += per_cpu(var, cpu);				\
+	__sum;								\
+})
+
 /*
- * Like the normal down_read() this is not recursive, the writer can
- * come after the first percpu_down_read() and create the deadlock.
- *
- * Note: returns with lock_is_held(brw->rw_sem) == T for lockdep,
- * percpu_up_read() does rwsem_release(). This pairs with the usage
- * of ->rw_sem in percpu_down/up_write().
+ * Return true if the modular sum of the sem->refcount per-CPU variable is
+ * zero.  If this sum is zero, then it is stable due to the fact that if any
+ * newly arriving readers increment a given counter, they will immediately
+ * decrement that same counter.
  */
-void percpu_down_read(struct percpu_rw_semaphore *brw)
+static bool readers_active_check(struct percpu_rw_semaphore *sem)
 {
-	might_sleep();
-	if (likely(update_fast_ctr(brw, +1))) {
-		rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_);
-		return;
-	}
+	if (per_cpu_sum(*sem->refcount) != 0)
+		return false;
+
+	/*
+	 * If we observed the decrement; ensure we see the entire critical
+	 * section.
+	 */
 
-	down_read(&brw->rw_sem);
-	atomic_inc(&brw->slow_read_ctr);
-	/* avoid up_read()->rwsem_release() */
-	__up_read(&brw->rw_sem);
+	smp_mb(); /* C matches B */
+
+	return true;
 }
 
-void percpu_up_read(struct percpu_rw_semaphore *brw)
+void percpu_down_write(struct percpu_rw_semaphore *sem)
 {
-	rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_);
+	down_write(&sem->rw_sem);
 
-	if (likely(update_fast_ctr(brw, -1)))
-		return;
+	/* Notify readers to take the slow path. */
+	rcu_sync_enter(&sem->rss);
 
-	/* false-positive is possible but harmless */
-	if (atomic_dec_and_test(&brw->slow_read_ctr))
-		wake_up_all(&brw->write_waitq);
-}
+	/*
+	 * Notify new readers to block; up until now, and thus throughout the
+	 * longish rcu_sync_enter() above, new readers could still come in.
+	 */
+	sem->state = readers_block;
 
-static int clear_fast_ctr(struct percpu_rw_semaphore *brw)
-{
-	unsigned int sum = 0;
-	int cpu;
+	smp_mb(); /* D matches A */
 
-	for_each_possible_cpu(cpu) {
-		sum += per_cpu(*brw->fast_read_ctr, cpu);
-		per_cpu(*brw->fast_read_ctr, cpu) = 0;
-	}
+	/*
+	 * If they don't see our writer of readers_block to sem->state,
+	 * then we are guaranteed to see their sem->refcount increment, and
+	 * therefore will wait for them.
+	 */
 
-	return sum;
+	/* Wait for all now active readers to complete. */
+	wait_event(sem->writer, readers_active_check(sem));
 }
 
-/*
- * A writer increments ->write_ctr to force the readers to switch to the
- * slow mode, note the atomic_read() check in update_fast_ctr().
- *
- * After that the readers can only inc/dec the slow ->slow_read_ctr counter,
- * ->fast_read_ctr is stable. Once the writer moves its sum into the slow
- * counter it represents the number of active readers.
- *
- * Finally the writer takes ->rw_sem for writing and blocks the new readers,
- * then waits until the slow counter becomes zero.
- */
-void percpu_down_write(struct percpu_rw_semaphore *brw)
+void percpu_up_write(struct percpu_rw_semaphore *sem)
 {
-	/* tell update_fast_ctr() there is a pending writer */
-	atomic_inc(&brw->write_ctr);
 	/*
-	 * 1. Ensures that write_ctr != 0 is visible to any down_read/up_read
-	 *    so that update_fast_ctr() can't succeed.
-	 *
-	 * 2. Ensures we see the result of every previous this_cpu_add() in
-	 *    update_fast_ctr().
+	 * Signal the writer is done, no fast path yet.
 	 *
-	 * 3. Ensures that if any reader has exited its critical section via
-	 *    fast-path, it executes a full memory barrier before we return.
-	 *    See R_W case in the comment above update_fast_ctr().
+	 * One reason that we cannot just immediately flip to readers_fast is
+	 * that new readers might fail to see the results of this writer's
+	 * critical section.
 	 */
-	synchronize_sched_expedited();
-
-	/* exclude other writers, and block the new readers completely */
-	down_write(&brw->rw_sem);
+	sem->state = readers_slow;
 
-	/* nobody can use fast_read_ctr, move its sum into slow_read_ctr */
-	atomic_add(clear_fast_ctr(brw), &brw->slow_read_ctr);
-
-	/* wait for all readers to complete their percpu_up_read() */
-	wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr));
-}
+	/*
+	 * Release the write lock, this will allow readers back in the game.
+	 */
+	up_write(&sem->rw_sem);
 
-void percpu_up_write(struct percpu_rw_semaphore *brw)
-{
-	/* release the lock, but the readers can't use the fast-path */
-	up_write(&brw->rw_sem);
 	/*
-	 * Insert the barrier before the next fast-path in down_read,
-	 * see W_R case in the comment above update_fast_ctr().
+	 * Once this completes (at least one RCU grace period hence) the reader
+	 * fast path will be available again. Safe to use outside the exclusive
+	 * write lock because its counting.
 	 */
-	synchronize_sched_expedited();
-	/* the last writer unblocks update_fast_ctr() */
-	atomic_dec(&brw->write_ctr);
+	rcu_sync_exit(&sem->rss);
 }



^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [RFC][PATCH 0/5] Optimize percpu-rwsem
  2015-05-26 11:43 [RFC][PATCH 0/5] Optimize percpu-rwsem Peter Zijlstra
                   ` (4 preceding siblings ...)
  2015-05-26 11:44 ` [RFC][PATCH 5/5] percpu-rwsem: Optimize readers and reduce global impact Peter Zijlstra
@ 2015-05-26 18:12 ` Linus Torvalds
  2015-05-26 18:34   ` Peter Zijlstra
                     ` (3 more replies)
  2015-05-26 19:54 ` Davidlohr Bueso
  6 siblings, 4 replies; 36+ messages in thread
From: Linus Torvalds @ 2015-05-26 18:12 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Oleg Nesterov, Paul McKenney, Tejun Heo, Ingo Molnar,
	Linux Kernel Mailing List, der.herr, Davidlohr Bueso

On Tue, May 26, 2015 at 4:43 AM, Peter Zijlstra <peterz@infradead.org> wrote:
>
> This is a derived work of the cpu hotplug lock rework I did in 2013 which never
> really went anywhere because Linus didn't like it.
>
> This applies those same optimizations to the percpu-rwsem. Seeing how we did
> all the work it seemed a waste to not use it at all.

So I *still* don't like it.

We literally have one single percpu-rwsem IN THE WHOLE KERNEL TREE.

One.

Not "a couple". Not "hundreds". ONE.

And that single use that you are optimizing for isn't even explained.
It's not really even clear that that thing is needed: fork() already
takes the mmap_sem for writing, and I'm not at all convinced that the
uprobes code couldn't just use mmap_sem for every mm it traverses,
rather than use this global percpu lock that we use absolutely nowhere
else.

So I'm not convinced that the right thing to do is to try to optimize
that lock at ALL. I'd rather get rid of it entirely.

Is there some new use that I don't know about? Have people really
looked at that uprobes code deeply? OF COURSE global locks will have
problems, I'm not at all convinced that "let's make that global lock
really complicated and clever" is the proper solution.

For example, the write lock is only ever taken when registering an
uprobe. Not exactly likely to be timing-critical. Is there some reason
why we couldn't get rid of the magical per-cpu rwsem entirely, and
replace it with something truly simple like a hashed rmsem, where the
readers (fork), will just read-lock one hashed rwsem, and the writer
will just write-lock every hashed rwsem.

The hash might be "pick the currently executing CPU at the time of
fork(), modulo 16". That one doesn't even need to disable preemption,
since it doesn't actually matter that the CPU stays constant, it's
just a trivial heurisitic to avoid cacheline pingpongs under very
heavy fork() load.

So we'd have

    #define NR_UPROBE_RWSEM 16

    // One cacheline per rwsem.
    struct cache_line_aligned_rw_semaphore uprobe_rwsem[NR_UPROBE_RWSEM];

and fork() would basically do

    int idx = raw_smp_processor_id() & (NR_UPROBE_RWSEM-1);
    struct rw_sem *usem = &uprobe_rwsem[idx].rwsem;
     ..
    down_read(usem);
    ... do the dup_mmap() here ..
    up_read(usem);

and the uprobe register thing could just do

    for (i = 0; i < NR_UPROBE_RWSEM; i++)
        down_write(&uprobe_rwsem[i].rwsem);
    ...
    for (i = 0; ... unlock them all ..)

which doesn't look any slower than what we do now (I suspect the
fork() part would be faster), and is much simpler, and would get rid
of the percpu-rwsem entirely.

Hmm?

                  Linus

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [RFC][PATCH 0/5] Optimize percpu-rwsem
  2015-05-26 18:12 ` [RFC][PATCH 0/5] Optimize percpu-rwsem Linus Torvalds
@ 2015-05-26 18:34   ` Peter Zijlstra
  2015-05-26 18:35   ` Tejun Heo
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 36+ messages in thread
From: Peter Zijlstra @ 2015-05-26 18:34 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Oleg Nesterov, Paul McKenney, Tejun Heo, Ingo Molnar,
	Linux Kernel Mailing List, der.herr, Davidlohr Bueso

On Tue, May 26, 2015 at 11:12:04AM -0700, Linus Torvalds wrote:
> Is there some new use that I don't know about?

TJ wants to use it in cgroups.

 lkml.kernel.org/r/1431549318-16756-3-git-send-email-tj@kernel.org 

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [RFC][PATCH 0/5] Optimize percpu-rwsem
  2015-05-26 18:12 ` [RFC][PATCH 0/5] Optimize percpu-rwsem Linus Torvalds
  2015-05-26 18:34   ` Peter Zijlstra
@ 2015-05-26 18:35   ` Tejun Heo
  2015-05-26 18:42   ` Davidlohr Bueso
  2015-05-26 18:57   ` Oleg Nesterov
  3 siblings, 0 replies; 36+ messages in thread
From: Tejun Heo @ 2015-05-26 18:35 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Peter Zijlstra, Oleg Nesterov, Paul McKenney, Ingo Molnar,
	Linux Kernel Mailing List, der.herr, Davidlohr Bueso

Hello, Linus.

On Tue, May 26, 2015 at 11:12:04AM -0700, Linus Torvalds wrote:
...
> Is there some new use that I don't know about? Have people really
> looked at that uprobes code deeply? OF COURSE global locks will have
> problems, I'm not at all convinced that "let's make that global lock
> really complicated and clever" is the proper solution.

I've posted a patchset to make threadgroup rwsem cgroup-specific (it
is used only by cgroups) and replace the per-threadgroup rwsems with a
global percpu_rwsem.  This is primarily to allow either succeeding or
failing migrations of multiple tasks atomically.  We already try to do
multiple task migrations in some corner cases and the unified
hierarchy will make wider use of it.  Currently, the implementation is
broken as there's no easy way of reverting changes if the operation
fails in the middle.

Given that task migrations aren't particularly high frequency
operations, a global percpu_rwsem is likely to be workable while being
less tedious and slightly less costly for the usual fork/exit paths.

That said, from cgroup's pov, write-locking per-threadgroup rwsems of
all target tasks works too although we'd need an outer mutex to
serialize down-write attempts to avoid locking order issues.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [RFC][PATCH 0/5] Optimize percpu-rwsem
  2015-05-26 18:12 ` [RFC][PATCH 0/5] Optimize percpu-rwsem Linus Torvalds
  2015-05-26 18:34   ` Peter Zijlstra
  2015-05-26 18:35   ` Tejun Heo
@ 2015-05-26 18:42   ` Davidlohr Bueso
  2015-05-26 21:57     ` Linus Torvalds
  2015-05-27  6:53     ` Peter Zijlstra
  2015-05-26 18:57   ` Oleg Nesterov
  3 siblings, 2 replies; 36+ messages in thread
From: Davidlohr Bueso @ 2015-05-26 18:42 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Peter Zijlstra, Oleg Nesterov, Paul McKenney, Tejun Heo,
	Ingo Molnar, Linux Kernel Mailing List, der.herr

On Tue, 2015-05-26 at 11:12 -0700, Linus Torvalds wrote:
> On Tue, May 26, 2015 at 4:43 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > This is a derived work of the cpu hotplug lock rework I did in 2013 which never
> > really went anywhere because Linus didn't like it.
> >
> > This applies those same optimizations to the percpu-rwsem. Seeing how we did
> > all the work it seemed a waste to not use it at all.
> 
> So I *still* don't like it.
> 
> We literally have one single percpu-rwsem IN THE WHOLE KERNEL TREE.
> 
> One.
> 
> Not "a couple". Not "hundreds". ONE.

Instead of dropping percpu-rwsem, I was thinking we could instead look
for opportunities to convert new users, for instance shinkers, where the
write lock is also taken just for register and unregister purposes,
similar to uprobes.

Thanks,
Davidlohr


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [RFC][PATCH 0/5] Optimize percpu-rwsem
  2015-05-26 18:12 ` [RFC][PATCH 0/5] Optimize percpu-rwsem Linus Torvalds
                     ` (2 preceding siblings ...)
  2015-05-26 18:42   ` Davidlohr Bueso
@ 2015-05-26 18:57   ` Oleg Nesterov
  2015-05-26 19:13     ` Oleg Nesterov
  2015-05-26 19:29     ` Oleg Nesterov
  3 siblings, 2 replies; 36+ messages in thread
From: Oleg Nesterov @ 2015-05-26 18:57 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Peter Zijlstra, Paul McKenney, Tejun Heo, Ingo Molnar,
	Linux Kernel Mailing List, der.herr, Davidlohr Bueso

On 05/26, Linus Torvalds wrote:
>
> On Tue, May 26, 2015 at 4:43 AM, Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > This is a derived work of the cpu hotplug lock rework I did in 2013 which never
> > really went anywhere because Linus didn't like it.
> >
> > This applies those same optimizations to the percpu-rwsem. Seeing how we did
> > all the work it seemed a waste to not use it at all.
>
> So I *still* don't like it.
>
> We literally have one single percpu-rwsem IN THE WHOLE KERNEL TREE.
>
> One.

Well. IIRC Tejun is going to turn signal_struct->group_rwsem into
percpu-rwsem.

And it can have more users. Say, __sb_start_write/etc  does something
similar, and last time I checked this code it looked buggy to me.

> And that single use that you are optimizing for isn't even explained.
> It's not really even clear that that thing is needed: fork() already
> takes the mmap_sem for writing, and I'm not at all convinced that the
> uprobes code couldn't just use mmap_sem for every mm it traverses,

I don't think we can do this... At least I do not see how.

register_for_each_vma() would need to lock all ->mmap_sem's in system
to avoid the race with fork().

Oleg.


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [RFC][PATCH 0/5] Optimize percpu-rwsem
  2015-05-26 18:57   ` Oleg Nesterov
@ 2015-05-26 19:13     ` Oleg Nesterov
  2015-05-26 19:29     ` Oleg Nesterov
  1 sibling, 0 replies; 36+ messages in thread
From: Oleg Nesterov @ 2015-05-26 19:13 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Peter Zijlstra, Paul McKenney, Tejun Heo, Ingo Molnar,
	Linux Kernel Mailing List, der.herr, Davidlohr Bueso

On 05/26, Oleg Nesterov wrote:
>
> On 05/26, Linus Torvalds wrote:
> >
> >
> > We literally have one single percpu-rwsem IN THE WHOLE KERNEL TREE.
> >
> > One.
>
> Well. IIRC Tejun is going to turn signal_struct->group_rwsem into
> percpu-rwsem.
>
> And it can have more users. Say, __sb_start_write/etc  does something
> similar, and last time I checked this code it looked buggy to me.

I have found my old email, see below. Perhaps this code was changed
since 2013 when I sent this email, I didn't verify... but in any
case this logic doesn't look simple, imo it would be nice to rely
on the generic helpers from kernel/locking.

Oleg.

------------------------------------------------------------------------------
When I look at __sb_start_write/etc I am not sure this locking
is correct. OK, __sb_start_write() does:

	percpu_counter_inc();

	mb();

	if (sb->s_writers.frozen)
		abort_and_retry;

freeze_super() does STORE + mb + LOAD in reverse order so either
__sb_start_write() must see SB_FREEZE_WRITE or freeze_super() must
see the change in ->s_writers.counter. This is correct.

Still I am not sure sb_wait_write() can trust percpu_counter_sum(),
because it can also see _other_ changes.

To simplify the discussion, suppose that percpu_counter doesn't have
lock/count/batch/whatever and  inc/dec/sum only uses "__percpu *counters".
Lets denote sb->s_writers.counter[level] as CTR[cpu].

Suppose that some thread did __sb_start_write() on CPU_1 and sleeps
"forever". CTR[0] == 0, CTR_[1] == 1, freezer_super() should block.

Now:

	1. freeze_super() sets SB_FREEZE_WRITE, does mb(), and
	   starts sb_wait_write()->percpu_counter_sum().

	2. __percpu_counter_sum() does for_each_online_cpu(),
	   reads CTR[0] == 0. ret = 0.

	3. Another thread comes, calls __sb_start_write() on CPU_0,
	   increments CTR[0].

	   Then it notices sb->s_writers.frozen >= level and starts
	   __sb_end_write() before retry.

	   Then it migrates to CPU_1. And decrements CTR[1] before
	   __percpu_counter_sum() reads it.

	   So CTR[0] == 1, CTR[1] == 0. Everything is fine except
	   sb_wait_write() has already read CTR[0].

	 4. __percpu_counter_sum() continues, reads CTR[1] == 0
	    and returns ret == 0.

sb_wait_write() returns while it should not?


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [RFC][PATCH 0/5] Optimize percpu-rwsem
  2015-05-26 18:57   ` Oleg Nesterov
  2015-05-26 19:13     ` Oleg Nesterov
@ 2015-05-26 19:29     ` Oleg Nesterov
  1 sibling, 0 replies; 36+ messages in thread
From: Oleg Nesterov @ 2015-05-26 19:29 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Peter Zijlstra, Paul McKenney, Tejun Heo, Ingo Molnar,
	Linux Kernel Mailing List, der.herr, Davidlohr Bueso

Sorry for noise, forgot to mention...

On 05/26, Oleg Nesterov wrote:
>
> And it can have more users.

I can be wrong and I am obviously biased... but it seems to me that
even rcu_sync alone can have more users in future. This interface
looks natural and simple.

Oleg.


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [RFC][PATCH 0/5] Optimize percpu-rwsem
  2015-05-26 11:43 [RFC][PATCH 0/5] Optimize percpu-rwsem Peter Zijlstra
                   ` (5 preceding siblings ...)
  2015-05-26 18:12 ` [RFC][PATCH 0/5] Optimize percpu-rwsem Linus Torvalds
@ 2015-05-26 19:54 ` Davidlohr Bueso
  6 siblings, 0 replies; 36+ messages in thread
From: Davidlohr Bueso @ 2015-05-26 19:54 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: oleg, paulmck, tj, mingo, linux-kernel, der.herr, torvalds

While at it, I'll go ahead and add this triviality.

From: Davidlohr Bueso <dave@stgolabs.net>
Date: Tue, 26 May 2015 12:51:15 -0700
Subject: [PATCH] doc,locking: Move percpu-rwsem doc into proper subdir

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
---
 Documentation/locking/percpu-rw-semaphore.txt | 27 +++++++++++++++++++++++++++
 Documentation/percpu-rw-semaphore.txt         | 27 ---------------------------
 2 files changed, 27 insertions(+), 27 deletions(-)
 create mode 100644 Documentation/locking/percpu-rw-semaphore.txt
 delete mode 100644 Documentation/percpu-rw-semaphore.txt

diff --git a/Documentation/locking/percpu-rw-semaphore.txt b/Documentation/locking/percpu-rw-semaphore.txt
new file mode 100644
index 0000000..7d3c824
--- /dev/null
+++ b/Documentation/locking/percpu-rw-semaphore.txt
@@ -0,0 +1,27 @@
+Percpu rw semaphores
+--------------------
+
+Percpu rw semaphores is a new read-write semaphore design that is
+optimized for locking for reading.
+
+The problem with traditional read-write semaphores is that when multiple
+cores take the lock for reading, the cache line containing the semaphore
+is bouncing between L1 caches of the cores, causing performance
+degradation.
+
+Locking for reading is very fast, it uses RCU and it avoids any atomic
+instruction in the lock and unlock path. On the other hand, locking for
+writing is very expensive, it calls synchronize_rcu() that can take
+hundreds of milliseconds.
+
+The lock is declared with "struct percpu_rw_semaphore" type.
+The lock is initialized percpu_init_rwsem, it returns 0 on success and
+-ENOMEM on allocation failure.
+The lock must be freed with percpu_free_rwsem to avoid memory leak.
+
+The lock is locked for read with percpu_down_read, percpu_up_read and
+for write with percpu_down_write, percpu_up_write.
+
+The idea of using RCU for optimized rw-lock was introduced by
+Eric Dumazet <eric.dumazet@gmail.com>.
+The code was written by Mikulas Patocka <mpatocka@redhat.com>
diff --git a/Documentation/percpu-rw-semaphore.txt b/Documentation/percpu-rw-semaphore.txt
deleted file mode 100644
index 7d3c824..0000000
--- a/Documentation/percpu-rw-semaphore.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-Percpu rw semaphores
---------------------
-
-Percpu rw semaphores is a new read-write semaphore design that is
-optimized for locking for reading.
-
-The problem with traditional read-write semaphores is that when multiple
-cores take the lock for reading, the cache line containing the semaphore
-is bouncing between L1 caches of the cores, causing performance
-degradation.
-
-Locking for reading is very fast, it uses RCU and it avoids any atomic
-instruction in the lock and unlock path. On the other hand, locking for
-writing is very expensive, it calls synchronize_rcu() that can take
-hundreds of milliseconds.
-
-The lock is declared with "struct percpu_rw_semaphore" type.
-The lock is initialized percpu_init_rwsem, it returns 0 on success and
--ENOMEM on allocation failure.
-The lock must be freed with percpu_free_rwsem to avoid memory leak.
-
-The lock is locked for read with percpu_down_read, percpu_up_read and
-for write with percpu_down_write, percpu_up_write.
-
-The idea of using RCU for optimized rw-lock was introduced by
-Eric Dumazet <eric.dumazet@gmail.com>.
-The code was written by Mikulas Patocka <mpatocka@redhat.com>
-- 
2.1.4




^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [RFC][PATCH 0/5] Optimize percpu-rwsem
  2015-05-26 18:42   ` Davidlohr Bueso
@ 2015-05-26 21:57     ` Linus Torvalds
  2015-05-27  9:28       ` Nicholas Mc Guire
  2015-06-05  1:45       ` Al Viro
  2015-05-27  6:53     ` Peter Zijlstra
  1 sibling, 2 replies; 36+ messages in thread
From: Linus Torvalds @ 2015-05-26 21:57 UTC (permalink / raw)
  To: Davidlohr Bueso
  Cc: Peter Zijlstra, Oleg Nesterov, Paul McKenney, Tejun Heo,
	Ingo Molnar, Linux Kernel Mailing List, der.herr

On Tue, May 26, 2015 at 11:42 AM, Davidlohr Bueso <dave@stgolabs.net> wrote:
>
> Instead of dropping percpu-rwsem, I was thinking we could instead look
> for opportunities to convert new users, for instance shinkers, where the
> write lock is also taken just for register and unregister purposes,
> similar to uprobes.

So if there really are useful use cases for this, I don't object to
the patch. It seems to just improve on a currently very low-usage
locking primitive.

And it's not like I conceptually mind the notion of a percpu rwsem, I
just hate seeing specialty locking that isn't really worth it.

Because as it is, with the current single use, I don't think it's even
worth improving on.

I _would_ ask that people who are looking at this also look at our
"lglock" thing. It's pretty much *exactly* the same thing, except for
spinlocks, and that one too has exactly two users (the documentation
states that the only user is stop_machine, but in fact file locking
does too).

Because that is another example of a complete failure of a locking
primitive that was just too specialized to be worth it.

                       Linus

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [RFC][PATCH 0/5] Optimize percpu-rwsem
  2015-05-26 18:42   ` Davidlohr Bueso
  2015-05-26 21:57     ` Linus Torvalds
@ 2015-05-27  6:53     ` Peter Zijlstra
  1 sibling, 0 replies; 36+ messages in thread
From: Peter Zijlstra @ 2015-05-27  6:53 UTC (permalink / raw)
  To: Davidlohr Bueso
  Cc: Linus Torvalds, Oleg Nesterov, Paul McKenney, Tejun Heo,
	Ingo Molnar, Linux Kernel Mailing List, der.herr

On Tue, May 26, 2015 at 11:42:11AM -0700, Davidlohr Bueso wrote:

> Instead of dropping percpu-rwsem, I was thinking we could instead look
> for opportunities to convert new users, for instance shinkers, where the
> write lock is also taken just for register and unregister purposes,
> similar to uprobes.

Another 'obvious' one would be the cpu hotplug lock. That's basically
the same locking primitive with just a wee little difference.


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [RFC][PATCH 0/5] Optimize percpu-rwsem
  2015-05-26 21:57     ` Linus Torvalds
@ 2015-05-27  9:28       ` Nicholas Mc Guire
  2015-06-05  1:45       ` Al Viro
  1 sibling, 0 replies; 36+ messages in thread
From: Nicholas Mc Guire @ 2015-05-27  9:28 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Davidlohr Bueso, Peter Zijlstra, Oleg Nesterov, Paul McKenney,
	Tejun Heo, Ingo Molnar, Linux Kernel Mailing List

On Tue, 26 May 2015, Linus Torvalds wrote:

> On Tue, May 26, 2015 at 11:42 AM, Davidlohr Bueso <dave@stgolabs.net> wrote:
> >
> > Instead of dropping percpu-rwsem, I was thinking we could instead look
> > for opportunities to convert new users, for instance shinkers, where the
> > write lock is also taken just for register and unregister purposes,
> > similar to uprobes.
> 
> So if there really are useful use cases for this, I don't object to
> the patch. It seems to just improve on a currently very low-usage
> locking primitive.
> 
> And it's not like I conceptually mind the notion of a percpu rwsem, I
> just hate seeing specialty locking that isn't really worth it.
> 
> Because as it is, with the current single use, I don't think it's even
> worth improving on.
> 
> I _would_ ask that people who are looking at this also look at our
> "lglock" thing. It's pretty much *exactly* the same thing, except for
> spinlocks, and that one too has exactly two users (the documentation
> states that the only user is stop_machine, but in fact file locking
> does too).
>
not sure where this would be missing:
Documentation/locking/lglock.txt
"Users: currently only the VFS and stop_machine related code"

I atleast did not find any other users as of 3.18 and in 4.0-rc5 this
still seems valid.

thx!
hofrat
 

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [RFC][PATCH 5/5] percpu-rwsem: Optimize readers and reduce global impact
  2015-05-26 11:44 ` [RFC][PATCH 5/5] percpu-rwsem: Optimize readers and reduce global impact Peter Zijlstra
@ 2015-05-29 19:45   ` Oleg Nesterov
  2015-05-29 20:09     ` Oleg Nesterov
  2015-05-30 17:18   ` Paul E. McKenney
  1 sibling, 1 reply; 36+ messages in thread
From: Oleg Nesterov @ 2015-05-29 19:45 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: paulmck, tj, mingo, linux-kernel, der.herr, dave, torvalds

Sorry for delay, finally I found the time to read this series...
The code matches our previous discussion and I believe it is correct.

Reviewed-by: Oleg Nesterov <oleg@redhat.com>


Just one nit below,

On 05/26, Peter Zijlstra wrote:
>
>  struct percpu_rw_semaphore {
> -	unsigned int __percpu	*fast_read_ctr;
> -	atomic_t		write_ctr;
> +	unsigned int __percpu	*refcount;
> +	int			state;

....

> +enum { readers_slow, readers_block };

Now that we rely on rss_sync() and thus we do not have "readers_fast",
I think that "bool reader_should_block" will look better.

> +void percpu_down_write(struct percpu_rw_semaphore *sem)
>  {
...

so it does

	rcu_sync_enter(&sem->rss);

	state = BLOCK;

	mb();

	wait_event(sem->writer, readers_active_check(sem));

and this looks correct.

The only nontrivial thing we need to ensure is that
per_cpu_sum(*sem->refcount) == 0 can't be false positive. False
negative is fine.

And this means that if we see the result of this_cpu_dec() we must
not miss the result of the previous this_cpu_inc() on another CPU.
same or _another_ CPU.

And this is true because if the reader does dec() on another CPU
it does up_read() and this is only possible if down_read() didn't
see state == BLOCK.

But if it didn't see state == BLOCK then the writer must see the
result of the previous down_read()->inc().

IOW, we just rely on STORE-MB-LOAD, just the writer does LOAD
multiple times in per_cpu_sum():

DOWN_WRITE:			DOWN_READ on CPU X:

state = BLOCK;			refcount[X]++;

mb();				mb();

sum = 0;			if (state != BLOCK)
sum += refcount[0];			return;	 /* success* /
sum += refcount[1];		
...				refcount[X]--;
sum += refcount[NR_CPUS];


If the reader wins and takes the lock, then its addition to
refcount[X] must be accounted by the writer.

The writer can obviously miss dec() from the reader, but we rely
on wake_up/wait_event in this case.

Oleg.


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [RFC][PATCH 5/5] percpu-rwsem: Optimize readers and reduce global impact
  2015-05-29 19:45   ` Oleg Nesterov
@ 2015-05-29 20:09     ` Oleg Nesterov
  2015-05-29 20:41       ` Linus Torvalds
  0 siblings, 1 reply; 36+ messages in thread
From: Oleg Nesterov @ 2015-05-29 20:09 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: paulmck, tj, mingo, linux-kernel, der.herr, dave, torvalds

On 05/29, Oleg Nesterov wrote:
>
> Just one nit below,

Ah, wait, I forgot about the "trivial" percpu_up_write()...

Doesn't it need mb() before "state = readers_slow" to ensure
"release" semantics?

Oleg.


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [RFC][PATCH 5/5] percpu-rwsem: Optimize readers and reduce global impact
  2015-05-29 20:09     ` Oleg Nesterov
@ 2015-05-29 20:41       ` Linus Torvalds
  2015-05-30 20:49         ` Oleg Nesterov
  0 siblings, 1 reply; 36+ messages in thread
From: Linus Torvalds @ 2015-05-29 20:41 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Peter Zijlstra, Paul McKenney, Tejun Heo, Ingo Molnar,
	Linux Kernel Mailing List, der.herr, Davidlohr Bueso

On Fri, May 29, 2015 at 1:09 PM, Oleg Nesterov <oleg@redhat.com> wrote:
>
> Doesn't it need mb() before "state = readers_slow" to ensure
> "release" semantics?

Please don't do any mb() at all. Just do smp_store_release() if
release semantics is what you want.

Basically, strive to avoid "smp_mb()" at all costs. It's insanely
expensive compared to pretty much all other serialization
alternatives.

(And try to make sure to pair it with smp_load_acquire() on the other
side for things to make sense)

                  Linus

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [RFC][PATCH 1/5] rcu: Create rcu_sync infrastructure
  2015-05-26 11:43 ` [RFC][PATCH 1/5] rcu: Create rcu_sync infrastructure Peter Zijlstra
@ 2015-05-30 16:58   ` Paul E. McKenney
  2015-05-30 19:16     ` Oleg Nesterov
  0 siblings, 1 reply; 36+ messages in thread
From: Paul E. McKenney @ 2015-05-30 16:58 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: oleg, tj, mingo, linux-kernel, der.herr, dave, torvalds

On Tue, May 26, 2015 at 01:43:57PM +0200, Peter Zijlstra wrote:
> From: Oleg Nesterov <oleg@redhat.com>
> 
> It is functionally equivalent to
> 
>         struct rcu_sync_struct {
>                 atomic_t counter;
>         };
> 
>         static inline bool rcu_sync_is_idle(struct rcu_sync_struct *rss)
>         {
>                 return atomic_read(&rss->counter) == 0;
>         }
> 
>         static inline void rcu_sync_enter(struct rcu_sync_struct *rss)
>         {
>                 atomic_inc(&rss->counter);
>                 synchronize_sched();
>         }

For vanilla RCU, this is called get_state_synchronize_rcu().

>         static inline void rcu_sync_exit(struct rcu_sync_struct *rss)
>         {
>                 synchronize_sched();
>                 atomic_dec(&rss->counter);
>         }
> 
> except: it records the state and synchronize_sched() is only called by
> rcu_sync_enter() and only if necessary.

Again for vanilla RCU, this is called cond_synchronize_rcu().

These functions are quite small, so the other flavors can be created
easily if needed.

							Thanx, Paul

> Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> Signed-off-by: Oleg Nesterov <oleg@redhat.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  include/linux/rcusync.h |   64 ++++++++++++++++++++++++++++
>  kernel/rcu/Makefile     |    2 
>  kernel/rcu/sync.c       |  108 ++++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 173 insertions(+), 1 deletion(-)
> 
> --- /dev/null
> +++ b/include/linux/rcusync.h
> @@ -0,0 +1,64 @@
> +#ifndef _LINUX_RCUSYNC_H_
> +#define _LINUX_RCUSYNC_H_
> +
> +#include <linux/wait.h>
> +#include <linux/rcupdate.h>
> +
> +struct rcu_sync_struct {
> +	int			gp_state;
> +	int			gp_count;
> +	wait_queue_head_t	gp_wait;
> +
> +	int			cb_state;
> +	struct rcu_head		cb_head;
> +
> +	void (*sync)(void);
> +	void (*call)(struct rcu_head *, void (*)(struct rcu_head *));
> +};
> +
> +#define ___RCU_SYNC_INIT(name)						\
> +	.gp_state = 0,							\
> +	.gp_count = 0,							\
> +	.gp_wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.gp_wait),		\
> +	.cb_state = 0
> +
> +#define __RCU_SCHED_SYNC_INIT(name) {					\
> +	___RCU_SYNC_INIT(name),						\
> +	.sync = synchronize_sched,					\
> +	.call = call_rcu_sched,						\
> +}
> +
> +#define __RCU_BH_SYNC_INIT(name) {					\
> +	___RCU_SYNC_INIT(name),						\
> +	.sync = synchronize_rcu_bh,					\
> +	.call = call_rcu_bh,						\
> +}
> +
> +#define __RCU_SYNC_INIT(name) {						\
> +	___RCU_SYNC_INIT(name),						\
> +	.sync = synchronize_rcu,					\
> +	.call = call_rcu,						\
> +}
> +
> +#define DEFINE_RCU_SCHED_SYNC(name)					\
> +	struct rcu_sync_struct name = __RCU_SCHED_SYNC_INIT(name)
> +
> +#define DEFINE_RCU_BH_SYNC(name)					\
> +	struct rcu_sync_struct name = __RCU_BH_SYNC_INIT(name)
> +
> +#define DEFINE_RCU_SYNC(name)						\
> +	struct rcu_sync_struct name = __RCU_SYNC_INIT(name)
> +
> +static inline bool rcu_sync_is_idle(struct rcu_sync_struct *rss)
> +{
> +	return !rss->gp_state; /* GP_IDLE */
> +}
> +
> +enum rcu_sync_type { RCU_SYNC, RCU_SCHED_SYNC, RCU_BH_SYNC };
> +
> +extern void rcu_sync_init(struct rcu_sync_struct *, enum rcu_sync_type);
> +extern void rcu_sync_enter(struct rcu_sync_struct *);
> +extern void rcu_sync_exit(struct rcu_sync_struct *);
> +
> +#endif /* _LINUX_RCUSYNC_H_ */
> +
> --- a/kernel/rcu/Makefile
> +++ b/kernel/rcu/Makefile
> @@ -1,4 +1,4 @@
> -obj-y += update.o
> +obj-y += update.o sync.o
>  obj-$(CONFIG_SRCU) += srcu.o
>  obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
>  obj-$(CONFIG_TREE_RCU) += tree.o
> --- /dev/null
> +++ b/kernel/rcu/sync.c
> @@ -0,0 +1,108 @@
> +
> +#include <linux/rcusync.h>
> +#include <linux/sched.h>
> +
> +enum { GP_IDLE = 0, GP_PENDING, GP_PASSED };
> +enum { CB_IDLE = 0, CB_PENDING, CB_REPLAY };
> +
> +#define	rss_lock	gp_wait.lock
> +
> +void rcu_sync_init(struct rcu_sync_struct *rss, enum rcu_sync_type type)
> +{
> +	memset(rss, 0, sizeof(*rss));
> +	init_waitqueue_head(&rss->gp_wait);
> +
> +	switch (type) {
> +	case RCU_SYNC:
> +		rss->sync = synchronize_rcu;
> +		rss->call = call_rcu;
> +		break;
> +
> +	case RCU_SCHED_SYNC:
> +		rss->sync = synchronize_sched;
> +		rss->call = call_rcu_sched;
> +		break;
> +
> +	case RCU_BH_SYNC:
> +		rss->sync = synchronize_rcu_bh;
> +		rss->call = call_rcu_bh;
> +		break;
> +	}
> +}
> +
> +void rcu_sync_enter(struct rcu_sync_struct *rss)
> +{
> +	bool need_wait, need_sync;
> +
> +	spin_lock_irq(&rss->rss_lock);
> +	need_wait = rss->gp_count++;
> +	need_sync = rss->gp_state == GP_IDLE;
> +	if (need_sync)
> +		rss->gp_state = GP_PENDING;
> +	spin_unlock_irq(&rss->rss_lock);
> +
> +	BUG_ON(need_wait && need_sync);
> +
> +	if (need_sync) {
> +		rss->sync();
> +		rss->gp_state = GP_PASSED;
> +		wake_up_all(&rss->gp_wait);
> +	} else if (need_wait) {
> +		wait_event(rss->gp_wait, rss->gp_state == GP_PASSED);
> +	} else {
> +		/*
> +		 * Possible when there's a pending CB from a rcu_sync_exit().
> +		 * Nobody has yet been allowed the 'fast' path and thus we can
> +		 * avoid doing any sync(). The callback will get 'dropped'.
> +		 */
> +		BUG_ON(rss->gp_state != GP_PASSED);
> +	}
> +}
> +
> +static void rcu_sync_func(struct rcu_head *rcu)
> +{
> +	struct rcu_sync_struct *rss =
> +		container_of(rcu, struct rcu_sync_struct, cb_head);
> +	unsigned long flags;
> +
> +
> +	BUG_ON(rss->gp_state != GP_PASSED);
> +	BUG_ON(rss->cb_state == CB_IDLE);
> +
> +	spin_lock_irqsave(&rss->rss_lock, flags);
> +	if (rss->gp_count) {
> +		/*
> +		 * A new rcu_sync_begin() has happened; drop the callback.
> +		 */
> +		rss->cb_state = CB_IDLE;
> +	} else if (rss->cb_state == CB_REPLAY) {
> +		/*
> +		 * A new rcu_sync_exit() has happened; requeue the callback
> +		 * to catch a later GP.
> +		 */
> +		rss->cb_state = CB_PENDING;
> +		rss->call(&rss->cb_head, rcu_sync_func);
> +	} else {
> +		/*
> +		 * We're at least a GP after rcu_sync_exit(); eveybody will now
> +		 * have observed the write side critical section. Let 'em rip!.
> +		 */
> +		rss->cb_state = CB_IDLE;
> +		rss->gp_state = GP_IDLE;
> +	}
> +	spin_unlock_irqrestore(&rss->rss_lock, flags);
> +}
> +
> +void rcu_sync_exit(struct rcu_sync_struct *rss)
> +{
> +	spin_lock_irq(&rss->rss_lock);
> +	if (!--rss->gp_count) {
> +		if (rss->cb_state == CB_IDLE) {
> +			rss->cb_state = CB_PENDING;
> +			rss->call(&rss->cb_head, rcu_sync_func);
> +		} else if (rss->cb_state == CB_PENDING) {
> +			rss->cb_state = CB_REPLAY;
> +		}
> +	}
> +	spin_unlock_irq(&rss->rss_lock);
> +}
> 
> 


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [RFC][PATCH 5/5] percpu-rwsem: Optimize readers and reduce global impact
  2015-05-26 11:44 ` [RFC][PATCH 5/5] percpu-rwsem: Optimize readers and reduce global impact Peter Zijlstra
  2015-05-29 19:45   ` Oleg Nesterov
@ 2015-05-30 17:18   ` Paul E. McKenney
  2015-05-30 20:04     ` ring_buffer_attach && cond_synchronize_rcu (Was: percpu-rwsem: Optimize readers and reduce global impact) Oleg Nesterov
  1 sibling, 1 reply; 36+ messages in thread
From: Paul E. McKenney @ 2015-05-30 17:18 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: oleg, tj, mingo, linux-kernel, der.herr, dave, torvalds, josh

On Tue, May 26, 2015 at 01:44:01PM +0200, Peter Zijlstra wrote:
> Currently the percpu-rwsem has two issues:
> 
>  - it switches to (global) atomic ops while a writer is waiting;
>    which could be quite a while and slows down releasing the readers.
> 
>  - it employs synchronize_sched_expedited() _twice_ which is evil and
>    should die -- it shoots IPIs around the machine.
> 
> This patch cures the first problem by ordering the reader-state vs
> reader-count (see the comments in __percpu_down_read() and
> percpu_down_write()). This changes a global atomic op into a full
> memory barrier, which doesn't have the global cacheline contention.
> 
> It cures the second problem by employing the rcu-sync primitives by
> Oleg which reduces to no sync_sched() calls in the 'normal' case of
> no write contention -- global locks had better be rare, and has a
> maximum of one sync_sched() call in case of contention.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  include/linux/percpu-rwsem.h  |   62 +++++++++-
>  kernel/locking/percpu-rwsem.c |  238 +++++++++++++++++++++---------------------
>  2 files changed, 176 insertions(+), 124 deletions(-)

[ . . . ]

> --- a/kernel/locking/percpu-rwsem.c
> +++ b/kernel/locking/percpu-rwsem.c
> @@ -8,158 +8,164 @@
>  #include <linux/sched.h>
>  #include <linux/errno.h>
> 
> -int __percpu_init_rwsem(struct percpu_rw_semaphore *brw,
> +enum { readers_slow, readers_block };
> +
> +int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
>  			const char *name, struct lock_class_key *rwsem_key)
>  {
> -	brw->fast_read_ctr = alloc_percpu(int);
> -	if (unlikely(!brw->fast_read_ctr))
> +	sem->refcount = alloc_percpu(unsigned int);
> +	if (unlikely(!sem->refcount))
>  		return -ENOMEM;
> 
> -	/* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
> -	__init_rwsem(&brw->rw_sem, name, rwsem_key);
> -	atomic_set(&brw->write_ctr, 0);
> -	atomic_set(&brw->slow_read_ctr, 0);
> -	init_waitqueue_head(&brw->write_waitq);
> +	sem->state = readers_slow;
> +	rcu_sync_init(&sem->rss, RCU_SCHED_SYNC);

But it looks like you need the RCU-sched variant.  Please see below for
an untested patch providing this support.  One benefit of this patch
is that it does not add any bloat to Tiny RCU.

							Thanx, Paul

------------------------------------------------------------------------

    rcu: Add RCU-sched flavors of get-state and cond-sync
    
    The get_state_synchronize_rcu() and cond_synchronize_rcu() functions
    allow polling for grace-period completion, with an actual wait for a
    grace period occuring only when cond_synchronize_rcu() is called too
    soon after the corresponding get_state_synchronize_rcu().  However,
    these functions work only for vanilla RCU.  This commit adds the
    get_state_synchronize_sched() and cond_synchronize_sched(), which provide
    the same capability for RCU-sched.
    
    Reported-by: Peter Zijlstra (Intel) <peterz@infradead.org>
    Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 3df6c1ec4e25..ff968b7af3a4 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -37,6 +37,16 @@ static inline void cond_synchronize_rcu(unsigned long oldstate)
 	might_sleep();
 }
 
+static inline unsigned long get_state_synchronize_sched(void)
+{
+	return 0;
+}
+
+static inline void cond_synchronize_sched(unsigned long oldstate)
+{
+	might_sleep();
+}
+
 static inline void rcu_barrier_bh(void)
 {
 	wait_rcu_gp(call_rcu_bh);
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 3fa4a43ab415..80e68d344205 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -76,6 +76,8 @@ void rcu_barrier_bh(void);
 void rcu_barrier_sched(void);
 unsigned long get_state_synchronize_rcu(void);
 void cond_synchronize_rcu(unsigned long oldstate);
+unsigned long get_state_synchronize_sched(void);
+void cond_synchronize_sched(unsigned long oldstate);
 
 extern unsigned long rcutorture_testseq;
 extern unsigned long rcutorture_vernum;
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 1cead7806ca6..f256dee0f6b1 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -635,6 +635,8 @@ static struct rcu_torture_ops sched_ops = {
 	.deferred_free	= rcu_sched_torture_deferred_free,
 	.sync		= synchronize_sched,
 	.exp_sync	= synchronize_sched_expedited,
+	.get_state	= get_state_synchronize_sched,
+	.cond_sync	= cond_synchronize_sched,
 	.call		= call_rcu_sched,
 	.cb_barrier	= rcu_barrier_sched,
 	.fqs		= rcu_sched_force_quiescent_state,
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 2fce662fa058..e33e1a8a8d08 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3259,6 +3259,58 @@ void cond_synchronize_rcu(unsigned long oldstate)
 }
 EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
 
+/**
+ * get_state_synchronize_sched - Snapshot current RCU-sched state
+ *
+ * Returns a cookie that is used by a later call to cond_synchronize_sched()
+ * to determine whether or not a full grace period has elapsed in the
+ * meantime.
+ */
+unsigned long get_state_synchronize_sched(void)
+{
+	/*
+	 * Any prior manipulation of RCU-protected data must happen
+	 * before the load from ->gpnum.
+	 */
+	smp_mb();  /* ^^^ */
+
+	/*
+	 * Make sure this load happens before the purportedly
+	 * time-consuming work between get_state_synchronize_sched()
+	 * and cond_synchronize_sched().
+	 */
+	return smp_load_acquire(&rcu_sched_state.gpnum);
+}
+EXPORT_SYMBOL_GPL(get_state_synchronize_sched);
+
+/**
+ * cond_synchronize_sched - Conditionally wait for an RCU-sched grace period
+ *
+ * @oldstate: return value from earlier call to get_state_synchronize_sched()
+ *
+ * If a full RCU-sched grace period has elapsed since the earlier call to
+ * get_state_synchronize_sched(), just return.  Otherwise, invoke
+ * synchronize_sched() to wait for a full grace period.
+ *
+ * Yes, this function does not take counter wrap into account.  But
+ * counter wrap is harmless.  If the counter wraps, we have waited for
+ * more than 2 billion grace periods (and way more on a 64-bit system!),
+ * so waiting for one additional grace period should be just fine.
+ */
+void cond_synchronize_sched(unsigned long oldstate)
+{
+	unsigned long newstate;
+
+	/*
+	 * Ensure that this load happens before any RCU-destructive
+	 * actions the caller might carry out after we return.
+	 */
+	newstate = smp_load_acquire(&rcu_sched_state.completed);
+	if (ULONG_CMP_GE(oldstate, newstate))
+		synchronize_sched();
+}
+EXPORT_SYMBOL_GPL(cond_synchronize_sched);
+
 static int synchronize_sched_expedited_cpu_stop(void *data)
 {
 	/*


^ permalink raw reply related	[flat|nested] 36+ messages in thread

* Re: [RFC][PATCH 1/5] rcu: Create rcu_sync infrastructure
  2015-05-30 16:58   ` Paul E. McKenney
@ 2015-05-30 19:16     ` Oleg Nesterov
  2015-05-30 19:25       ` Oleg Nesterov
  2015-05-31 16:07       ` Paul E. McKenney
  0 siblings, 2 replies; 36+ messages in thread
From: Oleg Nesterov @ 2015-05-30 19:16 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Peter Zijlstra, tj, mingo, linux-kernel, der.herr, dave, torvalds

On 05/30, Paul E. McKenney wrote:
>
> On Tue, May 26, 2015 at 01:43:57PM +0200, Peter Zijlstra wrote:
> > From: Oleg Nesterov <oleg@redhat.com>
> >
> > It is functionally equivalent to
> >
> >         struct rcu_sync_struct {
> >                 atomic_t counter;
> >         };
> >
> >         static inline bool rcu_sync_is_idle(struct rcu_sync_struct *rss)
> >         {
> >                 return atomic_read(&rss->counter) == 0;
> >         }
> >
> >         static inline void rcu_sync_enter(struct rcu_sync_struct *rss)
> >         {
> >                 atomic_inc(&rss->counter);
> >                 synchronize_sched();
> >         }
>
> For vanilla RCU, this is called get_state_synchronize_rcu().
>
> >         static inline void rcu_sync_exit(struct rcu_sync_struct *rss)
> >         {
> >                 synchronize_sched();
> >                 atomic_dec(&rss->counter);
> >         }
> >
> > except: it records the state and synchronize_sched() is only called by
> > rcu_sync_enter() and only if necessary.
>
> Again for vanilla RCU, this is called cond_synchronize_rcu().

Hmm. I do not understand... I think rcu_sync doesn't need
get_state/cond_synchronize.

The first caller of rcu_sync_enter() always needs sync(). The next one
could use cond_synchronize_rcu(), but for what? The 2nd one will wait
for the end of gp started by the first caller, and this is more optimal?

Note that rcu_sync_enter/rcu_sync_func never call sync() unless strictly
necessary.

Or I misunderstood you?

Oleg.


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [RFC][PATCH 1/5] rcu: Create rcu_sync infrastructure
  2015-05-30 19:16     ` Oleg Nesterov
@ 2015-05-30 19:25       ` Oleg Nesterov
  2015-05-31 16:07       ` Paul E. McKenney
  1 sibling, 0 replies; 36+ messages in thread
From: Oleg Nesterov @ 2015-05-30 19:25 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Peter Zijlstra, tj, mingo, linux-kernel, der.herr, dave, torvalds

On 05/30, Oleg Nesterov wrote:
>
> On 05/30, Paul E. McKenney wrote:
> >
> > On Tue, May 26, 2015 at 01:43:57PM +0200, Peter Zijlstra wrote:
> > > From: Oleg Nesterov <oleg@redhat.com>
> > >
> > > It is functionally equivalent to
> > >
> > >         struct rcu_sync_struct {
> > >                 atomic_t counter;
> > >         };
> > >
> > >         static inline bool rcu_sync_is_idle(struct rcu_sync_struct *rss)
> > >         {
> > >                 return atomic_read(&rss->counter) == 0;
> > >         }
> > >
> > >         static inline void rcu_sync_enter(struct rcu_sync_struct *rss)
> > >         {
> > >                 atomic_inc(&rss->counter);
> > >                 synchronize_sched();
> > >         }
> >
> > For vanilla RCU, this is called get_state_synchronize_rcu().
> >
> > >         static inline void rcu_sync_exit(struct rcu_sync_struct *rss)
> > >         {
> > >                 synchronize_sched();
> > >                 atomic_dec(&rss->counter);
> > >         }
> > >
> > > except: it records the state and synchronize_sched() is only called by
> > > rcu_sync_enter() and only if necessary.
> >
> > Again for vanilla RCU, this is called cond_synchronize_rcu().
>
> Hmm. I do not understand... I think rcu_sync doesn't need
> get_state/cond_synchronize.
>
> The first caller of rcu_sync_enter() always needs sync(). The next one
> could use cond_synchronize_rcu(), but for what? The 2nd one will wait
> for the end of gp started by the first caller, and this is more optimal?
>
> Note that rcu_sync_enter/rcu_sync_func never call sync() unless strictly
> necessary.

If you meant that rcu_sync_exit() could use cond_synchronize_rcu(), this
doesn't look right too... We always need another synchronize_rcu() after
the last writer does rcu_sync_exit(). Except rcu_sync_exit() uses call_rcu()
and thus it never blocks.

> Or I misunderstood you?

Yes...

Oleg.


^ permalink raw reply	[flat|nested] 36+ messages in thread

* ring_buffer_attach && cond_synchronize_rcu (Was: percpu-rwsem: Optimize readers and reduce global impact)
  2015-05-30 17:18   ` Paul E. McKenney
@ 2015-05-30 20:04     ` Oleg Nesterov
  2015-06-16 11:08       ` Peter Zijlstra
  2015-06-19 17:57       ` [tip:perf/urgent] perf: Fix ring_buffer_attach() RCU sync, again tip-bot for Oleg Nesterov
  0 siblings, 2 replies; 36+ messages in thread
From: Oleg Nesterov @ 2015-05-30 20:04 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Peter Zijlstra, tj, mingo, linux-kernel, der.herr, dave, torvalds, josh

On 05/30, Paul E. McKenney wrote:
>
> But it looks like you need the RCU-sched variant.  Please see below for
> an untested patch providing this support.  One benefit of this patch
> is that it does not add any bloat to Tiny RCU.

I don't think so, see another email. But perhaps I am totally confused,
please correct me.

Well, actually the first writer (need_sync == T) can use it, but it
does not make sense, I think. Because it calls sync() right after
it observes GP_IDLE and drops the lock, the window is too small.

> ------------------------------------------------------------------------
>
>     rcu: Add RCU-sched flavors of get-state and cond-sync

However, to me this patch makes sense anyway. Just I don't think rcu_sync
or percpu_rw_semaphore can use the new helpers.


And. I tried to find other users of get_state/cond_sync. Found
ring_buffer_attach() and it looks obviously buggy?

Again, perhaps I am totally confused, but don't we need to ensure
that we have "synchronize" _between_ list_del() and list_add() ?

IOW. Suppose that ring_buffer_attach() preempts right_after
get_state_synchronize_rcu() and gp completes before spin_lock().

In this case cond_synchronize_rcu() does nothing and we reuse
->rb_entry without waiting for gp in between?

Don't we need the patch below? (it also moves the ->rcu_pending check
under "if (rb)", to make it more readable imo).

Peter?

Oleg.

--- x/kernel/events/core.c
+++ x/kernel/events/core.c
@@ -4310,20 +4310,20 @@ static void ring_buffer_attach(struct pe
 		WARN_ON_ONCE(event->rcu_pending);
 
 		old_rb = event->rb;
-		event->rcu_batches = get_state_synchronize_rcu();
-		event->rcu_pending = 1;
-
 		spin_lock_irqsave(&old_rb->event_lock, flags);
 		list_del_rcu(&event->rb_entry);
 		spin_unlock_irqrestore(&old_rb->event_lock, flags);
-	}
 
-	if (event->rcu_pending && rb) {
-		cond_synchronize_rcu(event->rcu_batches);
-		event->rcu_pending = 0;
+		event->rcu_batches = get_state_synchronize_rcu();
+		event->rcu_pending = 1;
 	}
 
 	if (rb) {
+		if (event->rcu_pending) {
+			cond_synchronize_rcu(event->rcu_batches);
+			event->rcu_pending = 0;
+		}
+
 		spin_lock_irqsave(&rb->event_lock, flags);
 		list_add_rcu(&event->rb_entry, &rb->event_list);
 		spin_unlock_irqrestore(&rb->event_lock, flags);


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [RFC][PATCH 5/5] percpu-rwsem: Optimize readers and reduce global impact
  2015-05-29 20:41       ` Linus Torvalds
@ 2015-05-30 20:49         ` Oleg Nesterov
  2015-06-16 11:48           ` Peter Zijlstra
  0 siblings, 1 reply; 36+ messages in thread
From: Oleg Nesterov @ 2015-05-30 20:49 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Peter Zijlstra, Paul McKenney, Tejun Heo, Ingo Molnar,
	Linux Kernel Mailing List, der.herr, Davidlohr Bueso

On 05/29, Linus Torvalds wrote:
>
> On Fri, May 29, 2015 at 1:09 PM, Oleg Nesterov <oleg@redhat.com> wrote:
> >
> > Doesn't it need mb() before "state = readers_slow" to ensure
> > "release" semantics?
>
> Please don't do any mb() at all. Just do smp_store_release() if
> release semantics is what you want.

Agreed!

> (And try to make sure to pair it with smp_load_acquire() on the other
> side for things to make sense)

Hmm. I think you are right...

__percpu_down_read() lacks another mb() after the "state != BLOCK"
check for the same reason, and we can use smp_load_acquire(state)
instead.

Oleg.


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [RFC][PATCH 1/5] rcu: Create rcu_sync infrastructure
  2015-05-30 19:16     ` Oleg Nesterov
  2015-05-30 19:25       ` Oleg Nesterov
@ 2015-05-31 16:07       ` Paul E. McKenney
  1 sibling, 0 replies; 36+ messages in thread
From: Paul E. McKenney @ 2015-05-31 16:07 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Peter Zijlstra, tj, mingo, linux-kernel, der.herr, dave, torvalds

On Sat, May 30, 2015 at 09:16:52PM +0200, Oleg Nesterov wrote:
> On 05/30, Paul E. McKenney wrote:
> >
> > On Tue, May 26, 2015 at 01:43:57PM +0200, Peter Zijlstra wrote:
> > > From: Oleg Nesterov <oleg@redhat.com>
> > >
> > > It is functionally equivalent to
> > >
> > >         struct rcu_sync_struct {
> > >                 atomic_t counter;
> > >         };
> > >
> > >         static inline bool rcu_sync_is_idle(struct rcu_sync_struct *rss)
> > >         {
> > >                 return atomic_read(&rss->counter) == 0;
> > >         }
> > >
> > >         static inline void rcu_sync_enter(struct rcu_sync_struct *rss)
> > >         {
> > >                 atomic_inc(&rss->counter);
> > >                 synchronize_sched();
> > >         }
> >
> > For vanilla RCU, this is called get_state_synchronize_rcu().
> >
> > >         static inline void rcu_sync_exit(struct rcu_sync_struct *rss)
> > >         {
> > >                 synchronize_sched();
> > >                 atomic_dec(&rss->counter);
> > >         }
> > >
> > > except: it records the state and synchronize_sched() is only called by
> > > rcu_sync_enter() and only if necessary.
> >
> > Again for vanilla RCU, this is called cond_synchronize_rcu().
> 
> Hmm. I do not understand... I think rcu_sync doesn't need
> get_state/cond_synchronize.
> 
> The first caller of rcu_sync_enter() always needs sync(). The next one
> could use cond_synchronize_rcu(), but for what? The 2nd one will wait
> for the end of gp started by the first caller, and this is more optimal?
> 
> Note that rcu_sync_enter/rcu_sync_func never call sync() unless strictly
> necessary.
> 
> Or I misunderstood you?

Probably me forgetting exactly what your APIs above are doing.

I clearly need to take a closer look when fully awake.  ;-)

							Thanx, Paul


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [RFC][PATCH 0/5] Optimize percpu-rwsem
  2015-05-26 21:57     ` Linus Torvalds
  2015-05-27  9:28       ` Nicholas Mc Guire
@ 2015-06-05  1:45       ` Al Viro
  2015-06-05 21:08         ` Oleg Nesterov
  1 sibling, 1 reply; 36+ messages in thread
From: Al Viro @ 2015-06-05  1:45 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Davidlohr Bueso, Peter Zijlstra, Oleg Nesterov, Paul McKenney,
	Tejun Heo, Ingo Molnar, Linux Kernel Mailing List, der.herr

On Tue, May 26, 2015 at 02:57:53PM -0700, Linus Torvalds wrote:

> Because that is another example of a complete failure of a locking
> primitive that was just too specialized to be worth it.

<notices stale include in fs/file_table.c and removes it>

FWIW, I hadn't really looked into stop_machine uses, but fs/locks.c one
is really not all that great - there we have a large trashcan of a list
(every file_lock on the system) and the only use of that list is /proc/locks
output generation.  Sure, additions take this CPU's spinlock.  And removals
take pretty much a random one - losing the timeslice and regaining it on
a different CPU is quite likely with the uses there.

Why do we need a global lock there, anyway?  Why not hold only one for
the chain currently being traversed?  Sure, we'll need to get and drop
them in ->next() that way; so what?

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [RFC][PATCH 0/5] Optimize percpu-rwsem
  2015-06-05  1:45       ` Al Viro
@ 2015-06-05 21:08         ` Oleg Nesterov
  2015-06-05 22:11           ` Al Viro
  0 siblings, 1 reply; 36+ messages in thread
From: Oleg Nesterov @ 2015-06-05 21:08 UTC (permalink / raw)
  To: Al Viro
  Cc: Linus Torvalds, Davidlohr Bueso, Peter Zijlstra, Paul McKenney,
	Tejun Heo, Ingo Molnar, Linux Kernel Mailing List, der.herr

On 06/05, Al Viro wrote:
>
> FWIW, I hadn't really looked into stop_machine uses, but fs/locks.c one
> is really not all that great - there we have a large trashcan of a list
> (every file_lock on the system) and the only use of that list is /proc/locks
> output generation.  Sure, additions take this CPU's spinlock.  And removals
> take pretty much a random one - losing the timeslice and regaining it on
> a different CPU is quite likely with the uses there.
>
> Why do we need a global lock there, anyway?  Why not hold only one for
> the chain currently being traversed?  Sure, we'll need to get and drop
> them in ->next() that way; so what?

And note that fs/seq_file.c:seq_hlist_next_percpu() has no other users.

And given that locks_delete_global_locks() takes the random lock anyway,
perhaps the hashed lists/locking makes no sense, I dunno.

Oleg.


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [RFC][PATCH 0/5] Optimize percpu-rwsem
  2015-06-05 21:08         ` Oleg Nesterov
@ 2015-06-05 22:11           ` Al Viro
  2015-06-05 23:36             ` Oleg Nesterov
  0 siblings, 1 reply; 36+ messages in thread
From: Al Viro @ 2015-06-05 22:11 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Linus Torvalds, Davidlohr Bueso, Peter Zijlstra, Paul McKenney,
	Tejun Heo, Ingo Molnar, Linux Kernel Mailing List, der.herr

On Fri, Jun 05, 2015 at 11:08:57PM +0200, Oleg Nesterov wrote:
> On 06/05, Al Viro wrote:
> >
> > FWIW, I hadn't really looked into stop_machine uses, but fs/locks.c one
> > is really not all that great - there we have a large trashcan of a list
> > (every file_lock on the system) and the only use of that list is /proc/locks
> > output generation.  Sure, additions take this CPU's spinlock.  And removals
> > take pretty much a random one - losing the timeslice and regaining it on
> > a different CPU is quite likely with the uses there.
> >
> > Why do we need a global lock there, anyway?  Why not hold only one for
> > the chain currently being traversed?  Sure, we'll need to get and drop
> > them in ->next() that way; so what?
> 
> And note that fs/seq_file.c:seq_hlist_next_percpu() has no other users.
> 
> And given that locks_delete_global_locks() takes the random lock anyway,
> perhaps the hashed lists/locking makes no sense, I dunno.

It's not about making life easier for /proc/locks; it's about not screwing
those who add/remove file_lock...  And no, that "random lock" isn't held
when modifying the (per-cpu) lists - it protects the list hanging off each
element of the global list, and /proc/locks scans those lists, so rather
than taking/dropping it in each ->show(), it's taken once in ->start()...

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [RFC][PATCH 0/5] Optimize percpu-rwsem
  2015-06-05 22:11           ` Al Viro
@ 2015-06-05 23:36             ` Oleg Nesterov
  0 siblings, 0 replies; 36+ messages in thread
From: Oleg Nesterov @ 2015-06-05 23:36 UTC (permalink / raw)
  To: Al Viro
  Cc: Linus Torvalds, Davidlohr Bueso, Peter Zijlstra, Paul McKenney,
	Tejun Heo, Ingo Molnar, Linux Kernel Mailing List, der.herr

On 06/05, Al Viro wrote:
>
> On Fri, Jun 05, 2015 at 11:08:57PM +0200, Oleg Nesterov wrote:
> > On 06/05, Al Viro wrote:
> > >
> > > FWIW, I hadn't really looked into stop_machine uses, but fs/locks.c one
> > > is really not all that great - there we have a large trashcan of a list
> > > (every file_lock on the system) and the only use of that list is /proc/locks
> > > output generation.  Sure, additions take this CPU's spinlock.  And removals
> > > take pretty much a random one - losing the timeslice and regaining it on
> > > a different CPU is quite likely with the uses there.
> > >
> > > Why do we need a global lock there, anyway?  Why not hold only one for
> > > the chain currently being traversed?  Sure, we'll need to get and drop
> > > them in ->next() that way; so what?
> >
> > And note that fs/seq_file.c:seq_hlist_next_percpu() has no other users.
> >
> > And given that locks_delete_global_locks() takes the random lock anyway,
> > perhaps the hashed lists/locking makes no sense, I dunno.
>
> It's not about making life easier for /proc/locks; it's about not screwing
> those who add/remove file_lock...

I meant, seq_hlist_next_percpu() could be "static" in fs/locks.c.

> And no, that "random lock" isn't held
> when modifying the (per-cpu) lists - it protects the list hanging off each
> element of the global list, and /proc/locks scans those lists, so rather
> than taking/dropping it in each ->show(), it's taken once in ->start()...

Sure, I understand. I meant that (perhaps) something like

	struct {
		spinlock_t lock;
		struct list_head *head
	} file_lock_list[];


	locks_insert_global_locks(fl)
	{
		int idx = fl->idx = hash(fl);
		spin_lock(&file_lock_list[idx].lock);
		hlist_add_head(...);
		spin_unlock(...);
	}

seq_hlist_next_percpu() could scan file_lock_list[] and unlock/lock ->lock
when it changes the index.

But please forget, this is really minor. Just I think that file_lock_list
is not actually "per-cpu", exactly because every locks_delete_global_locks()
needs lg_local_lock_cpu(fl->fl_link_cpu) as you pointed out.

Oleg.


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: ring_buffer_attach && cond_synchronize_rcu (Was: percpu-rwsem: Optimize readers and reduce global impact)
  2015-05-30 20:04     ` ring_buffer_attach && cond_synchronize_rcu (Was: percpu-rwsem: Optimize readers and reduce global impact) Oleg Nesterov
@ 2015-06-16 11:08       ` Peter Zijlstra
  2015-06-16 11:16         ` Peter Zijlstra
  2015-06-19 17:57       ` [tip:perf/urgent] perf: Fix ring_buffer_attach() RCU sync, again tip-bot for Oleg Nesterov
  1 sibling, 1 reply; 36+ messages in thread
From: Peter Zijlstra @ 2015-06-16 11:08 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Paul E. McKenney, tj, mingo, linux-kernel, der.herr, dave,
	torvalds, josh

On Sat, May 30, 2015 at 10:04:25PM +0200, Oleg Nesterov wrote:
> And. I tried to find other users of get_state/cond_sync. Found
> ring_buffer_attach() and it looks obviously buggy?

Urgh, indeed.

> IOW. Suppose that ring_buffer_attach() preempts right_after
> get_state_synchronize_rcu() and gp completes before spin_lock().
> 
> In this case cond_synchronize_rcu() does nothing and we reuse
> ->rb_entry without waiting for gp in between?

Yes.

> Don't we need the patch below? (it also moves the ->rcu_pending check
> under "if (rb)", to make it more readable imo).

> --- x/kernel/events/core.c
> +++ x/kernel/events/core.c
> @@ -4310,20 +4310,20 @@ static void ring_buffer_attach(struct pe
>  		WARN_ON_ONCE(event->rcu_pending);
>  
>  		spin_lock_irqsave(&old_rb->event_lock, flags);
>  		list_del_rcu(&event->rb_entry);
>  		spin_unlock_irqrestore(&old_rb->event_lock, flags);
>  
> +		event->rcu_batches = get_state_synchronize_rcu();
> +		event->rcu_pending = 1;
>  	}
>  
>  	if (rb) {
> +		if (event->rcu_pending) {
> +			cond_synchronize_rcu(event->rcu_batches);
> +			event->rcu_pending = 0;
> +		}
> +
>  		spin_lock_irqsave(&rb->event_lock, flags);
>  		list_add_rcu(&event->rb_entry, &rb->event_list);
>  		spin_unlock_irqrestore(&rb->event_lock, flags);

Agreed.


^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: ring_buffer_attach && cond_synchronize_rcu (Was: percpu-rwsem: Optimize readers and reduce global impact)
  2015-06-16 11:08       ` Peter Zijlstra
@ 2015-06-16 11:16         ` Peter Zijlstra
  2015-06-16 19:03           ` Oleg Nesterov
  0 siblings, 1 reply; 36+ messages in thread
From: Peter Zijlstra @ 2015-06-16 11:16 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Paul E. McKenney, tj, mingo, linux-kernel, der.herr, dave,
	torvalds, josh


I made that the below.

Are you OK with the Changelog edits and the added SoB ?

---
Subject: perf: Fix ring_buffer_attach() RCU sync, again.
From: Oleg Nesterov <oleg@redhat.com>
Date: Sat, 30 May 2015 22:04:25 +0200

While looking for other users of get_state/cond_sync. I Found
ring_buffer_attach() and it looks obviously buggy?

Don't we need to ensure that we have "synchronize" _between_
list_del() and list_add() ?

IOW. Suppose that ring_buffer_attach() preempts right_after
get_state_synchronize_rcu() and gp completes before spin_lock().

In this case cond_synchronize_rcu() does nothing and we reuse
->rb_entry without waiting for gp in between?

It also moves the ->rcu_pending check under "if (rb)", to make it
more readable imo.

Cc: tj@kernel.org
Cc: mingo@redhat.com
Cc: der.herr@hofr.at
Cc: dave@stgolabs.net
Cc: torvalds@linux-foundation.org
Cc: josh@joshtriplett.org
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Fixes: b69cf53640da ("perf: Fix a race between ring_buffer_detach() and ring_buffer_attach()")
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20150530200425.GA15748@redhat.com
---
 kernel/events/core.c |   14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4307,20 +4307,20 @@ static void ring_buffer_attach(struct pe
 		WARN_ON_ONCE(event->rcu_pending);
 
 		old_rb = event->rb;
-		event->rcu_batches = get_state_synchronize_rcu();
-		event->rcu_pending = 1;
-
 		spin_lock_irqsave(&old_rb->event_lock, flags);
 		list_del_rcu(&event->rb_entry);
 		spin_unlock_irqrestore(&old_rb->event_lock, flags);
-	}
 
-	if (event->rcu_pending && rb) {
-		cond_synchronize_rcu(event->rcu_batches);
-		event->rcu_pending = 0;
+		event->rcu_batches = get_state_synchronize_rcu();
+		event->rcu_pending = 1;
 	}
 
 	if (rb) {
+		if (event->rcu_pending) {
+			cond_synchronize_rcu(event->rcu_batches);
+			event->rcu_pending = 0;
+		}
+
 		spin_lock_irqsave(&rb->event_lock, flags);
 		list_add_rcu(&event->rb_entry, &rb->event_list);
 		spin_unlock_irqrestore(&rb->event_lock, flags);

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: [RFC][PATCH 5/5] percpu-rwsem: Optimize readers and reduce global impact
  2015-05-30 20:49         ` Oleg Nesterov
@ 2015-06-16 11:48           ` Peter Zijlstra
  0 siblings, 0 replies; 36+ messages in thread
From: Peter Zijlstra @ 2015-06-16 11:48 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Linus Torvalds, Paul McKenney, Tejun Heo, Ingo Molnar,
	Linux Kernel Mailing List, der.herr, Davidlohr Bueso

On Sat, May 30, 2015 at 10:49:00PM +0200, Oleg Nesterov wrote:
> > On Fri, May 29, 2015 at 1:09 PM, Oleg Nesterov <oleg@redhat.com> wrote:
> > >
> > > Doesn't it need mb() before "state = readers_slow" to ensure
> > > "release" semantics?

> __percpu_down_read() lacks another mb() after the "state != BLOCK"
> check for the same reason, and we can use smp_load_acquire(state)
> instead.

I made the below modification to the patch.

---
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -51,7 +51,11 @@ void __percpu_down_read(struct percpu_rw
 
 	smp_mb(); /* A matches D */
 
-	if (likely(sem->state != readers_block))
+	/*
+	 * If !readers_block the critical section starts here, matched by the
+	 * release in percpu_up_write().
+	 */
+	if (likely(smp_load_acquire(sem->state) != readers_block))
 		return;
 
 	/*
@@ -154,8 +158,11 @@ void percpu_up_write(struct percpu_rw_se
 	 * One reason that we cannot just immediately flip to readers_fast is
 	 * that new readers might fail to see the results of this writer's
 	 * critical section.
+	 *
+	 * Therefore we force it through the slow path which guarantees an
+	 * acquire and thereby guarantees the critical section's consistency.
 	 */
-	sem->state = readers_slow;
+	smp_store_release(sem->state, readers_slow);
 
 	/*
 	 * Release the write lock, this will allow readers back in the game.

^ permalink raw reply	[flat|nested] 36+ messages in thread

* Re: ring_buffer_attach && cond_synchronize_rcu (Was: percpu-rwsem: Optimize readers and reduce global impact)
  2015-06-16 11:16         ` Peter Zijlstra
@ 2015-06-16 19:03           ` Oleg Nesterov
  0 siblings, 0 replies; 36+ messages in thread
From: Oleg Nesterov @ 2015-06-16 19:03 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Paul E. McKenney, tj, mingo, linux-kernel, der.herr, dave,
	torvalds, josh

On 06/16, Peter Zijlstra wrote:
>
>
> I made that the below.
>
> Are you OK with the Changelog edits and the added SoB ?

Of course,

Thanks Peter!

Oleg.

> ---
> Subject: perf: Fix ring_buffer_attach() RCU sync, again.
> From: Oleg Nesterov <oleg@redhat.com>
> Date: Sat, 30 May 2015 22:04:25 +0200
>
> While looking for other users of get_state/cond_sync. I Found
> ring_buffer_attach() and it looks obviously buggy?
>
> Don't we need to ensure that we have "synchronize" _between_
> list_del() and list_add() ?
>
> IOW. Suppose that ring_buffer_attach() preempts right_after
> get_state_synchronize_rcu() and gp completes before spin_lock().
>
> In this case cond_synchronize_rcu() does nothing and we reuse
> ->rb_entry without waiting for gp in between?
>
> It also moves the ->rcu_pending check under "if (rb)", to make it
> more readable imo.
>
> Cc: tj@kernel.org
> Cc: mingo@redhat.com
> Cc: der.herr@hofr.at
> Cc: dave@stgolabs.net
> Cc: torvalds@linux-foundation.org
> Cc: josh@joshtriplett.org
> Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
> Fixes: b69cf53640da ("perf: Fix a race between ring_buffer_detach() and ring_buffer_attach()")
> Signed-off-by: Oleg Nesterov <oleg@redhat.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> Link: http://lkml.kernel.org/r/20150530200425.GA15748@redhat.com
> ---
>  kernel/events/core.c |   14 +++++++-------
>  1 file changed, 7 insertions(+), 7 deletions(-)
>
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -4307,20 +4307,20 @@ static void ring_buffer_attach(struct pe
>  		WARN_ON_ONCE(event->rcu_pending);
>
>  		old_rb = event->rb;
> -		event->rcu_batches = get_state_synchronize_rcu();
> -		event->rcu_pending = 1;
> -
>  		spin_lock_irqsave(&old_rb->event_lock, flags);
>  		list_del_rcu(&event->rb_entry);
>  		spin_unlock_irqrestore(&old_rb->event_lock, flags);
> -	}
>
> -	if (event->rcu_pending && rb) {
> -		cond_synchronize_rcu(event->rcu_batches);
> -		event->rcu_pending = 0;
> +		event->rcu_batches = get_state_synchronize_rcu();
> +		event->rcu_pending = 1;
>  	}
>
>  	if (rb) {
> +		if (event->rcu_pending) {
> +			cond_synchronize_rcu(event->rcu_batches);
> +			event->rcu_pending = 0;
> +		}
> +
>  		spin_lock_irqsave(&rb->event_lock, flags);
>  		list_add_rcu(&event->rb_entry, &rb->event_list);
>  		spin_unlock_irqrestore(&rb->event_lock, flags);


^ permalink raw reply	[flat|nested] 36+ messages in thread

* [tip:perf/urgent] perf: Fix ring_buffer_attach() RCU sync, again
  2015-05-30 20:04     ` ring_buffer_attach && cond_synchronize_rcu (Was: percpu-rwsem: Optimize readers and reduce global impact) Oleg Nesterov
  2015-06-16 11:08       ` Peter Zijlstra
@ 2015-06-19 17:57       ` tip-bot for Oleg Nesterov
  1 sibling, 0 replies; 36+ messages in thread
From: tip-bot for Oleg Nesterov @ 2015-06-19 17:57 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: paulmck, bp, brgerst, mingo, luto, hpa, alexander.shishkin, oleg,
	akpm, peterz, linux-kernel, torvalds, dvlasenk, tglx

Commit-ID:  2f993cf093643b98477c421fa2b9a98dcc940323
Gitweb:     http://git.kernel.org/tip/2f993cf093643b98477c421fa2b9a98dcc940323
Author:     Oleg Nesterov <oleg@redhat.com>
AuthorDate: Sat, 30 May 2015 22:04:25 +0200
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Fri, 19 Jun 2015 09:38:45 +0200

perf: Fix ring_buffer_attach() RCU sync, again

While looking for other users of get_state/cond_sync. I Found
ring_buffer_attach() and it looks obviously buggy?

Don't we need to ensure that we have "synchronize" _between_
list_del() and list_add() ?

IOW. Suppose that ring_buffer_attach() preempts right_after
get_state_synchronize_rcu() and gp completes before spin_lock().

In this case cond_synchronize_rcu() does nothing and we reuse
->rb_entry without waiting for gp in between?

It also moves the ->rcu_pending check under "if (rb)", to make it
more readable imo.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dave@stgolabs.net
Cc: der.herr@hofr.at
Cc: josh@joshtriplett.org
Cc: tj@kernel.org
Fixes: b69cf53640da ("perf: Fix a race between ring_buffer_detach() and ring_buffer_attach()")
Link: http://lkml.kernel.org/r/20150530200425.GA15748@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index eddf1ed..0ceb386 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4331,20 +4331,20 @@ static void ring_buffer_attach(struct perf_event *event,
 		WARN_ON_ONCE(event->rcu_pending);
 
 		old_rb = event->rb;
-		event->rcu_batches = get_state_synchronize_rcu();
-		event->rcu_pending = 1;
-
 		spin_lock_irqsave(&old_rb->event_lock, flags);
 		list_del_rcu(&event->rb_entry);
 		spin_unlock_irqrestore(&old_rb->event_lock, flags);
-	}
 
-	if (event->rcu_pending && rb) {
-		cond_synchronize_rcu(event->rcu_batches);
-		event->rcu_pending = 0;
+		event->rcu_batches = get_state_synchronize_rcu();
+		event->rcu_pending = 1;
 	}
 
 	if (rb) {
+		if (event->rcu_pending) {
+			cond_synchronize_rcu(event->rcu_batches);
+			event->rcu_pending = 0;
+		}
+
 		spin_lock_irqsave(&rb->event_lock, flags);
 		list_add_rcu(&event->rb_entry, &rb->event_list);
 		spin_unlock_irqrestore(&rb->event_lock, flags);
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply related	[flat|nested] 36+ messages in thread

end of thread, other threads:[~2015-06-19 17:59 UTC | newest]

Thread overview: 36+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-05-26 11:43 [RFC][PATCH 0/5] Optimize percpu-rwsem Peter Zijlstra
2015-05-26 11:43 ` [RFC][PATCH 1/5] rcu: Create rcu_sync infrastructure Peter Zijlstra
2015-05-30 16:58   ` Paul E. McKenney
2015-05-30 19:16     ` Oleg Nesterov
2015-05-30 19:25       ` Oleg Nesterov
2015-05-31 16:07       ` Paul E. McKenney
2015-05-26 11:43 ` [RFC][PATCH 2/5] rcusync: Introduce struct rcu_sync_ops Peter Zijlstra
2015-05-26 11:43 ` [RFC][PATCH 3/5] rcusync: Add the CONFIG_PROVE_RCU checks Peter Zijlstra
2015-05-26 11:44 ` [RFC][PATCH 4/5] rcusync: Introduce rcu_sync_dtor() Peter Zijlstra
2015-05-26 11:44 ` [RFC][PATCH 5/5] percpu-rwsem: Optimize readers and reduce global impact Peter Zijlstra
2015-05-29 19:45   ` Oleg Nesterov
2015-05-29 20:09     ` Oleg Nesterov
2015-05-29 20:41       ` Linus Torvalds
2015-05-30 20:49         ` Oleg Nesterov
2015-06-16 11:48           ` Peter Zijlstra
2015-05-30 17:18   ` Paul E. McKenney
2015-05-30 20:04     ` ring_buffer_attach && cond_synchronize_rcu (Was: percpu-rwsem: Optimize readers and reduce global impact) Oleg Nesterov
2015-06-16 11:08       ` Peter Zijlstra
2015-06-16 11:16         ` Peter Zijlstra
2015-06-16 19:03           ` Oleg Nesterov
2015-06-19 17:57       ` [tip:perf/urgent] perf: Fix ring_buffer_attach() RCU sync, again tip-bot for Oleg Nesterov
2015-05-26 18:12 ` [RFC][PATCH 0/5] Optimize percpu-rwsem Linus Torvalds
2015-05-26 18:34   ` Peter Zijlstra
2015-05-26 18:35   ` Tejun Heo
2015-05-26 18:42   ` Davidlohr Bueso
2015-05-26 21:57     ` Linus Torvalds
2015-05-27  9:28       ` Nicholas Mc Guire
2015-06-05  1:45       ` Al Viro
2015-06-05 21:08         ` Oleg Nesterov
2015-06-05 22:11           ` Al Viro
2015-06-05 23:36             ` Oleg Nesterov
2015-05-27  6:53     ` Peter Zijlstra
2015-05-26 18:57   ` Oleg Nesterov
2015-05-26 19:13     ` Oleg Nesterov
2015-05-26 19:29     ` Oleg Nesterov
2015-05-26 19:54 ` Davidlohr Bueso

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).