linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC][PATCH] srcu: Implement call_srcu()
@ 2012-01-31 13:32 Peter Zijlstra
  2012-01-31 13:47 ` Avi Kivity
  0 siblings, 1 reply; 23+ messages in thread
From: Peter Zijlstra @ 2012-01-31 13:32 UTC (permalink / raw)
  To: Paul E. McKenney, Oleg Nesterov; +Cc: Avi Kiviti, linux-kernel

Subject: srcu: Implement call_srcu()
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon Jan 30 23:20:49 CET 2012

Implement call_srcu() by using a state machine driven by
call_rcu_sched() and timer callbacks.

The state machine is a direct derivation of the existing
synchronize_srcu() code and replaces synchronize_sched() calls with a
call_rcu_sched() callback and the schedule_timeout() calls with simple
timer callbacks.

It then re-implements synchronize_srcu() using a completion where we
send the complete through call_srcu().

It completely wrecks synchronize_srcu_expedited() which is only used
by KVM. 3 of the 5 use cases look like they really want to use
call_srcu() instead, the remaining 2 I don't know but hope they can,
which would let us remove it.

Compile tested only!! :-)

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/srcu.h |   23 +++
 kernel/srcu.c        |  304 +++++++++++++++++++++++++++++----------------------
 2 files changed, 196 insertions(+), 131 deletions(-)

--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -27,17 +27,35 @@
 #ifndef _LINUX_SRCU_H
 #define _LINUX_SRCU_H
 
-#include <linux/mutex.h>
+#include <linux/spinlock.h>
 #include <linux/rcupdate.h>
+#include <linux/timer.h>
 
 struct srcu_struct_array {
 	int c[2];
 };
 
+enum srcu_state {
+	srcu_idle,
+	srcu_sync_1,
+	srcu_sync_2,
+	srcu_sync_2b,
+	srcu_wait,
+	srcu_wait_b,
+	srcu_sync_3,
+	srcu_sync_3b,
+};
+
 struct srcu_struct {
 	int completed;
 	struct srcu_struct_array __percpu *per_cpu_ref;
-	struct mutex mutex;
+	raw_spinlock_t lock;
+	enum srcu_state state;
+	union {
+		struct rcu_head head;
+		struct timer_list timer;
+	};
+	struct rcu_head *pending[2];
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map dep_map;
 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
@@ -73,6 +91,7 @@ void __srcu_read_unlock(struct srcu_stru
 void synchronize_srcu(struct srcu_struct *sp);
 void synchronize_srcu_expedited(struct srcu_struct *sp);
 long srcu_batches_completed(struct srcu_struct *sp);
+void call_srcu(struct srcu_struct *sp, struct rcu_head *head, void (*func)(struct rcu_head *));
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -16,6 +16,7 @@
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  *
  * Copyright (C) IBM Corporation, 2006
+ * Copyright (C) 2012 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
  *
  * Author: Paul McKenney <paulmck@us.ibm.com>
  *
@@ -33,11 +34,14 @@
 #include <linux/smp.h>
 #include <linux/delay.h>
 #include <linux/srcu.h>
+#include <linux/completion.h>
 
 static int init_srcu_struct_fields(struct srcu_struct *sp)
 {
 	sp->completed = 0;
-	mutex_init(&sp->mutex);
+	raw_spin_lock_init(&sp->lock);
+	sp->state = srcu_idle;
+	sp->pending[0] = sp->pending[1] = NULL;
 	sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
 	return sp->per_cpu_ref ? 0 : -ENOMEM;
 }
@@ -155,119 +159,190 @@ void __srcu_read_unlock(struct srcu_stru
 }
 EXPORT_SYMBOL_GPL(__srcu_read_unlock);
 
-/*
- * We use an adaptive strategy for synchronize_srcu() and especially for
- * synchronize_srcu_expedited().  We spin for a fixed time period
- * (defined below) to allow SRCU readers to exit their read-side critical
- * sections.  If there are still some readers after 10 microseconds,
- * we repeatedly block for 1-millisecond time periods.  This approach
- * has done well in testing, so there is no need for a config parameter.
+
+/**
+ * synchronize_srcu_expedited - like synchronize_srcu, but less patient
+ * @sp: srcu_struct with which to synchronize.
+ *
+ * Note that it is illegal to call synchronize_srcu_expedited()
+ * from the corresponding SRCU read-side critical section; doing so
+ * will result in deadlock.  However, it is perfectly legal to call
+ * synchronize_srcu_expedited() on one srcu_struct from some other
+ * srcu_struct's read-side critical section.
  */
-#define SYNCHRONIZE_SRCU_READER_DELAY 10
+void synchronize_srcu_expedited(struct srcu_struct *sp)
+{
+	/* XXX kill me */
+	synchronize_srcu(sp);
+}
+EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
 
-/*
- * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
+/**
+ * srcu_batches_completed - return batches completed.
+ * @sp: srcu_struct on which to report batch completion.
+ *
+ * Report the number of batches, correlated with, but not necessarily
+ * precisely the same as, the number of grace periods that have elapsed.
  */
-static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
+long srcu_batches_completed(struct srcu_struct *sp)
 {
-	int idx;
+	return sp->completed;
+}
+EXPORT_SYMBOL_GPL(srcu_batches_completed);
+
+static void do_srcu_state(struct srcu_struct *sp);
 
-	idx = sp->completed;
-	mutex_lock(&sp->mutex);
+static void do_srcu_state_timer(unsigned long __data)
+{
+	struct srcu_struct *sp = (void *)__data;
+	do_srcu_state(sp);
+}
 
-	/*
-	 * Check to see if someone else did the work for us while we were
-	 * waiting to acquire the lock.  We need -two- advances of
-	 * the counter, not just one.  If there was but one, we might have
-	 * shown up -after- our helper's first synchronize_sched(), thus
-	 * having failed to prevent CPU-reordering races with concurrent
-	 * srcu_read_unlock()s on other CPUs (see comment below).  So we
-	 * either (1) wait for two or (2) supply the second ourselves.
-	 */
+static void do_srcu_state_rcu(struct rcu_head *head)
+{
+	struct srcu_struct *sp = container_of(head, struct srcu_struct, head);
+	do_srcu_state(sp);
+}
 
-	if ((sp->completed - idx) >= 2) {
-		mutex_unlock(&sp->mutex);
-		return;
+static void do_srcu_state(struct srcu_struct *sp)
+{
+	struct rcu_head *head, *next;
+	unsigned long flags;
+	int idx;
+
+	raw_spin_lock_irqsave(&sp->lock, flags);
+	switch (sp->state) {
+	case srcu_idle:
+		BUG();
+
+	case srcu_sync_1:
+		/*
+		 * The preceding synchronize_sched() ensures that any CPU that
+		 * sees the new value of sp->completed will also see any
+		 * preceding changes to data structures made by this CPU.  This
+		 * prevents some other CPU from reordering the accesses in its
+		 * SRCU read-side critical section to precede the corresponding
+		 * srcu_read_lock() -- ensuring that such references will in
+		 * fact be protected.
+		 *
+		 * So it is now safe to do the flip.
+		 */
+		idx = sp->completed & 0x1;
+		sp->completed++;
+
+		sp->state = srcu_sync_2 + idx;
+		call_rcu_sched(&sp->head, do_srcu_state_rcu);
+		break;
+
+	case srcu_sync_2:
+	case srcu_sync_2b:
+		idx = sp->state - srcu_sync_2;
+
+		init_timer(&sp->timer);
+		sp->timer.data = (unsigned long)sp;
+		sp->timer.function = do_srcu_state_timer;
+		sp->state = srcu_wait + idx;
+
+		/*
+		 * At this point, because of the preceding synchronize_sched(),
+		 * all srcu_read_lock() calls using the old counters have
+		 * completed. Their corresponding critical sections might well
+		 * be still executing, but the srcu_read_lock() primitives
+		 * themselves will have finished executing.
+		 */
+test_pending:
+		if (!srcu_readers_active_idx(sp, idx)) {
+			sp->state = srcu_sync_3 + idx;
+			call_rcu_sched(&sp->head, do_srcu_state_rcu);
+			break;
+		}
+
+		mod_timer(&sp->timer, jiffies + 1);
+		break;
+
+	case srcu_wait:
+	case srcu_wait_b:
+		idx = sp->state - srcu_wait;
+		goto test_pending;
+
+	case srcu_sync_3:
+	case srcu_sync_3b:
+		idx = sp->state - srcu_sync_3;
+		/*
+		 * The preceding synchronize_sched() forces all
+		 * srcu_read_unlock() primitives that were executing
+		 * concurrently with the preceding for_each_possible_cpu() loop
+		 * to have completed by this point. More importantly, it also
+		 * forces the corresponding SRCU read-side critical sections to
+		 * have also completed, and the corresponding references to
+		 * SRCU-protected data items to be dropped.
+		 */
+		head = sp->pending[idx];
+		sp->pending[idx] = NULL;
+		raw_spin_unlock(&sp->lock);
+		while (head) {
+			next = head->next;
+			head->func(head);
+			head = next;
+		}
+		raw_spin_lock(&sp->lock);
+
+		/*
+		 * If there's a new batch waiting...
+		 */
+		if (sp->pending[idx ^ 1]) {
+			sp->state = srcu_sync_1;
+			call_rcu_sched(&sp->head, do_srcu_state_rcu);
+			break;
+		}
+
+		/*
+		 * We done!!
+		 */
+		sp->state = srcu_idle;
+		break;
 	}
+	raw_spin_unlock_irqrestore(&sp->lock, flags);
+}
 
-	sync_func();  /* Force memory barrier on all CPUs. */
+void call_srcu(struct srcu_struct *sp,
+	       struct rcu_head *head, void (*func)(struct rcu_head *))
+{
+	unsigned long flags;
+	int idx;
 
-	/*
-	 * The preceding synchronize_sched() ensures that any CPU that
-	 * sees the new value of sp->completed will also see any preceding
-	 * changes to data structures made by this CPU.  This prevents
-	 * some other CPU from reordering the accesses in its SRCU
-	 * read-side critical section to precede the corresponding
-	 * srcu_read_lock() -- ensuring that such references will in
-	 * fact be protected.
-	 *
-	 * So it is now safe to do the flip.
-	 */
+	head->func = func;
 
-	idx = sp->completed & 0x1;
-	sp->completed++;
+	raw_spin_lock_irqsave(&sp->lock, flags);
+	idx = sp->completed & 1;
+	barrier(); /* look at sp->completed once */
+	head->next = sp->pending[idx];
+	sp->pending[idx] = head;
+
+	if (sp->state == srcu_idle) {
+		sp->state = srcu_sync_1;
+		call_rcu_sched(&sp->head, do_srcu_state_rcu);
+	}
+	raw_spin_unlock_irqrestore(&sp->lock, flags);
+}
+EXPORT_SYMBOL_GPL(call_srcu);
 
-	sync_func();  /* Force memory barrier on all CPUs. */
+struct srcu_waiter {
+	struct completion wait;
+	struct rcu_head head;
+};
 
-	/*
-	 * At this point, because of the preceding synchronize_sched(),
-	 * all srcu_read_lock() calls using the old counters have completed.
-	 * Their corresponding critical sections might well be still
-	 * executing, but the srcu_read_lock() primitives themselves
-	 * will have finished executing.  We initially give readers
-	 * an arbitrarily chosen 10 microseconds to get out of their
-	 * SRCU read-side critical sections, then loop waiting 1/HZ
-	 * seconds per iteration.  The 10-microsecond value has done
-	 * very well in testing.
-	 */
-
-	if (srcu_readers_active_idx(sp, idx))
-		udelay(SYNCHRONIZE_SRCU_READER_DELAY);
-	while (srcu_readers_active_idx(sp, idx))
-		schedule_timeout_interruptible(1);
-
-	sync_func();  /* Force memory barrier on all CPUs. */
-
-	/*
-	 * The preceding synchronize_sched() forces all srcu_read_unlock()
-	 * primitives that were executing concurrently with the preceding
-	 * for_each_possible_cpu() loop to have completed by this point.
-	 * More importantly, it also forces the corresponding SRCU read-side
-	 * critical sections to have also completed, and the corresponding
-	 * references to SRCU-protected data items to be dropped.
-	 *
-	 * Note:
-	 *
-	 *	Despite what you might think at first glance, the
-	 *	preceding synchronize_sched() -must- be within the
-	 *	critical section ended by the following mutex_unlock().
-	 *	Otherwise, a task taking the early exit can race
-	 *	with a srcu_read_unlock(), which might have executed
-	 *	just before the preceding srcu_readers_active() check,
-	 *	and whose CPU might have reordered the srcu_read_unlock()
-	 *	with the preceding critical section.  In this case, there
-	 *	is nothing preventing the synchronize_sched() task that is
-	 *	taking the early exit from freeing a data structure that
-	 *	is still being referenced (out of order) by the task
-	 *	doing the srcu_read_unlock().
-	 *
-	 *	Alternatively, the comparison with "2" on the early exit
-	 *	could be changed to "3", but this increases synchronize_srcu()
-	 *	latency for bulk loads.  So the current code is preferred.
-	 */
+static void synchronize_srcu_complete(struct rcu_head *head)
+{
+	struct srcu_waiter *waiter = container_of(head, struct srcu_waiter, head);
 
-	mutex_unlock(&sp->mutex);
+	complete(&waiter->wait);
 }
 
 /**
  * synchronize_srcu - wait for prior SRCU read-side critical-section completion
  * @sp: srcu_struct with which to synchronize.
  *
- * Flip the completed counter, and wait for the old count to drain to zero.
- * As with classic RCU, the updater must use some separate means of
- * synchronizing concurrent updates.  Can block; must be called from
- * process context.
- *
  * Note that it is illegal to call synchronize_srcu() from the corresponding
  * SRCU read-side critical section; doing so will result in deadlock.
  * However, it is perfectly legal to call synchronize_srcu() on one
@@ -275,41 +350,12 @@ static void __synchronize_srcu(struct sr
  */
 void synchronize_srcu(struct srcu_struct *sp)
 {
-	__synchronize_srcu(sp, synchronize_sched);
-}
-EXPORT_SYMBOL_GPL(synchronize_srcu);
+	struct srcu_waiter waiter = {
+		.wait = COMPLETION_INITIALIZER_ONSTACK(waiter.wait),
+	};
 
-/**
- * synchronize_srcu_expedited - like synchronize_srcu, but less patient
- * @sp: srcu_struct with which to synchronize.
- *
- * Flip the completed counter, and wait for the old count to drain to zero.
- * As with classic RCU, the updater must use some separate means of
- * synchronizing concurrent updates.  Can block; must be called from
- * process context.
- *
- * Note that it is illegal to call synchronize_srcu_expedited()
- * from the corresponding SRCU read-side critical section; doing so
- * will result in deadlock.  However, it is perfectly legal to call
- * synchronize_srcu_expedited() on one srcu_struct from some other
- * srcu_struct's read-side critical section.
- */
-void synchronize_srcu_expedited(struct srcu_struct *sp)
-{
-	__synchronize_srcu(sp, synchronize_sched_expedited);
-}
-EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
-
-/**
- * srcu_batches_completed - return batches completed.
- * @sp: srcu_struct on which to report batch completion.
- *
- * Report the number of batches, correlated with, but not necessarily
- * precisely the same as, the number of grace periods that have elapsed.
- */
+	call_srcu(sp, &waiter.head, synchronize_srcu_complete);
 
-long srcu_batches_completed(struct srcu_struct *sp)
-{
-	return sp->completed;
+	wait_for_completion(&waiter.wait);
 }
-EXPORT_SYMBOL_GPL(srcu_batches_completed);
+EXPORT_SYMBOL_GPL(synchronize_srcu);


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC][PATCH] srcu: Implement call_srcu()
  2012-01-31 13:32 [RFC][PATCH] srcu: Implement call_srcu() Peter Zijlstra
@ 2012-01-31 13:47 ` Avi Kivity
  2012-01-31 13:50   ` Peter Zijlstra
  0 siblings, 1 reply; 23+ messages in thread
From: Avi Kivity @ 2012-01-31 13:47 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Paul E. McKenney, Oleg Nesterov, linux-kernel, Marcelo Tosatti, KVM list

On 01/31/2012 03:32 PM, Peter Zijlstra wrote:
> Subject: srcu: Implement call_srcu()
> From: Peter Zijlstra <a.p.zijlstra@chello.nl>
> Date: Mon Jan 30 23:20:49 CET 2012
>
> Implement call_srcu() by using a state machine driven by
> call_rcu_sched() and timer callbacks.
>
> The state machine is a direct derivation of the existing
> synchronize_srcu() code and replaces synchronize_sched() calls with a
> call_rcu_sched() callback and the schedule_timeout() calls with simple
> timer callbacks.
>
> It then re-implements synchronize_srcu() using a completion where we
> send the complete through call_srcu().
>
> It completely wrecks synchronize_srcu_expedited() which is only used
> by KVM. 3 of the 5 

2 of the 5

> use cases look like they really want to use
> call_srcu() instead, the remaining 2 I don't know but hope they can,
> which would let us remove it.

They really need to return quickly to userspace, and they really need to
perform some operation between rcu_assign_pointer() and returning, so no.

>
> Compile tested only!! :-)
>

How much did synchronize_srcu_expedited() regress?  Presumably your
compiler didn't tell you that.

Can we get it back to speed by scheduling a work function on all cpus? 
wouldn't that force a quiescent state and allow call_srcu() to fire?

In kvm's use case synchronize_srcu_expedited() is usually called when no
thread is in a critical section, so we don't have to wait for anything
except the srcu machinery.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC][PATCH] srcu: Implement call_srcu()
  2012-01-31 13:47 ` Avi Kivity
@ 2012-01-31 13:50   ` Peter Zijlstra
  2012-01-31 22:24     ` Paul E. McKenney
  0 siblings, 1 reply; 23+ messages in thread
From: Peter Zijlstra @ 2012-01-31 13:50 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Paul E. McKenney, Oleg Nesterov, linux-kernel, Marcelo Tosatti, KVM list

On Tue, 2012-01-31 at 15:47 +0200, Avi Kivity wrote:

> They really need to return quickly to userspace, and they really need to
> perform some operation between rcu_assign_pointer() and returning, so no.

Bugger :/

> >
> > Compile tested only!! :-)
> >
> 
> How much did synchronize_srcu_expedited() regress?  Presumably your
> compiler didn't tell you that.

Nope, quite a lot I figure.

> Can we get it back to speed by scheduling a work function on all cpus? 
> wouldn't that force a quiescent state and allow call_srcu() to fire?
> 
> In kvm's use case synchronize_srcu_expedited() is usually called when no
> thread is in a critical section, so we don't have to wait for anything
> except the srcu machinery.

OK, I'll try and come up with means of making it go fast again ;-)

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC][PATCH] srcu: Implement call_srcu()
  2012-01-31 13:50   ` Peter Zijlstra
@ 2012-01-31 22:24     ` Paul E. McKenney
  2012-02-01 10:22       ` Peter Zijlstra
  0 siblings, 1 reply; 23+ messages in thread
From: Paul E. McKenney @ 2012-01-31 22:24 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Avi Kivity, Oleg Nesterov, linux-kernel, Marcelo Tosatti, KVM list

On Tue, Jan 31, 2012 at 02:50:07PM +0100, Peter Zijlstra wrote:
> On Tue, 2012-01-31 at 15:47 +0200, Avi Kivity wrote:
> 
> > They really need to return quickly to userspace, and they really need to
> > perform some operation between rcu_assign_pointer() and returning, so no.
> 
> Bugger :/
> 
> > >
> > > Compile tested only!! :-)
> > >
> > 
> > How much did synchronize_srcu_expedited() regress?  Presumably your
> > compiler didn't tell you that.
> 
> Nope, quite a lot I figure.
> 
> > Can we get it back to speed by scheduling a work function on all cpus? 
> > wouldn't that force a quiescent state and allow call_srcu() to fire?
> > 
> > In kvm's use case synchronize_srcu_expedited() is usually called when no
> > thread is in a critical section, so we don't have to wait for anything
> > except the srcu machinery.
> 
> OK, I'll try and come up with means of making it go fast again ;-)

I cannot resist suggesting a kthread to do the call_srcu(), which
would allow synchronize_srcu_expedited() to proceed with its current
brute-force speed.

							Thanx, Paul


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC][PATCH] srcu: Implement call_srcu()
  2012-01-31 22:24     ` Paul E. McKenney
@ 2012-02-01 10:22       ` Peter Zijlstra
  2012-02-01 10:44         ` Avi Kivity
  2012-02-01 14:07         ` Paul E. McKenney
  0 siblings, 2 replies; 23+ messages in thread
From: Peter Zijlstra @ 2012-02-01 10:22 UTC (permalink / raw)
  To: paulmck
  Cc: Avi Kivity, Oleg Nesterov, linux-kernel, Marcelo Tosatti, KVM list

On Tue, 2012-01-31 at 14:24 -0800, Paul E. McKenney wrote:

> > > Can we get it back to speed by scheduling a work function on all cpus? 
> > > wouldn't that force a quiescent state and allow call_srcu() to fire?
> > > 
> > > In kvm's use case synchronize_srcu_expedited() is usually called when no
> > > thread is in a critical section, so we don't have to wait for anything
> > > except the srcu machinery.
> > 
> > OK, I'll try and come up with means of making it go fast again ;-)
> 
> I cannot resist suggesting a kthread to do the call_srcu(), which
> would allow synchronize_srcu_expedited() to proceed with its current
> brute-force speed.

Right, so I really don't like to add a kthread per srcu instance.
Sharing a kthread between all SRCUs will be problematic since these sync
things can take forever and so the thread will become a bottlneck.

Also, I'd really like to come up with a better means of sync for SRCU
and not hammer the entire machine (3 times).

One of the things I was thinking of is adding a sequence counter in the
per-cpu data. Using that we could do something like:

  unsigned int seq1 = 0, seq2 = 0, count = 0;
  int cpu, idx;

  idx = ACCESS_ONCE(sp->completions) & 1;

  for_each_possible_cpu(cpu)
	seq1 += per_cpu(sp->per_cpu_ref, cpu)->seq;

  for_each_possible_cpu(cpu)
	count += per_cpu(sp->per_cpu_ref, cpu)->c[idx];

  for_each_possible_cpu(cpu)
	seq2 += per_cpu(sp->per_cpu_ref, cpu)->seq;

  /*
   * there's no active references and no activity, we pass
   */
  if (seq1 == seq2 && count == 0)
	return;

  synchronize_srcu_slow();


This would add a fast-path which should catch the case Avi outlined
where we call sync_srcu() when there's no other SRCU activity.

The other thing I was hoping to be able to pull off is add a copy of idx
into the same cacheline as c[] and abuse cache-coherency to avoid some
of the sync_sched() calls, but that's currently hurting my brain.


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC][PATCH] srcu: Implement call_srcu()
  2012-02-01 10:22       ` Peter Zijlstra
@ 2012-02-01 10:44         ` Avi Kivity
  2012-02-01 10:49           ` Avi Kivity
  2012-02-01 14:07         ` Paul E. McKenney
  1 sibling, 1 reply; 23+ messages in thread
From: Avi Kivity @ 2012-02-01 10:44 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: paulmck, Oleg Nesterov, linux-kernel, Marcelo Tosatti, KVM list

On 02/01/2012 12:22 PM, Peter Zijlstra wrote:
> One of the things I was thinking of is adding a sequence counter in the
> per-cpu data. Using that we could do something like:
>
>   unsigned int seq1 = 0, seq2 = 0, count = 0;
>   int cpu, idx;
>
>   idx = ACCESS_ONCE(sp->completions) & 1;
>
>   for_each_possible_cpu(cpu)
> 	seq1 += per_cpu(sp->per_cpu_ref, cpu)->seq;
>
>   for_each_possible_cpu(cpu)
> 	count += per_cpu(sp->per_cpu_ref, cpu)->c[idx];
>
>   for_each_possible_cpu(cpu)
> 	seq2 += per_cpu(sp->per_cpu_ref, cpu)->seq;
>
>   /*
>    * there's no active references and no activity, we pass
>    */
>   if (seq1 == seq2 && count == 0)
> 	return;
>
>   synchronize_srcu_slow();
>
>
> This would add a fast-path which should catch the case Avi outlined
> where we call sync_srcu() when there's no other SRCU activity.

Sorry, I was inaccurate.  In two of the cases indeed we don't expect
guest activity, and we're okay with waiting a bit if there is guest
activity - when we're altering the guest physical memory map.  But the
third case does have concurrent guest activity with
synchronize_srcu_expedited() and we still need it fast - that's when
userspace reads the dirty bitmap log of a running guest and replaces it
with a new bitmap.

There may be a way to convert it to call_srcu() though.  Without
synchronize_srcu_expedited(), kvm sees both the old and the new bitmaps,
but that's fine, since the dirty bits will go *somewhere*, and we can
pick them up later in call_srcu().  The only problem is if this is the
very last call to kvm_vm_ioctl_get_dirty_log(), and the callback
triggers after it returns - we end up with a bag of bits with not one to
return them to.  Maybe we can detect this conditions (all vcpus ought to
be stopped), and do something like:


   if (all vcpus stopped) {
        /* no activity, this should be fast */
        synchronize_srcu()
        /* collect and return bits */
   } else {
        call_srcu(collect bits)
   }

still a snag - we can't reliably detect that all vcpus are stopped, they
may be just resting in userspace, and restart while synchronize_srcu()
is running.

Marcelo?

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC][PATCH] srcu: Implement call_srcu()
  2012-02-01 10:44         ` Avi Kivity
@ 2012-02-01 10:49           ` Avi Kivity
  2012-02-01 11:00             ` Takuya Yoshikawa
  2012-02-01 13:50             ` Marcelo Tosatti
  0 siblings, 2 replies; 23+ messages in thread
From: Avi Kivity @ 2012-02-01 10:49 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: paulmck, Oleg Nesterov, linux-kernel, Marcelo Tosatti, KVM list

On 02/01/2012 12:44 PM, Avi Kivity wrote:
> On 02/01/2012 12:22 PM, Peter Zijlstra wrote:
> > One of the things I was thinking of is adding a sequence counter in the
> > per-cpu data. Using that we could do something like:
> >
> >   unsigned int seq1 = 0, seq2 = 0, count = 0;
> >   int cpu, idx;
> >
> >   idx = ACCESS_ONCE(sp->completions) & 1;
> >
> >   for_each_possible_cpu(cpu)
> > 	seq1 += per_cpu(sp->per_cpu_ref, cpu)->seq;
> >
> >   for_each_possible_cpu(cpu)
> > 	count += per_cpu(sp->per_cpu_ref, cpu)->c[idx];
> >
> >   for_each_possible_cpu(cpu)
> > 	seq2 += per_cpu(sp->per_cpu_ref, cpu)->seq;
> >
> >   /*
> >    * there's no active references and no activity, we pass
> >    */
> >   if (seq1 == seq2 && count == 0)
> > 	return;
> >
> >   synchronize_srcu_slow();
> >
> >
> > This would add a fast-path which should catch the case Avi outlined
> > where we call sync_srcu() when there's no other SRCU activity.
>
> Sorry, I was inaccurate.  In two of the cases indeed we don't expect
> guest activity, and we're okay with waiting a bit if there is guest
> activity - when we're altering the guest physical memory map.  But the
> third case does have concurrent guest activity with
> synchronize_srcu_expedited() and we still need it fast - that's when
> userspace reads the dirty bitmap log of a running guest and replaces it
> with a new bitmap.
>
> There may be a way to convert it to call_srcu() though.  Without
> synchronize_srcu_expedited(), kvm sees both the old and the new bitmaps,
> but that's fine, since the dirty bits will go *somewhere*, and we can
> pick them up later in call_srcu().  The only problem is if this is the
> very last call to kvm_vm_ioctl_get_dirty_log(), and the callback
> triggers after it returns - we end up with a bag of bits with not one to
> return them to.  Maybe we can detect this conditions (all vcpus ought to
> be stopped), and do something like:
>
>
>    if (all vcpus stopped) {
>         /* no activity, this should be fast */
>         synchronize_srcu()
>         /* collect and return bits */
>    } else {
>         call_srcu(collect bits)
>    }
>
> still a snag - we can't reliably detect that all vcpus are stopped, they
> may be just resting in userspace, and restart while synchronize_srcu()
> is running.
>
> Marcelo?
>

Or something completely different - we can remove srcu from the equation
completely in this case.  Use just one bitmap (so no
rcu_assign_pointer), and use atomic operations to copy and clear:

  word = bitmap[i]
  put_user(word)
  atomic_and(&bitmap[i], ~word)


-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC][PATCH] srcu: Implement call_srcu()
  2012-02-01 10:49           ` Avi Kivity
@ 2012-02-01 11:00             ` Takuya Yoshikawa
  2012-02-01 11:01               ` Avi Kivity
  2012-02-01 13:50             ` Marcelo Tosatti
  1 sibling, 1 reply; 23+ messages in thread
From: Takuya Yoshikawa @ 2012-02-01 11:00 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Peter Zijlstra, paulmck, Oleg Nesterov, linux-kernel,
	Marcelo Tosatti, KVM list

(2012/02/01 19:49), Avi Kivity wrote:
> On 02/01/2012 12:44 PM, Avi Kivity wrote:
>> On 02/01/2012 12:22 PM, Peter Zijlstra wrote:
>>> One of the things I was thinking of is adding a sequence counter in the
>>> per-cpu data. Using that we could do something like:
>>>
>>>    unsigned int seq1 = 0, seq2 = 0, count = 0;
>>>    int cpu, idx;
>>>
>>>    idx = ACCESS_ONCE(sp->completions)&  1;
>>>
>>>    for_each_possible_cpu(cpu)
>>> 	seq1 += per_cpu(sp->per_cpu_ref, cpu)->seq;
>>>
>>>    for_each_possible_cpu(cpu)
>>> 	count += per_cpu(sp->per_cpu_ref, cpu)->c[idx];
>>>
>>>    for_each_possible_cpu(cpu)
>>> 	seq2 += per_cpu(sp->per_cpu_ref, cpu)->seq;
>>>
>>>    /*
>>>     * there's no active references and no activity, we pass
>>>     */
>>>    if (seq1 == seq2&&  count == 0)
>>> 	return;
>>>
>>>    synchronize_srcu_slow();
>>>
>>>
>>> This would add a fast-path which should catch the case Avi outlined
>>> where we call sync_srcu() when there's no other SRCU activity.
>>
>> Sorry, I was inaccurate.  In two of the cases indeed we don't expect
>> guest activity, and we're okay with waiting a bit if there is guest
>> activity - when we're altering the guest physical memory map.  But the
>> third case does have concurrent guest activity with
>> synchronize_srcu_expedited() and we still need it fast - that's when
>> userspace reads the dirty bitmap log of a running guest and replaces it
>> with a new bitmap.
>>
>> There may be a way to convert it to call_srcu() though.  Without
>> synchronize_srcu_expedited(), kvm sees both the old and the new bitmaps,
>> but that's fine, since the dirty bits will go *somewhere*, and we can
>> pick them up later in call_srcu().  The only problem is if this is the
>> very last call to kvm_vm_ioctl_get_dirty_log(), and the callback
>> triggers after it returns - we end up with a bag of bits with not one to
>> return them to.  Maybe we can detect this conditions (all vcpus ought to
>> be stopped), and do something like:
>>
>>
>>     if (all vcpus stopped) {
>>          /* no activity, this should be fast */
>>          synchronize_srcu()
>>          /* collect and return bits */
>>     } else {
>>          call_srcu(collect bits)
>>     }
>>
>> still a snag - we can't reliably detect that all vcpus are stopped, they
>> may be just resting in userspace, and restart while synchronize_srcu()
>> is running.
>>
>> Marcelo?
>>
>
> Or something completely different - we can remove srcu from the equation
> completely in this case.  Use just one bitmap (so no

I am already testing various possibilities like this.
For VGA, using clear_bit() (+ rmap write protect) works well!

> rcu_assign_pointer), and use atomic operations to copy and clear:
>
>    word = bitmap[i]
>    put_user(word)
>    atomic_and(&bitmap[i], ~word)
>
>

This kind of this was really slow IIRC.


How about just doing:

take a spin_lock
copy the entire (or some portions of) bitmap locally
clear the bitmap
unlock

write protect the dirty pages based on the copied dirty data

copy_to_user



I can show you some performance numbers, this weekend, if you like.

	Takuya

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC][PATCH] srcu: Implement call_srcu()
  2012-02-01 11:00             ` Takuya Yoshikawa
@ 2012-02-01 11:01               ` Avi Kivity
  2012-02-01 11:12                 ` Takuya Yoshikawa
  2012-02-01 13:43                 ` Marcelo Tosatti
  0 siblings, 2 replies; 23+ messages in thread
From: Avi Kivity @ 2012-02-01 11:01 UTC (permalink / raw)
  To: Takuya Yoshikawa
  Cc: Peter Zijlstra, paulmck, Oleg Nesterov, linux-kernel,
	Marcelo Tosatti, KVM list

On 02/01/2012 01:00 PM, Takuya Yoshikawa wrote:
>
>> rcu_assign_pointer), and use atomic operations to copy and clear:
>>
>>    word = bitmap[i]
>>    put_user(word)
>>    atomic_and(&bitmap[i], ~word)
>>
>>
>
> This kind of this was really slow IIRC.
>
>
> How about just doing:
>
> take a spin_lock
> copy the entire (or some portions of) bitmap locally
> clear the bitmap
> unlock
>

That means that vcpus dirtying memory also have to take that lock, and
spin while the bitmap is being copied.  So kvm_vm_ioctl_get_dirty_log()
will become faster, at the expense of vcpus, which I think is a bad
tradeoff.

> write protect the dirty pages based on the copied dirty data
>
> copy_to_user
>
>
>
> I can show you some performance numbers, this weekend, if you like.

That'll be great, numbers are better than speculation.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC][PATCH] srcu: Implement call_srcu()
  2012-02-01 11:01               ` Avi Kivity
@ 2012-02-01 11:12                 ` Takuya Yoshikawa
  2012-02-01 13:24                   ` Avi Kivity
  2012-02-01 13:43                 ` Marcelo Tosatti
  1 sibling, 1 reply; 23+ messages in thread
From: Takuya Yoshikawa @ 2012-02-01 11:12 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Peter Zijlstra, paulmck, Oleg Nesterov, linux-kernel,
	Marcelo Tosatti, KVM list

(2012/02/01 20:01), Avi Kivity wrote:
> On 02/01/2012 01:00 PM, Takuya Yoshikawa wrote:

>> How about just doing:
>>
>> take a spin_lock
>> copy the entire (or some portions of) bitmap locally
>> clear the bitmap
>> unlock
>>
>
> That means that vcpus dirtying memory also have to take that lock, and
> spin while the bitmap is being copied.  So kvm_vm_ioctl_get_dirty_log()
> will become faster, at the expense of vcpus, which I think is a bad
> tradeoff.

Almost every caller is already holding the mmu_lock.

Isn't it possible to make others hold the lock only for the case of
marking to the bitmap? (I will check myself too.)


>> write protect the dirty pages based on the copied dirty data
>>
>> copy_to_user
>>
>>
>>
>> I can show you some performance numbers, this weekend, if you like.
>
> That'll be great, numbers are better than speculation.
>

Yes, I already have some good numbers to show (and some patches).

	Takuya

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC][PATCH] srcu: Implement call_srcu()
  2012-02-01 11:12                 ` Takuya Yoshikawa
@ 2012-02-01 13:24                   ` Avi Kivity
  2012-02-02  5:46                     ` [test result] dirty logging without srcu update -- " Takuya Yoshikawa
  0 siblings, 1 reply; 23+ messages in thread
From: Avi Kivity @ 2012-02-01 13:24 UTC (permalink / raw)
  To: Takuya Yoshikawa
  Cc: Peter Zijlstra, paulmck, Oleg Nesterov, linux-kernel,
	Marcelo Tosatti, KVM list

On 02/01/2012 01:12 PM, Takuya Yoshikawa wrote:
> (2012/02/01 20:01), Avi Kivity wrote:
>> On 02/01/2012 01:00 PM, Takuya Yoshikawa wrote:
>
>>> How about just doing:
>>>
>>> take a spin_lock
>>> copy the entire (or some portions of) bitmap locally
>>> clear the bitmap
>>> unlock
>>>
>>
>> That means that vcpus dirtying memory also have to take that lock, and
>> spin while the bitmap is being copied.  So kvm_vm_ioctl_get_dirty_log()
>> will become faster, at the expense of vcpus, which I think is a bad
>> tradeoff.
>
> Almost every caller is already holding the mmu_lock.

True.  But let's not add more locks.

>
> Isn't it possible to make others hold the lock only for the case of
> marking to the bitmap? (I will check myself too.)

Yes, in walk_addr(), while setting accessed and dirty bits in guest
PTEs, or while emulating instructions and writing to guest memory.

>> That'll be great, numbers are better than speculation.
>>
>
>
> Yes, I already have some good numbers to show (and some patches).

Looking forward.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC][PATCH] srcu: Implement call_srcu()
  2012-02-01 11:01               ` Avi Kivity
  2012-02-01 11:12                 ` Takuya Yoshikawa
@ 2012-02-01 13:43                 ` Marcelo Tosatti
  2012-02-01 15:42                   ` Takuya Yoshikawa
  1 sibling, 1 reply; 23+ messages in thread
From: Marcelo Tosatti @ 2012-02-01 13:43 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Takuya Yoshikawa, Peter Zijlstra, paulmck, Oleg Nesterov,
	linux-kernel, KVM list

On Wed, Feb 01, 2012 at 01:01:38PM +0200, Avi Kivity wrote:
> On 02/01/2012 01:00 PM, Takuya Yoshikawa wrote:
> >
> >> rcu_assign_pointer), and use atomic operations to copy and clear:
> >>
> >>    word = bitmap[i]
> >>    put_user(word)
> >>    atomic_and(&bitmap[i], ~word)
> >>
> >>
> >
> > This kind of this was really slow IIRC.
> >
> >
> > How about just doing:
> >
> > take a spin_lock
> > copy the entire (or some portions of) bitmap locally
> > clear the bitmap
> > unlock
> >
> 
> That means that vcpus dirtying memory also have to take that lock, and
> spin while the bitmap is being copied.  So kvm_vm_ioctl_get_dirty_log()
> will become faster, at the expense of vcpus, which I think is a bad
> tradeoff.
> 
> > write protect the dirty pages based on the copied dirty data
> >
> > copy_to_user
> >
> >
> >
> > I can show you some performance numbers, this weekend, if you like.
> 
> That'll be great, numbers are better than speculation.

  get dirty log:    5634134 ns for     262144 dirty pages

5ms (for the entire operation).


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC][PATCH] srcu: Implement call_srcu()
  2012-02-01 10:49           ` Avi Kivity
  2012-02-01 11:00             ` Takuya Yoshikawa
@ 2012-02-01 13:50             ` Marcelo Tosatti
  1 sibling, 0 replies; 23+ messages in thread
From: Marcelo Tosatti @ 2012-02-01 13:50 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Peter Zijlstra, paulmck, Oleg Nesterov, linux-kernel, KVM list

On Wed, Feb 01, 2012 at 12:49:57PM +0200, Avi Kivity wrote:
> On 02/01/2012 12:44 PM, Avi Kivity wrote:
> > On 02/01/2012 12:22 PM, Peter Zijlstra wrote:
> > > One of the things I was thinking of is adding a sequence counter in the
> > > per-cpu data. Using that we could do something like:
> > >
> > >   unsigned int seq1 = 0, seq2 = 0, count = 0;
> > >   int cpu, idx;
> > >
> > >   idx = ACCESS_ONCE(sp->completions) & 1;
> > >
> > >   for_each_possible_cpu(cpu)
> > > 	seq1 += per_cpu(sp->per_cpu_ref, cpu)->seq;
> > >
> > >   for_each_possible_cpu(cpu)
> > > 	count += per_cpu(sp->per_cpu_ref, cpu)->c[idx];
> > >
> > >   for_each_possible_cpu(cpu)
> > > 	seq2 += per_cpu(sp->per_cpu_ref, cpu)->seq;
> > >
> > >   /*
> > >    * there's no active references and no activity, we pass
> > >    */
> > >   if (seq1 == seq2 && count == 0)
> > > 	return;
> > >
> > >   synchronize_srcu_slow();
> > >
> > >
> > > This would add a fast-path which should catch the case Avi outlined
> > > where we call sync_srcu() when there's no other SRCU activity.
> >
> > Sorry, I was inaccurate.  In two of the cases indeed we don't expect
> > guest activity, and we're okay with waiting a bit if there is guest
> > activity - when we're altering the guest physical memory map.  But the
> > third case does have concurrent guest activity with
> > synchronize_srcu_expedited() and we still need it fast - that's when
> > userspace reads the dirty bitmap log of a running guest and replaces it
> > with a new bitmap.
> >
> > There may be a way to convert it to call_srcu() though.  Without
> > synchronize_srcu_expedited(), kvm sees both the old and the new bitmaps,
> > but that's fine, since the dirty bits will go *somewhere*, and we can
> > pick them up later in call_srcu().  The only problem is if this is the
> > very last call to kvm_vm_ioctl_get_dirty_log(), and the callback
> > triggers after it returns - we end up with a bag of bits with not one to
> > return them to.  Maybe we can detect this conditions (all vcpus ought to
> > be stopped), and do something like:
> >
> >
> >    if (all vcpus stopped) {
> >         /* no activity, this should be fast */
> >         synchronize_srcu()
> >         /* collect and return bits */
> >    } else {
> >         call_srcu(collect bits)
> >    }
> >
> > still a snag - we can't reliably detect that all vcpus are stopped, they
> > may be just resting in userspace, and restart while synchronize_srcu()
> > is running.
> >
> > Marcelo?
> >

VGABIOS mode constantly destroys and creates 0xa0000 slot, so
performance is required for KVM_SET_MEM too (it can probably be fixed in
qemu, but older qemu's must be supported).

For KVM it would be better for synchronize_srcu_expedited() to remain
relatively fast (think heavy users of GET_DIRTY_LOG such as Kemari),
even if slower than it is today.

synchronize_srcu() takes 10ms or so.

> Or something completely different - we can remove srcu from the equation
> completely in this case.  Use just one bitmap (so no
> rcu_assign_pointer), and use atomic operations to copy and clear:
> 
>   word = bitmap[i]
>   put_user(word)
>   atomic_and(&bitmap[i], ~word)
> 
> 
> -- 
> error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC][PATCH] srcu: Implement call_srcu()
  2012-02-01 10:22       ` Peter Zijlstra
  2012-02-01 10:44         ` Avi Kivity
@ 2012-02-01 14:07         ` Paul E. McKenney
  1 sibling, 0 replies; 23+ messages in thread
From: Paul E. McKenney @ 2012-02-01 14:07 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Avi Kivity, Oleg Nesterov, linux-kernel, Marcelo Tosatti, KVM list

On Wed, Feb 01, 2012 at 11:22:29AM +0100, Peter Zijlstra wrote:
> On Tue, 2012-01-31 at 14:24 -0800, Paul E. McKenney wrote:
> 
> > > > Can we get it back to speed by scheduling a work function on all cpus? 
> > > > wouldn't that force a quiescent state and allow call_srcu() to fire?
> > > > 
> > > > In kvm's use case synchronize_srcu_expedited() is usually called when no
> > > > thread is in a critical section, so we don't have to wait for anything
> > > > except the srcu machinery.
> > > 
> > > OK, I'll try and come up with means of making it go fast again ;-)
> > 
> > I cannot resist suggesting a kthread to do the call_srcu(), which
> > would allow synchronize_srcu_expedited() to proceed with its current
> > brute-force speed.
> 
> Right, so I really don't like to add a kthread per srcu instance.
> Sharing a kthread between all SRCUs will be problematic since these sync
> things can take forever and so the thread will become a bottlneck.

It should be possible to substitute call_rcu_sched() for synchronize_sched()
and use a per-SRCU-instance state machine to avoid a given instance from
blocking progress on the other instances.

> Also, I'd really like to come up with a better means of sync for SRCU
> and not hammer the entire machine (3 times).

We could introduce read-side smp_mb()s and reduce to one machine-hammer
straightforwardly.  It should be possible to make it so that if a given
srcu_struct ever uses synchronize_srcu_expedited(), it is required to
use the slower version of the read-side primitives.

Hmmm..  I should go back and take a look at the 8 or 9 variants that
I posted to LKML when I first worked on this.

> One of the things I was thinking of is adding a sequence counter in the
> per-cpu data. Using that we could do something like:
> 
>   unsigned int seq1 = 0, seq2 = 0, count = 0;
>   int cpu, idx;
> 
>   idx = ACCESS_ONCE(sp->completions) & 1;
> 
>   for_each_possible_cpu(cpu)
> 	seq1 += per_cpu(sp->per_cpu_ref, cpu)->seq;
> 
>   for_each_possible_cpu(cpu)
> 	count += per_cpu(sp->per_cpu_ref, cpu)->c[idx];
> 
>   for_each_possible_cpu(cpu)
> 	seq2 += per_cpu(sp->per_cpu_ref, cpu)->seq;
> 
>   /*
>    * there's no active references and no activity, we pass
>    */
>   if (seq1 == seq2 && count == 0)
> 	return;
> 
>   synchronize_srcu_slow();
> 
> 
> This would add a fast-path which should catch the case Avi outlined
> where we call sync_srcu() when there's no other SRCU activity.

Overflow could give us false positives for workloads with frequent
readers.  Yes, I know you don't like 32-bit systems, but SRCU still has
to work on them.  ;-)

We also need some smp_mb() calls for readers and in the above sequence
for this to have half a chance of working.

> The other thing I was hoping to be able to pull off is add a copy of idx
> into the same cacheline as c[] and abuse cache-coherency to avoid some
> of the sync_sched() calls, but that's currently hurting my brain.

I will check with some hardware architects.  Last I knew, this was not
officially supported, but it cannot hurt to ask again.

I suppose that another approach would be to use bitfields.  Fifteen bits
should be enough for each of the counters, but ouch!  Atomic instructions
would be required to make incrementing all the idx bits safe.

							Thanx, Paul


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [RFC][PATCH] srcu: Implement call_srcu()
  2012-02-01 13:43                 ` Marcelo Tosatti
@ 2012-02-01 15:42                   ` Takuya Yoshikawa
  0 siblings, 0 replies; 23+ messages in thread
From: Takuya Yoshikawa @ 2012-02-01 15:42 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: Avi Kivity, Takuya Yoshikawa, Peter Zijlstra, paulmck,
	Oleg Nesterov, linux-kernel, KVM list

On Wed, 1 Feb 2012 11:43:47 -0200
Marcelo Tosatti <mtosatti@redhat.com> wrote:

> > > I can show you some performance numbers, this weekend, if you like.
> > 
> > That'll be great, numbers are better than speculation.
> 
>   get dirty log:    5634134 ns for     262144 dirty pages
> 
> 5ms (for the entire operation).
> 

I have mixed my personal patch series and Avi's comments, and ...

I am now getting really good improvements for every number of dirty pages.
(might be a dream, so checking now ... will soon report)

	Takuya

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [test result] dirty logging without srcu update -- Re: [RFC][PATCH] srcu: Implement call_srcu()
  2012-02-01 13:24                   ` Avi Kivity
@ 2012-02-02  5:46                     ` Takuya Yoshikawa
  2012-02-02 10:10                       ` Avi Kivity
  0 siblings, 1 reply; 23+ messages in thread
From: Takuya Yoshikawa @ 2012-02-02  5:46 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Peter Zijlstra, paulmck, Oleg Nesterov, linux-kernel,
	Marcelo Tosatti, KVM list

Avi Kivity <avi@redhat.com> wrote:

> >> That'll be great, numbers are better than speculation.
> >>
> >
> >
> > Yes, I already have some good numbers to show (and some patches).
> 
> Looking forward.

I made a patch to see if Avi's suggestion of getting rid of srcu
update for dirty logging is practical;  tested with my unit-test.

(I used a function to write protect a range of pages using rmap,
 which is itself useful for optimizing the current code.)

1. test result

on 32bit host (core i3 box)   // just for the unit-test ...
slot size: 256K pages (1GB memory)


Measured by dirty-log-perf (executed only once for each case)

Note: dirty pages are completely distributed
      (no locality: worst case for my patch?)

=========================================================
# of dirty pages:  kvm.git (ns),  with this patch (ns)
1:         102,077 ns      10,105 ns
2:          47,197 ns       9,395 ns
4:          43,563 ns       9,938 ns
8:          41,239 ns      10,618 ns
16:         42,988 ns      12,299 ns
32:         45,503 ns      14,298 ns
64:         50,915 ns      19,895 ns
128:        61,087 ns      29,260 ns
256:        81,007 ns      49,023 ns
512:       132,776 ns      86,670 ns
1024:      939,299 ns     131,496 ns
2048:      992,209 ns     250,429 ns
4096:      891,809 ns     479,280 ns
8192:    1,027,280 ns     906,971 ns
(until now pretty good)

(ah, for every 32-bit atomic clear mask ...)
16384:   1,270,972 ns   6,661,741 ns    //  1  1  1 ...  1
32768:   1,581,335 ns   9,673,985 ns    //  ...
65536:   2,161,604 ns  11,466,134 ns    //  ...
131072:  3,253,027 ns  13,412,954 ns    //  ...
262144:  5,663,002 ns  16,309,924 ns    // 31 31 31 ... 31
=========================================================

According to a 2005 usenix paper, WWS with a 8sec window was
about 50,000 pages for a high dirtying rate program.

Taking into acount of another possible gains from the WWS locality
of real workloads, these numbers are not so bad IMO.

Furthermore the code has been made for initial test only and I did
not do any optimization:  I know what I should try.

So this seems worth more testing.


The new code also makes it possible to do find-grained get dirty log.
Live migration can be done like this ??? (not sure yet):

	until the dirty rate becomes enough low
		get dirty log for the first 32K pages (partial return is OK)
		while sending
		get dirty log for the next 32K pages  (partial return is OK)
		while sending
		...
		get dirty log for the last 32K pages  (partial return is OK)

	stop the guest and get dirty log (but no need to write protect now)
	send the remaining pages

New API is needed for this as discussed before!


Thanks,
	Takuya



2. test patch

[PATCH for test only] KVM: dirty logging: avoid srcu and ...

VGA works normally but not tested enough yet.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
---
 arch/x86/include/asm/kvm_host.h |    2 +
 arch/x86/kvm/mmu.c              |   48 ++++++++++++++++++++++++++++++++-
 arch/x86/kvm/x86.c              |   55 ++++++++++++++++----------------------
 include/linux/kvm_host.h        |    7 +++++
 virt/kvm/kvm_main.c             |    7 +----
 5 files changed, 79 insertions(+), 40 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4610166..e9bbef1 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -684,6 +684,8 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 
 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
+void kvm_mmu_write_protect_range(struct kvm *kvm, struct kvm_memory_slot *slot,
+				gfn_t start, gfn_t end);
 int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
 			       struct kvm_memory_slot *slot);
 void kvm_mmu_zap_all(struct kvm *kvm);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ae76cc3..62cc3a9 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -688,8 +688,7 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
 {
 	unsigned long idx;
 
-	idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
-	      (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
+	idx = gfn_to_index(gfn, slot->base_gfn, level);
 	return &slot->lpage_info[level - 2][idx];
 }
 
@@ -1011,6 +1010,51 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
 		rmap_remove(kvm, sptep);
 }
 
+static void write_protect_range(struct kvm *kvm,
+				struct kvm_memory_slot *slot, int level,
+				gfn_t start, gfn_t end)
+{
+	unsigned long *rmapp;
+	u64 *spte;
+	gfn_t gfn;
+
+	if (level == PT_PAGE_TABLE_LEVEL) {
+		for (gfn = start; gfn < end; gfn++) {
+			rmapp = __gfn_to_rmap(gfn, level, slot);
+			spte = rmap_next(rmapp, NULL);
+			while (spte) {
+				if (is_writable_pte(*spte))
+					mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK);
+				spte = rmap_next(rmapp, spte);
+			}
+		}
+	} else {
+		for (gfn = start; gfn < end; gfn++) {
+			rmapp = __gfn_to_rmap(gfn, level, slot);
+			spte = rmap_next(rmapp, NULL);
+			while (spte) {
+				if (is_writable_pte(*spte)) {
+					drop_spte(kvm, spte);
+					--kvm->stat.lpages;
+					spte = NULL;
+				}
+				spte = rmap_next(rmapp, spte);
+			}
+		}
+	}
+}
+
+void kvm_mmu_write_protect_range(struct kvm *kvm, struct kvm_memory_slot *slot,
+				gfn_t start, gfn_t end)
+{
+	int i;
+
+	for (i = PT_PAGE_TABLE_LEVEL;
+	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
+		write_protect_range(kvm, slot, i, start, end);
+	}
+}
+
 int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn,
 			       struct kvm_memory_slot *slot)
 {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2bd77a3..becb571 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3049,7 +3049,10 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 {
 	int r;
 	struct kvm_memory_slot *memslot;
-	unsigned long n, nr_dirty_pages;
+	unsigned long n, i;
+	unsigned bits;
+	unsigned *dirty_bitmap;
+	unsigned __user *dirty_bitmap_user;
 
 	mutex_lock(&kvm->slots_lock);
 
@@ -3063,44 +3066,32 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 		goto out;
 
 	n = kvm_dirty_bitmap_bytes(memslot);
-	nr_dirty_pages = memslot->nr_dirty_pages;
+	r = -EFAULT;
+	if (clear_user(log->dirty_bitmap, n))
+		goto out;
 
-	/* If nothing is dirty, don't bother messing with page tables. */
-	if (nr_dirty_pages) {
-		struct kvm_memslots *slots, *old_slots;
-		unsigned long *dirty_bitmap, *dirty_bitmap_head;
+	dirty_bitmap = (unsigned *)memslot->dirty_bitmap;
+	dirty_bitmap_user = (unsigned __user *)log->dirty_bitmap;
 
-		dirty_bitmap = memslot->dirty_bitmap;
-		dirty_bitmap_head = memslot->dirty_bitmap_head;
-		if (dirty_bitmap == dirty_bitmap_head)
-			dirty_bitmap_head += n / sizeof(long);
-		memset(dirty_bitmap_head, 0, n);
+	for (i = 0; i < n / sizeof(unsigned); i++) {
+		gfn_t start, end;
 
-		r = -ENOMEM;
-		slots = kmemdup(kvm->memslots, sizeof(*kvm->memslots), GFP_KERNEL);
-		if (!slots)
-			goto out;
-
-		memslot = id_to_memslot(slots, log->slot);
-		memslot->nr_dirty_pages = 0;
-		memslot->dirty_bitmap = dirty_bitmap_head;
-		update_memslots(slots, NULL);
+		if (!(bits = dirty_bitmap[i]))
+			continue;
 
-		old_slots = kvm->memslots;
-		rcu_assign_pointer(kvm->memslots, slots);
-		synchronize_srcu_expedited(&kvm->srcu);
-		kfree(old_slots);
+		start = memslot->base_gfn + 8 * sizeof(unsigned) * i;
+		end   = start;
+		start += __ffs(bits);
+		end   += __fls(bits) + 1;
+		atomic_clear_mask(bits, &dirty_bitmap[i]);
 
-		write_protect_slot(kvm, memslot, dirty_bitmap, nr_dirty_pages);
+		spin_lock(&kvm->mmu_lock);
+		kvm_mmu_write_protect_range(kvm, memslot, start, end);
+		spin_unlock(&kvm->mmu_lock);
 
-		r = -EFAULT;
-		if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
-			goto out;
-	} else {
-		r = -EFAULT;
-		if (clear_user(log->dirty_bitmap, n))
-			goto out;
+		__put_user(bits, &dirty_bitmap_user[i]);
 	}
+	kvm_flush_remote_tlbs(kvm);
 
 	r = 0;
 out:
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index eada8e6..06d4e41 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -656,6 +656,13 @@ static inline int memslot_id(struct kvm *kvm, gfn_t gfn)
 	return gfn_to_memslot(kvm, gfn)->id;
 }
 
+static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
+{
+	/* KVM_HPAGE_GFN_SHIFT(PT_PAGE_TABLE_LEVEL) must be 0. */
+	return (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
+		(base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
+}
+
 static inline unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
 					       gfn_t gfn)
 {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 9f32bff..e483ae4 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -797,15 +797,10 @@ int __kvm_set_memory_region(struct kvm *kvm,
 		int lpages;
 		int level = i + 2;
 
-		/* Avoid unused variable warning if no large pages */
-		(void)level;
-
 		if (new.lpage_info[i])
 			continue;
 
-		lpages = 1 + ((base_gfn + npages - 1)
-			     >> KVM_HPAGE_GFN_SHIFT(level));
-		lpages -= base_gfn >> KVM_HPAGE_GFN_SHIFT(level);
+		lpages = gfn_to_index(base_gfn + npages - 1, base_gfn, level) + 1;
 
 		new.lpage_info[i] = vzalloc(lpages * sizeof(*new.lpage_info[i]));
 
-- 
1.7.5.4

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [test result] dirty logging without srcu update -- Re: [RFC][PATCH] srcu: Implement call_srcu()
  2012-02-02  5:46                     ` [test result] dirty logging without srcu update -- " Takuya Yoshikawa
@ 2012-02-02 10:10                       ` Avi Kivity
  2012-02-02 10:21                         ` Takuya Yoshikawa
  0 siblings, 1 reply; 23+ messages in thread
From: Avi Kivity @ 2012-02-02 10:10 UTC (permalink / raw)
  To: Takuya Yoshikawa
  Cc: Peter Zijlstra, paulmck, Oleg Nesterov, linux-kernel,
	Marcelo Tosatti, KVM list

On 02/02/2012 07:46 AM, Takuya Yoshikawa wrote:
> Avi Kivity <avi@redhat.com> wrote:
>
> > >> That'll be great, numbers are better than speculation.
> > >>
> > >
> > >
> > > Yes, I already have some good numbers to show (and some patches).
> > 
> > Looking forward.
>
> I made a patch to see if Avi's suggestion of getting rid of srcu
> update for dirty logging is practical;  tested with my unit-test.
>
> (I used a function to write protect a range of pages using rmap,
>  which is itself useful for optimizing the current code.)
>
> 1. test result
>
> on 32bit host (core i3 box)   // just for the unit-test ...
> slot size: 256K pages (1GB memory)
>
>
> Measured by dirty-log-perf (executed only once for each case)
>
> Note: dirty pages are completely distributed
>       (no locality: worst case for my patch?)
>
> =========================================================
> # of dirty pages:  kvm.git (ns),  with this patch (ns)
> 1:         102,077 ns      10,105 ns
> 2:          47,197 ns       9,395 ns
> 4:          43,563 ns       9,938 ns
> 8:          41,239 ns      10,618 ns
> 16:         42,988 ns      12,299 ns
> 32:         45,503 ns      14,298 ns
> 64:         50,915 ns      19,895 ns
> 128:        61,087 ns      29,260 ns
> 256:        81,007 ns      49,023 ns
> 512:       132,776 ns      86,670 ns
> 1024:      939,299 ns     131,496 ns
> 2048:      992,209 ns     250,429 ns
> 4096:      891,809 ns     479,280 ns
> 8192:    1,027,280 ns     906,971 ns
> (until now pretty good)
>
> (ah, for every 32-bit atomic clear mask ...)
> 16384:   1,270,972 ns   6,661,741 ns    //  1  1  1 ...  1
> 32768:   1,581,335 ns   9,673,985 ns    //  ...
> 65536:   2,161,604 ns  11,466,134 ns    //  ...
> 131072:  3,253,027 ns  13,412,954 ns    //  ...
> 262144:  5,663,002 ns  16,309,924 ns    // 31 31 31 ... 31
> =========================================================

On a 64-bit host, this will be twice as fast.  Or if we use cmpxchg16b,
and there are no surprises, four times as fast.  It will still be slower
than the original, but by a smaller margin.

> According to a 2005 usenix paper, WWS with a 8sec window was
> about 50,000 pages for a high dirtying rate program.
>
> Taking into acount of another possible gains from the WWS locality
> of real workloads, these numbers are not so bad IMO.

I agree.

>
> Furthermore the code has been made for initial test only and I did
> not do any optimization:  I know what I should try.
>
> So this seems worth more testing.
>
>
> The new code also makes it possible to do find-grained get dirty log.
> Live migration can be done like this ??? (not sure yet):
>
> 	until the dirty rate becomes enough low
> 		get dirty log for the first 32K pages (partial return is OK)
> 		while sending
> 		get dirty log for the next 32K pages  (partial return is OK)
> 		while sending
> 		...
> 		get dirty log for the last 32K pages  (partial return is OK)
>
> 	stop the guest and get dirty log (but no need to write protect now)
> 	send the remaining pages
>
> New API is needed for this as discussed before!

Yeah.  But I think we should switch to srcu-less dirty logs regardless. 
Here are you numbers, but normalized by the number of dirty pages.

dirty pages                    old (ns/page)    new (ns/page)
1    102077    10105
2    23599    4698
4    10891    2485
8    5155    1327
16    2687    769
32    1422    447
64    796    311
128    477    229
256    316    191
512    259    169
1024    917    128
2048    484    122
4096    218    117
8192    125    111
16384    78    407
32768    48    295
65536    33    175
131072    25    102
262144    22    62


Your worst case, when considering a reasonable number of dirty pages, is
407ns/page, which is still lower than what userspace will actually do to
process the page, so it's reasonable.  The old method is often a lot
worse than your worst case, by this metric.



-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [test result] dirty logging without srcu update -- Re: [RFC][PATCH] srcu: Implement call_srcu()
  2012-02-02 10:10                       ` Avi Kivity
@ 2012-02-02 10:21                         ` Takuya Yoshikawa
  2012-02-02 10:21                           ` Avi Kivity
  0 siblings, 1 reply; 23+ messages in thread
From: Takuya Yoshikawa @ 2012-02-02 10:21 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Peter Zijlstra, paulmck, Oleg Nesterov, linux-kernel,
	Marcelo Tosatti, KVM list

(2012/02/02 19:10), Avi Kivity wrote:

>>
>> =========================================================
>> # of dirty pages:  kvm.git (ns),  with this patch (ns)
>> 1:         102,077 ns      10,105 ns
>> 2:          47,197 ns       9,395 ns
>> 4:          43,563 ns       9,938 ns
>> 8:          41,239 ns      10,618 ns
>> 16:         42,988 ns      12,299 ns
>> 32:         45,503 ns      14,298 ns
>> 64:         50,915 ns      19,895 ns
>> 128:        61,087 ns      29,260 ns
>> 256:        81,007 ns      49,023 ns
>> 512:       132,776 ns      86,670 ns
>> 1024:      939,299 ns     131,496 ns
>> 2048:      992,209 ns     250,429 ns
>> 4096:      891,809 ns     479,280 ns
>> 8192:    1,027,280 ns     906,971 ns
>> (until now pretty good)
>>
>> (ah, for every 32-bit atomic clear mask ...)
>> 16384:   1,270,972 ns   6,661,741 ns    //  1  1  1 ...  1
>> 32768:   1,581,335 ns   9,673,985 ns    //  ...
>> 65536:   2,161,604 ns  11,466,134 ns    //  ...
>> 131072:  3,253,027 ns  13,412,954 ns    //  ...
>> 262144:  5,663,002 ns  16,309,924 ns    // 31 31 31 ... 31
>> =========================================================
>
> On a 64-bit host, this will be twice as fast.  Or if we use cmpxchg16b,
> and there are no surprises, four times as fast.  It will still be slower
> than the original, but by a smaller margin.

Yes.

I used "unsigned int" just because I wanted to use the current
atomic_clear_mask() as is.

We need to implement atomic_clear_mask_long() or use ...



>
> Yeah.  But I think we should switch to srcu-less dirty logs regardless.
> Here are you numbers, but normalized by the number of dirty pages.

Thanks,

I can prepare the official patch series then, of course with more test.


	Takuya

>
> dirty pages                    old (ns/page)    new (ns/page)
> 1    102077    10105
> 2    23599    4698
> 4    10891    2485
> 8    5155    1327
> 16    2687    769
> 32    1422    447
> 64    796    311
> 128    477    229
> 256    316    191
> 512    259    169
> 1024    917    128
> 2048    484    122
> 4096    218    117
> 8192    125    111
> 16384    78    407
> 32768    48    295
> 65536    33    175
> 131072    25    102
> 262144    22    62
>
>
> Your worst case, when considering a reasonable number of dirty pages, is
> 407ns/page, which is still lower than what userspace will actually do to
> process the page, so it's reasonable.  The old method is often a lot
> worse than your worst case, by this metric.
>
>
>


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [test result] dirty logging without srcu update -- Re: [RFC][PATCH] srcu: Implement call_srcu()
  2012-02-02 10:21                         ` Takuya Yoshikawa
@ 2012-02-02 10:21                           ` Avi Kivity
  2012-02-02 10:40                             ` Takuya Yoshikawa
  0 siblings, 1 reply; 23+ messages in thread
From: Avi Kivity @ 2012-02-02 10:21 UTC (permalink / raw)
  To: Takuya Yoshikawa
  Cc: Peter Zijlstra, paulmck, Oleg Nesterov, linux-kernel,
	Marcelo Tosatti, KVM list

On 02/02/2012 12:21 PM, Takuya Yoshikawa wrote:
> (2012/02/02 19:10), Avi Kivity wrote:
>
>>>
>>> =========================================================
>>> # of dirty pages:  kvm.git (ns),  with this patch (ns)
>>> 1:         102,077 ns      10,105 ns
>>> 2:          47,197 ns       9,395 ns
>>> 4:          43,563 ns       9,938 ns
>>> 8:          41,239 ns      10,618 ns
>>> 16:         42,988 ns      12,299 ns
>>> 32:         45,503 ns      14,298 ns
>>> 64:         50,915 ns      19,895 ns
>>> 128:        61,087 ns      29,260 ns
>>> 256:        81,007 ns      49,023 ns
>>> 512:       132,776 ns      86,670 ns
>>> 1024:      939,299 ns     131,496 ns
>>> 2048:      992,209 ns     250,429 ns
>>> 4096:      891,809 ns     479,280 ns
>>> 8192:    1,027,280 ns     906,971 ns
>>> (until now pretty good)
>>>
>>> (ah, for every 32-bit atomic clear mask ...)
>>> 16384:   1,270,972 ns   6,661,741 ns    //  1  1  1 ...  1
>>> 32768:   1,581,335 ns   9,673,985 ns    //  ...
>>> 65536:   2,161,604 ns  11,466,134 ns    //  ...
>>> 131072:  3,253,027 ns  13,412,954 ns    //  ...
>>> 262144:  5,663,002 ns  16,309,924 ns    // 31 31 31 ... 31
>>> =========================================================
>>
>> On a 64-bit host, this will be twice as fast.  Or if we use cmpxchg16b,
>> and there are no surprises, four times as fast.  It will still be slower
>> than the original, but by a smaller margin.
>
> Yes.
>
> I used "unsigned int" just because I wanted to use the current
> atomic_clear_mask() as is.
>
> We need to implement atomic_clear_mask_long() or use ...

If we use cmpxchg8b/cmpxchg16b then this won't fit with the
atomic_*_long family.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [test result] dirty logging without srcu update -- Re: [RFC][PATCH] srcu: Implement call_srcu()
  2012-02-02 10:21                           ` Avi Kivity
@ 2012-02-02 10:40                             ` Takuya Yoshikawa
  2012-02-02 11:02                               ` Avi Kivity
  0 siblings, 1 reply; 23+ messages in thread
From: Takuya Yoshikawa @ 2012-02-02 10:40 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Peter Zijlstra, paulmck, Oleg Nesterov, linux-kernel,
	Marcelo Tosatti, KVM list

(2012/02/02 19:21), Avi Kivity wrote:

>>
>> I used "unsigned int" just because I wanted to use the current
>> atomic_clear_mask() as is.
>>
>> We need to implement atomic_clear_mask_long() or use ...
>
> If we use cmpxchg8b/cmpxchg16b then this won't fit with the
> atomic_*_long family.
>

OK, I will try.


I have one concern about correctness issue though:

	concurrent rmap write protection may not be safe due to
	delayed tlb flush ... cannot happen?


	Takuya

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [test result] dirty logging without srcu update -- Re: [RFC][PATCH] srcu: Implement call_srcu()
  2012-02-02 10:40                             ` Takuya Yoshikawa
@ 2012-02-02 11:02                               ` Avi Kivity
  2012-02-02 14:44                                 ` Takuya Yoshikawa
  0 siblings, 1 reply; 23+ messages in thread
From: Avi Kivity @ 2012-02-02 11:02 UTC (permalink / raw)
  To: Takuya Yoshikawa
  Cc: Peter Zijlstra, paulmck, Oleg Nesterov, linux-kernel,
	Marcelo Tosatti, KVM list

On 02/02/2012 12:40 PM, Takuya Yoshikawa wrote:
>
> I have one concern about correctness issue though:
>
>     concurrent rmap write protection may not be safe due to
>     delayed tlb flush ... cannot happen?

What do you mean by concurrent rmap write protection?


-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [test result] dirty logging without srcu update -- Re: [RFC][PATCH] srcu: Implement call_srcu()
  2012-02-02 11:02                               ` Avi Kivity
@ 2012-02-02 14:44                                 ` Takuya Yoshikawa
  2012-02-02 14:57                                   ` Avi Kivity
  0 siblings, 1 reply; 23+ messages in thread
From: Takuya Yoshikawa @ 2012-02-02 14:44 UTC (permalink / raw)
  To: Avi Kivity
  Cc: Takuya Yoshikawa, Peter Zijlstra, paulmck, Oleg Nesterov,
	linux-kernel, Marcelo Tosatti, KVM list

Avi Kivity <avi@redhat.com> wrote:

> > I have one concern about correctness issue though:
> >
> >     concurrent rmap write protection may not be safe due to
> >     delayed tlb flush ... cannot happen?
> 
> What do you mean by concurrent rmap write protection?
> 

Not sure, but other codes like:

- mmu_sync_children()
	for_each_sp(pages, sp, parents, i)
		protected |= rmap_write_protect(vcpu->kvm, sp->gfn);

		if (protected)
			kvm_flush_remote_tlbs(vcpu->kvm);

- kvm_mmu_get_page()
	if (rmap_write_protect(vcpu->kvm, gfn))
		kvm_flush_remote_tlbs(vcpu->kvm);

I just wondered what can happen if GET_DIRTY_LOG is being processed
behind these processing?

They may find nothing to write protect and won't do kvm_flush_remote_tlbs()
if the gfn has been already protected by GET_DIRTY_LOG.

But GET_DIRTY_LOG may still be busy write protecting other pages and
others can return before.  (My code releases mmu_lock to not include
__put_user() in the critical section.)

I am not still enough familier with these code yet.
(maybe empty concern)

	Takuya

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [test result] dirty logging without srcu update -- Re: [RFC][PATCH] srcu: Implement call_srcu()
  2012-02-02 14:44                                 ` Takuya Yoshikawa
@ 2012-02-02 14:57                                   ` Avi Kivity
  0 siblings, 0 replies; 23+ messages in thread
From: Avi Kivity @ 2012-02-02 14:57 UTC (permalink / raw)
  To: Takuya Yoshikawa
  Cc: Takuya Yoshikawa, Peter Zijlstra, paulmck, Oleg Nesterov,
	linux-kernel, Marcelo Tosatti, KVM list

On 02/02/2012 04:44 PM, Takuya Yoshikawa wrote:
> Avi Kivity <avi@redhat.com> wrote:
>
> > > I have one concern about correctness issue though:
> > >
> > >     concurrent rmap write protection may not be safe due to
> > >     delayed tlb flush ... cannot happen?
> > 
> > What do you mean by concurrent rmap write protection?
> > 
>
> Not sure, but other codes like:
>
> - mmu_sync_children()
> 	for_each_sp(pages, sp, parents, i)
> 		protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
>
> 		if (protected)
> 			kvm_flush_remote_tlbs(vcpu->kvm);
>
> - kvm_mmu_get_page()
> 	if (rmap_write_protect(vcpu->kvm, gfn))
> 		kvm_flush_remote_tlbs(vcpu->kvm);
>
> I just wondered what can happen if GET_DIRTY_LOG is being processed
> behind these processing?

It's a bug.  If the flush happens outside the spinlock, then one of the
callers can return before it is assured the tlb is flushed.

  A             B

  spin_lock
  clear pte.w
  spin_unlock
                spin_lock
                pte.w already clear
                spin_unlock
                skip flush
                return
  flush


>
>
> They may find nothing to write protect and won't do kvm_flush_remote_tlbs()
> if the gfn has been already protected by GET_DIRTY_LOG.
>
> But GET_DIRTY_LOG may still be busy write protecting other pages and
> others can return before.  (My code releases mmu_lock to not include
> __put_user() in the critical section.)
>
> I am not still enough familier with these code yet.

It's actually an advantage, since you don't have any assumptions on how
the code works.

> (maybe empty concern)

Nope, good catch of this subtle bug.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2012-02-02 14:58 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-01-31 13:32 [RFC][PATCH] srcu: Implement call_srcu() Peter Zijlstra
2012-01-31 13:47 ` Avi Kivity
2012-01-31 13:50   ` Peter Zijlstra
2012-01-31 22:24     ` Paul E. McKenney
2012-02-01 10:22       ` Peter Zijlstra
2012-02-01 10:44         ` Avi Kivity
2012-02-01 10:49           ` Avi Kivity
2012-02-01 11:00             ` Takuya Yoshikawa
2012-02-01 11:01               ` Avi Kivity
2012-02-01 11:12                 ` Takuya Yoshikawa
2012-02-01 13:24                   ` Avi Kivity
2012-02-02  5:46                     ` [test result] dirty logging without srcu update -- " Takuya Yoshikawa
2012-02-02 10:10                       ` Avi Kivity
2012-02-02 10:21                         ` Takuya Yoshikawa
2012-02-02 10:21                           ` Avi Kivity
2012-02-02 10:40                             ` Takuya Yoshikawa
2012-02-02 11:02                               ` Avi Kivity
2012-02-02 14:44                                 ` Takuya Yoshikawa
2012-02-02 14:57                                   ` Avi Kivity
2012-02-01 13:43                 ` Marcelo Tosatti
2012-02-01 15:42                   ` Takuya Yoshikawa
2012-02-01 13:50             ` Marcelo Tosatti
2012-02-01 14:07         ` Paul E. McKenney

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).