All of lore.kernel.org
 help / color / mirror / Atom feed
* x86: kvm: Revert "remove sched notifier for cross-cpu migrations"
@ 2015-03-23 23:21 Marcelo Tosatti
  2015-03-23 23:30 ` Andy Lutomirski
                   ` (3 more replies)
  0 siblings, 4 replies; 32+ messages in thread
From: Marcelo Tosatti @ 2015-03-23 23:21 UTC (permalink / raw)
  To: kvm-devel, stable; +Cc: Paolo Bonzini, Andy Lutomirski


The following point:

    2. per-CPU pvclock time info is updated if the
       underlying CPU changes.

Is not true anymore since "KVM: x86: update pvclock area conditionally,
on cpu migration".

Add task migration notification back.

Problem noticed by Andy Lutomirski.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
CC: stable@kernel.org # 3.11+

diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index d6b078e..25b1cc0 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -95,6 +95,7 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
 
 struct pvclock_vsyscall_time_info {
 	struct pvclock_vcpu_time_info pvti;
+	u32 migrate_count;
 } __attribute__((__aligned__(SMP_CACHE_BYTES)));
 
 #define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info)
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 2f355d2..e5ecd20 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -141,7 +141,46 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
 	set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
 }
 
+static struct pvclock_vsyscall_time_info *pvclock_vdso_info;
+
+static struct pvclock_vsyscall_time_info *
+pvclock_get_vsyscall_user_time_info(int cpu)
+{
+	if (!pvclock_vdso_info) {
+		BUG();
+		return NULL;
+	}
+
+	return &pvclock_vdso_info[cpu];
+}
+
+struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu)
+{
+	return &pvclock_get_vsyscall_user_time_info(cpu)->pvti;
+}
+
 #ifdef CONFIG_X86_64
+static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l,
+			        void *v)
+{
+	struct task_migration_notifier *mn = v;
+	struct pvclock_vsyscall_time_info *pvti;
+
+	pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu);
+
+	/* this is NULL when pvclock vsyscall is not initialized */
+	if (unlikely(pvti == NULL))
+		return NOTIFY_DONE;
+
+	pvti->migrate_count++;
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block pvclock_migrate = {
+	.notifier_call = pvclock_task_migrate,
+};
+
 /*
  * Initialize the generic pvclock vsyscall state.  This will allocate
  * a/some page(s) for the per-vcpu pvclock information, set up a
@@ -155,12 +194,17 @@ int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
 
 	WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE);
 
+	pvclock_vdso_info = i;
+
 	for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) {
 		__set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx,
 			     __pa(i) + (idx*PAGE_SIZE),
 			     PAGE_KERNEL_VVAR);
 	}
 
+
+	register_task_migration_notifier(&pvclock_migrate);
+
 	return 0;
 }
 #endif
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 9793322..3093376 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -82,18 +82,15 @@ static notrace cycle_t vread_pvclock(int *mode)
 	cycle_t ret;
 	u64 last;
 	u32 version;
+	u32 migrate_count;
 	u8 flags;
 	unsigned cpu, cpu1;
 
 
 	/*
-	 * Note: hypervisor must guarantee that:
-	 * 1. cpu ID number maps 1:1 to per-CPU pvclock time info.
-	 * 2. that per-CPU pvclock time info is updated if the
-	 *    underlying CPU changes.
-	 * 3. that version is increased whenever underlying CPU
-	 *    changes.
-	 *
+	 * When looping to get a consistent (time-info, tsc) pair, we
+	 * also need to deal with the possibility we can switch vcpus,
+	 * so make sure we always re-fetch time-info for the current vcpu.
 	 */
 	do {
 		cpu = __getcpu() & VGETCPU_CPU_MASK;
@@ -104,6 +101,8 @@ static notrace cycle_t vread_pvclock(int *mode)
 
 		pvti = get_pvti(cpu);
 
+		migrate_count = pvti->migrate_count;
+
 		version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
 
 		/*
@@ -115,7 +114,8 @@ static notrace cycle_t vread_pvclock(int *mode)
 		cpu1 = __getcpu() & VGETCPU_CPU_MASK;
 	} while (unlikely(cpu != cpu1 ||
 			  (pvti->pvti.version & 1) ||
-			  pvti->pvti.version != version));
+			  pvti->pvti.version != version ||
+			  pvti->migrate_count != migrate_count));
 
 	if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
 		*mode = VCLOCK_NONE;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6d77432..be98910 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -176,6 +176,14 @@ extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load);
 extern void calc_global_load(unsigned long ticks);
 extern void update_cpu_load_nohz(void);
 
+/* Notifier for when a task gets migrated to a new CPU */
+struct task_migration_notifier {
+	struct task_struct *task;
+	int from_cpu;
+	int to_cpu;
+};
+extern void register_task_migration_notifier(struct notifier_block *n);
+
 extern unsigned long get_parent_ip(unsigned long addr);
 
 extern void dump_cpu_task(int cpu);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f0f831e..d0c4209 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -996,6 +996,13 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 		rq_clock_skip_update(rq, true);
 }
 
+static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
+
+void register_task_migration_notifier(struct notifier_block *n)
+{
+	atomic_notifier_chain_register(&task_migration_notifier, n);
+}
+
 #ifdef CONFIG_SMP
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 {
@@ -1026,10 +1033,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 	trace_sched_migrate_task(p, new_cpu);
 
 	if (task_cpu(p) != new_cpu) {
+		struct task_migration_notifier tmn;
+
 		if (p->sched_class->migrate_task_rq)
 			p->sched_class->migrate_task_rq(p, new_cpu);
 		p->se.nr_migrations++;
 		perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
+
+		tmn.task = p;
+		tmn.from_cpu = task_cpu(p);
+		tmn.to_cpu = new_cpu;
+
+		atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn);
 	}
 
 	__set_task_cpu(p, new_cpu);

^ permalink raw reply related	[flat|nested] 32+ messages in thread

* Re: x86: kvm: Revert "remove sched notifier for cross-cpu migrations"
  2015-03-23 23:21 x86: kvm: Revert "remove sched notifier for cross-cpu migrations" Marcelo Tosatti
@ 2015-03-23 23:30 ` Andy Lutomirski
  2015-03-24 15:34 ` Radim Krčmář
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 32+ messages in thread
From: Andy Lutomirski @ 2015-03-23 23:30 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm-devel, stable, Paolo Bonzini

On Mon, Mar 23, 2015 at 4:21 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
>
> The following point:
>
>     2. per-CPU pvclock time info is updated if the
>        underlying CPU changes.
>
> Is not true anymore since "KVM: x86: update pvclock area conditionally,
> on cpu migration".
>
> Add task migration notification back.

IMO this is a pretty big hammer to use to work around what appears to
be a bug in the host, but I guess that's okay.

It's also unfortunate in another regard: it seems non-obvious to me
how to use this without reading the cpu number twice in the vdso.  On
the other hand, unless we have a global pvti, or at least a global
indication of TSC stability, I don't see how to do that even with the
host bug fixed.

Grumble.

On a more useful note, could you rename migrate_count to
migrate_from_count, since that's what it is?

--Andy

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: x86: kvm: Revert "remove sched notifier for cross-cpu migrations"
  2015-03-23 23:21 x86: kvm: Revert "remove sched notifier for cross-cpu migrations" Marcelo Tosatti
  2015-03-23 23:30 ` Andy Lutomirski
@ 2015-03-24 15:34 ` Radim Krčmář
  2015-03-24 22:33   ` Andy Lutomirski
  2015-03-24 22:59   ` Marcelo Tosatti
  2015-03-25 13:06 ` Radim Krčmář
  2015-03-26 20:59 ` Radim Krčmář
  3 siblings, 2 replies; 32+ messages in thread
From: Radim Krčmář @ 2015-03-24 15:34 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm-devel, stable, Paolo Bonzini, Andy Lutomirski

2015-03-23 20:21-0300, Marcelo Tosatti:
> The following point:
> 
>     2. per-CPU pvclock time info is updated if the
>        underlying CPU changes.
> 
> Is not true anymore since "KVM: x86: update pvclock area conditionally,
> on cpu migration".

I think that the revert doesn't fix point 2.:  "KVM: x86: update pvclock
[...]" changed the host to skip clock update on physical CPU change, but
guest's task migration notifier isn't tied to it at all.
(Guest can have all tasks pinned, so the revert changed nothing.)

> Add task migration notification back.
> 
> Problem noticed by Andy Lutomirski.

What is the problem?

Thanks.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: x86: kvm: Revert "remove sched notifier for cross-cpu migrations"
  2015-03-24 15:34 ` Radim Krčmář
@ 2015-03-24 22:33   ` Andy Lutomirski
  2015-03-25 11:08     ` Radim Krčmář
  2015-03-24 22:59   ` Marcelo Tosatti
  1 sibling, 1 reply; 32+ messages in thread
From: Andy Lutomirski @ 2015-03-24 22:33 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: Marcelo Tosatti, kvm-devel, stable, Paolo Bonzini

On Tue, Mar 24, 2015 at 8:34 AM, Radim Krčmář <rkrcmar@redhat.com> wrote:
> 2015-03-23 20:21-0300, Marcelo Tosatti:
>> The following point:
>>
>>     2. per-CPU pvclock time info is updated if the
>>        underlying CPU changes.
>>
>> Is not true anymore since "KVM: x86: update pvclock area conditionally,
>> on cpu migration".
>
> I think that the revert doesn't fix point 2.:  "KVM: x86: update pvclock
> [...]" changed the host to skip clock update on physical CPU change, but
> guest's task migration notifier isn't tied to it at all.
> (Guest can have all tasks pinned, so the revert changed nothing.)
>
>> Add task migration notification back.
>>
>> Problem noticed by Andy Lutomirski.
>
> What is the problem?

The kvmclock spec says that the host will increment a version field to
an odd number, then update stuff, then increment it to an even number.
The host is buggy and doesn't do this, and the result is observable
when one vcpu reads another vcpu's kvmclock data.

Since there's no good way for a guest kernel to keep its vdso from
reading a different vcpu's kvmclock data, this is a real corner-case
bug.  This patch allows the vdso to retry when this happens.  I don't
think it's a great solution, but it should mostly work.

--Andy

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: x86: kvm: Revert "remove sched notifier for cross-cpu migrations"
  2015-03-24 15:34 ` Radim Krčmář
  2015-03-24 22:33   ` Andy Lutomirski
@ 2015-03-24 22:59   ` Marcelo Tosatti
  2015-03-25 11:09     ` Radim Krčmář
  1 sibling, 1 reply; 32+ messages in thread
From: Marcelo Tosatti @ 2015-03-24 22:59 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: kvm-devel, stable, Paolo Bonzini, Andy Lutomirski

On Tue, Mar 24, 2015 at 04:34:12PM +0100, Radim Krčmář wrote:
> 2015-03-23 20:21-0300, Marcelo Tosatti:
> > The following point:
> > 
> >     2. per-CPU pvclock time info is updated if the
> >        underlying CPU changes.
> > 
> > Is not true anymore since "KVM: x86: update pvclock area conditionally,
> > on cpu migration".
> 
> I think that the revert doesn't fix point 2.:  "KVM: x86: update pvclock
> [...]" changed the host to skip clock update on physical CPU change, but
> guest's task migration notifier isn't tied to it at all.

"per-CPU pvclock time info is updated if the underlying CPU changes"
is the same as
"always perform clock update on physical CPU change".

That was a requirement for the original patch, to drop migration
notifiers.

> (Guest can have all tasks pinned, so the revert changed nothing.)
> 
> > Add task migration notification back.
> > 
> > Problem noticed by Andy Lutomirski.
> 
> What is the problem?
> 
> Thanks.

The problem is this:

T1) guest thread1 on vcpu1.
T2) guest thread1 on vcpu2.
T3) guest thread1 on vcpu1.

Inside a pvclock read loop.

Since the writes by hypervisor of pvclock area are not ordered, 
you cannot rely on version being updated _before_ 
the rest of pvclock data.

(in the case above, "has the physical cpu changed" check, inside the
guests thread1, obviously fails).

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: x86: kvm: Revert "remove sched notifier for cross-cpu migrations"
  2015-03-24 22:33   ` Andy Lutomirski
@ 2015-03-25 11:08     ` Radim Krčmář
  2015-03-25 12:52       ` Radim Krčmář
  2015-03-26 18:47       ` Andy Lutomirski
  0 siblings, 2 replies; 32+ messages in thread
From: Radim Krčmář @ 2015-03-25 11:08 UTC (permalink / raw)
  To: Andy Lutomirski; +Cc: Marcelo Tosatti, kvm-devel, stable, Paolo Bonzini

2015-03-24 15:33-0700, Andy Lutomirski:
> On Tue, Mar 24, 2015 at 8:34 AM, Radim Krčmář <rkrcmar@redhat.com> wrote:
> > What is the problem?
> 
> The kvmclock spec says that the host will increment a version field to
> an odd number, then update stuff, then increment it to an even number.
> The host is buggy and doesn't do this, and the result is observable
> when one vcpu reads another vcpu's kvmclock data.
> 
> Since there's no good way for a guest kernel to keep its vdso from
> reading a different vcpu's kvmclock data, this is a real corner-case
> bug.  This patch allows the vdso to retry when this happens.  I don't
> think it's a great solution, but it should mostly work.

Great explanation, thank you.

Reverting the patch protects us from any migration, but I don't think we
need to care about changing VCPUs as long as we read a consistent data
from kvmclock.  (VCPU can change outside of this loop too, so it doesn't
matter if we return a value not fit for this VCPU.)

I think we could drop the second __getcpu if our kvmclock was being
handled better;  maybe with a patch like the one below:

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cc2c759f69a3..8658599e0024 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1658,12 +1658,24 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 		&guest_hv_clock, sizeof(guest_hv_clock))))
 		return 0;
 
-	/*
-	 * The interface expects us to write an even number signaling that the
-	 * update is finished. Since the guest won't see the intermediate
-	 * state, we just increase by 2 at the end.
+	/* A guest can read other VCPU's kvmclock; specification says that
+	 * version is odd if data is being modified and even after it is
+	 * consistent.
+	 * We write three times to be sure.
+	 *  1) update version to odd number
+	 *  2) write modified data (version is still odd)
+	 *  3) update version to even number
+	 *
+	 * TODO: optimize
+	 *  - only two writes should be enough -- version is first
+	 *  - the second write could update just version
 	 */
-	vcpu->hv_clock.version = guest_hv_clock.version + 2;
+	guest_hv_clock.version += 1;
+	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+				&guest_hv_clock,
+				sizeof(guest_hv_clock));
+
+	vcpu->hv_clock.version = guest_hv_clock.version;
 
 	/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
 	pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
@@ -1684,6 +1696,11 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
 				&vcpu->hv_clock,
 				sizeof(vcpu->hv_clock));
+
+	vcpu->hv_clock.version += 1;
+	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+				&vcpu->hv_clock,
+				sizeof(vcpu->hv_clock));
 	return 0;
 }
 

^ permalink raw reply related	[flat|nested] 32+ messages in thread

* Re: x86: kvm: Revert "remove sched notifier for cross-cpu migrations"
  2015-03-24 22:59   ` Marcelo Tosatti
@ 2015-03-25 11:09     ` Radim Krčmář
  0 siblings, 0 replies; 32+ messages in thread
From: Radim Krčmář @ 2015-03-25 11:09 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm-devel, stable, Paolo Bonzini, Andy Lutomirski

2015-03-24 19:59-0300, Marcelo Tosatti:
> On Tue, Mar 24, 2015 at 04:34:12PM +0100, Radim Krčmář wrote:
> > 2015-03-23 20:21-0300, Marcelo Tosatti:
> > > The following point:
> > > 
> > >     2. per-CPU pvclock time info is updated if the
> > >        underlying CPU changes.
> > > 
> > > Is not true anymore since "KVM: x86: update pvclock area conditionally,
> > > on cpu migration".
> > 
> > I think that the revert doesn't fix point 2.:  "KVM: x86: update pvclock
> > [...]" changed the host to skip clock update on physical CPU change, but
> > guest's task migration notifier isn't tied to it at all.
> 
> "per-CPU pvclock time info is updated if the underlying CPU changes"
> is the same as
> "always perform clock update on physical CPU change".
> 
> That was a requirement for the original patch, to drop migration
> notifiers.
> 
> > (Guest can have all tasks pinned, so the revert changed nothing.)
> > 
> > > Add task migration notification back.
> > > 
> > > Problem noticed by Andy Lutomirski.
> > 
> > What is the problem?
> > 
> > Thanks.
> 
> The problem is this:
> 
> T1) guest thread1 on vcpu1.
> T2) guest thread1 on vcpu2.
> T3) guest thread1 on vcpu1.
> 
> Inside a pvclock read loop.
> 
> Since the writes by hypervisor of pvclock area are not ordered, 
> you cannot rely on version being updated _before_ 
> the rest of pvclock data.
> 
> (in the case above, "has the physical cpu changed" check, inside the
> guests thread1, obviously fails).

Ah, thanks! so the "KVM: x86: update pvclock area conditionally [...]"
has nothing to do with it -- that really confused me.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: x86: kvm: Revert "remove sched notifier for cross-cpu migrations"
  2015-03-25 11:08     ` Radim Krčmář
@ 2015-03-25 12:52       ` Radim Krčmář
  2015-03-25 21:28         ` Marcelo Tosatti
  2015-03-26 18:47       ` Andy Lutomirski
  1 sibling, 1 reply; 32+ messages in thread
From: Radim Krčmář @ 2015-03-25 12:52 UTC (permalink / raw)
  To: Andy Lutomirski; +Cc: Marcelo Tosatti, kvm-devel, stable, Paolo Bonzini

2015-03-25 12:08+0100, Radim Krčmář:
> Reverting the patch protects us from any migration, but I don't think we
> need to care about changing VCPUs as long as we read a consistent data
> from kvmclock.  (VCPU can change outside of this loop too, so it doesn't
> matter if we return a value not fit for this VCPU.)
> 
> I think we could drop the second __getcpu if our kvmclock was being
> handled better;  maybe with a patch like the one below:

The second __getcpu is not neccessary, but I forgot about rdtsc.
We need to either use rtdscp, know the host has synchronized tsc, or
monitor VCPU migrations.  Only the last one works everywhere.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: x86: kvm: Revert "remove sched notifier for cross-cpu migrations"
  2015-03-23 23:21 x86: kvm: Revert "remove sched notifier for cross-cpu migrations" Marcelo Tosatti
  2015-03-23 23:30 ` Andy Lutomirski
  2015-03-24 15:34 ` Radim Krčmář
@ 2015-03-25 13:06 ` Radim Krčmář
  2015-03-26 20:59 ` Radim Krčmář
  3 siblings, 0 replies; 32+ messages in thread
From: Radim Krčmář @ 2015-03-25 13:06 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm-devel, stable, Paolo Bonzini, Andy Lutomirski

2015-03-23 20:21-0300, Marcelo Tosatti:
> The following point:
> 
>     2. per-CPU pvclock time info is updated if the
>        underlying CPU changes.
> 
> Is not true anymore since "KVM: x86: update pvclock area conditionally,
> on cpu migration".
> 
> Add task migration notification back.
> 
> Problem noticed by Andy Lutomirski.
> 
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> CC: stable@kernel.org # 3.11+

Please improve the commit message.
"KVM: x86: update pvclock area conditionally [...]" was merged half a
year before the patch we are reverting and is completely unrelated to
the bug we are fixing now, (reverted patch just was just wrong)

Reviewed-by: Radim Krčmář <rkrcmar@redhat.com>

> diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
> @@ -82,18 +82,15 @@ static notrace cycle_t vread_pvclock(int *mode)
>  	/*
> -	 * Note: hypervisor must guarantee that:
> -	 * 1. cpu ID number maps 1:1 to per-CPU pvclock time info.
> -	 * 2. that per-CPU pvclock time info is updated if the
> -	 *    underlying CPU changes.
> -	 * 3. that version is increased whenever underlying CPU
> -	 *    changes.
> -	 *
> +	 * When looping to get a consistent (time-info, tsc) pair, we
> +	 * also need to deal with the possibility we can switch vcpus,
> +	 * so make sure we always re-fetch time-info for the current vcpu.

(All points from the original comment need to hold -- it would be nicer
 to keep both.)

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: x86: kvm: Revert "remove sched notifier for cross-cpu migrations"
  2015-03-25 12:52       ` Radim Krčmář
@ 2015-03-25 21:28         ` Marcelo Tosatti
  2015-03-25 22:33           ` Andy Lutomirski
  0 siblings, 1 reply; 32+ messages in thread
From: Marcelo Tosatti @ 2015-03-25 21:28 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: Andy Lutomirski, kvm-devel, stable, Paolo Bonzini

On Wed, Mar 25, 2015 at 01:52:15PM +0100, Radim Krčmář wrote:
> 2015-03-25 12:08+0100, Radim Krčmář:
> > Reverting the patch protects us from any migration, but I don't think we
> > need to care about changing VCPUs as long as we read a consistent data
> > from kvmclock.  (VCPU can change outside of this loop too, so it doesn't
> > matter if we return a value not fit for this VCPU.)
> > 
> > I think we could drop the second __getcpu if our kvmclock was being
> > handled better;  maybe with a patch like the one below:
> 
> The second __getcpu is not neccessary, but I forgot about rdtsc.
> We need to either use rtdscp, know the host has synchronized tsc, or
> monitor VCPU migrations.  Only the last one works everywhere.

The vdso code is only used if host has synchronized tsc.

But you have to handle the case where host goes from synchronized tsc to
unsynchronized tsc (see the clocksource notifier in the host side).


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: x86: kvm: Revert "remove sched notifier for cross-cpu migrations"
  2015-03-25 21:28         ` Marcelo Tosatti
@ 2015-03-25 22:33           ` Andy Lutomirski
  2015-03-25 22:41             ` Marcelo Tosatti
  0 siblings, 1 reply; 32+ messages in thread
From: Andy Lutomirski @ 2015-03-25 22:33 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm list, Radim Krcmar, stable, Paolo Bonzini

On Mar 25, 2015 2:29 PM, "Marcelo Tosatti" <mtosatti@redhat.com> wrote:
>
> On Wed, Mar 25, 2015 at 01:52:15PM +0100, Radim Krčmář wrote:
> > 2015-03-25 12:08+0100, Radim Krčmář:
> > > Reverting the patch protects us from any migration, but I don't think we
> > > need to care about changing VCPUs as long as we read a consistent data
> > > from kvmclock.  (VCPU can change outside of this loop too, so it doesn't
> > > matter if we return a value not fit for this VCPU.)
> > >
> > > I think we could drop the second __getcpu if our kvmclock was being
> > > handled better;  maybe with a patch like the one below:
> >
> > The second __getcpu is not neccessary, but I forgot about rdtsc.
> > We need to either use rtdscp, know the host has synchronized tsc, or
> > monitor VCPU migrations.  Only the last one works everywhere.
>
> The vdso code is only used if host has synchronized tsc.
>
> But you have to handle the case where host goes from synchronized tsc to
> unsynchronized tsc (see the clocksource notifier in the host side).
>

Can't we change the host to freeze all vcpus and clear the stable bit
on all of them if this happens?  This would simplify and speed up
vclock_gettime.

--Andy

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: x86: kvm: Revert "remove sched notifier for cross-cpu migrations"
  2015-03-25 22:33           ` Andy Lutomirski
@ 2015-03-25 22:41             ` Marcelo Tosatti
  2015-03-25 22:48               ` Andy Lutomirski
  0 siblings, 1 reply; 32+ messages in thread
From: Marcelo Tosatti @ 2015-03-25 22:41 UTC (permalink / raw)
  To: Andy Lutomirski; +Cc: kvm list, Radim Krcmar, stable, Paolo Bonzini

On Wed, Mar 25, 2015 at 03:33:10PM -0700, Andy Lutomirski wrote:
> On Mar 25, 2015 2:29 PM, "Marcelo Tosatti" <mtosatti@redhat.com> wrote:
> >
> > On Wed, Mar 25, 2015 at 01:52:15PM +0100, Radim Krčmář wrote:
> > > 2015-03-25 12:08+0100, Radim Krčmář:
> > > > Reverting the patch protects us from any migration, but I don't think we
> > > > need to care about changing VCPUs as long as we read a consistent data
> > > > from kvmclock.  (VCPU can change outside of this loop too, so it doesn't
> > > > matter if we return a value not fit for this VCPU.)
> > > >
> > > > I think we could drop the second __getcpu if our kvmclock was being
> > > > handled better;  maybe with a patch like the one below:
> > >
> > > The second __getcpu is not neccessary, but I forgot about rdtsc.
> > > We need to either use rtdscp, know the host has synchronized tsc, or
> > > monitor VCPU migrations.  Only the last one works everywhere.
> >
> > The vdso code is only used if host has synchronized tsc.
> >
> > But you have to handle the case where host goes from synchronized tsc to
> > unsynchronized tsc (see the clocksource notifier in the host side).
> >
> 
> Can't we change the host to freeze all vcpus and clear the stable bit
> on all of them if this happens?  This would simplify and speed up
> vclock_gettime.
> 
> --Andy

Seems interesting to do on 512-vcpus, but sure, could be done.


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: x86: kvm: Revert "remove sched notifier for cross-cpu migrations"
  2015-03-25 22:41             ` Marcelo Tosatti
@ 2015-03-25 22:48               ` Andy Lutomirski
  2015-03-25 23:13                 ` Marcelo Tosatti
  0 siblings, 1 reply; 32+ messages in thread
From: Andy Lutomirski @ 2015-03-25 22:48 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm list, Radim Krcmar, stable, Paolo Bonzini

On Wed, Mar 25, 2015 at 3:41 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> On Wed, Mar 25, 2015 at 03:33:10PM -0700, Andy Lutomirski wrote:
>> On Mar 25, 2015 2:29 PM, "Marcelo Tosatti" <mtosatti@redhat.com> wrote:
>> >
>> > On Wed, Mar 25, 2015 at 01:52:15PM +0100, Radim Krčmář wrote:
>> > > 2015-03-25 12:08+0100, Radim Krčmář:
>> > > > Reverting the patch protects us from any migration, but I don't think we
>> > > > need to care about changing VCPUs as long as we read a consistent data
>> > > > from kvmclock.  (VCPU can change outside of this loop too, so it doesn't
>> > > > matter if we return a value not fit for this VCPU.)
>> > > >
>> > > > I think we could drop the second __getcpu if our kvmclock was being
>> > > > handled better;  maybe with a patch like the one below:
>> > >
>> > > The second __getcpu is not neccessary, but I forgot about rdtsc.
>> > > We need to either use rtdscp, know the host has synchronized tsc, or
>> > > monitor VCPU migrations.  Only the last one works everywhere.
>> >
>> > The vdso code is only used if host has synchronized tsc.
>> >
>> > But you have to handle the case where host goes from synchronized tsc to
>> > unsynchronized tsc (see the clocksource notifier in the host side).
>> >
>>
>> Can't we change the host to freeze all vcpus and clear the stable bit
>> on all of them if this happens?  This would simplify and speed up
>> vclock_gettime.
>>
>> --Andy
>
> Seems interesting to do on 512-vcpus, but sure, could be done.
>

If you have a 512-vcpu system that switches between stable and
unstable more than once per migration, then I expect that you have
serious problems and this is the least of your worries.

Personally, I'd *much* rather we just made vcpu 0's pvti authoritative
if we're stable.  If nothing else, I'm not even remotely convinced
that the current scheme gives monotonic timing due to skew between
when the updates happen on different vcpus.

--Andy


-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: x86: kvm: Revert "remove sched notifier for cross-cpu migrations"
  2015-03-25 22:48               ` Andy Lutomirski
@ 2015-03-25 23:13                 ` Marcelo Tosatti
  2015-03-25 23:22                   ` Andy Lutomirski
  0 siblings, 1 reply; 32+ messages in thread
From: Marcelo Tosatti @ 2015-03-25 23:13 UTC (permalink / raw)
  To: Andy Lutomirski; +Cc: kvm list, Radim Krcmar, stable, Paolo Bonzini

On Wed, Mar 25, 2015 at 03:48:02PM -0700, Andy Lutomirski wrote:
> On Wed, Mar 25, 2015 at 3:41 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> > On Wed, Mar 25, 2015 at 03:33:10PM -0700, Andy Lutomirski wrote:
> >> On Mar 25, 2015 2:29 PM, "Marcelo Tosatti" <mtosatti@redhat.com> wrote:
> >> >
> >> > On Wed, Mar 25, 2015 at 01:52:15PM +0100, Radim Krčmář wrote:
> >> > > 2015-03-25 12:08+0100, Radim Krčmář:
> >> > > > Reverting the patch protects us from any migration, but I don't think we
> >> > > > need to care about changing VCPUs as long as we read a consistent data
> >> > > > from kvmclock.  (VCPU can change outside of this loop too, so it doesn't
> >> > > > matter if we return a value not fit for this VCPU.)
> >> > > >
> >> > > > I think we could drop the second __getcpu if our kvmclock was being
> >> > > > handled better;  maybe with a patch like the one below:
> >> > >
> >> > > The second __getcpu is not neccessary, but I forgot about rdtsc.
> >> > > We need to either use rtdscp, know the host has synchronized tsc, or
> >> > > monitor VCPU migrations.  Only the last one works everywhere.
> >> >
> >> > The vdso code is only used if host has synchronized tsc.
> >> >
> >> > But you have to handle the case where host goes from synchronized tsc to
> >> > unsynchronized tsc (see the clocksource notifier in the host side).
> >> >
> >>
> >> Can't we change the host to freeze all vcpus and clear the stable bit
> >> on all of them if this happens?  This would simplify and speed up
> >> vclock_gettime.
> >>
> >> --Andy
> >
> > Seems interesting to do on 512-vcpus, but sure, could be done.
> >
> 
> If you have a 512-vcpu system that switches between stable and
> unstable more than once per migration, then I expect that you have
> serious problems and this is the least of your worries.
> 
> Personally, I'd *much* rather we just made vcpu 0's pvti authoritative
> if we're stable.  If nothing else, I'm not even remotely convinced
> that the current scheme gives monotonic timing due to skew between
> when the updates happen on different vcpus.

Can you write down the problem ?

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: x86: kvm: Revert "remove sched notifier for cross-cpu migrations"
  2015-03-25 23:13                 ` Marcelo Tosatti
@ 2015-03-25 23:22                   ` Andy Lutomirski
  2015-03-26 11:29                     ` Marcelo Tosatti
  0 siblings, 1 reply; 32+ messages in thread
From: Andy Lutomirski @ 2015-03-25 23:22 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm list, Radim Krcmar, stable, Paolo Bonzini

On Wed, Mar 25, 2015 at 4:13 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> On Wed, Mar 25, 2015 at 03:48:02PM -0700, Andy Lutomirski wrote:
>> On Wed, Mar 25, 2015 at 3:41 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
>> > On Wed, Mar 25, 2015 at 03:33:10PM -0700, Andy Lutomirski wrote:
>> >> On Mar 25, 2015 2:29 PM, "Marcelo Tosatti" <mtosatti@redhat.com> wrote:
>> >> >
>> >> > On Wed, Mar 25, 2015 at 01:52:15PM +0100, Radim Krčmář wrote:
>> >> > > 2015-03-25 12:08+0100, Radim Krčmář:
>> >> > > > Reverting the patch protects us from any migration, but I don't think we
>> >> > > > need to care about changing VCPUs as long as we read a consistent data
>> >> > > > from kvmclock.  (VCPU can change outside of this loop too, so it doesn't
>> >> > > > matter if we return a value not fit for this VCPU.)
>> >> > > >
>> >> > > > I think we could drop the second __getcpu if our kvmclock was being
>> >> > > > handled better;  maybe with a patch like the one below:
>> >> > >
>> >> > > The second __getcpu is not neccessary, but I forgot about rdtsc.
>> >> > > We need to either use rtdscp, know the host has synchronized tsc, or
>> >> > > monitor VCPU migrations.  Only the last one works everywhere.
>> >> >
>> >> > The vdso code is only used if host has synchronized tsc.
>> >> >
>> >> > But you have to handle the case where host goes from synchronized tsc to
>> >> > unsynchronized tsc (see the clocksource notifier in the host side).
>> >> >
>> >>
>> >> Can't we change the host to freeze all vcpus and clear the stable bit
>> >> on all of them if this happens?  This would simplify and speed up
>> >> vclock_gettime.
>> >>
>> >> --Andy
>> >
>> > Seems interesting to do on 512-vcpus, but sure, could be done.
>> >
>>
>> If you have a 512-vcpu system that switches between stable and
>> unstable more than once per migration, then I expect that you have
>> serious problems and this is the least of your worries.
>>
>> Personally, I'd *much* rather we just made vcpu 0's pvti authoritative
>> if we're stable.  If nothing else, I'm not even remotely convinced
>> that the current scheme gives monotonic timing due to skew between
>> when the updates happen on different vcpus.
>
> Can you write down the problem ?
>

I can try.

Suppose we start out with all vcpus agreeing on their pvti and perfect
invariant TSCs.  Now the host updates its frequency (due to NTP or
whatever).  KVM updates vcpu 0's pvti.  Before KVM updates vcpu 1's
pvti, guest code on vcpus 0 and 1 see synced TSCs but different pvti.
They'll disagree on the time, and one of them will be ahead until vcpu
1's pvti gets updated.

--Andy

-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: x86: kvm: Revert "remove sched notifier for cross-cpu migrations"
  2015-03-25 23:22                   ` Andy Lutomirski
@ 2015-03-26 11:29                     ` Marcelo Tosatti
  2015-03-26 18:51                       ` Andy Lutomirski
  0 siblings, 1 reply; 32+ messages in thread
From: Marcelo Tosatti @ 2015-03-26 11:29 UTC (permalink / raw)
  To: Andy Lutomirski; +Cc: kvm list, Radim Krcmar, stable, Paolo Bonzini

On Wed, Mar 25, 2015 at 04:22:03PM -0700, Andy Lutomirski wrote:
> On Wed, Mar 25, 2015 at 4:13 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> > On Wed, Mar 25, 2015 at 03:48:02PM -0700, Andy Lutomirski wrote:
> >> On Wed, Mar 25, 2015 at 3:41 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> >> > On Wed, Mar 25, 2015 at 03:33:10PM -0700, Andy Lutomirski wrote:
> >> >> On Mar 25, 2015 2:29 PM, "Marcelo Tosatti" <mtosatti@redhat.com> wrote:
> >> >> >
> >> >> > On Wed, Mar 25, 2015 at 01:52:15PM +0100, Radim Krčmář wrote:
> >> >> > > 2015-03-25 12:08+0100, Radim Krčmář:
> >> >> > > > Reverting the patch protects us from any migration, but I don't think we
> >> >> > > > need to care about changing VCPUs as long as we read a consistent data
> >> >> > > > from kvmclock.  (VCPU can change outside of this loop too, so it doesn't
> >> >> > > > matter if we return a value not fit for this VCPU.)
> >> >> > > >
> >> >> > > > I think we could drop the second __getcpu if our kvmclock was being
> >> >> > > > handled better;  maybe with a patch like the one below:
> >> >> > >
> >> >> > > The second __getcpu is not neccessary, but I forgot about rdtsc.
> >> >> > > We need to either use rtdscp, know the host has synchronized tsc, or
> >> >> > > monitor VCPU migrations.  Only the last one works everywhere.
> >> >> >
> >> >> > The vdso code is only used if host has synchronized tsc.
> >> >> >
> >> >> > But you have to handle the case where host goes from synchronized tsc to
> >> >> > unsynchronized tsc (see the clocksource notifier in the host side).
> >> >> >
> >> >>
> >> >> Can't we change the host to freeze all vcpus and clear the stable bit
> >> >> on all of them if this happens?  This would simplify and speed up
> >> >> vclock_gettime.
> >> >>
> >> >> --Andy
> >> >
> >> > Seems interesting to do on 512-vcpus, but sure, could be done.
> >> >
> >>
> >> If you have a 512-vcpu system that switches between stable and
> >> unstable more than once per migration, then I expect that you have
> >> serious problems and this is the least of your worries.
> >>
> >> Personally, I'd *much* rather we just made vcpu 0's pvti authoritative
> >> if we're stable.  If nothing else, I'm not even remotely convinced
> >> that the current scheme gives monotonic timing due to skew between
> >> when the updates happen on different vcpus.
> >
> > Can you write down the problem ?
> >
> 
> I can try.
> 
> Suppose we start out with all vcpus agreeing on their pvti and perfect
> invariant TSCs.  Now the host updates its frequency (due to NTP or
> whatever).  KVM updates vcpu 0's pvti.  Before KVM updates vcpu 1's
> pvti, guest code on vcpus 0 and 1 see synced TSCs but different pvti.
> They'll disagree on the time, and one of them will be ahead until vcpu
> 1's pvti gets updated.

The masterclock scheme enforces the same system_timestamp/tsc_timestamp pairs
to be visible at one time, for all vcpus.


 * That is, when timespec0 != timespec1, M < N. Unfortunately that is
 * not
 * always the case (the difference between two distinct xtime instances
 * might be smaller then the difference between corresponding TSC reads,
 * when updating guest vcpus pvclock areas).
 *
 * To avoid that problem, do not allow visibility of distinct
 * system_timestamp/tsc_timestamp values simultaneously: use a master
 * copy of host monotonic time values. Update that master copy
 * in lockstep.



^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: x86: kvm: Revert "remove sched notifier for cross-cpu migrations"
  2015-03-25 11:08     ` Radim Krčmář
  2015-03-25 12:52       ` Radim Krčmář
@ 2015-03-26 18:47       ` Andy Lutomirski
  2015-03-26 20:10         ` Radim Krčmář
  1 sibling, 1 reply; 32+ messages in thread
From: Andy Lutomirski @ 2015-03-26 18:47 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: Marcelo Tosatti, kvm-devel, stable, Paolo Bonzini

On Wed, Mar 25, 2015 at 4:08 AM, Radim Krčmář <rkrcmar@redhat.com> wrote:
> 2015-03-24 15:33-0700, Andy Lutomirski:
>> On Tue, Mar 24, 2015 at 8:34 AM, Radim Krčmář <rkrcmar@redhat.com> wrote:
>> > What is the problem?
>>
>> The kvmclock spec says that the host will increment a version field to
>> an odd number, then update stuff, then increment it to an even number.
>> The host is buggy and doesn't do this, and the result is observable
>> when one vcpu reads another vcpu's kvmclock data.
>>
>> Since there's no good way for a guest kernel to keep its vdso from
>> reading a different vcpu's kvmclock data, this is a real corner-case
>> bug.  This patch allows the vdso to retry when this happens.  I don't
>> think it's a great solution, but it should mostly work.
>
> Great explanation, thank you.
>
> Reverting the patch protects us from any migration, but I don't think we
> need to care about changing VCPUs as long as we read a consistent data
> from kvmclock.  (VCPU can change outside of this loop too, so it doesn't
> matter if we return a value not fit for this VCPU.)
>
> I think we could drop the second __getcpu if our kvmclock was being
> handled better;  maybe with a patch like the one below:
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index cc2c759f69a3..8658599e0024 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -1658,12 +1658,24 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
>                 &guest_hv_clock, sizeof(guest_hv_clock))))
>                 return 0;
>
> -       /*
> -        * The interface expects us to write an even number signaling that the
> -        * update is finished. Since the guest won't see the intermediate
> -        * state, we just increase by 2 at the end.
> +       /* A guest can read other VCPU's kvmclock; specification says that
> +        * version is odd if data is being modified and even after it is
> +        * consistent.
> +        * We write three times to be sure.
> +        *  1) update version to odd number
> +        *  2) write modified data (version is still odd)
> +        *  3) update version to even number
> +        *
> +        * TODO: optimize
> +        *  - only two writes should be enough -- version is first
> +        *  - the second write could update just version
>          */
> -       vcpu->hv_clock.version = guest_hv_clock.version + 2;
> +       guest_hv_clock.version += 1;
> +       kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
> +                               &guest_hv_clock,
> +                               sizeof(guest_hv_clock));
> +
> +       vcpu->hv_clock.version = guest_hv_clock.version;
>
>         /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
>         pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
> @@ -1684,6 +1696,11 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
>         kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
>                                 &vcpu->hv_clock,
>                                 sizeof(vcpu->hv_clock));
> +
> +       vcpu->hv_clock.version += 1;
> +       kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
> +                               &vcpu->hv_clock,
> +                               sizeof(vcpu->hv_clock));
>         return 0;
>  }
>

The trouble with this is that kvm_write_guest_cached seems to
correspond roughly to a "rep movs" variant, and those are weakly
ordered.  As a result, I don't really know whether they have
well-defined semantics wrt concurrent reads.  What we really want is
just "mov".

--Andy

-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: x86: kvm: Revert "remove sched notifier for cross-cpu migrations"
  2015-03-26 11:29                     ` Marcelo Tosatti
@ 2015-03-26 18:51                       ` Andy Lutomirski
  2015-03-26 20:31                         ` Radim Krcmar
  0 siblings, 1 reply; 32+ messages in thread
From: Andy Lutomirski @ 2015-03-26 18:51 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm list, Radim Krcmar, stable, Paolo Bonzini

On Thu, Mar 26, 2015 at 4:29 AM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> On Wed, Mar 25, 2015 at 04:22:03PM -0700, Andy Lutomirski wrote:
>> On Wed, Mar 25, 2015 at 4:13 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
>> > On Wed, Mar 25, 2015 at 03:48:02PM -0700, Andy Lutomirski wrote:
>> >> On Wed, Mar 25, 2015 at 3:41 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
>> >> > On Wed, Mar 25, 2015 at 03:33:10PM -0700, Andy Lutomirski wrote:
>> >> >> On Mar 25, 2015 2:29 PM, "Marcelo Tosatti" <mtosatti@redhat.com> wrote:
>> >> >> >
>> >> >> > On Wed, Mar 25, 2015 at 01:52:15PM +0100, Radim Krčmář wrote:
>> >> >> > > 2015-03-25 12:08+0100, Radim Krčmář:
>> >> >> > > > Reverting the patch protects us from any migration, but I don't think we
>> >> >> > > > need to care about changing VCPUs as long as we read a consistent data
>> >> >> > > > from kvmclock.  (VCPU can change outside of this loop too, so it doesn't
>> >> >> > > > matter if we return a value not fit for this VCPU.)
>> >> >> > > >
>> >> >> > > > I think we could drop the second __getcpu if our kvmclock was being
>> >> >> > > > handled better;  maybe with a patch like the one below:
>> >> >> > >
>> >> >> > > The second __getcpu is not neccessary, but I forgot about rdtsc.
>> >> >> > > We need to either use rtdscp, know the host has synchronized tsc, or
>> >> >> > > monitor VCPU migrations.  Only the last one works everywhere.
>> >> >> >
>> >> >> > The vdso code is only used if host has synchronized tsc.
>> >> >> >
>> >> >> > But you have to handle the case where host goes from synchronized tsc to
>> >> >> > unsynchronized tsc (see the clocksource notifier in the host side).
>> >> >> >
>> >> >>
>> >> >> Can't we change the host to freeze all vcpus and clear the stable bit
>> >> >> on all of them if this happens?  This would simplify and speed up
>> >> >> vclock_gettime.
>> >> >>
>> >> >> --Andy
>> >> >
>> >> > Seems interesting to do on 512-vcpus, but sure, could be done.
>> >> >
>> >>
>> >> If you have a 512-vcpu system that switches between stable and
>> >> unstable more than once per migration, then I expect that you have
>> >> serious problems and this is the least of your worries.
>> >>
>> >> Personally, I'd *much* rather we just made vcpu 0's pvti authoritative
>> >> if we're stable.  If nothing else, I'm not even remotely convinced
>> >> that the current scheme gives monotonic timing due to skew between
>> >> when the updates happen on different vcpus.
>> >
>> > Can you write down the problem ?
>> >
>>
>> I can try.
>>
>> Suppose we start out with all vcpus agreeing on their pvti and perfect
>> invariant TSCs.  Now the host updates its frequency (due to NTP or
>> whatever).  KVM updates vcpu 0's pvti.  Before KVM updates vcpu 1's
>> pvti, guest code on vcpus 0 and 1 see synced TSCs but different pvti.
>> They'll disagree on the time, and one of them will be ahead until vcpu
>> 1's pvti gets updated.
>
> The masterclock scheme enforces the same system_timestamp/tsc_timestamp pairs
> to be visible at one time, for all vcpus.
>
>
>  * That is, when timespec0 != timespec1, M < N. Unfortunately that is
>  * not
>  * always the case (the difference between two distinct xtime instances
>  * might be smaller then the difference between corresponding TSC reads,
>  * when updating guest vcpus pvclock areas).
>  *
>  * To avoid that problem, do not allow visibility of distinct
>  * system_timestamp/tsc_timestamp values simultaneously: use a master
>  * copy of host monotonic time values. Update that master copy
>  * in lockstep.
>
>

[resend without HTML]

Yuck.  So we have per cpu timing data, but the protocol is only usable
for monotonic timing because we forcibly freeze all vcpus when we
update the nominally per cpu data.

The obvious guest implementations are still unnecessarily slow,
though.  It would be nice if the guest could get away without using
any getcpu operation at all.

Even if we fixed the host to increment version as advertised, I think
we can't avoid two getcpu ops.  We need one before rdtsc to figure out
which pvti to look at, and we need another to make sure that we were
actually on that cpu at the time we did rdtsc.  (Rdtscp doesn't help
-- we need to check version before rdtsc, and we don't know what
version to check until we do a getcpu.). The migration hook has the
same issue -- we need to check the migration count, then confirm we're
on that cpu, then check the migration count again, and we can't do
that until we know what cpu we're on.

If, on the other hand, we could rely on having all of these things in
sync, then this complication goes away, and we go down from two getcpu
ops to zero.

--Andy

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: x86: kvm: Revert "remove sched notifier for cross-cpu migrations"
  2015-03-26 18:47       ` Andy Lutomirski
@ 2015-03-26 20:10         ` Radim Krčmář
  2015-03-26 20:52           ` Paolo Bonzini
  0 siblings, 1 reply; 32+ messages in thread
From: Radim Krčmář @ 2015-03-26 20:10 UTC (permalink / raw)
  To: Andy Lutomirski; +Cc: Marcelo Tosatti, kvm-devel, stable, Paolo Bonzini

2015-03-26 11:47-0700, Andy Lutomirski:
> On Wed, Mar 25, 2015 at 4:08 AM, Radim Krčmář <rkrcmar@redhat.com> wrote:
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > +       /* A guest can read other VCPU's kvmclock; specification says that
> > +        * version is odd if data is being modified and even after it is
> > +        * consistent.
> > +        * We write three times to be sure.
> > +        *  1) update version to odd number
> > +        *  2) write modified data (version is still odd)
> > +        *  3) update version to even number
> > +        *
> > +        * TODO: optimize
> > +        *  - only two writes should be enough -- version is first
> > +        *  - the second write could update just version
> >          */
> 
> The trouble with this is that kvm_write_guest_cached seems to
> correspond roughly to a "rep movs" variant, and those are weakly
> ordered.  As a result, I don't really know whether they have
> well-defined semantics wrt concurrent reads.  What we really want is
> just "mov".

Ah, so the first optimization TODO is not possible, but stores are
weakly ordered only within one rep movs.   We're safe if compiler
outputs three mov-like instructions.

(Btw. does current hardware reorder string stores?)

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: x86: kvm: Revert "remove sched notifier for cross-cpu migrations"
  2015-03-26 18:51                       ` Andy Lutomirski
@ 2015-03-26 20:31                         ` Radim Krcmar
  2015-03-26 20:58                           ` Andy Lutomirski
  0 siblings, 1 reply; 32+ messages in thread
From: Radim Krcmar @ 2015-03-26 20:31 UTC (permalink / raw)
  To: Andy Lutomirski; +Cc: Marcelo Tosatti, kvm list, stable, Paolo Bonzini

2015-03-26 11:51-0700, Andy Lutomirski:
> On Thu, Mar 26, 2015 at 4:29 AM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> > On Wed, Mar 25, 2015 at 04:22:03PM -0700, Andy Lutomirski wrote:
> >> Suppose we start out with all vcpus agreeing on their pvti and perfect
> >> invariant TSCs.  Now the host updates its frequency (due to NTP or
> >> whatever).  KVM updates vcpu 0's pvti.  Before KVM updates vcpu 1's
> >> pvti, guest code on vcpus 0 and 1 see synced TSCs but different pvti.
> >> They'll disagree on the time, and one of them will be ahead until vcpu
> >> 1's pvti gets updated.
> >
> > The masterclock scheme enforces the same system_timestamp/tsc_timestamp pairs
> > to be visible at one time, for all vcpus.
> >
> >
> >  * That is, when timespec0 != timespec1, M < N. Unfortunately that is
> >  * not
> >  * always the case (the difference between two distinct xtime instances
> >  * might be smaller then the difference between corresponding TSC reads,
> >  * when updating guest vcpus pvclock areas).
> >  *
> >  * To avoid that problem, do not allow visibility of distinct
> >  * system_timestamp/tsc_timestamp values simultaneously: use a master
> >  * copy of host monotonic time values. Update that master copy
> >  * in lockstep.
> 
> Yuck.  So we have per cpu timing data, but the protocol is only usable
> for monotonic timing because we forcibly freeze all vcpus when we
> update the nominally per cpu data.
> 
> The obvious guest implementations are still unnecessarily slow,
> though.  It would be nice if the guest could get away without using
> any getcpu operation at all.
> 
> Even if we fixed the host to increment version as advertised, I think
> we can't avoid two getcpu ops.  We need one before rdtsc to figure out
> which pvti to look at,

Yes.

>                        and we need another to make sure that we were
> actually on that cpu at the time we did rdtsc.  (Rdtscp doesn't help
> -- we need to check version before rdtsc, and we don't know what
> version to check until we do a getcpu.).

Exactly, reading cpuid after rdtsc doesn't do that though, we could have
migrated back between those reads.
rtdscp would allow us to check that we read tsc of pvti's cpu.
(It doesn't get rid of that first read.)

>                                          The migration hook has the
> same issue -- we need to check the migration count, then confirm we're
> on that cpu, then check the migration count again, and we can't do
> that until we know what cpu we're on.

True;  the revert has a bug -- we need to check cpuid for the second
time before rdtsc.  (Migration hook is there just because we don't know
which cpu executed rdtsc.)

> If, on the other hand, we could rely on having all of these things in
> sync, then this complication goes away, and we go down from two getcpu
> ops to zero.

(Yeah, we should look what are the drawbacks of doing it differently.)

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: x86: kvm: Revert "remove sched notifier for cross-cpu migrations"
  2015-03-26 20:10         ` Radim Krčmář
@ 2015-03-26 20:52           ` Paolo Bonzini
  0 siblings, 0 replies; 32+ messages in thread
From: Paolo Bonzini @ 2015-03-26 20:52 UTC (permalink / raw)
  To: Radim Krčmář, Andy Lutomirski
  Cc: Marcelo Tosatti, kvm-devel, stable



On 26/03/2015 21:10, Radim Krčmář wrote:
> 2015-03-26 11:47-0700, Andy Lutomirski:
>> On Wed, Mar 25, 2015 at 4:08 AM, Radim Krčmář <rkrcmar@redhat.com> wrote:
>>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>>> +       /* A guest can read other VCPU's kvmclock; specification says that
>>> +        * version is odd if data is being modified and even after it is
>>> +        * consistent.
>>> +        * We write three times to be sure.
>>> +        *  1) update version to odd number
>>> +        *  2) write modified data (version is still odd)
>>> +        *  3) update version to even number
>>> +        *
>>> +        * TODO: optimize
>>> +        *  - only two writes should be enough -- version is first
>>> +        *  - the second write could update just version
>>>          */
>>
>> The trouble with this is that kvm_write_guest_cached seems to
>> correspond roughly to a "rep movs" variant, and those are weakly
>> ordered.  As a result, I don't really know whether they have
>> well-defined semantics wrt concurrent reads.  What we really want is
>> just "mov".
> 
> Ah, so the first optimization TODO is not possible, but stores are
> weakly ordered only within one rep movs.   We're safe if compiler
> outputs three mov-like instructions.
> 
> (Btw. does current hardware reorder string stores?)

It probably does so if they hit multiple cache lines.  Within a cache
line, probably not.

We can add kvm_map/unmap_guest_cached and then use __put_user.

Paolo

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: x86: kvm: Revert "remove sched notifier for cross-cpu migrations"
  2015-03-26 20:31                         ` Radim Krcmar
@ 2015-03-26 20:58                           ` Andy Lutomirski
  2015-03-26 22:22                             ` Andy Lutomirski
  2015-03-26 22:56                             ` Marcelo Tosatti
  0 siblings, 2 replies; 32+ messages in thread
From: Andy Lutomirski @ 2015-03-26 20:58 UTC (permalink / raw)
  To: Radim Krcmar
  Cc: Marcelo Tosatti, kvm list, stable, Paolo Bonzini, Rik van Riel

On Thu, Mar 26, 2015 at 1:31 PM, Radim Krcmar <rkrcmar@redhat.com> wrote:
> 2015-03-26 11:51-0700, Andy Lutomirski:
>> On Thu, Mar 26, 2015 at 4:29 AM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
>> > On Wed, Mar 25, 2015 at 04:22:03PM -0700, Andy Lutomirski wrote:
>> >> Suppose we start out with all vcpus agreeing on their pvti and perfect
>> >> invariant TSCs.  Now the host updates its frequency (due to NTP or
>> >> whatever).  KVM updates vcpu 0's pvti.  Before KVM updates vcpu 1's
>> >> pvti, guest code on vcpus 0 and 1 see synced TSCs but different pvti.
>> >> They'll disagree on the time, and one of them will be ahead until vcpu
>> >> 1's pvti gets updated.
>> >
>> > The masterclock scheme enforces the same system_timestamp/tsc_timestamp pairs
>> > to be visible at one time, for all vcpus.
>> >
>> >
>> >  * That is, when timespec0 != timespec1, M < N. Unfortunately that is
>> >  * not
>> >  * always the case (the difference between two distinct xtime instances
>> >  * might be smaller then the difference between corresponding TSC reads,
>> >  * when updating guest vcpus pvclock areas).
>> >  *
>> >  * To avoid that problem, do not allow visibility of distinct
>> >  * system_timestamp/tsc_timestamp values simultaneously: use a master
>> >  * copy of host monotonic time values. Update that master copy
>> >  * in lockstep.
>>
>> Yuck.  So we have per cpu timing data, but the protocol is only usable
>> for monotonic timing because we forcibly freeze all vcpus when we
>> update the nominally per cpu data.
>>
>> The obvious guest implementations are still unnecessarily slow,
>> though.  It would be nice if the guest could get away without using
>> any getcpu operation at all.
>>
>> Even if we fixed the host to increment version as advertised, I think
>> we can't avoid two getcpu ops.  We need one before rdtsc to figure out
>> which pvti to look at,
>
> Yes.
>
>>                        and we need another to make sure that we were
>> actually on that cpu at the time we did rdtsc.  (Rdtscp doesn't help
>> -- we need to check version before rdtsc, and we don't know what
>> version to check until we do a getcpu.).
>
> Exactly, reading cpuid after rdtsc doesn't do that though, we could have
> migrated back between those reads.
> rtdscp would allow us to check that we read tsc of pvti's cpu.
> (It doesn't get rid of that first read.)
>
>>                                          The migration hook has the
>> same issue -- we need to check the migration count, then confirm we're
>> on that cpu, then check the migration count again, and we can't do
>> that until we know what cpu we're on.
>
> True;  the revert has a bug -- we need to check cpuid for the second
> time before rdtsc.  (Migration hook is there just because we don't know
> which cpu executed rdtsc.)

One way or another, I'm planning on completely rewriting the vdso
code.  An early draft is here:

https://git.kernel.org/cgit/linux/kernel/git/luto/linux.git/commit/?h=x86/vdso&id=57ace6e6e032afc4faf7b9ec52f78a8e6642c980

but I can't finish it until the KVM side shakes out.

I think there are at least two ways that would work:

a) If KVM incremented version as advertised:

cpu = getcpu();
pvti = pvti for cpu;

ver1 = pvti->version;
check stable bit;
rdtsc_barrier, rdtsc, read scale, shift, etc.
if (getcpu() != cpu) retry;
if (pvti->version != ver1) retry;

I think this is safe because, we're guaranteed that there was an
interval (between the two version reads) in which the vcpu we think
we're on was running and the kvmclock data was valid and marked
stable, and we know that the tsc we read came from that interval.

Note: rdtscp isn't needed. If we're stable, is makes no difference
which cpu's tsc we actually read.

b) If version remains buggy but we use this migrations_from hack:

cpu = getcpu();
pvti = pvti for cpu;
m1 = pvti->migrations_from;
barrier();

ver1 = pvti->version;
check stable bit;
rdtsc_barrier, rdtsc, read scale, shift, etc.
if (getcpu() != cpu) retry;
if (pvti->version != ver1) retry;  /* probably not really needed */

barrier();
if (pvti->migrations_from != m1) retry;

This is just like (a), except that we're using a guest kernel hack to
ensure that no one migrated off the vcpu during the version-protected
critical section and that we were, in fact, on that vcpu at some point
during that critical section.  Once we've ensured that we were on
pvti's associated vcpu for the entire time we were reading it, then we
are protected by the existing versioning in the host.

>
>> If, on the other hand, we could rely on having all of these things in
>> sync, then this complication goes away, and we go down from two getcpu
>> ops to zero.
>
> (Yeah, we should look what are the drawbacks of doing it differently.)

If the versioning were fixed, I think we could almost get away with:

pvti = pvti for vcpu 0;

ver1 = pvti->version;
check stable bit;
rdtsc_barrier, rdtsc, read scale, shift, etc.
if (pvti->version != ver1) retry;

This guarantees that the tsc came from an interval in which vcpu0's
kvmclock was *marked* stable.  If vcpu0's kvmclock were genuinely
stable in that interval, then we'd be fine, but there's a race window
in which the kvmclock is *not* stable and vcpu 0 wasn't running.

Why doesn't KVM just update all of the kvmclock data at once?  (For
that matter, why is the pvti in guest memory at all?  Wouldn't this
all be simpler if the kvmclock data were host-allocated so the host
could write it directly and maybe even share it between guests?)

--Andy

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: x86: kvm: Revert "remove sched notifier for cross-cpu migrations"
  2015-03-23 23:21 x86: kvm: Revert "remove sched notifier for cross-cpu migrations" Marcelo Tosatti
                   ` (2 preceding siblings ...)
  2015-03-25 13:06 ` Radim Krčmář
@ 2015-03-26 20:59 ` Radim Krčmář
  2015-03-26 22:22   ` Marcelo Tosatti
  3 siblings, 1 reply; 32+ messages in thread
From: Radim Krčmář @ 2015-03-26 20:59 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm-devel, stable, Paolo Bonzini, Andy Lutomirski

2015-03-23 20:21-0300, Marcelo Tosatti:
> 
> The following point:
> 
>     2. per-CPU pvclock time info is updated if the
>        underlying CPU changes.
> 
> Is not true anymore since "KVM: x86: update pvclock area conditionally,
> on cpu migration".
> 
> Add task migration notification back.
> 
> Problem noticed by Andy Lutomirski.
> 
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> CC: stable@kernel.org # 3.11+

Revert contains a bug that got pointed out in the discussion:

> diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
>  	do {
>  		cpu = __getcpu() & VGETCPU_CPU_MASK;
>  
>  		pvti = get_pvti(cpu);

We can migrate to 'other cpu' here.

> +		migrate_count = pvti->migrate_count;
> +
>  		version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);

And migrate back to 'cpu' here.

rdtsc was executed on different cpu, so pvti and tsc might not be in
sync, but migrate_count hasn't changed.

>  		cpu1 = __getcpu() & VGETCPU_CPU_MASK;

(Reading cpuid here is useless.)

>  	} while (unlikely(cpu != cpu1 ||
>  			  (pvti->pvti.version & 1) ||
> -			  pvti->pvti.version != version));
> +			  pvti->pvti.version != version ||
> +			  pvti->migrate_count != migrate_count));

We can workaround the bug with,

  	cpu = __getcpu() & VGETCPU_CPU_MASK;
  	pvti = get_pvti(cpu);
  	migrate_count = pvti->migrate_count;
  	if (cpu != (__getcpu() & VGETCPU_CPU_MASK))
  		continue;

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: x86: kvm: Revert "remove sched notifier for cross-cpu migrations"
  2015-03-26 20:58                           ` Andy Lutomirski
@ 2015-03-26 22:22                             ` Andy Lutomirski
  2015-03-26 22:56                             ` Marcelo Tosatti
  1 sibling, 0 replies; 32+ messages in thread
From: Andy Lutomirski @ 2015-03-26 22:22 UTC (permalink / raw)
  To: Radim Krcmar
  Cc: Marcelo Tosatti, kvm list, stable, Paolo Bonzini, Rik van Riel

[much snippage]

On Thu, Mar 26, 2015 at 1:58 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>
> If the versioning were fixed, I think we could almost get away with:
>
> pvti = pvti for vcpu 0;
>
> ver1 = pvti->version;
> check stable bit;
> rdtsc_barrier, rdtsc, read scale, shift, etc.
> if (pvti->version != ver1) retry;
>
> This guarantees that the tsc came from an interval in which vcpu0's
> kvmclock was *marked* stable.  If vcpu0's kvmclock were genuinely
> stable in that interval, then we'd be fine, but there's a race window
> in which the kvmclock is *not* stable and vcpu 0 wasn't running.

Rik pointed out that this could actually be okay. Apparently vcpu 0 is
somewhat special, and it may actually be impossible to switch from
stable to unstable which a vcpu other than 0 is running and vcpu0
hasn't updated its kvmclock data.

--Andy

>
> Why doesn't KVM just update all of the kvmclock data at once?  (For
> that matter, why is the pvti in guest memory at all?  Wouldn't this
> all be simpler if the kvmclock data were host-allocated so the host
> could write it directly and maybe even share it between guests?)
>
> --Andy



-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: x86: kvm: Revert "remove sched notifier for cross-cpu migrations"
  2015-03-26 20:59 ` Radim Krčmář
@ 2015-03-26 22:22   ` Marcelo Tosatti
  2015-03-26 22:24     ` Andy Lutomirski
  0 siblings, 1 reply; 32+ messages in thread
From: Marcelo Tosatti @ 2015-03-26 22:22 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: kvm-devel, stable, Paolo Bonzini, Andy Lutomirski

On Thu, Mar 26, 2015 at 09:59:24PM +0100, Radim Krčmář wrote:
> 2015-03-23 20:21-0300, Marcelo Tosatti:
> > 
> > The following point:
> > 
> >     2. per-CPU pvclock time info is updated if the
> >        underlying CPU changes.
> > 
> > Is not true anymore since "KVM: x86: update pvclock area conditionally,
> > on cpu migration".
> > 
> > Add task migration notification back.
> > 
> > Problem noticed by Andy Lutomirski.
> > 
> > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > CC: stable@kernel.org # 3.11+
> 
> Revert contains a bug that got pointed out in the discussion:
> 
> > diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
> >  	do {
> >  		cpu = __getcpu() & VGETCPU_CPU_MASK;
> >  
> >  		pvti = get_pvti(cpu);
> 
> We can migrate to 'other cpu' here.
> 
> > +		migrate_count = pvti->migrate_count;
> > +
> >  		version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
> 
> And migrate back to 'cpu' here.

Migrating back will increase pvti->migrate_count, right ?

> rdtsc was executed on different cpu, so pvti and tsc might not be in
> sync, but migrate_count hasn't changed.
> 
> >  		cpu1 = __getcpu() & VGETCPU_CPU_MASK;
> 
> (Reading cpuid here is useless.)
> 
> >  	} while (unlikely(cpu != cpu1 ||
> >  			  (pvti->pvti.version & 1) ||
> > -			  pvti->pvti.version != version));
> > +			  pvti->pvti.version != version ||
> > +			  pvti->migrate_count != migrate_count));
> 
> We can workaround the bug with,
> 
>   	cpu = __getcpu() & VGETCPU_CPU_MASK;
>   	pvti = get_pvti(cpu);
>   	migrate_count = pvti->migrate_count;
>   	if (cpu != (__getcpu() & VGETCPU_CPU_MASK))
>   		continue;

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: x86: kvm: Revert "remove sched notifier for cross-cpu migrations"
  2015-03-26 22:22   ` Marcelo Tosatti
@ 2015-03-26 22:24     ` Andy Lutomirski
  2015-03-26 22:40       ` Marcelo Tosatti
  0 siblings, 1 reply; 32+ messages in thread
From: Andy Lutomirski @ 2015-03-26 22:24 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: Radim Krčmář, kvm-devel, stable, Paolo Bonzini

On Thu, Mar 26, 2015 at 3:22 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> On Thu, Mar 26, 2015 at 09:59:24PM +0100, Radim Krčmář wrote:
>> 2015-03-23 20:21-0300, Marcelo Tosatti:
>> >
>> > The following point:
>> >
>> >     2. per-CPU pvclock time info is updated if the
>> >        underlying CPU changes.
>> >
>> > Is not true anymore since "KVM: x86: update pvclock area conditionally,
>> > on cpu migration".
>> >
>> > Add task migration notification back.
>> >
>> > Problem noticed by Andy Lutomirski.
>> >
>> > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
>> > CC: stable@kernel.org # 3.11+
>>
>> Revert contains a bug that got pointed out in the discussion:
>>
>> > diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
>> >     do {
>> >             cpu = __getcpu() & VGETCPU_CPU_MASK;
>> >
>> >             pvti = get_pvti(cpu);
>>
>> We can migrate to 'other cpu' here.
>>
>> > +           migrate_count = pvti->migrate_count;
>> > +
>> >             version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
>>
>> And migrate back to 'cpu' here.
>
> Migrating back will increase pvti->migrate_count, right ?

I thought it only increased the count when we migrated away.

--Andy

>
>> rdtsc was executed on different cpu, so pvti and tsc might not be in
>> sync, but migrate_count hasn't changed.
>>
>> >             cpu1 = __getcpu() & VGETCPU_CPU_MASK;
>>
>> (Reading cpuid here is useless.)
>>
>> >     } while (unlikely(cpu != cpu1 ||
>> >                       (pvti->pvti.version & 1) ||
>> > -                     pvti->pvti.version != version));
>> > +                     pvti->pvti.version != version ||
>> > +                     pvti->migrate_count != migrate_count));
>>
>> We can workaround the bug with,
>>
>>       cpu = __getcpu() & VGETCPU_CPU_MASK;
>>       pvti = get_pvti(cpu);
>>       migrate_count = pvti->migrate_count;
>>       if (cpu != (__getcpu() & VGETCPU_CPU_MASK))
>>               continue;



-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: x86: kvm: Revert "remove sched notifier for cross-cpu migrations"
  2015-03-26 22:24     ` Andy Lutomirski
@ 2015-03-26 22:40       ` Marcelo Tosatti
  0 siblings, 0 replies; 32+ messages in thread
From: Marcelo Tosatti @ 2015-03-26 22:40 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Radim Krčmář, kvm-devel, stable, Paolo Bonzini

On Thu, Mar 26, 2015 at 03:24:10PM -0700, Andy Lutomirski wrote:
> On Thu, Mar 26, 2015 at 3:22 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> > On Thu, Mar 26, 2015 at 09:59:24PM +0100, Radim Krčmář wrote:
> >> 2015-03-23 20:21-0300, Marcelo Tosatti:
> >> >
> >> > The following point:
> >> >
> >> >     2. per-CPU pvclock time info is updated if the
> >> >        underlying CPU changes.
> >> >
> >> > Is not true anymore since "KVM: x86: update pvclock area conditionally,
> >> > on cpu migration".
> >> >
> >> > Add task migration notification back.
> >> >
> >> > Problem noticed by Andy Lutomirski.
> >> >
> >> > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> >> > CC: stable@kernel.org # 3.11+
> >>
> >> Revert contains a bug that got pointed out in the discussion:
> >>
> >> > diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
> >> >     do {
> >> >             cpu = __getcpu() & VGETCPU_CPU_MASK;
> >> >
> >> >             pvti = get_pvti(cpu);
> >>
> >> We can migrate to 'other cpu' here.
> >>
> >> > +           migrate_count = pvti->migrate_count;
> >> > +
> >> >             version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
> >>
> >> And migrate back to 'cpu' here.
> >
> > Migrating back will increase pvti->migrate_count, right ?
> 
> I thought it only increased the count when we migrated away.

Right.

> --Andy
> 
> >
> >> rdtsc was executed on different cpu, so pvti and tsc might not be in
> >> sync, but migrate_count hasn't changed.
> >>
> >> >             cpu1 = __getcpu() & VGETCPU_CPU_MASK;
> >>
> >> (Reading cpuid here is useless.)
> >>
> >> >     } while (unlikely(cpu != cpu1 ||
> >> >                       (pvti->pvti.version & 1) ||
> >> > -                     pvti->pvti.version != version));
> >> > +                     pvti->pvti.version != version ||
> >> > +                     pvti->migrate_count != migrate_count));
> >>
> >> We can workaround the bug with,
> >>
> >>       cpu = __getcpu() & VGETCPU_CPU_MASK;
> >>       pvti = get_pvti(cpu);
> >>       migrate_count = pvti->migrate_count;
> >>       if (cpu != (__getcpu() & VGETCPU_CPU_MASK))
> >>               continue;

Looks good, please submit a fix.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: x86: kvm: Revert "remove sched notifier for cross-cpu migrations"
  2015-03-26 20:58                           ` Andy Lutomirski
  2015-03-26 22:22                             ` Andy Lutomirski
@ 2015-03-26 22:56                             ` Marcelo Tosatti
  2015-03-26 23:09                               ` Andy Lutomirski
  1 sibling, 1 reply; 32+ messages in thread
From: Marcelo Tosatti @ 2015-03-26 22:56 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Radim Krcmar, kvm list, stable, Paolo Bonzini, Rik van Riel

On Thu, Mar 26, 2015 at 01:58:25PM -0700, Andy Lutomirski wrote:
> On Thu, Mar 26, 2015 at 1:31 PM, Radim Krcmar <rkrcmar@redhat.com> wrote:
> > 2015-03-26 11:51-0700, Andy Lutomirski:
> >> On Thu, Mar 26, 2015 at 4:29 AM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> >> > On Wed, Mar 25, 2015 at 04:22:03PM -0700, Andy Lutomirski wrote:
> >> >> Suppose we start out with all vcpus agreeing on their pvti and perfect
> >> >> invariant TSCs.  Now the host updates its frequency (due to NTP or
> >> >> whatever).  KVM updates vcpu 0's pvti.  Before KVM updates vcpu 1's
> >> >> pvti, guest code on vcpus 0 and 1 see synced TSCs but different pvti.
> >> >> They'll disagree on the time, and one of them will be ahead until vcpu
> >> >> 1's pvti gets updated.
> >> >
> >> > The masterclock scheme enforces the same system_timestamp/tsc_timestamp pairs
> >> > to be visible at one time, for all vcpus.
> >> >
> >> >
> >> >  * That is, when timespec0 != timespec1, M < N. Unfortunately that is
> >> >  * not
> >> >  * always the case (the difference between two distinct xtime instances
> >> >  * might be smaller then the difference between corresponding TSC reads,
> >> >  * when updating guest vcpus pvclock areas).
> >> >  *
> >> >  * To avoid that problem, do not allow visibility of distinct
> >> >  * system_timestamp/tsc_timestamp values simultaneously: use a master
> >> >  * copy of host monotonic time values. Update that master copy
> >> >  * in lockstep.
> >>
> >> Yuck.  So we have per cpu timing data, but the protocol is only usable
> >> for monotonic timing because we forcibly freeze all vcpus when we
> >> update the nominally per cpu data.
> >>
> >> The obvious guest implementations are still unnecessarily slow,
> >> though.  It would be nice if the guest could get away without using
> >> any getcpu operation at all.
> >>
> >> Even if we fixed the host to increment version as advertised, I think
> >> we can't avoid two getcpu ops.  We need one before rdtsc to figure out
> >> which pvti to look at,
> >
> > Yes.
> >
> >>                        and we need another to make sure that we were
> >> actually on that cpu at the time we did rdtsc.  (Rdtscp doesn't help
> >> -- we need to check version before rdtsc, and we don't know what
> >> version to check until we do a getcpu.).
> >
> > Exactly, reading cpuid after rdtsc doesn't do that though, we could have
> > migrated back between those reads.
> > rtdscp would allow us to check that we read tsc of pvti's cpu.
> > (It doesn't get rid of that first read.)
> >
> >>                                          The migration hook has the
> >> same issue -- we need to check the migration count, then confirm we're
> >> on that cpu, then check the migration count again, and we can't do
> >> that until we know what cpu we're on.
> >
> > True;  the revert has a bug -- we need to check cpuid for the second
> > time before rdtsc.  (Migration hook is there just because we don't know
> > which cpu executed rdtsc.)
> 
> One way or another, I'm planning on completely rewriting the vdso
> code.  An early draft is here:
> 
> https://git.kernel.org/cgit/linux/kernel/git/luto/linux.git/commit/?h=x86/vdso&id=57ace6e6e032afc4faf7b9ec52f78a8e6642c980
> 
> but I can't finish it until the KVM side shakes out.
> 
> I think there are at least two ways that would work:
> 
> a) If KVM incremented version as advertised:

All for it.

> cpu = getcpu();
> pvti = pvti for cpu;
> 
> ver1 = pvti->version;
> check stable bit;
> rdtsc_barrier, rdtsc, read scale, shift, etc.
> if (getcpu() != cpu) retry;
> if (pvti->version != ver1) retry;
> 
> I think this is safe because, we're guaranteed that there was an
> interval (between the two version reads) in which the vcpu we think
> we're on was running and the kvmclock data was valid and marked
> stable, and we know that the tsc we read came from that interval.
> 
> Note: rdtscp isn't needed. If we're stable, is makes no difference
> which cpu's tsc we actually read.

Yes, can't see a problem with that.

> b) If version remains buggy but we use this migrations_from hack:

There is no reason for version to remain buggy.

> cpu = getcpu();
> pvti = pvti for cpu;
> m1 = pvti->migrations_from;
> barrier();
> 
> ver1 = pvti->version;
> check stable bit;
> rdtsc_barrier, rdtsc, read scale, shift, etc.
> if (getcpu() != cpu) retry;
> if (pvti->version != ver1) retry;  /* probably not really needed */
> 
> barrier();
> if (pvti->migrations_from != m1) retry;
> 
> This is just like (a), except that we're using a guest kernel hack to
> ensure that no one migrated off the vcpu during the version-protected
> critical section and that we were, in fact, on that vcpu at some point
> during that critical section.  Once we've ensured that we were on
> pvti's associated vcpu for the entire time we were reading it, then we
> are protected by the existing versioning in the host.
> 
> >
> >> If, on the other hand, we could rely on having all of these things in
> >> sync, then this complication goes away, and we go down from two getcpu
> >> ops to zero.
> >
> > (Yeah, we should look what are the drawbacks of doing it differently.)
> 
> If the versioning were fixed, I think we could almost get away with:
> 
> pvti = pvti for vcpu 0;
> 
> ver1 = pvti->version;
> check stable bit;
> rdtsc_barrier, rdtsc, read scale, shift, etc.
> if (pvti->version != ver1) retry;
> 
> This guarantees that the tsc came from an interval in which vcpu0's
> kvmclock was *marked* stable.  If vcpu0's kvmclock were genuinely
> stable in that interval, then we'd be fine, but there's a race window
> in which the kvmclock is *not* stable and vcpu 0 wasn't running.

What is that window again ? Have no objections against using vcpu0's
pvti (cacheline should be read-only 99.9% of time).

> Why doesn't KVM just update all of the kvmclock data at once?  

Because it has not been necessary -- updating kvmclock data on vcpu 
entry was the previous method, so that was reused.

> (For
> that matter, why is the pvti in guest memory at all?  Wouldn't this
> all be simpler if the kvmclock data were host-allocated so the host
> could write it directly and maybe even share it between guests?)

And use a 4K TLB entry for that kvmclock area rather than 
sharing one of kernel's 2MB (or 1GB) TLB entry?

> 
> --Andy

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: x86: kvm: Revert "remove sched notifier for cross-cpu migrations"
  2015-03-26 22:56                             ` Marcelo Tosatti
@ 2015-03-26 23:09                               ` Andy Lutomirski
  2015-03-26 23:22                                 ` Marcelo Tosatti
  0 siblings, 1 reply; 32+ messages in thread
From: Andy Lutomirski @ 2015-03-26 23:09 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: Radim Krcmar, kvm list, stable, Paolo Bonzini, Rik van Riel

On Thu, Mar 26, 2015 at 3:56 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> On Thu, Mar 26, 2015 at 01:58:25PM -0700, Andy Lutomirski wrote:
>> On Thu, Mar 26, 2015 at 1:31 PM, Radim Krcmar <rkrcmar@redhat.com> wrote:
>> > 2015-03-26 11:51-0700, Andy Lutomirski:
>> >> On Thu, Mar 26, 2015 at 4:29 AM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
>> >> > On Wed, Mar 25, 2015 at 04:22:03PM -0700, Andy Lutomirski wrote:
>> >> >> Suppose we start out with all vcpus agreeing on their pvti and perfect
>> >> >> invariant TSCs.  Now the host updates its frequency (due to NTP or
>> >> >> whatever).  KVM updates vcpu 0's pvti.  Before KVM updates vcpu 1's
>> >> >> pvti, guest code on vcpus 0 and 1 see synced TSCs but different pvti.
>> >> >> They'll disagree on the time, and one of them will be ahead until vcpu
>> >> >> 1's pvti gets updated.
>> >> >
>> >> > The masterclock scheme enforces the same system_timestamp/tsc_timestamp pairs
>> >> > to be visible at one time, for all vcpus.
>> >> >
>> >> >
>> >> >  * That is, when timespec0 != timespec1, M < N. Unfortunately that is
>> >> >  * not
>> >> >  * always the case (the difference between two distinct xtime instances
>> >> >  * might be smaller then the difference between corresponding TSC reads,
>> >> >  * when updating guest vcpus pvclock areas).
>> >> >  *
>> >> >  * To avoid that problem, do not allow visibility of distinct
>> >> >  * system_timestamp/tsc_timestamp values simultaneously: use a master
>> >> >  * copy of host monotonic time values. Update that master copy
>> >> >  * in lockstep.
>> >>
>> >> Yuck.  So we have per cpu timing data, but the protocol is only usable
>> >> for monotonic timing because we forcibly freeze all vcpus when we
>> >> update the nominally per cpu data.
>> >>
>> >> The obvious guest implementations are still unnecessarily slow,
>> >> though.  It would be nice if the guest could get away without using
>> >> any getcpu operation at all.
>> >>
>> >> Even if we fixed the host to increment version as advertised, I think
>> >> we can't avoid two getcpu ops.  We need one before rdtsc to figure out
>> >> which pvti to look at,
>> >
>> > Yes.
>> >
>> >>                        and we need another to make sure that we were
>> >> actually on that cpu at the time we did rdtsc.  (Rdtscp doesn't help
>> >> -- we need to check version before rdtsc, and we don't know what
>> >> version to check until we do a getcpu.).
>> >
>> > Exactly, reading cpuid after rdtsc doesn't do that though, we could have
>> > migrated back between those reads.
>> > rtdscp would allow us to check that we read tsc of pvti's cpu.
>> > (It doesn't get rid of that first read.)
>> >
>> >>                                          The migration hook has the
>> >> same issue -- we need to check the migration count, then confirm we're
>> >> on that cpu, then check the migration count again, and we can't do
>> >> that until we know what cpu we're on.
>> >
>> > True;  the revert has a bug -- we need to check cpuid for the second
>> > time before rdtsc.  (Migration hook is there just because we don't know
>> > which cpu executed rdtsc.)
>>
>> One way or another, I'm planning on completely rewriting the vdso
>> code.  An early draft is here:
>>
>> https://git.kernel.org/cgit/linux/kernel/git/luto/linux.git/commit/?h=x86/vdso&id=57ace6e6e032afc4faf7b9ec52f78a8e6642c980
>>
>> but I can't finish it until the KVM side shakes out.
>>
>> I think there are at least two ways that would work:
>>
>> a) If KVM incremented version as advertised:
>
> All for it.
>
>> cpu = getcpu();
>> pvti = pvti for cpu;
>>
>> ver1 = pvti->version;
>> check stable bit;
>> rdtsc_barrier, rdtsc, read scale, shift, etc.
>> if (getcpu() != cpu) retry;
>> if (pvti->version != ver1) retry;
>>
>> I think this is safe because, we're guaranteed that there was an
>> interval (between the two version reads) in which the vcpu we think
>> we're on was running and the kvmclock data was valid and marked
>> stable, and we know that the tsc we read came from that interval.
>>
>> Note: rdtscp isn't needed. If we're stable, is makes no difference
>> which cpu's tsc we actually read.
>
> Yes, can't see a problem with that.
>
>> b) If version remains buggy but we use this migrations_from hack:
>
> There is no reason for version to remain buggy.
>
>> cpu = getcpu();
>> pvti = pvti for cpu;
>> m1 = pvti->migrations_from;
>> barrier();
>>
>> ver1 = pvti->version;
>> check stable bit;
>> rdtsc_barrier, rdtsc, read scale, shift, etc.
>> if (getcpu() != cpu) retry;
>> if (pvti->version != ver1) retry;  /* probably not really needed */
>>
>> barrier();
>> if (pvti->migrations_from != m1) retry;
>>
>> This is just like (a), except that we're using a guest kernel hack to
>> ensure that no one migrated off the vcpu during the version-protected
>> critical section and that we were, in fact, on that vcpu at some point
>> during that critical section.  Once we've ensured that we were on
>> pvti's associated vcpu for the entire time we were reading it, then we
>> are protected by the existing versioning in the host.
>>
>> >
>> >> If, on the other hand, we could rely on having all of these things in
>> >> sync, then this complication goes away, and we go down from two getcpu
>> >> ops to zero.
>> >
>> > (Yeah, we should look what are the drawbacks of doing it differently.)
>>
>> If the versioning were fixed, I think we could almost get away with:
>>
>> pvti = pvti for vcpu 0;
>>
>> ver1 = pvti->version;
>> check stable bit;
>> rdtsc_barrier, rdtsc, read scale, shift, etc.
>> if (pvti->version != ver1) retry;
>>
>> This guarantees that the tsc came from an interval in which vcpu0's
>> kvmclock was *marked* stable.  If vcpu0's kvmclock were genuinely
>> stable in that interval, then we'd be fine, but there's a race window
>> in which the kvmclock is *not* stable and vcpu 0 wasn't running.
>
> What is that window again ? Have no objections against using vcpu0's
> pvti (cacheline should be read-only 99.9% of time).

This is based on my (mis?)understanding of the code.  Here goes.

Suppose we transition from stable to unstable.  The host freezes all
vcpus and set a flag for each vcpu that the kvmclock data needs
updating.  There could then be a window in which vcpu 1 runs vdso code
and vcpu 0 hasn't updated its kvmclock data yet.

I don't know whether this is actually possible.  Rik suspects it isn't.

>
>> Why doesn't KVM just update all of the kvmclock data at once?
>
> Because it has not been necessary -- updating kvmclock data on vcpu
> entry was the previous method, so that was reused.
>
>> (For
>> that matter, why is the pvti in guest memory at all?  Wouldn't this
>> all be simpler if the kvmclock data were host-allocated so the host
>> could write it directly and maybe even share it between guests?)
>
> And use a 4K TLB entry for that kvmclock area rather than
> sharing one of kernel's 2MB (or 1GB) TLB entry?

Exactly.  I'd also move it into the vvar area instead of using the
fixmap so 32-bit userspace could use it.

I'm more than happy to handle the vdso side of all of this, but I'd
like the host code to settle down first.  I'm also not quite sure
whether it's okay to cause the vdso timing code to regress on old
hosts with new gusts.

--Andy

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: x86: kvm: Revert "remove sched notifier for cross-cpu migrations"
  2015-03-26 23:09                               ` Andy Lutomirski
@ 2015-03-26 23:22                                 ` Marcelo Tosatti
  2015-03-26 23:28                                   ` Andy Lutomirski
  0 siblings, 1 reply; 32+ messages in thread
From: Marcelo Tosatti @ 2015-03-26 23:22 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Radim Krcmar, kvm list, stable, Paolo Bonzini, Rik van Riel

On Thu, Mar 26, 2015 at 04:09:53PM -0700, Andy Lutomirski wrote:
> On Thu, Mar 26, 2015 at 3:56 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> > On Thu, Mar 26, 2015 at 01:58:25PM -0700, Andy Lutomirski wrote:
> >> On Thu, Mar 26, 2015 at 1:31 PM, Radim Krcmar <rkrcmar@redhat.com> wrote:
> >> > 2015-03-26 11:51-0700, Andy Lutomirski:
> >> >> On Thu, Mar 26, 2015 at 4:29 AM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> >> >> > On Wed, Mar 25, 2015 at 04:22:03PM -0700, Andy Lutomirski wrote:
> >> >> >> Suppose we start out with all vcpus agreeing on their pvti and perfect
> >> >> >> invariant TSCs.  Now the host updates its frequency (due to NTP or
> >> >> >> whatever).  KVM updates vcpu 0's pvti.  Before KVM updates vcpu 1's
> >> >> >> pvti, guest code on vcpus 0 and 1 see synced TSCs but different pvti.
> >> >> >> They'll disagree on the time, and one of them will be ahead until vcpu
> >> >> >> 1's pvti gets updated.
> >> >> >
> >> >> > The masterclock scheme enforces the same system_timestamp/tsc_timestamp pairs
> >> >> > to be visible at one time, for all vcpus.
> >> >> >
> >> >> >
> >> >> >  * That is, when timespec0 != timespec1, M < N. Unfortunately that is
> >> >> >  * not
> >> >> >  * always the case (the difference between two distinct xtime instances
> >> >> >  * might be smaller then the difference between corresponding TSC reads,
> >> >> >  * when updating guest vcpus pvclock areas).
> >> >> >  *
> >> >> >  * To avoid that problem, do not allow visibility of distinct
> >> >> >  * system_timestamp/tsc_timestamp values simultaneously: use a master
> >> >> >  * copy of host monotonic time values. Update that master copy
> >> >> >  * in lockstep.
> >> >>
> >> >> Yuck.  So we have per cpu timing data, but the protocol is only usable
> >> >> for monotonic timing because we forcibly freeze all vcpus when we
> >> >> update the nominally per cpu data.
> >> >>
> >> >> The obvious guest implementations are still unnecessarily slow,
> >> >> though.  It would be nice if the guest could get away without using
> >> >> any getcpu operation at all.
> >> >>
> >> >> Even if we fixed the host to increment version as advertised, I think
> >> >> we can't avoid two getcpu ops.  We need one before rdtsc to figure out
> >> >> which pvti to look at,
> >> >
> >> > Yes.
> >> >
> >> >>                        and we need another to make sure that we were
> >> >> actually on that cpu at the time we did rdtsc.  (Rdtscp doesn't help
> >> >> -- we need to check version before rdtsc, and we don't know what
> >> >> version to check until we do a getcpu.).
> >> >
> >> > Exactly, reading cpuid after rdtsc doesn't do that though, we could have
> >> > migrated back between those reads.
> >> > rtdscp would allow us to check that we read tsc of pvti's cpu.
> >> > (It doesn't get rid of that first read.)
> >> >
> >> >>                                          The migration hook has the
> >> >> same issue -- we need to check the migration count, then confirm we're
> >> >> on that cpu, then check the migration count again, and we can't do
> >> >> that until we know what cpu we're on.
> >> >
> >> > True;  the revert has a bug -- we need to check cpuid for the second
> >> > time before rdtsc.  (Migration hook is there just because we don't know
> >> > which cpu executed rdtsc.)
> >>
> >> One way or another, I'm planning on completely rewriting the vdso
> >> code.  An early draft is here:
> >>
> >> https://git.kernel.org/cgit/linux/kernel/git/luto/linux.git/commit/?h=x86/vdso&id=57ace6e6e032afc4faf7b9ec52f78a8e6642c980
> >>
> >> but I can't finish it until the KVM side shakes out.
> >>
> >> I think there are at least two ways that would work:
> >>
> >> a) If KVM incremented version as advertised:
> >
> > All for it.
> >
> >> cpu = getcpu();
> >> pvti = pvti for cpu;
> >>
> >> ver1 = pvti->version;
> >> check stable bit;
> >> rdtsc_barrier, rdtsc, read scale, shift, etc.
> >> if (getcpu() != cpu) retry;
> >> if (pvti->version != ver1) retry;
> >>
> >> I think this is safe because, we're guaranteed that there was an
> >> interval (between the two version reads) in which the vcpu we think
> >> we're on was running and the kvmclock data was valid and marked
> >> stable, and we know that the tsc we read came from that interval.
> >>
> >> Note: rdtscp isn't needed. If we're stable, is makes no difference
> >> which cpu's tsc we actually read.
> >
> > Yes, can't see a problem with that.
> >
> >> b) If version remains buggy but we use this migrations_from hack:
> >
> > There is no reason for version to remain buggy.
> >
> >> cpu = getcpu();
> >> pvti = pvti for cpu;
> >> m1 = pvti->migrations_from;
> >> barrier();
> >>
> >> ver1 = pvti->version;
> >> check stable bit;
> >> rdtsc_barrier, rdtsc, read scale, shift, etc.
> >> if (getcpu() != cpu) retry;
> >> if (pvti->version != ver1) retry;  /* probably not really needed */
> >>
> >> barrier();
> >> if (pvti->migrations_from != m1) retry;
> >>
> >> This is just like (a), except that we're using a guest kernel hack to
> >> ensure that no one migrated off the vcpu during the version-protected
> >> critical section and that we were, in fact, on that vcpu at some point
> >> during that critical section.  Once we've ensured that we were on
> >> pvti's associated vcpu for the entire time we were reading it, then we
> >> are protected by the existing versioning in the host.
> >>
> >> >
> >> >> If, on the other hand, we could rely on having all of these things in
> >> >> sync, then this complication goes away, and we go down from two getcpu
> >> >> ops to zero.
> >> >
> >> > (Yeah, we should look what are the drawbacks of doing it differently.)
> >>
> >> If the versioning were fixed, I think we could almost get away with:
> >>
> >> pvti = pvti for vcpu 0;
> >>
> >> ver1 = pvti->version;
> >> check stable bit;
> >> rdtsc_barrier, rdtsc, read scale, shift, etc.
> >> if (pvti->version != ver1) retry;
> >>
> >> This guarantees that the tsc came from an interval in which vcpu0's
> >> kvmclock was *marked* stable.  If vcpu0's kvmclock were genuinely
> >> stable in that interval, then we'd be fine, but there's a race window
> >> in which the kvmclock is *not* stable and vcpu 0 wasn't running.
> >
> > What is that window again ? Have no objections against using vcpu0's
> > pvti (cacheline should be read-only 99.9% of time).
> 
> This is based on my (mis?)understanding of the code.  Here goes.
> 
> Suppose we transition from stable to unstable.  The host freezes all
> vcpus and set a flag for each vcpu that the kvmclock data needs
> updating.  There could then be a window in which vcpu 1 runs vdso code
> and vcpu 0 hasn't updated its kvmclock data yet.
> 
> I don't know whether this is actually possible.  Rik suspects it isn't.

I don't see why its not possible. We can force vcpu0's kvmclock to be 
updated.

Do you have an estimation of the performance gain of using vcpu0 pvti?

> >> Why doesn't KVM just update all of the kvmclock data at once?
> >
> > Because it has not been necessary -- updating kvmclock data on vcpu
> > entry was the previous method, so that was reused.
> >
> >> (For
> >> that matter, why is the pvti in guest memory at all?  Wouldn't this
> >> all be simpler if the kvmclock data were host-allocated so the host
> >> could write it directly and maybe even share it between guests?)
> >
> > And use a 4K TLB entry for that kvmclock area rather than
> > sharing one of kernel's 2MB (or 1GB) TLB entry?
> 
> Exactly.  I'd also move it into the vvar area instead of using the
> fixmap so 32-bit userspace could use it.

Thats an obvious downside (an additional entry in the TLB occupied just
for the kvmclock area?).

> I'm more than happy to handle the vdso side of all of this, but I'd
> like the host code to settle down first. 
> I'm also not quite sure whether it's okay to cause the vdso timing
> code to regress on old hosts with new gusts.

Must be backwards compatible.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: x86: kvm: Revert "remove sched notifier for cross-cpu migrations"
  2015-03-26 23:22                                 ` Marcelo Tosatti
@ 2015-03-26 23:28                                   ` Andy Lutomirski
  2015-03-26 23:38                                     ` Marcelo Tosatti
  0 siblings, 1 reply; 32+ messages in thread
From: Andy Lutomirski @ 2015-03-26 23:28 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: Radim Krcmar, kvm list, stable, Paolo Bonzini, Rik van Riel

On Thu, Mar 26, 2015 at 4:22 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> On Thu, Mar 26, 2015 at 04:09:53PM -0700, Andy Lutomirski wrote:
>> On Thu, Mar 26, 2015 at 3:56 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
>> > On Thu, Mar 26, 2015 at 01:58:25PM -0700, Andy Lutomirski wrote:
>> >> On Thu, Mar 26, 2015 at 1:31 PM, Radim Krcmar <rkrcmar@redhat.com> wrote:
>> >> > 2015-03-26 11:51-0700, Andy Lutomirski:
>> >> >> On Thu, Mar 26, 2015 at 4:29 AM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
>> >> >> > On Wed, Mar 25, 2015 at 04:22:03PM -0700, Andy Lutomirski wrote:
>> >> >> >> Suppose we start out with all vcpus agreeing on their pvti and perfect
>> >> >> >> invariant TSCs.  Now the host updates its frequency (due to NTP or
>> >> >> >> whatever).  KVM updates vcpu 0's pvti.  Before KVM updates vcpu 1's
>> >> >> >> pvti, guest code on vcpus 0 and 1 see synced TSCs but different pvti.
>> >> >> >> They'll disagree on the time, and one of them will be ahead until vcpu
>> >> >> >> 1's pvti gets updated.
>> >> >> >
>> >> >> > The masterclock scheme enforces the same system_timestamp/tsc_timestamp pairs
>> >> >> > to be visible at one time, for all vcpus.
>> >> >> >
>> >> >> >
>> >> >> >  * That is, when timespec0 != timespec1, M < N. Unfortunately that is
>> >> >> >  * not
>> >> >> >  * always the case (the difference between two distinct xtime instances
>> >> >> >  * might be smaller then the difference between corresponding TSC reads,
>> >> >> >  * when updating guest vcpus pvclock areas).
>> >> >> >  *
>> >> >> >  * To avoid that problem, do not allow visibility of distinct
>> >> >> >  * system_timestamp/tsc_timestamp values simultaneously: use a master
>> >> >> >  * copy of host monotonic time values. Update that master copy
>> >> >> >  * in lockstep.
>> >> >>
>> >> >> Yuck.  So we have per cpu timing data, but the protocol is only usable
>> >> >> for monotonic timing because we forcibly freeze all vcpus when we
>> >> >> update the nominally per cpu data.
>> >> >>
>> >> >> The obvious guest implementations are still unnecessarily slow,
>> >> >> though.  It would be nice if the guest could get away without using
>> >> >> any getcpu operation at all.
>> >> >>
>> >> >> Even if we fixed the host to increment version as advertised, I think
>> >> >> we can't avoid two getcpu ops.  We need one before rdtsc to figure out
>> >> >> which pvti to look at,
>> >> >
>> >> > Yes.
>> >> >
>> >> >>                        and we need another to make sure that we were
>> >> >> actually on that cpu at the time we did rdtsc.  (Rdtscp doesn't help
>> >> >> -- we need to check version before rdtsc, and we don't know what
>> >> >> version to check until we do a getcpu.).
>> >> >
>> >> > Exactly, reading cpuid after rdtsc doesn't do that though, we could have
>> >> > migrated back between those reads.
>> >> > rtdscp would allow us to check that we read tsc of pvti's cpu.
>> >> > (It doesn't get rid of that first read.)
>> >> >
>> >> >>                                          The migration hook has the
>> >> >> same issue -- we need to check the migration count, then confirm we're
>> >> >> on that cpu, then check the migration count again, and we can't do
>> >> >> that until we know what cpu we're on.
>> >> >
>> >> > True;  the revert has a bug -- we need to check cpuid for the second
>> >> > time before rdtsc.  (Migration hook is there just because we don't know
>> >> > which cpu executed rdtsc.)
>> >>
>> >> One way or another, I'm planning on completely rewriting the vdso
>> >> code.  An early draft is here:
>> >>
>> >> https://git.kernel.org/cgit/linux/kernel/git/luto/linux.git/commit/?h=x86/vdso&id=57ace6e6e032afc4faf7b9ec52f78a8e6642c980
>> >>
>> >> but I can't finish it until the KVM side shakes out.
>> >>
>> >> I think there are at least two ways that would work:
>> >>
>> >> a) If KVM incremented version as advertised:
>> >
>> > All for it.
>> >
>> >> cpu = getcpu();
>> >> pvti = pvti for cpu;
>> >>
>> >> ver1 = pvti->version;
>> >> check stable bit;
>> >> rdtsc_barrier, rdtsc, read scale, shift, etc.
>> >> if (getcpu() != cpu) retry;
>> >> if (pvti->version != ver1) retry;
>> >>
>> >> I think this is safe because, we're guaranteed that there was an
>> >> interval (between the two version reads) in which the vcpu we think
>> >> we're on was running and the kvmclock data was valid and marked
>> >> stable, and we know that the tsc we read came from that interval.
>> >>
>> >> Note: rdtscp isn't needed. If we're stable, is makes no difference
>> >> which cpu's tsc we actually read.
>> >
>> > Yes, can't see a problem with that.
>> >
>> >> b) If version remains buggy but we use this migrations_from hack:
>> >
>> > There is no reason for version to remain buggy.
>> >
>> >> cpu = getcpu();
>> >> pvti = pvti for cpu;
>> >> m1 = pvti->migrations_from;
>> >> barrier();
>> >>
>> >> ver1 = pvti->version;
>> >> check stable bit;
>> >> rdtsc_barrier, rdtsc, read scale, shift, etc.
>> >> if (getcpu() != cpu) retry;
>> >> if (pvti->version != ver1) retry;  /* probably not really needed */
>> >>
>> >> barrier();
>> >> if (pvti->migrations_from != m1) retry;
>> >>
>> >> This is just like (a), except that we're using a guest kernel hack to
>> >> ensure that no one migrated off the vcpu during the version-protected
>> >> critical section and that we were, in fact, on that vcpu at some point
>> >> during that critical section.  Once we've ensured that we were on
>> >> pvti's associated vcpu for the entire time we were reading it, then we
>> >> are protected by the existing versioning in the host.
>> >>
>> >> >
>> >> >> If, on the other hand, we could rely on having all of these things in
>> >> >> sync, then this complication goes away, and we go down from two getcpu
>> >> >> ops to zero.
>> >> >
>> >> > (Yeah, we should look what are the drawbacks of doing it differently.)
>> >>
>> >> If the versioning were fixed, I think we could almost get away with:
>> >>
>> >> pvti = pvti for vcpu 0;
>> >>
>> >> ver1 = pvti->version;
>> >> check stable bit;
>> >> rdtsc_barrier, rdtsc, read scale, shift, etc.
>> >> if (pvti->version != ver1) retry;
>> >>
>> >> This guarantees that the tsc came from an interval in which vcpu0's
>> >> kvmclock was *marked* stable.  If vcpu0's kvmclock were genuinely
>> >> stable in that interval, then we'd be fine, but there's a race window
>> >> in which the kvmclock is *not* stable and vcpu 0 wasn't running.
>> >
>> > What is that window again ? Have no objections against using vcpu0's
>> > pvti (cacheline should be read-only 99.9% of time).
>>
>> This is based on my (mis?)understanding of the code.  Here goes.
>>
>> Suppose we transition from stable to unstable.  The host freezes all
>> vcpus and set a flag for each vcpu that the kvmclock data needs
>> updating.  There could then be a window in which vcpu 1 runs vdso code
>> and vcpu 0 hasn't updated its kvmclock data yet.
>>
>> I don't know whether this is actually possible.  Rik suspects it isn't.
>
> I don't see why its not possible. We can force vcpu0's kvmclock to be
> updated.
>
> Do you have an estimation of the performance gain of using vcpu0 pvti?

rdtscp is approximately 3 cycles worse than rdtsc_barrier(); rdtsc()
on my machine, but that doesn't help much, since the current scheme
requires that we get the cpu number before reading the time.  The
fastest way I know of to get the cpu number is 25-ish cycles, so this
ends up being a large fraction of the overall cost.

>
>> >> Why doesn't KVM just update all of the kvmclock data at once?
>> >
>> > Because it has not been necessary -- updating kvmclock data on vcpu
>> > entry was the previous method, so that was reused.
>> >
>> >> (For
>> >> that matter, why is the pvti in guest memory at all?  Wouldn't this
>> >> all be simpler if the kvmclock data were host-allocated so the host
>> >> could write it directly and maybe even share it between guests?)
>> >
>> > And use a 4K TLB entry for that kvmclock area rather than
>> > sharing one of kernel's 2MB (or 1GB) TLB entry?
>>
>> Exactly.  I'd also move it into the vvar area instead of using the
>> fixmap so 32-bit userspace could use it.
>
> Thats an obvious downside (an additional entry in the TLB occupied just
> for the kvmclock area?).

True.  But TLB misses are actually pretty cheap on new hardware, and
it's still only one TLB entry per process.

>
>> I'm more than happy to handle the vdso side of all of this, but I'd
>> like the host code to settle down first.
>> I'm also not quite sure whether it's okay to cause the vdso timing
>> code to regress on old hosts with new gusts.
>
> Must be backwards compatible.
>

Could we add a new feature bit that indicates that the host is
updated?  Then we could use the new code on new hosts and the old code
on old hosts and eventually deprecate accelerated vdso timing on old
hosts.

--Andy

-- 
Andy Lutomirski
AMA Capital Management, LLC

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: x86: kvm: Revert "remove sched notifier for cross-cpu migrations"
  2015-03-26 23:28                                   ` Andy Lutomirski
@ 2015-03-26 23:38                                     ` Marcelo Tosatti
  0 siblings, 0 replies; 32+ messages in thread
From: Marcelo Tosatti @ 2015-03-26 23:38 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Radim Krcmar, kvm list, stable, Paolo Bonzini, Rik van Riel

On Thu, Mar 26, 2015 at 04:28:37PM -0700, Andy Lutomirski wrote:
> On Thu, Mar 26, 2015 at 4:22 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> > On Thu, Mar 26, 2015 at 04:09:53PM -0700, Andy Lutomirski wrote:
> >> On Thu, Mar 26, 2015 at 3:56 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> >> > On Thu, Mar 26, 2015 at 01:58:25PM -0700, Andy Lutomirski wrote:
> >> >> On Thu, Mar 26, 2015 at 1:31 PM, Radim Krcmar <rkrcmar@redhat.com> wrote:
> >> >> > 2015-03-26 11:51-0700, Andy Lutomirski:
> >> >> >> On Thu, Mar 26, 2015 at 4:29 AM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> >> >> >> > On Wed, Mar 25, 2015 at 04:22:03PM -0700, Andy Lutomirski wrote:
> >> >> >> >> Suppose we start out with all vcpus agreeing on their pvti and perfect
> >> >> >> >> invariant TSCs.  Now the host updates its frequency (due to NTP or
> >> >> >> >> whatever).  KVM updates vcpu 0's pvti.  Before KVM updates vcpu 1's
> >> >> >> >> pvti, guest code on vcpus 0 and 1 see synced TSCs but different pvti.
> >> >> >> >> They'll disagree on the time, and one of them will be ahead until vcpu
> >> >> >> >> 1's pvti gets updated.
> >> >> >> >
> >> >> >> > The masterclock scheme enforces the same system_timestamp/tsc_timestamp pairs
> >> >> >> > to be visible at one time, for all vcpus.
> >> >> >> >
> >> >> >> >
> >> >> >> >  * That is, when timespec0 != timespec1, M < N. Unfortunately that is
> >> >> >> >  * not
> >> >> >> >  * always the case (the difference between two distinct xtime instances
> >> >> >> >  * might be smaller then the difference between corresponding TSC reads,
> >> >> >> >  * when updating guest vcpus pvclock areas).
> >> >> >> >  *
> >> >> >> >  * To avoid that problem, do not allow visibility of distinct
> >> >> >> >  * system_timestamp/tsc_timestamp values simultaneously: use a master
> >> >> >> >  * copy of host monotonic time values. Update that master copy
> >> >> >> >  * in lockstep.
> >> >> >>
> >> >> >> Yuck.  So we have per cpu timing data, but the protocol is only usable
> >> >> >> for monotonic timing because we forcibly freeze all vcpus when we
> >> >> >> update the nominally per cpu data.
> >> >> >>
> >> >> >> The obvious guest implementations are still unnecessarily slow,
> >> >> >> though.  It would be nice if the guest could get away without using
> >> >> >> any getcpu operation at all.
> >> >> >>
> >> >> >> Even if we fixed the host to increment version as advertised, I think
> >> >> >> we can't avoid two getcpu ops.  We need one before rdtsc to figure out
> >> >> >> which pvti to look at,
> >> >> >
> >> >> > Yes.
> >> >> >
> >> >> >>                        and we need another to make sure that we were
> >> >> >> actually on that cpu at the time we did rdtsc.  (Rdtscp doesn't help
> >> >> >> -- we need to check version before rdtsc, and we don't know what
> >> >> >> version to check until we do a getcpu.).
> >> >> >
> >> >> > Exactly, reading cpuid after rdtsc doesn't do that though, we could have
> >> >> > migrated back between those reads.
> >> >> > rtdscp would allow us to check that we read tsc of pvti's cpu.
> >> >> > (It doesn't get rid of that first read.)
> >> >> >
> >> >> >>                                          The migration hook has the
> >> >> >> same issue -- we need to check the migration count, then confirm we're
> >> >> >> on that cpu, then check the migration count again, and we can't do
> >> >> >> that until we know what cpu we're on.
> >> >> >
> >> >> > True;  the revert has a bug -- we need to check cpuid for the second
> >> >> > time before rdtsc.  (Migration hook is there just because we don't know
> >> >> > which cpu executed rdtsc.)
> >> >>
> >> >> One way or another, I'm planning on completely rewriting the vdso
> >> >> code.  An early draft is here:
> >> >>
> >> >> https://git.kernel.org/cgit/linux/kernel/git/luto/linux.git/commit/?h=x86/vdso&id=57ace6e6e032afc4faf7b9ec52f78a8e6642c980
> >> >>
> >> >> but I can't finish it until the KVM side shakes out.
> >> >>
> >> >> I think there are at least two ways that would work:
> >> >>
> >> >> a) If KVM incremented version as advertised:
> >> >
> >> > All for it.
> >> >
> >> >> cpu = getcpu();
> >> >> pvti = pvti for cpu;
> >> >>
> >> >> ver1 = pvti->version;
> >> >> check stable bit;
> >> >> rdtsc_barrier, rdtsc, read scale, shift, etc.
> >> >> if (getcpu() != cpu) retry;
> >> >> if (pvti->version != ver1) retry;
> >> >>
> >> >> I think this is safe because, we're guaranteed that there was an
> >> >> interval (between the two version reads) in which the vcpu we think
> >> >> we're on was running and the kvmclock data was valid and marked
> >> >> stable, and we know that the tsc we read came from that interval.
> >> >>
> >> >> Note: rdtscp isn't needed. If we're stable, is makes no difference
> >> >> which cpu's tsc we actually read.
> >> >
> >> > Yes, can't see a problem with that.
> >> >
> >> >> b) If version remains buggy but we use this migrations_from hack:
> >> >
> >> > There is no reason for version to remain buggy.
> >> >
> >> >> cpu = getcpu();
> >> >> pvti = pvti for cpu;
> >> >> m1 = pvti->migrations_from;
> >> >> barrier();
> >> >>
> >> >> ver1 = pvti->version;
> >> >> check stable bit;
> >> >> rdtsc_barrier, rdtsc, read scale, shift, etc.
> >> >> if (getcpu() != cpu) retry;
> >> >> if (pvti->version != ver1) retry;  /* probably not really needed */
> >> >>
> >> >> barrier();
> >> >> if (pvti->migrations_from != m1) retry;
> >> >>
> >> >> This is just like (a), except that we're using a guest kernel hack to
> >> >> ensure that no one migrated off the vcpu during the version-protected
> >> >> critical section and that we were, in fact, on that vcpu at some point
> >> >> during that critical section.  Once we've ensured that we were on
> >> >> pvti's associated vcpu for the entire time we were reading it, then we
> >> >> are protected by the existing versioning in the host.
> >> >>
> >> >> >
> >> >> >> If, on the other hand, we could rely on having all of these things in
> >> >> >> sync, then this complication goes away, and we go down from two getcpu
> >> >> >> ops to zero.
> >> >> >
> >> >> > (Yeah, we should look what are the drawbacks of doing it differently.)
> >> >>
> >> >> If the versioning were fixed, I think we could almost get away with:
> >> >>
> >> >> pvti = pvti for vcpu 0;
> >> >>
> >> >> ver1 = pvti->version;
> >> >> check stable bit;
> >> >> rdtsc_barrier, rdtsc, read scale, shift, etc.
> >> >> if (pvti->version != ver1) retry;
> >> >>
> >> >> This guarantees that the tsc came from an interval in which vcpu0's
> >> >> kvmclock was *marked* stable.  If vcpu0's kvmclock were genuinely
> >> >> stable in that interval, then we'd be fine, but there's a race window
> >> >> in which the kvmclock is *not* stable and vcpu 0 wasn't running.
> >> >
> >> > What is that window again ? Have no objections against using vcpu0's
> >> > pvti (cacheline should be read-only 99.9% of time).
> >>
> >> This is based on my (mis?)understanding of the code.  Here goes.
> >>
> >> Suppose we transition from stable to unstable.  The host freezes all
> >> vcpus and set a flag for each vcpu that the kvmclock data needs
> >> updating.  There could then be a window in which vcpu 1 runs vdso code
> >> and vcpu 0 hasn't updated its kvmclock data yet.
> >>
> >> I don't know whether this is actually possible.  Rik suspects it isn't.
> >
> > I don't see why its not possible. We can force vcpu0's kvmclock to be
> > updated.
> >
> > Do you have an estimation of the performance gain of using vcpu0 pvti?
> 
> rdtscp is approximately 3 cycles worse than rdtsc_barrier(); rdtsc()
> on my machine, but that doesn't help much, since the current scheme
> requires that we get the cpu number before reading the time.  The
> fastest way I know of to get the cpu number is 25-ish cycles, so this
> ends up being a large fraction of the overall cost.

It should be possible to force vcpu0 kvmclock area to be updated before
vm-entry, on the update of any other vcpuN kvmclock area.

        kvm_make_mclock_inprogress_request(kvm);
        /* no guest entries from this point */
        pvclock_update_vm_gtod_copy(kvm);

	<---- HERE ------->

        kvm_for_each_vcpu(i, vcpu, kvm)
                kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);

        /* guest entries allowed */
        kvm_for_each_vcpu(i, vcpu, kvm)
                clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);

> >> >> Why doesn't KVM just update all of the kvmclock data at once?
> >> >
> >> > Because it has not been necessary -- updating kvmclock data on vcpu
> >> > entry was the previous method, so that was reused.
> >> >
> >> >> (For
> >> >> that matter, why is the pvti in guest memory at all?  Wouldn't this
> >> >> all be simpler if the kvmclock data were host-allocated so the host
> >> >> could write it directly and maybe even share it between guests?)
> >> >
> >> > And use a 4K TLB entry for that kvmclock area rather than
> >> > sharing one of kernel's 2MB (or 1GB) TLB entry?
> >>
> >> Exactly.  I'd also move it into the vvar area instead of using the
> >> fixmap so 32-bit userspace could use it.
> >
> > Thats an obvious downside (an additional entry in the TLB occupied just
> > for the kvmclock area?).
> 
> True.  But TLB misses are actually pretty cheap on new hardware, and
> it's still only one TLB entry per process.
> >> I'm more than happy to handle the vdso side of all of this, but I'd
> >> like the host code to settle down first.
> >> I'm also not quite sure whether it's okay to cause the vdso timing
> >> code to regress on old hosts with new gusts.
> >
> > Must be backwards compatible.
> >
> 
> Could we add a new feature bit that indicates that the host is
> updated?  Then we could use the new code on new hosts and the old code
> on old hosts and eventually deprecate accelerated vdso timing on old
> hosts.

Makes sense.

^ permalink raw reply	[flat|nested] 32+ messages in thread

end of thread, other threads:[~2015-03-26 23:38 UTC | newest]

Thread overview: 32+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-03-23 23:21 x86: kvm: Revert "remove sched notifier for cross-cpu migrations" Marcelo Tosatti
2015-03-23 23:30 ` Andy Lutomirski
2015-03-24 15:34 ` Radim Krčmář
2015-03-24 22:33   ` Andy Lutomirski
2015-03-25 11:08     ` Radim Krčmář
2015-03-25 12:52       ` Radim Krčmář
2015-03-25 21:28         ` Marcelo Tosatti
2015-03-25 22:33           ` Andy Lutomirski
2015-03-25 22:41             ` Marcelo Tosatti
2015-03-25 22:48               ` Andy Lutomirski
2015-03-25 23:13                 ` Marcelo Tosatti
2015-03-25 23:22                   ` Andy Lutomirski
2015-03-26 11:29                     ` Marcelo Tosatti
2015-03-26 18:51                       ` Andy Lutomirski
2015-03-26 20:31                         ` Radim Krcmar
2015-03-26 20:58                           ` Andy Lutomirski
2015-03-26 22:22                             ` Andy Lutomirski
2015-03-26 22:56                             ` Marcelo Tosatti
2015-03-26 23:09                               ` Andy Lutomirski
2015-03-26 23:22                                 ` Marcelo Tosatti
2015-03-26 23:28                                   ` Andy Lutomirski
2015-03-26 23:38                                     ` Marcelo Tosatti
2015-03-26 18:47       ` Andy Lutomirski
2015-03-26 20:10         ` Radim Krčmář
2015-03-26 20:52           ` Paolo Bonzini
2015-03-24 22:59   ` Marcelo Tosatti
2015-03-25 11:09     ` Radim Krčmář
2015-03-25 13:06 ` Radim Krčmář
2015-03-26 20:59 ` Radim Krčmář
2015-03-26 22:22   ` Marcelo Tosatti
2015-03-26 22:24     ` Andy Lutomirski
2015-03-26 22:40       ` Marcelo Tosatti

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.