All of lore.kernel.org
 help / color / mirror / Atom feed
* [QEMU PATCH] kvmclock: advance clock by time window between vm_stop and pre_save
@ 2016-11-04  9:43 ` Marcelo Tosatti
  0 siblings, 0 replies; 80+ messages in thread
From: Marcelo Tosatti @ 2016-11-04  9:43 UTC (permalink / raw)
  To: kvm, qemu-devel
  Cc: Dr. David Alan Gilbert, Paolo Bonzini,
	Radim Krčmář,
	Juan Quintela, Eduardo Habkost


This patch, relative to pre-copy migration codepath,
measures the time between vm_stop() and pre_save(), 
which includes copying the remaining RAM to destination,
and advances the clock by that amount.

In a VM with 5 seconds downtime, this reduces the guest 
clock difference on destination from 5s to 0.2s.

Please do not apply this yet as some codepaths still need
checking, submitting early for comments.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/hw/i386/kvm/clock.c b/hw/i386/kvm/clock.c
index 0f75dd3..1bd8fd6 100644
--- a/hw/i386/kvm/clock.c
+++ b/hw/i386/kvm/clock.c
@@ -22,9 +22,11 @@
 #include "kvm_i386.h"
 #include "hw/sysbus.h"
 #include "hw/kvm/clock.h"
+#include "migration/migration.h"
 
 #include <linux/kvm.h>
 #include <linux/kvm_para.h>
+#include <time.h>
 
 #define TYPE_KVM_CLOCK "kvmclock"
 #define KVM_CLOCK(obj) OBJECT_CHECK(KVMClockState, (obj), TYPE_KVM_CLOCK)
@@ -35,7 +37,11 @@ typedef struct KVMClockState {
     /*< public >*/
 
     uint64_t clock;
+    uint64_t ns;
     bool clock_valid;
+
+    uint64_t advance_clock;
+    struct timespec t_aftervmstop;
 } KVMClockState;
 
 struct pvclock_vcpu_time_info {
@@ -100,6 +106,11 @@ static void kvmclock_vm_state_change(void *opaque, int running,
             s->clock = time_at_migration;
         }
 
+        if (s->advance_clock && s->clock + s->advance_clock > s->clock) {
+            s->clock += s->advance_clock;
+            s->advance_clock = 0;
+        }
+
         data.clock = s->clock;
         ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
         if (ret < 0) {
@@ -135,6 +146,18 @@ static void kvmclock_vm_state_change(void *opaque, int running,
             abort();
         }
         s->clock = data.clock;
+        /*
+         * Transition from VM-running to VM-stopped via migration?
+         * Record when the VM was stopped.
+         */
+
+        if (state == RUN_STATE_FINISH_MIGRATE &&
+            !migration_in_postcopy(migrate_get_current())) {
+            clock_gettime(CLOCK_MONOTONIC, &s->t_aftervmstop);
+        } else {
+            s->t_aftervmstop.tv_sec = 0;
+            s->t_aftervmstop.tv_nsec = 0;
+        }
 
         /*
          * If the VM is stopped, declare the clock state valid to
@@ -152,12 +175,66 @@ static void kvmclock_realize(DeviceState *dev, Error **errp)
     qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
 }
 
+static uint64_t clock_delta(struct timespec *before, struct timespec *after)
+{
+    if (before->tv_sec > after->tv_sec ||
+        (before->tv_sec == after->tv_sec &&
+         before->tv_nsec > after->tv_nsec)) {
+        fprintf(stderr, "clock_delta failed: before=(%ld sec, %ld nsec),"
+                        "after=(%ld sec, %ld nsec)\n", before->tv_sec,
+                        before->tv_nsec, after->tv_sec, after->tv_nsec);
+        abort();
+    }
+
+    return (after->tv_sec - before->tv_sec) * 1000000000ULL +
+            after->tv_nsec - before->tv_nsec;
+}
+
+static void kvmclock_pre_save(void *opaque)
+{
+    KVMClockState *s = opaque;
+    struct timespec now;
+    uint64_t ns;
+
+    if (s->t_aftervmstop.tv_sec == 0) {
+        return;
+    }
+
+    clock_gettime(CLOCK_MONOTONIC, &now);
+
+    ns = clock_delta(&s->t_aftervmstop, &now);
+
+    /*
+     * Linux guests can overflow if time jumps
+     * forward in large increments.
+     * Cap maximum adjustment to 10 minutes.
+     */
+    ns = MIN(ns, 600000000000ULL);
+
+    if (s->clock + ns > s->clock) {
+        s->ns = ns;
+    }
+}
+
+static int kvmclock_post_load(void *opaque, int version_id)
+{
+    KVMClockState *s = opaque;
+
+    /* save the value from incoming migration */
+    s->advance_clock = s->ns;
+
+    return 0;
+}
+
 static const VMStateDescription kvmclock_vmsd = {
     .name = "kvmclock",
-    .version_id = 1,
+    .version_id = 2,
     .minimum_version_id = 1,
+    .pre_save = kvmclock_pre_save,
+    .post_load = kvmclock_post_load,
     .fields = (VMStateField[]) {
         VMSTATE_UINT64(clock, KVMClockState),
+        VMSTATE_UINT64_V(ns, KVMClockState, 2),
         VMSTATE_END_OF_LIST()
     }
 };


^ permalink raw reply related	[flat|nested] 80+ messages in thread

* [Qemu-devel] [QEMU PATCH] kvmclock: advance clock by time window between vm_stop and pre_save
@ 2016-11-04  9:43 ` Marcelo Tosatti
  0 siblings, 0 replies; 80+ messages in thread
From: Marcelo Tosatti @ 2016-11-04  9:43 UTC (permalink / raw)
  To: kvm, qemu-devel
  Cc: Dr. David Alan Gilbert, Paolo Bonzini,
	Radim Krčmář,
	Juan Quintela, Eduardo Habkost


This patch, relative to pre-copy migration codepath,
measures the time between vm_stop() and pre_save(), 
which includes copying the remaining RAM to destination,
and advances the clock by that amount.

In a VM with 5 seconds downtime, this reduces the guest 
clock difference on destination from 5s to 0.2s.

Please do not apply this yet as some codepaths still need
checking, submitting early for comments.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/hw/i386/kvm/clock.c b/hw/i386/kvm/clock.c
index 0f75dd3..1bd8fd6 100644
--- a/hw/i386/kvm/clock.c
+++ b/hw/i386/kvm/clock.c
@@ -22,9 +22,11 @@
 #include "kvm_i386.h"
 #include "hw/sysbus.h"
 #include "hw/kvm/clock.h"
+#include "migration/migration.h"
 
 #include <linux/kvm.h>
 #include <linux/kvm_para.h>
+#include <time.h>
 
 #define TYPE_KVM_CLOCK "kvmclock"
 #define KVM_CLOCK(obj) OBJECT_CHECK(KVMClockState, (obj), TYPE_KVM_CLOCK)
@@ -35,7 +37,11 @@ typedef struct KVMClockState {
     /*< public >*/
 
     uint64_t clock;
+    uint64_t ns;
     bool clock_valid;
+
+    uint64_t advance_clock;
+    struct timespec t_aftervmstop;
 } KVMClockState;
 
 struct pvclock_vcpu_time_info {
@@ -100,6 +106,11 @@ static void kvmclock_vm_state_change(void *opaque, int running,
             s->clock = time_at_migration;
         }
 
+        if (s->advance_clock && s->clock + s->advance_clock > s->clock) {
+            s->clock += s->advance_clock;
+            s->advance_clock = 0;
+        }
+
         data.clock = s->clock;
         ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
         if (ret < 0) {
@@ -135,6 +146,18 @@ static void kvmclock_vm_state_change(void *opaque, int running,
             abort();
         }
         s->clock = data.clock;
+        /*
+         * Transition from VM-running to VM-stopped via migration?
+         * Record when the VM was stopped.
+         */
+
+        if (state == RUN_STATE_FINISH_MIGRATE &&
+            !migration_in_postcopy(migrate_get_current())) {
+            clock_gettime(CLOCK_MONOTONIC, &s->t_aftervmstop);
+        } else {
+            s->t_aftervmstop.tv_sec = 0;
+            s->t_aftervmstop.tv_nsec = 0;
+        }
 
         /*
          * If the VM is stopped, declare the clock state valid to
@@ -152,12 +175,66 @@ static void kvmclock_realize(DeviceState *dev, Error **errp)
     qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
 }
 
+static uint64_t clock_delta(struct timespec *before, struct timespec *after)
+{
+    if (before->tv_sec > after->tv_sec ||
+        (before->tv_sec == after->tv_sec &&
+         before->tv_nsec > after->tv_nsec)) {
+        fprintf(stderr, "clock_delta failed: before=(%ld sec, %ld nsec),"
+                        "after=(%ld sec, %ld nsec)\n", before->tv_sec,
+                        before->tv_nsec, after->tv_sec, after->tv_nsec);
+        abort();
+    }
+
+    return (after->tv_sec - before->tv_sec) * 1000000000ULL +
+            after->tv_nsec - before->tv_nsec;
+}
+
+static void kvmclock_pre_save(void *opaque)
+{
+    KVMClockState *s = opaque;
+    struct timespec now;
+    uint64_t ns;
+
+    if (s->t_aftervmstop.tv_sec == 0) {
+        return;
+    }
+
+    clock_gettime(CLOCK_MONOTONIC, &now);
+
+    ns = clock_delta(&s->t_aftervmstop, &now);
+
+    /*
+     * Linux guests can overflow if time jumps
+     * forward in large increments.
+     * Cap maximum adjustment to 10 minutes.
+     */
+    ns = MIN(ns, 600000000000ULL);
+
+    if (s->clock + ns > s->clock) {
+        s->ns = ns;
+    }
+}
+
+static int kvmclock_post_load(void *opaque, int version_id)
+{
+    KVMClockState *s = opaque;
+
+    /* save the value from incoming migration */
+    s->advance_clock = s->ns;
+
+    return 0;
+}
+
 static const VMStateDescription kvmclock_vmsd = {
     .name = "kvmclock",
-    .version_id = 1,
+    .version_id = 2,
     .minimum_version_id = 1,
+    .pre_save = kvmclock_pre_save,
+    .post_load = kvmclock_post_load,
     .fields = (VMStateField[]) {
         VMSTATE_UINT64(clock, KVMClockState),
+        VMSTATE_UINT64_V(ns, KVMClockState, 2),
         VMSTATE_END_OF_LIST()
     }
 };

^ permalink raw reply related	[flat|nested] 80+ messages in thread

* Re: [QEMU PATCH] kvmclock: advance clock by time window between vm_stop and pre_save
  2016-11-04  9:43 ` [Qemu-devel] " Marcelo Tosatti
@ 2016-11-04 12:28   ` Juan Quintela
  -1 siblings, 0 replies; 80+ messages in thread
From: Juan Quintela @ 2016-11-04 12:28 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: kvm, qemu-devel, Dr. David Alan Gilbert, Paolo Bonzini,
	Radim Krčmář,
	Eduardo Habkost

Marcelo Tosatti <mtosatti@redhat.com> wrote:
> This patch, relative to pre-copy migration codepath,
> measures the time between vm_stop() and pre_save(), 
> which includes copying the remaining RAM to destination,
> and advances the clock by that amount.
>
> In a VM with 5 seconds downtime, this reduces the guest 
> clock difference on destination from 5s to 0.2s.
>
> Please do not apply this yet as some codepaths still need
> checking, submitting early for comments.
>
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

You can use an optional section, and then you don't need to increase the
version number.

I believe you that the clock manipulation is right, only talking about
the migration bits.

> +static uint64_t clock_delta(struct timespec *before, struct timespec *after)
> +{
> +    if (before->tv_sec > after->tv_sec ||
> +        (before->tv_sec == after->tv_sec &&
> +         before->tv_nsec > after->tv_nsec)) {
> +        fprintf(stderr, "clock_delta failed: before=(%ld sec, %ld nsec),"
> +                        "after=(%ld sec, %ld nsec)\n", before->tv_sec,
> +                        before->tv_nsec, after->tv_sec, after->tv_nsec);
> +        abort();
> +    }
> +
> +    return (after->tv_sec - before->tv_sec) * 1000000000ULL +
> +            after->tv_nsec - before->tv_nsec;
> +}

I can't believe that we don't have a helper function already to
calculate this....


> +
> +static void kvmclock_pre_save(void *opaque)
> +{
> +    KVMClockState *s = opaque;
> +    struct timespec now;
> +    uint64_t ns;
> +
> +    if (s->t_aftervmstop.tv_sec == 0) {
> +        return;
> +    }

You have your test here.

> +
> +    clock_gettime(CLOCK_MONOTONIC, &now);
> +
> +    ns = clock_delta(&s->t_aftervmstop, &now);
> +
> +    /*
> +     * Linux guests can overflow if time jumps
> +     * forward in large increments.
> +     * Cap maximum adjustment to 10 minutes.
> +     */
> +    ns = MIN(ns, 600000000000ULL);
> +
> +    if (s->clock + ns > s->clock) {
> +        s->ns = ns;

Would it be a good idea to print an error message here?  If it has been more
than 10mins since we did the vmstop, something got wrong here.


> +    }
> +}
> +
> +static int kvmclock_post_load(void *opaque, int version_id)
> +{
> +    KVMClockState *s = opaque;
> +
> +    /* save the value from incoming migration */
> +    s->advance_clock = s->ns;
> +
> +    return 0;
> +}
> +
>  static const VMStateDescription kvmclock_vmsd = {
>      .name = "kvmclock",
> -    .version_id = 1,
> +    .version_id = 2,
>      .minimum_version_id = 1,
> +    .pre_save = kvmclock_pre_save,
> +    .post_load = kvmclock_post_load,
>      .fields = (VMStateField[]) {
>          VMSTATE_UINT64(clock, KVMClockState),
> +        VMSTATE_UINT64_V(ns, KVMClockState, 2),
>          VMSTATE_END_OF_LIST()
>      }
>  };


If you need help with the subsection stuff, just ask.

Later, Juan.

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [Qemu-devel] [QEMU PATCH] kvmclock: advance clock by time window between vm_stop and pre_save
@ 2016-11-04 12:28   ` Juan Quintela
  0 siblings, 0 replies; 80+ messages in thread
From: Juan Quintela @ 2016-11-04 12:28 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: kvm, qemu-devel, Dr. David Alan Gilbert, Paolo Bonzini,
	Radim Krčmář,
	Eduardo Habkost

Marcelo Tosatti <mtosatti@redhat.com> wrote:
> This patch, relative to pre-copy migration codepath,
> measures the time between vm_stop() and pre_save(), 
> which includes copying the remaining RAM to destination,
> and advances the clock by that amount.
>
> In a VM with 5 seconds downtime, this reduces the guest 
> clock difference on destination from 5s to 0.2s.
>
> Please do not apply this yet as some codepaths still need
> checking, submitting early for comments.
>
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

You can use an optional section, and then you don't need to increase the
version number.

I believe you that the clock manipulation is right, only talking about
the migration bits.

> +static uint64_t clock_delta(struct timespec *before, struct timespec *after)
> +{
> +    if (before->tv_sec > after->tv_sec ||
> +        (before->tv_sec == after->tv_sec &&
> +         before->tv_nsec > after->tv_nsec)) {
> +        fprintf(stderr, "clock_delta failed: before=(%ld sec, %ld nsec),"
> +                        "after=(%ld sec, %ld nsec)\n", before->tv_sec,
> +                        before->tv_nsec, after->tv_sec, after->tv_nsec);
> +        abort();
> +    }
> +
> +    return (after->tv_sec - before->tv_sec) * 1000000000ULL +
> +            after->tv_nsec - before->tv_nsec;
> +}

I can't believe that we don't have a helper function already to
calculate this....


> +
> +static void kvmclock_pre_save(void *opaque)
> +{
> +    KVMClockState *s = opaque;
> +    struct timespec now;
> +    uint64_t ns;
> +
> +    if (s->t_aftervmstop.tv_sec == 0) {
> +        return;
> +    }

You have your test here.

> +
> +    clock_gettime(CLOCK_MONOTONIC, &now);
> +
> +    ns = clock_delta(&s->t_aftervmstop, &now);
> +
> +    /*
> +     * Linux guests can overflow if time jumps
> +     * forward in large increments.
> +     * Cap maximum adjustment to 10 minutes.
> +     */
> +    ns = MIN(ns, 600000000000ULL);
> +
> +    if (s->clock + ns > s->clock) {
> +        s->ns = ns;

Would it be a good idea to print an error message here?  If it has been more
than 10mins since we did the vmstop, something got wrong here.


> +    }
> +}
> +
> +static int kvmclock_post_load(void *opaque, int version_id)
> +{
> +    KVMClockState *s = opaque;
> +
> +    /* save the value from incoming migration */
> +    s->advance_clock = s->ns;
> +
> +    return 0;
> +}
> +
>  static const VMStateDescription kvmclock_vmsd = {
>      .name = "kvmclock",
> -    .version_id = 1,
> +    .version_id = 2,
>      .minimum_version_id = 1,
> +    .pre_save = kvmclock_pre_save,
> +    .post_load = kvmclock_post_load,
>      .fields = (VMStateField[]) {
>          VMSTATE_UINT64(clock, KVMClockState),
> +        VMSTATE_UINT64_V(ns, KVMClockState, 2),
>          VMSTATE_END_OF_LIST()
>      }
>  };


If you need help with the subsection stuff, just ask.

Later, Juan.

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [QEMU PATCH] kvmclock: advance clock by time window between vm_stop and pre_save
  2016-11-04 12:28   ` [Qemu-devel] " Juan Quintela
@ 2016-11-04 12:35     ` Marcelo Tosatti
  -1 siblings, 0 replies; 80+ messages in thread
From: Marcelo Tosatti @ 2016-11-04 12:35 UTC (permalink / raw)
  To: Juan Quintela
  Cc: Eduardo Habkost, kvm, Radim Krčmář,
	qemu-devel, Dr. David Alan Gilbert, Paolo Bonzini

On Fri, Nov 04, 2016 at 01:28:48PM +0100, Juan Quintela wrote:
> Marcelo Tosatti <mtosatti@redhat.com> wrote:
> > This patch, relative to pre-copy migration codepath,
> > measures the time between vm_stop() and pre_save(), 
> > which includes copying the remaining RAM to destination,
> > and advances the clock by that amount.
> >
> > In a VM with 5 seconds downtime, this reduces the guest 
> > clock difference on destination from 5s to 0.2s.
> >
> > Please do not apply this yet as some codepaths still need
> > checking, submitting early for comments.
> >
> > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> 
> You can use an optional section, and then you don't need to increase the
> version number.

Optional section is more appropriate, thanks.

> I believe you that the clock manipulation is right, only talking about
> the migration bits.
> 
> > +static uint64_t clock_delta(struct timespec *before, struct timespec *after)
> > +{
> > +    if (before->tv_sec > after->tv_sec ||
> > +        (before->tv_sec == after->tv_sec &&
> > +         before->tv_nsec > after->tv_nsec)) {
> > +        fprintf(stderr, "clock_delta failed: before=(%ld sec, %ld nsec),"
> > +                        "after=(%ld sec, %ld nsec)\n", before->tv_sec,
> > +                        before->tv_nsec, after->tv_sec, after->tv_nsec);
> > +        abort();
> > +    }
> > +
> > +    return (after->tv_sec - before->tv_sec) * 1000000000ULL +
> > +            after->tv_nsec - before->tv_nsec;
> > +}
> 
> I can't believe that we don't have a helper function already to
> calculate this....

Couldnt find any...

> > +
> > +static void kvmclock_pre_save(void *opaque)
> > +{
> > +    KVMClockState *s = opaque;
> > +    struct timespec now;
> > +    uint64_t ns;
> > +
> > +    if (s->t_aftervmstop.tv_sec == 0) {
> > +        return;
> > +    }
> 
> You have your test here.
> 
> > +
> > +    clock_gettime(CLOCK_MONOTONIC, &now);
> > +
> > +    ns = clock_delta(&s->t_aftervmstop, &now);
> > +
> > +    /*
> > +     * Linux guests can overflow if time jumps
> > +     * forward in large increments.
> > +     * Cap maximum adjustment to 10 minutes.
> > +     */
> > +    ns = MIN(ns, 600000000000ULL);
> > +
> > +    if (s->clock + ns > s->clock) {
> > +        s->ns = ns;
> 
> Would it be a good idea to print an error message here?  If it has been more
> than 10mins since we did the vmstop, something got wrong here.

Not sure... is it not possible for the user to stop migration in some 
way? 

What if network is very slow and maxdowntime very high?

> > +    }
> > +}
> > +
> > +static int kvmclock_post_load(void *opaque, int version_id)
> > +{
> > +    KVMClockState *s = opaque;
> > +
> > +    /* save the value from incoming migration */
> > +    s->advance_clock = s->ns;
> > +
> > +    return 0;
> > +}
> > +
> >  static const VMStateDescription kvmclock_vmsd = {
> >      .name = "kvmclock",
> > -    .version_id = 1,
> > +    .version_id = 2,
> >      .minimum_version_id = 1,
> > +    .pre_save = kvmclock_pre_save,
> > +    .post_load = kvmclock_post_load,
> >      .fields = (VMStateField[]) {
> >          VMSTATE_UINT64(clock, KVMClockState),
> > +        VMSTATE_UINT64_V(ns, KVMClockState, 2),
> >          VMSTATE_END_OF_LIST()
> >      }
> >  };
> 
> 
> If you need help with the subsection stuff, just ask.
> 
> Later, Juan.

Ok, i'll try to cook up an optional section and lets see what happens.

Thanks Juan.

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [Qemu-devel] [QEMU PATCH] kvmclock: advance clock by time window between vm_stop and pre_save
@ 2016-11-04 12:35     ` Marcelo Tosatti
  0 siblings, 0 replies; 80+ messages in thread
From: Marcelo Tosatti @ 2016-11-04 12:35 UTC (permalink / raw)
  To: Juan Quintela
  Cc: kvm, qemu-devel, Dr. David Alan Gilbert, Paolo Bonzini,
	Radim Krčmář,
	Eduardo Habkost

On Fri, Nov 04, 2016 at 01:28:48PM +0100, Juan Quintela wrote:
> Marcelo Tosatti <mtosatti@redhat.com> wrote:
> > This patch, relative to pre-copy migration codepath,
> > measures the time between vm_stop() and pre_save(), 
> > which includes copying the remaining RAM to destination,
> > and advances the clock by that amount.
> >
> > In a VM with 5 seconds downtime, this reduces the guest 
> > clock difference on destination from 5s to 0.2s.
> >
> > Please do not apply this yet as some codepaths still need
> > checking, submitting early for comments.
> >
> > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> 
> You can use an optional section, and then you don't need to increase the
> version number.

Optional section is more appropriate, thanks.

> I believe you that the clock manipulation is right, only talking about
> the migration bits.
> 
> > +static uint64_t clock_delta(struct timespec *before, struct timespec *after)
> > +{
> > +    if (before->tv_sec > after->tv_sec ||
> > +        (before->tv_sec == after->tv_sec &&
> > +         before->tv_nsec > after->tv_nsec)) {
> > +        fprintf(stderr, "clock_delta failed: before=(%ld sec, %ld nsec),"
> > +                        "after=(%ld sec, %ld nsec)\n", before->tv_sec,
> > +                        before->tv_nsec, after->tv_sec, after->tv_nsec);
> > +        abort();
> > +    }
> > +
> > +    return (after->tv_sec - before->tv_sec) * 1000000000ULL +
> > +            after->tv_nsec - before->tv_nsec;
> > +}
> 
> I can't believe that we don't have a helper function already to
> calculate this....

Couldnt find any...

> > +
> > +static void kvmclock_pre_save(void *opaque)
> > +{
> > +    KVMClockState *s = opaque;
> > +    struct timespec now;
> > +    uint64_t ns;
> > +
> > +    if (s->t_aftervmstop.tv_sec == 0) {
> > +        return;
> > +    }
> 
> You have your test here.
> 
> > +
> > +    clock_gettime(CLOCK_MONOTONIC, &now);
> > +
> > +    ns = clock_delta(&s->t_aftervmstop, &now);
> > +
> > +    /*
> > +     * Linux guests can overflow if time jumps
> > +     * forward in large increments.
> > +     * Cap maximum adjustment to 10 minutes.
> > +     */
> > +    ns = MIN(ns, 600000000000ULL);
> > +
> > +    if (s->clock + ns > s->clock) {
> > +        s->ns = ns;
> 
> Would it be a good idea to print an error message here?  If it has been more
> than 10mins since we did the vmstop, something got wrong here.

Not sure... is it not possible for the user to stop migration in some 
way? 

What if network is very slow and maxdowntime very high?

> > +    }
> > +}
> > +
> > +static int kvmclock_post_load(void *opaque, int version_id)
> > +{
> > +    KVMClockState *s = opaque;
> > +
> > +    /* save the value from incoming migration */
> > +    s->advance_clock = s->ns;
> > +
> > +    return 0;
> > +}
> > +
> >  static const VMStateDescription kvmclock_vmsd = {
> >      .name = "kvmclock",
> > -    .version_id = 1,
> > +    .version_id = 2,
> >      .minimum_version_id = 1,
> > +    .pre_save = kvmclock_pre_save,
> > +    .post_load = kvmclock_post_load,
> >      .fields = (VMStateField[]) {
> >          VMSTATE_UINT64(clock, KVMClockState),
> > +        VMSTATE_UINT64_V(ns, KVMClockState, 2),
> >          VMSTATE_END_OF_LIST()
> >      }
> >  };
> 
> 
> If you need help with the subsection stuff, just ask.
> 
> Later, Juan.

Ok, i'll try to cook up an optional section and lets see what happens.

Thanks Juan.

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [QEMU PATCH] kvmclock: advance clock by time window between vm_stop and pre_save
  2016-11-04 12:35     ` [Qemu-devel] " Marcelo Tosatti
@ 2016-11-04 14:00       ` Marcelo Tosatti
  -1 siblings, 0 replies; 80+ messages in thread
From: Marcelo Tosatti @ 2016-11-04 14:00 UTC (permalink / raw)
  To: Juan Quintela
  Cc: kvm, qemu-devel, Dr. David Alan Gilbert, Paolo Bonzini,
	Radim Krčmář,
	Eduardo Habkost

On Fri, Nov 04, 2016 at 10:35:39AM -0200, Marcelo Tosatti wrote:
> On Fri, Nov 04, 2016 at 01:28:48PM +0100, Juan Quintela wrote:
> > Marcelo Tosatti <mtosatti@redhat.com> wrote:
> > > This patch, relative to pre-copy migration codepath,
> > > measures the time between vm_stop() and pre_save(), 
> > > which includes copying the remaining RAM to destination,
> > > and advances the clock by that amount.
> > >
> > > In a VM with 5 seconds downtime, this reduces the guest 
> > > clock difference on destination from 5s to 0.2s.
> > >
> > > Please do not apply this yet as some codepaths still need
> > > checking, submitting early for comments.
> > >
> > > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > 
> > You can use an optional section, and then you don't need to increase the
> > version number.
> 
> Optional section is more appropriate, thanks.
> 
> > I believe you that the clock manipulation is right, only talking about
> > the migration bits.
> > 
> > > +static uint64_t clock_delta(struct timespec *before, struct timespec *after)
> > > +{
> > > +    if (before->tv_sec > after->tv_sec ||
> > > +        (before->tv_sec == after->tv_sec &&
> > > +         before->tv_nsec > after->tv_nsec)) {
> > > +        fprintf(stderr, "clock_delta failed: before=(%ld sec, %ld nsec),"
> > > +                        "after=(%ld sec, %ld nsec)\n", before->tv_sec,
> > > +                        before->tv_nsec, after->tv_sec, after->tv_nsec);
> > > +        abort();
> > > +    }
> > > +
> > > +    return (after->tv_sec - before->tv_sec) * 1000000000ULL +
> > > +            after->tv_nsec - before->tv_nsec;
> > > +}
> > 
> > I can't believe that we don't have a helper function already to
> > calculate this....
> 
> Couldnt find any...
> 
> > > +
> > > +static void kvmclock_pre_save(void *opaque)
> > > +{
> > > +    KVMClockState *s = opaque;
> > > +    struct timespec now;
> > > +    uint64_t ns;
> > > +
> > > +    if (s->t_aftervmstop.tv_sec == 0) {
> > > +        return;
> > > +    }
> > 
> > You have your test here.
> > 
> > > +
> > > +    clock_gettime(CLOCK_MONOTONIC, &now);
> > > +
> > > +    ns = clock_delta(&s->t_aftervmstop, &now);
> > > +
> > > +    /*
> > > +     * Linux guests can overflow if time jumps
> > > +     * forward in large increments.
> > > +     * Cap maximum adjustment to 10 minutes.
> > > +     */
> > > +    ns = MIN(ns, 600000000000ULL);
> > > +
> > > +    if (s->clock + ns > s->clock) {
> > > +        s->ns = ns;
> > 
> > Would it be a good idea to print an error message here?  If it has been more
> > than 10mins since we did the vmstop, something got wrong here.
> 
> Not sure... is it not possible for the user to stop migration in some 
> way? 
> 
> What if network is very slow and maxdowntime very high?
> 
> > > +    }
> > > +}
> > > +
> > > +static int kvmclock_post_load(void *opaque, int version_id)
> > > +{
> > > +    KVMClockState *s = opaque;
> > > +
> > > +    /* save the value from incoming migration */
> > > +    s->advance_clock = s->ns;
> > > +
> > > +    return 0;
> > > +}
> > > +
> > >  static const VMStateDescription kvmclock_vmsd = {
> > >      .name = "kvmclock",
> > > -    .version_id = 1,
> > > +    .version_id = 2,
> > >      .minimum_version_id = 1,
> > > +    .pre_save = kvmclock_pre_save,
> > > +    .post_load = kvmclock_post_load,
> > >      .fields = (VMStateField[]) {
> > >          VMSTATE_UINT64(clock, KVMClockState),
> > > +        VMSTATE_UINT64_V(ns, KVMClockState, 2),
> > >          VMSTATE_END_OF_LIST()
> > >      }
> > >  };
> > 
> > 
> > If you need help with the subsection stuff, just ask.
> > 
> > Later, Juan.
> 
> Ok, i'll try to cook up an optional section and lets see what happens.
> 
> Thanks Juan.

Ok so by "optional section" i meant a section that when sent 
to destination, could be ignored and migration would succeed. 

The alternative (what this patch has now) is to increase migration
version so that:

    1. older machine types remain compatible. 
    2. newer machine types fail to migrate.

Because the data being sent, ns, is not really optional: if kvmclock or
hyper-v time is enabled (which should be 100% of the cases) we always
want to send that data.

That is, there is no difference between:

* Writing a subsection with needed=1 always (except when 
using an older machine types).
* Using old/new machine types with particular versions.

I think i missed the patch to switch current machine
types to kvmclock v1, BTW.


^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [Qemu-devel] [QEMU PATCH] kvmclock: advance clock by time window between vm_stop and pre_save
@ 2016-11-04 14:00       ` Marcelo Tosatti
  0 siblings, 0 replies; 80+ messages in thread
From: Marcelo Tosatti @ 2016-11-04 14:00 UTC (permalink / raw)
  To: Juan Quintela
  Cc: kvm, qemu-devel, Dr. David Alan Gilbert, Paolo Bonzini,
	Radim Krčmář,
	Eduardo Habkost

On Fri, Nov 04, 2016 at 10:35:39AM -0200, Marcelo Tosatti wrote:
> On Fri, Nov 04, 2016 at 01:28:48PM +0100, Juan Quintela wrote:
> > Marcelo Tosatti <mtosatti@redhat.com> wrote:
> > > This patch, relative to pre-copy migration codepath,
> > > measures the time between vm_stop() and pre_save(), 
> > > which includes copying the remaining RAM to destination,
> > > and advances the clock by that amount.
> > >
> > > In a VM with 5 seconds downtime, this reduces the guest 
> > > clock difference on destination from 5s to 0.2s.
> > >
> > > Please do not apply this yet as some codepaths still need
> > > checking, submitting early for comments.
> > >
> > > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > 
> > You can use an optional section, and then you don't need to increase the
> > version number.
> 
> Optional section is more appropriate, thanks.
> 
> > I believe you that the clock manipulation is right, only talking about
> > the migration bits.
> > 
> > > +static uint64_t clock_delta(struct timespec *before, struct timespec *after)
> > > +{
> > > +    if (before->tv_sec > after->tv_sec ||
> > > +        (before->tv_sec == after->tv_sec &&
> > > +         before->tv_nsec > after->tv_nsec)) {
> > > +        fprintf(stderr, "clock_delta failed: before=(%ld sec, %ld nsec),"
> > > +                        "after=(%ld sec, %ld nsec)\n", before->tv_sec,
> > > +                        before->tv_nsec, after->tv_sec, after->tv_nsec);
> > > +        abort();
> > > +    }
> > > +
> > > +    return (after->tv_sec - before->tv_sec) * 1000000000ULL +
> > > +            after->tv_nsec - before->tv_nsec;
> > > +}
> > 
> > I can't believe that we don't have a helper function already to
> > calculate this....
> 
> Couldnt find any...
> 
> > > +
> > > +static void kvmclock_pre_save(void *opaque)
> > > +{
> > > +    KVMClockState *s = opaque;
> > > +    struct timespec now;
> > > +    uint64_t ns;
> > > +
> > > +    if (s->t_aftervmstop.tv_sec == 0) {
> > > +        return;
> > > +    }
> > 
> > You have your test here.
> > 
> > > +
> > > +    clock_gettime(CLOCK_MONOTONIC, &now);
> > > +
> > > +    ns = clock_delta(&s->t_aftervmstop, &now);
> > > +
> > > +    /*
> > > +     * Linux guests can overflow if time jumps
> > > +     * forward in large increments.
> > > +     * Cap maximum adjustment to 10 minutes.
> > > +     */
> > > +    ns = MIN(ns, 600000000000ULL);
> > > +
> > > +    if (s->clock + ns > s->clock) {
> > > +        s->ns = ns;
> > 
> > Would it be a good idea to print an error message here?  If it has been more
> > than 10mins since we did the vmstop, something got wrong here.
> 
> Not sure... is it not possible for the user to stop migration in some 
> way? 
> 
> What if network is very slow and maxdowntime very high?
> 
> > > +    }
> > > +}
> > > +
> > > +static int kvmclock_post_load(void *opaque, int version_id)
> > > +{
> > > +    KVMClockState *s = opaque;
> > > +
> > > +    /* save the value from incoming migration */
> > > +    s->advance_clock = s->ns;
> > > +
> > > +    return 0;
> > > +}
> > > +
> > >  static const VMStateDescription kvmclock_vmsd = {
> > >      .name = "kvmclock",
> > > -    .version_id = 1,
> > > +    .version_id = 2,
> > >      .minimum_version_id = 1,
> > > +    .pre_save = kvmclock_pre_save,
> > > +    .post_load = kvmclock_post_load,
> > >      .fields = (VMStateField[]) {
> > >          VMSTATE_UINT64(clock, KVMClockState),
> > > +        VMSTATE_UINT64_V(ns, KVMClockState, 2),
> > >          VMSTATE_END_OF_LIST()
> > >      }
> > >  };
> > 
> > 
> > If you need help with the subsection stuff, just ask.
> > 
> > Later, Juan.
> 
> Ok, i'll try to cook up an optional section and lets see what happens.
> 
> Thanks Juan.

Ok so by "optional section" i meant a section that when sent 
to destination, could be ignored and migration would succeed. 

The alternative (what this patch has now) is to increase migration
version so that:

    1. older machine types remain compatible. 
    2. newer machine types fail to migrate.

Because the data being sent, ns, is not really optional: if kvmclock or
hyper-v time is enabled (which should be 100% of the cases) we always
want to send that data.

That is, there is no difference between:

* Writing a subsection with needed=1 always (except when 
using an older machine types).
* Using old/new machine types with particular versions.

I think i missed the patch to switch current machine
types to kvmclock v1, BTW.

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [QEMU PATCH] kvmclock: advance clock by time window between vm_stop and pre_save
  2016-11-04  9:43 ` [Qemu-devel] " Marcelo Tosatti
@ 2016-11-04 15:25   ` Radim Krčmář
  -1 siblings, 0 replies; 80+ messages in thread
From: Radim Krčmář @ 2016-11-04 15:25 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: kvm, qemu-devel, Dr. David Alan Gilbert, Paolo Bonzini,
	Juan Quintela, Eduardo Habkost

2016-11-04 07:43-0200, Marcelo Tosatti:
> This patch, relative to pre-copy migration codepath,
> measures the time between vm_stop() and pre_save(), 
> which includes copying the remaining RAM to destination,
> and advances the clock by that amount.
> 
> In a VM with 5 seconds downtime, this reduces the guest 
> clock difference on destination from 5s to 0.2s.
> 
> Please do not apply this yet as some codepaths still need
> checking, submitting early for comments.

The time computation looks ok.

We could make it slightly more precise by returning the CLOCK_MONOTONIC
at which KVM_GET_CLOCK is read with the IOCTL, but we don't account the
migration time anyway, so that precision would be wasted.

> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> 
> diff --git a/hw/i386/kvm/clock.c b/hw/i386/kvm/clock.c
> @@ -100,6 +106,11 @@ static void kvmclock_vm_state_change(void *opaque, int running,
>              s->clock = time_at_migration;
>          }
>  
> +        if (s->advance_clock && s->clock + s->advance_clock > s->clock) {
> +            s->clock += s->advance_clock;
> +            s->advance_clock = 0;
> +        }

Can't the advance_clock added to the migrated KVMClockState instead of
passing it as another parameter?

(It is sad that we can't just query KVMClockState in kvmclock_pre_save
 because of the Linux bug.)

> @@ -135,6 +146,18 @@ static void kvmclock_vm_state_change(void *opaque, int running,
>              abort();
>          }
>          s->clock = data.clock;
> +        /*
> +         * Transition from VM-running to VM-stopped via migration?
> +         * Record when the VM was stopped.
> +         */
> +
> +        if (state == RUN_STATE_FINISH_MIGRATE &&
> +            !migration_in_postcopy(migrate_get_current())) {
> +            clock_gettime(CLOCK_MONOTONIC, &s->t_aftervmstop);

How big (more like small) was the clock delta between here and
kvmclock_pre_save with postcopy?

Thanks.

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [Qemu-devel] [QEMU PATCH] kvmclock: advance clock by time window between vm_stop and pre_save
@ 2016-11-04 15:25   ` Radim Krčmář
  0 siblings, 0 replies; 80+ messages in thread
From: Radim Krčmář @ 2016-11-04 15:25 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: kvm, qemu-devel, Dr. David Alan Gilbert, Paolo Bonzini,
	Juan Quintela, Eduardo Habkost

2016-11-04 07:43-0200, Marcelo Tosatti:
> This patch, relative to pre-copy migration codepath,
> measures the time between vm_stop() and pre_save(), 
> which includes copying the remaining RAM to destination,
> and advances the clock by that amount.
> 
> In a VM with 5 seconds downtime, this reduces the guest 
> clock difference on destination from 5s to 0.2s.
> 
> Please do not apply this yet as some codepaths still need
> checking, submitting early for comments.

The time computation looks ok.

We could make it slightly more precise by returning the CLOCK_MONOTONIC
at which KVM_GET_CLOCK is read with the IOCTL, but we don't account the
migration time anyway, so that precision would be wasted.

> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> 
> diff --git a/hw/i386/kvm/clock.c b/hw/i386/kvm/clock.c
> @@ -100,6 +106,11 @@ static void kvmclock_vm_state_change(void *opaque, int running,
>              s->clock = time_at_migration;
>          }
>  
> +        if (s->advance_clock && s->clock + s->advance_clock > s->clock) {
> +            s->clock += s->advance_clock;
> +            s->advance_clock = 0;
> +        }

Can't the advance_clock added to the migrated KVMClockState instead of
passing it as another parameter?

(It is sad that we can't just query KVMClockState in kvmclock_pre_save
 because of the Linux bug.)

> @@ -135,6 +146,18 @@ static void kvmclock_vm_state_change(void *opaque, int running,
>              abort();
>          }
>          s->clock = data.clock;
> +        /*
> +         * Transition from VM-running to VM-stopped via migration?
> +         * Record when the VM was stopped.
> +         */
> +
> +        if (state == RUN_STATE_FINISH_MIGRATE &&
> +            !migration_in_postcopy(migrate_get_current())) {
> +            clock_gettime(CLOCK_MONOTONIC, &s->t_aftervmstop);

How big (more like small) was the clock delta between here and
kvmclock_pre_save with postcopy?

Thanks.

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [QEMU PATCH] kvmclock: advance clock by time window between vm_stop and pre_save
  2016-11-04 15:25   ` [Qemu-devel] " Radim Krčmář
@ 2016-11-04 15:33     ` Paolo Bonzini
  -1 siblings, 0 replies; 80+ messages in thread
From: Paolo Bonzini @ 2016-11-04 15:33 UTC (permalink / raw)
  To: Radim Krčmář, Marcelo Tosatti
  Cc: kvm, qemu-devel, Dr. David Alan Gilbert, Juan Quintela,
	Eduardo Habkost, Roman Kagan



On 04/11/2016 16:25, Radim Krčmář wrote:
>> >  
>> > +        if (s->advance_clock && s->clock + s->advance_clock > s->clock) {
>> > +            s->clock += s->advance_clock;
>> > +            s->advance_clock = 0;
>> > +        }
> Can't the advance_clock added to the migrated KVMClockState instead of
> passing it as another parameter?
> 
> (It is sad that we can't just query KVMClockState in kvmclock_pre_save
>  because of the Linux bug.)

What Linux bug?  The one that makes us use kvmclock_current_nsec?

It should work with 4.9-rc (well, once Linus applies my pull request).
4.9-rc will not return ktime_get_ns for KVM_GET_CLOCK; it will return
the raw value from the kernel timekeeper.

I'm thinking that we should add a KVM capability for this, and skip
kvmclock_current_nsec if the capability is present.  The first part is
trivial, so we can do it even during Linux rc period.

Paolo

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [Qemu-devel] [QEMU PATCH] kvmclock: advance clock by time window between vm_stop and pre_save
@ 2016-11-04 15:33     ` Paolo Bonzini
  0 siblings, 0 replies; 80+ messages in thread
From: Paolo Bonzini @ 2016-11-04 15:33 UTC (permalink / raw)
  To: Radim Krčmář, Marcelo Tosatti
  Cc: kvm, qemu-devel, Dr. David Alan Gilbert, Juan Quintela,
	Eduardo Habkost, Roman Kagan



On 04/11/2016 16:25, Radim Krčmář wrote:
>> >  
>> > +        if (s->advance_clock && s->clock + s->advance_clock > s->clock) {
>> > +            s->clock += s->advance_clock;
>> > +            s->advance_clock = 0;
>> > +        }
> Can't the advance_clock added to the migrated KVMClockState instead of
> passing it as another parameter?
> 
> (It is sad that we can't just query KVMClockState in kvmclock_pre_save
>  because of the Linux bug.)

What Linux bug?  The one that makes us use kvmclock_current_nsec?

It should work with 4.9-rc (well, once Linus applies my pull request).
4.9-rc will not return ktime_get_ns for KVM_GET_CLOCK; it will return
the raw value from the kernel timekeeper.

I'm thinking that we should add a KVM capability for this, and skip
kvmclock_current_nsec if the capability is present.  The first part is
trivial, so we can do it even during Linux rc period.

Paolo

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [QEMU PATCH] kvmclock: advance clock by time window between vm_stop and pre_save
  2016-11-04 15:33     ` [Qemu-devel] " Paolo Bonzini
@ 2016-11-04 15:48       ` Radim Krčmář
  -1 siblings, 0 replies; 80+ messages in thread
From: Radim Krčmář @ 2016-11-04 15:48 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Marcelo Tosatti, kvm, qemu-devel, Dr. David Alan Gilbert,
	Juan Quintela, Eduardo Habkost, Roman Kagan

2016-11-04 16:33+0100, Paolo Bonzini:
> On 04/11/2016 16:25, Radim Krčmář wrote:
>>> >  
>>> > +        if (s->advance_clock && s->clock + s->advance_clock > s->clock) {
>>> > +            s->clock += s->advance_clock;
>>> > +            s->advance_clock = 0;
>>> > +        }
>> Can't the advance_clock added to the migrated KVMClockState instead of
>> passing it as another parameter?
>> 
>> (It is sad that we can't just query KVMClockState in kvmclock_pre_save
>>  because of the Linux bug.)
> 
> What Linux bug?  The one that makes us use kvmclock_current_nsec?

No, the one that forced Marcelo to add the 10 minute limit to the
advance_clock.  We wouldn't need this advance_clock hack if we could
just call KVM_GET_CLOCK like we did before 00f4d64ee76e ("kvmclock:
clock should count only if vm is running").

> It should work with 4.9-rc (well, once Linus applies my pull request).
> 4.9-rc will not return ktime_get_ns for KVM_GET_CLOCK; it will return
> the raw value from the kernel timekeeper.
> 
> I'm thinking that we should add a KVM capability for this, and skip
> kvmclock_current_nsec if the capability is present.  The first part is
> trivial, so we can do it even during Linux rc period.

I agree, that sounds like a nice improvement.

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [Qemu-devel] [QEMU PATCH] kvmclock: advance clock by time window between vm_stop and pre_save
@ 2016-11-04 15:48       ` Radim Krčmář
  0 siblings, 0 replies; 80+ messages in thread
From: Radim Krčmář @ 2016-11-04 15:48 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Marcelo Tosatti, kvm, qemu-devel, Dr. David Alan Gilbert,
	Juan Quintela, Eduardo Habkost, Roman Kagan

2016-11-04 16:33+0100, Paolo Bonzini:
> On 04/11/2016 16:25, Radim Krčmář wrote:
>>> >  
>>> > +        if (s->advance_clock && s->clock + s->advance_clock > s->clock) {
>>> > +            s->clock += s->advance_clock;
>>> > +            s->advance_clock = 0;
>>> > +        }
>> Can't the advance_clock added to the migrated KVMClockState instead of
>> passing it as another parameter?
>> 
>> (It is sad that we can't just query KVMClockState in kvmclock_pre_save
>>  because of the Linux bug.)
> 
> What Linux bug?  The one that makes us use kvmclock_current_nsec?

No, the one that forced Marcelo to add the 10 minute limit to the
advance_clock.  We wouldn't need this advance_clock hack if we could
just call KVM_GET_CLOCK like we did before 00f4d64ee76e ("kvmclock:
clock should count only if vm is running").

> It should work with 4.9-rc (well, once Linus applies my pull request).
> 4.9-rc will not return ktime_get_ns for KVM_GET_CLOCK; it will return
> the raw value from the kernel timekeeper.
> 
> I'm thinking that we should add a KVM capability for this, and skip
> kvmclock_current_nsec if the capability is present.  The first part is
> trivial, so we can do it even during Linux rc period.

I agree, that sounds like a nice improvement.

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [QEMU PATCH] kvmclock: advance clock by time window between vm_stop and pre_save
  2016-11-04 15:48       ` [Qemu-devel] " Radim Krčmář
@ 2016-11-04 15:57         ` Paolo Bonzini
  -1 siblings, 0 replies; 80+ messages in thread
From: Paolo Bonzini @ 2016-11-04 15:57 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: Marcelo Tosatti, kvm, qemu-devel, Dr. David Alan Gilbert,
	Juan Quintela, Eduardo Habkost, Roman Kagan



On 04/11/2016 16:48, Radim Krčmář wrote:
> 2016-11-04 16:33+0100, Paolo Bonzini:
>> On 04/11/2016 16:25, Radim Krčmář wrote:
>>>>>  
>>>>> +        if (s->advance_clock && s->clock + s->advance_clock > s->clock) {
>>>>> +            s->clock += s->advance_clock;
>>>>> +            s->advance_clock = 0;
>>>>> +        }
>>> Can't the advance_clock added to the migrated KVMClockState instead of
>>> passing it as another parameter?
>>>
>>> (It is sad that we can't just query KVMClockState in kvmclock_pre_save
>>>  because of the Linux bug.)
>>
>> What Linux bug?  The one that makes us use kvmclock_current_nsec?
> 
> No, the one that forced Marcelo to add the 10 minute limit to the
> advance_clock.  We wouldn't need this advance_clock hack if we could
> just call KVM_GET_CLOCK like we did before 00f4d64ee76e ("kvmclock:
> clock should count only if vm is running").

There are two cases:

- migrating a paused guest

- pausing at the end of migration

In the first case, kvmclock_vm_state_change's !running branch will see
state == RUN_STATE_FINISH_MIGRATE && s->clock_valid.  In the second
case, it will see state == RUN_STATE_FINISH_MIGRATE && !s->clock_valid.

In the second case it should do nothing.  Then kvmclock_pre_save will
see !s->clock_valid and call KVM_GET_CLOCK.  A bonus is that, if
migration fails and migration_thread() calls vm_start(), then the guest
will not see the clock drift backwards.

>> It should work with 4.9-rc (well, once Linus applies my pull request).
>> 4.9-rc will not return ktime_get_ns for KVM_GET_CLOCK; it will return
>> the raw value from the kernel timekeeper.
>>
>> I'm thinking that we should add a KVM capability for this, and skip
>> kvmclock_current_nsec if the capability is present.  The first part is
>> trivial, so we can do it even during Linux rc period.
> 
> I agree, that sounds like a nice improvement.

Ok, I'll send the KVM patch then.  It can even be the value *returned*
for KVM_CAP_ADJUST_CLOCK---right now it returns 1, we can return 3 (bit
0 for KVM_CAP_ADJUST_CLOCK_RESENT and bit 1 for returning the raw
value).  Any suggestion for the name?

Paolo

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [Qemu-devel] [QEMU PATCH] kvmclock: advance clock by time window between vm_stop and pre_save
@ 2016-11-04 15:57         ` Paolo Bonzini
  0 siblings, 0 replies; 80+ messages in thread
From: Paolo Bonzini @ 2016-11-04 15:57 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: Marcelo Tosatti, kvm, qemu-devel, Dr. David Alan Gilbert,
	Juan Quintela, Eduardo Habkost, Roman Kagan



On 04/11/2016 16:48, Radim Krčmář wrote:
> 2016-11-04 16:33+0100, Paolo Bonzini:
>> On 04/11/2016 16:25, Radim Krčmář wrote:
>>>>>  
>>>>> +        if (s->advance_clock && s->clock + s->advance_clock > s->clock) {
>>>>> +            s->clock += s->advance_clock;
>>>>> +            s->advance_clock = 0;
>>>>> +        }
>>> Can't the advance_clock added to the migrated KVMClockState instead of
>>> passing it as another parameter?
>>>
>>> (It is sad that we can't just query KVMClockState in kvmclock_pre_save
>>>  because of the Linux bug.)
>>
>> What Linux bug?  The one that makes us use kvmclock_current_nsec?
> 
> No, the one that forced Marcelo to add the 10 minute limit to the
> advance_clock.  We wouldn't need this advance_clock hack if we could
> just call KVM_GET_CLOCK like we did before 00f4d64ee76e ("kvmclock:
> clock should count only if vm is running").

There are two cases:

- migrating a paused guest

- pausing at the end of migration

In the first case, kvmclock_vm_state_change's !running branch will see
state == RUN_STATE_FINISH_MIGRATE && s->clock_valid.  In the second
case, it will see state == RUN_STATE_FINISH_MIGRATE && !s->clock_valid.

In the second case it should do nothing.  Then kvmclock_pre_save will
see !s->clock_valid and call KVM_GET_CLOCK.  A bonus is that, if
migration fails and migration_thread() calls vm_start(), then the guest
will not see the clock drift backwards.

>> It should work with 4.9-rc (well, once Linus applies my pull request).
>> 4.9-rc will not return ktime_get_ns for KVM_GET_CLOCK; it will return
>> the raw value from the kernel timekeeper.
>>
>> I'm thinking that we should add a KVM capability for this, and skip
>> kvmclock_current_nsec if the capability is present.  The first part is
>> trivial, so we can do it even during Linux rc period.
> 
> I agree, that sounds like a nice improvement.

Ok, I'll send the KVM patch then.  It can even be the value *returned*
for KVM_CAP_ADJUST_CLOCK---right now it returns 1, we can return 3 (bit
0 for KVM_CAP_ADJUST_CLOCK_RESENT and bit 1 for returning the raw
value).  Any suggestion for the name?

Paolo

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [QEMU PATCH] kvmclock: advance clock by time window between vm_stop and pre_save
  2016-11-04 15:25   ` [Qemu-devel] " Radim Krčmář
@ 2016-11-04 16:04     ` Marcelo Tosatti
  -1 siblings, 0 replies; 80+ messages in thread
From: Marcelo Tosatti @ 2016-11-04 16:04 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: kvm, qemu-devel, Dr. David Alan Gilbert, Paolo Bonzini,
	Juan Quintela, Eduardo Habkost

On Fri, Nov 04, 2016 at 04:25:23PM +0100, Radim Krčmář wrote:
> 2016-11-04 07:43-0200, Marcelo Tosatti:
> > This patch, relative to pre-copy migration codepath,
> > measures the time between vm_stop() and pre_save(), 
> > which includes copying the remaining RAM to destination,
> > and advances the clock by that amount.
> > 
> > In a VM with 5 seconds downtime, this reduces the guest 
> > clock difference on destination from 5s to 0.2s.
> > 
> > Please do not apply this yet as some codepaths still need
> > checking, submitting early for comments.
> 
> The time computation looks ok.
> 
> We could make it slightly more precise by returning the CLOCK_MONOTONIC
> at which KVM_GET_CLOCK is read with the IOCTL, but we don't account the
> migration time anyway, so that precision would be wasted.

Yes, can be enhanced later... That should be about 1us? (return to
userspace).

> > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > 
> > diff --git a/hw/i386/kvm/clock.c b/hw/i386/kvm/clock.c
> > @@ -100,6 +106,11 @@ static void kvmclock_vm_state_change(void *opaque, int running,
> >              s->clock = time_at_migration;
> >          }
> >  
> > +        if (s->advance_clock && s->clock + s->advance_clock > s->clock) {
> > +            s->clock += s->advance_clock;
> > +            s->advance_clock = 0;
> > +        }
> 
> Can't the advance_clock added to the migrated KVMClockState instead of
> passing it as another parameter?

Don't want to be fragile to ->ns being overwritten by pre_save executing
on the destination and overwriting the value passed in from
migration... 

> (It is sad that we can't just query KVMClockState in kvmclock_pre_save
>  because of the Linux bug.)
> 
> > @@ -135,6 +146,18 @@ static void kvmclock_vm_state_change(void *opaque, int running,
> >              abort();
> >          }
> >          s->clock = data.clock;
> > +        /*
> > +         * Transition from VM-running to VM-stopped via migration?
> > +         * Record when the VM was stopped.
> > +         */
> > +
> > +        if (state == RUN_STATE_FINISH_MIGRATE &&
> > +            !migration_in_postcopy(migrate_get_current())) {
> > +            clock_gettime(CLOCK_MONOTONIC, &s->t_aftervmstop);
> 
> How big (more like small) was the clock delta between here and
> kvmclock_pre_save with postcopy?

Haven't checked, but the clock delta is now 0.2s. 
So i assume postcopy (stopping the VM, saving the device states)
should be about 0.2s.

I'll run and let you know.


^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [Qemu-devel] [QEMU PATCH] kvmclock: advance clock by time window between vm_stop and pre_save
@ 2016-11-04 16:04     ` Marcelo Tosatti
  0 siblings, 0 replies; 80+ messages in thread
From: Marcelo Tosatti @ 2016-11-04 16:04 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: kvm, qemu-devel, Dr. David Alan Gilbert, Paolo Bonzini,
	Juan Quintela, Eduardo Habkost

On Fri, Nov 04, 2016 at 04:25:23PM +0100, Radim Krčmář wrote:
> 2016-11-04 07:43-0200, Marcelo Tosatti:
> > This patch, relative to pre-copy migration codepath,
> > measures the time between vm_stop() and pre_save(), 
> > which includes copying the remaining RAM to destination,
> > and advances the clock by that amount.
> > 
> > In a VM with 5 seconds downtime, this reduces the guest 
> > clock difference on destination from 5s to 0.2s.
> > 
> > Please do not apply this yet as some codepaths still need
> > checking, submitting early for comments.
> 
> The time computation looks ok.
> 
> We could make it slightly more precise by returning the CLOCK_MONOTONIC
> at which KVM_GET_CLOCK is read with the IOCTL, but we don't account the
> migration time anyway, so that precision would be wasted.

Yes, can be enhanced later... That should be about 1us? (return to
userspace).

> > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > 
> > diff --git a/hw/i386/kvm/clock.c b/hw/i386/kvm/clock.c
> > @@ -100,6 +106,11 @@ static void kvmclock_vm_state_change(void *opaque, int running,
> >              s->clock = time_at_migration;
> >          }
> >  
> > +        if (s->advance_clock && s->clock + s->advance_clock > s->clock) {
> > +            s->clock += s->advance_clock;
> > +            s->advance_clock = 0;
> > +        }
> 
> Can't the advance_clock added to the migrated KVMClockState instead of
> passing it as another parameter?

Don't want to be fragile to ->ns being overwritten by pre_save executing
on the destination and overwriting the value passed in from
migration... 

> (It is sad that we can't just query KVMClockState in kvmclock_pre_save
>  because of the Linux bug.)
> 
> > @@ -135,6 +146,18 @@ static void kvmclock_vm_state_change(void *opaque, int running,
> >              abort();
> >          }
> >          s->clock = data.clock;
> > +        /*
> > +         * Transition from VM-running to VM-stopped via migration?
> > +         * Record when the VM was stopped.
> > +         */
> > +
> > +        if (state == RUN_STATE_FINISH_MIGRATE &&
> > +            !migration_in_postcopy(migrate_get_current())) {
> > +            clock_gettime(CLOCK_MONOTONIC, &s->t_aftervmstop);
> 
> How big (more like small) was the clock delta between here and
> kvmclock_pre_save with postcopy?

Haven't checked, but the clock delta is now 0.2s. 
So i assume postcopy (stopping the VM, saving the device states)
should be about 0.2s.

I'll run and let you know.

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [QEMU PATCH] kvmclock: advance clock by time window between vm_stop and pre_save
  2016-11-04 15:48       ` [Qemu-devel] " Radim Krčmář
@ 2016-11-04 16:24         ` Marcelo Tosatti
  -1 siblings, 0 replies; 80+ messages in thread
From: Marcelo Tosatti @ 2016-11-04 16:24 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: Paolo Bonzini, kvm, qemu-devel, Dr. David Alan Gilbert,
	Juan Quintela, Eduardo Habkost, Roman Kagan

On Fri, Nov 04, 2016 at 04:48:28PM +0100, Radim Krčmář wrote:
> 2016-11-04 16:33+0100, Paolo Bonzini:
> > On 04/11/2016 16:25, Radim Krčmář wrote:
> >>> >  
> >>> > +        if (s->advance_clock && s->clock + s->advance_clock > s->clock) {
> >>> > +            s->clock += s->advance_clock;
> >>> > +            s->advance_clock = 0;
> >>> > +        }
> >> Can't the advance_clock added to the migrated KVMClockState instead of
> >> passing it as another parameter?
> >> 
> >> (It is sad that we can't just query KVMClockState in kvmclock_pre_save
> >>  because of the Linux bug.)
> > 
> > What Linux bug?  The one that makes us use kvmclock_current_nsec?
> 
> No, the one that forced Marcelo to add the 10 minute limit to the
> advance_clock. 

Overflow when reading clocksource, in the timer interrupt.

>  We wouldn't need this advance_clock hack if we could
> just call KVM_GET_CLOCK like we did before 00f4d64ee76e ("kvmclock:
> clock should count only if vm is running").

People have requested that CLOCK_MONOTONIC stops when 
vm suspends.

> > It should work with 4.9-rc (well, once Linus applies my pull request).
> > 4.9-rc will not return ktime_get_ns for KVM_GET_CLOCK; it will return
> > the raw value from the kernel timekeeper.
> > 
> > I'm thinking that we should add a KVM capability for this, and skip
> > kvmclock_current_nsec if the capability is present.  The first part is
> > trivial, so we can do it even during Linux rc period.
> 
> I agree, that sounds like a nice improvement.

I fail to see how this (KVM_GET_CLOCK using raw value (vs NTP corrected)
relates to clocksource overflow and CLOCK_MONOTONIC stop requests.

-----

Todo list (collective???):

1) Implement suggestion to Roman: 
"Subject: Re: [PATCH] x86/kvm: fix condition to update kvm master
clocks" to handle the time backwards when updating masterclock 
from kernel clock.

2) Review TSC scaling support (make sure scaled TSC 
is propagated properly everywhere).

3) Enable migration with invariant TSC.

4) Fullvirt fix for local APIC deadline timer bug
(BTW Paolo Windows is also vulnerable right).



^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [Qemu-devel] [QEMU PATCH] kvmclock: advance clock by time window between vm_stop and pre_save
@ 2016-11-04 16:24         ` Marcelo Tosatti
  0 siblings, 0 replies; 80+ messages in thread
From: Marcelo Tosatti @ 2016-11-04 16:24 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: Paolo Bonzini, kvm, qemu-devel, Dr. David Alan Gilbert,
	Juan Quintela, Eduardo Habkost, Roman Kagan

On Fri, Nov 04, 2016 at 04:48:28PM +0100, Radim Krčmář wrote:
> 2016-11-04 16:33+0100, Paolo Bonzini:
> > On 04/11/2016 16:25, Radim Krčmář wrote:
> >>> >  
> >>> > +        if (s->advance_clock && s->clock + s->advance_clock > s->clock) {
> >>> > +            s->clock += s->advance_clock;
> >>> > +            s->advance_clock = 0;
> >>> > +        }
> >> Can't the advance_clock added to the migrated KVMClockState instead of
> >> passing it as another parameter?
> >> 
> >> (It is sad that we can't just query KVMClockState in kvmclock_pre_save
> >>  because of the Linux bug.)
> > 
> > What Linux bug?  The one that makes us use kvmclock_current_nsec?
> 
> No, the one that forced Marcelo to add the 10 minute limit to the
> advance_clock. 

Overflow when reading clocksource, in the timer interrupt.

>  We wouldn't need this advance_clock hack if we could
> just call KVM_GET_CLOCK like we did before 00f4d64ee76e ("kvmclock:
> clock should count only if vm is running").

People have requested that CLOCK_MONOTONIC stops when 
vm suspends.

> > It should work with 4.9-rc (well, once Linus applies my pull request).
> > 4.9-rc will not return ktime_get_ns for KVM_GET_CLOCK; it will return
> > the raw value from the kernel timekeeper.
> > 
> > I'm thinking that we should add a KVM capability for this, and skip
> > kvmclock_current_nsec if the capability is present.  The first part is
> > trivial, so we can do it even during Linux rc period.
> 
> I agree, that sounds like a nice improvement.

I fail to see how this (KVM_GET_CLOCK using raw value (vs NTP corrected)
relates to clocksource overflow and CLOCK_MONOTONIC stop requests.

-----

Todo list (collective???):

1) Implement suggestion to Roman: 
"Subject: Re: [PATCH] x86/kvm: fix condition to update kvm master
clocks" to handle the time backwards when updating masterclock 
from kernel clock.

2) Review TSC scaling support (make sure scaled TSC 
is propagated properly everywhere).

3) Enable migration with invariant TSC.

4) Fullvirt fix for local APIC deadline timer bug
(BTW Paolo Windows is also vulnerable right).

^ permalink raw reply	[flat|nested] 80+ messages in thread

* [QEMU PATCH v2] kvmclock: advance clock by time window between vm_stop and pre_save
  2016-11-04  9:43 ` [Qemu-devel] " Marcelo Tosatti
@ 2016-11-04 16:59   ` Marcelo Tosatti
  -1 siblings, 0 replies; 80+ messages in thread
From: Marcelo Tosatti @ 2016-11-04 16:59 UTC (permalink / raw)
  To: kvm, qemu-devel
  Cc: Dr. David Alan Gilbert, Paolo Bonzini,
	Radim Krčmář,
	Juan Quintela, Eduardo Habkost

This patch, relative to pre-copy migration codepath,
measures the time between vm_stop() and pre_save(),
which includes copying the remaining RAM to destination,
and advances the clock by that amount.

In a VM with 5 seconds downtime, this reduces the guest
clock difference on destination from 5s to 0.2s.

Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

---

v2: use subsection (Juan Quintela)
    fix older machine types support

diff --git a/hw/i386/kvm/clock.c b/hw/i386/kvm/clock.c
index 0f75dd3..a2a02ac 100644
--- a/hw/i386/kvm/clock.c
+++ b/hw/i386/kvm/clock.c
@@ -22,9 +22,11 @@
 #include "kvm_i386.h"
 #include "hw/sysbus.h"
 #include "hw/kvm/clock.h"
+#include "migration/migration.h"
 
 #include <linux/kvm.h>
 #include <linux/kvm_para.h>
+#include <time.h>
 
 #define TYPE_KVM_CLOCK "kvmclock"
 #define KVM_CLOCK(obj) OBJECT_CHECK(KVMClockState, (obj), TYPE_KVM_CLOCK)
@@ -35,7 +37,13 @@ typedef struct KVMClockState {
     /*< public >*/
 
     uint64_t clock;
+    uint64_t ns;
     bool clock_valid;
+
+    uint64_t advance_clock;
+    struct timespec t_aftervmstop;
+
+    bool adv_clock_enabled;
 } KVMClockState;
 
 struct pvclock_vcpu_time_info {
@@ -100,6 +108,11 @@ static void kvmclock_vm_state_change(void *opaque, int running,
             s->clock = time_at_migration;
         }
 
+        if (s->advance_clock && s->clock + s->advance_clock > s->clock) {
+            s->clock += s->advance_clock;
+            s->advance_clock = 0;
+        }
+
         data.clock = s->clock;
         ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
         if (ret < 0) {
@@ -135,6 +148,18 @@ static void kvmclock_vm_state_change(void *opaque, int running,
             abort();
         }
         s->clock = data.clock;
+        /*
+         * Transition from VM-running to VM-stopped via migration?
+         * Record when the VM was stopped.
+         */
+
+        if (state == RUN_STATE_FINISH_MIGRATE &&
+            !migration_in_postcopy(migrate_get_current())) {
+            clock_gettime(CLOCK_MONOTONIC, &s->t_aftervmstop);
+        } else {
+            s->t_aftervmstop.tv_sec = 0;
+            s->t_aftervmstop.tv_nsec = 0;
+        }
 
         /*
          * If the VM is stopped, declare the clock state valid to
@@ -152,6 +177,77 @@ static void kvmclock_realize(DeviceState *dev, Error **errp)
     qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
 }
 
+static uint64_t clock_delta(struct timespec *before, struct timespec *after)
+{
+    if (before->tv_sec > after->tv_sec ||
+        (before->tv_sec == after->tv_sec &&
+         before->tv_nsec > after->tv_nsec)) {
+        fprintf(stderr, "clock_delta failed: before=(%ld sec, %ld nsec),"
+                        "after=(%ld sec, %ld nsec)\n", before->tv_sec,
+                        before->tv_nsec, after->tv_sec, after->tv_nsec);
+        abort();
+    }
+
+    return (after->tv_sec - before->tv_sec) * 1000000000ULL +
+            after->tv_nsec - before->tv_nsec;
+}
+
+static void kvmclock_pre_save(void *opaque)
+{
+    KVMClockState *s = opaque;
+    struct timespec now;
+    uint64_t ns;
+
+    if (s->t_aftervmstop.tv_sec == 0) {
+        return;
+    }
+
+    clock_gettime(CLOCK_MONOTONIC, &now);
+
+    ns = clock_delta(&s->t_aftervmstop, &now);
+
+    /*
+     * Linux guests can overflow if time jumps
+     * forward in large increments.
+     * Cap maximum adjustment to 10 minutes.
+     */
+    ns = MIN(ns, 600000000000ULL);
+
+    if (s->clock + ns > s->clock) {
+        s->ns = ns;
+    }
+}
+
+static int kvmclock_post_load(void *opaque, int version_id)
+{
+    KVMClockState *s = opaque;
+
+    /* save the value from incoming migration */
+    s->advance_clock = s->ns;
+
+    return 0;
+}
+
+static bool kvmclock_ns_needed(void *opaque)
+{
+    KVMClockState *s = opaque;
+
+    return s->adv_clock_enabled;
+}
+
+static const VMStateDescription kvmclock_advance_ns = {
+    .name = "kvmclock/advance_ns",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .needed = kvmclock_ns_needed,
+    .pre_save = kvmclock_pre_save,
+    .post_load = kvmclock_post_load,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT64(ns, KVMClockState),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
 static const VMStateDescription kvmclock_vmsd = {
     .name = "kvmclock",
     .version_id = 1,
@@ -159,15 +255,25 @@ static const VMStateDescription kvmclock_vmsd = {
     .fields = (VMStateField[]) {
         VMSTATE_UINT64(clock, KVMClockState),
         VMSTATE_END_OF_LIST()
+    },
+    .subsections = (const VMStateDescription * []) {
+        &kvmclock_advance_ns,
+        NULL
     }
 };
 
+static Property kvmclock_properties[] = {
+    DEFINE_PROP_BOOL("advance_clock", KVMClockState, adv_clock_enabled, true),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
 static void kvmclock_class_init(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
 
     dc->realize = kvmclock_realize;
     dc->vmsd = &kvmclock_vmsd;
+    dc->props = kvmclock_properties;
 }
 
 static const TypeInfo kvmclock_info = {
diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 98dc772..243352e 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -370,6 +370,11 @@ bool e820_get_entry(int, uint32_t, uint64_t *, uint64_t *);
 #define PC_COMPAT_2_7 \
     HW_COMPAT_2_7 \
     {\
+        .driver   = "kvmclock",\
+        .property = "advance_clock",\
+        .value    = "off",\
+    },\
+    {\
         .driver   = TYPE_X86_CPU,\
         .property = "l3-cache",\
         .value    = "off",\


^ permalink raw reply related	[flat|nested] 80+ messages in thread

* [Qemu-devel] [QEMU PATCH v2] kvmclock: advance clock by time window between vm_stop and pre_save
@ 2016-11-04 16:59   ` Marcelo Tosatti
  0 siblings, 0 replies; 80+ messages in thread
From: Marcelo Tosatti @ 2016-11-04 16:59 UTC (permalink / raw)
  To: kvm, qemu-devel
  Cc: Dr. David Alan Gilbert, Paolo Bonzini,
	Radim Krčmář,
	Juan Quintela, Eduardo Habkost

This patch, relative to pre-copy migration codepath,
measures the time between vm_stop() and pre_save(),
which includes copying the remaining RAM to destination,
and advances the clock by that amount.

In a VM with 5 seconds downtime, this reduces the guest
clock difference on destination from 5s to 0.2s.

Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

---

v2: use subsection (Juan Quintela)
    fix older machine types support

diff --git a/hw/i386/kvm/clock.c b/hw/i386/kvm/clock.c
index 0f75dd3..a2a02ac 100644
--- a/hw/i386/kvm/clock.c
+++ b/hw/i386/kvm/clock.c
@@ -22,9 +22,11 @@
 #include "kvm_i386.h"
 #include "hw/sysbus.h"
 #include "hw/kvm/clock.h"
+#include "migration/migration.h"
 
 #include <linux/kvm.h>
 #include <linux/kvm_para.h>
+#include <time.h>
 
 #define TYPE_KVM_CLOCK "kvmclock"
 #define KVM_CLOCK(obj) OBJECT_CHECK(KVMClockState, (obj), TYPE_KVM_CLOCK)
@@ -35,7 +37,13 @@ typedef struct KVMClockState {
     /*< public >*/
 
     uint64_t clock;
+    uint64_t ns;
     bool clock_valid;
+
+    uint64_t advance_clock;
+    struct timespec t_aftervmstop;
+
+    bool adv_clock_enabled;
 } KVMClockState;
 
 struct pvclock_vcpu_time_info {
@@ -100,6 +108,11 @@ static void kvmclock_vm_state_change(void *opaque, int running,
             s->clock = time_at_migration;
         }
 
+        if (s->advance_clock && s->clock + s->advance_clock > s->clock) {
+            s->clock += s->advance_clock;
+            s->advance_clock = 0;
+        }
+
         data.clock = s->clock;
         ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
         if (ret < 0) {
@@ -135,6 +148,18 @@ static void kvmclock_vm_state_change(void *opaque, int running,
             abort();
         }
         s->clock = data.clock;
+        /*
+         * Transition from VM-running to VM-stopped via migration?
+         * Record when the VM was stopped.
+         */
+
+        if (state == RUN_STATE_FINISH_MIGRATE &&
+            !migration_in_postcopy(migrate_get_current())) {
+            clock_gettime(CLOCK_MONOTONIC, &s->t_aftervmstop);
+        } else {
+            s->t_aftervmstop.tv_sec = 0;
+            s->t_aftervmstop.tv_nsec = 0;
+        }
 
         /*
          * If the VM is stopped, declare the clock state valid to
@@ -152,6 +177,77 @@ static void kvmclock_realize(DeviceState *dev, Error **errp)
     qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
 }
 
+static uint64_t clock_delta(struct timespec *before, struct timespec *after)
+{
+    if (before->tv_sec > after->tv_sec ||
+        (before->tv_sec == after->tv_sec &&
+         before->tv_nsec > after->tv_nsec)) {
+        fprintf(stderr, "clock_delta failed: before=(%ld sec, %ld nsec),"
+                        "after=(%ld sec, %ld nsec)\n", before->tv_sec,
+                        before->tv_nsec, after->tv_sec, after->tv_nsec);
+        abort();
+    }
+
+    return (after->tv_sec - before->tv_sec) * 1000000000ULL +
+            after->tv_nsec - before->tv_nsec;
+}
+
+static void kvmclock_pre_save(void *opaque)
+{
+    KVMClockState *s = opaque;
+    struct timespec now;
+    uint64_t ns;
+
+    if (s->t_aftervmstop.tv_sec == 0) {
+        return;
+    }
+
+    clock_gettime(CLOCK_MONOTONIC, &now);
+
+    ns = clock_delta(&s->t_aftervmstop, &now);
+
+    /*
+     * Linux guests can overflow if time jumps
+     * forward in large increments.
+     * Cap maximum adjustment to 10 minutes.
+     */
+    ns = MIN(ns, 600000000000ULL);
+
+    if (s->clock + ns > s->clock) {
+        s->ns = ns;
+    }
+}
+
+static int kvmclock_post_load(void *opaque, int version_id)
+{
+    KVMClockState *s = opaque;
+
+    /* save the value from incoming migration */
+    s->advance_clock = s->ns;
+
+    return 0;
+}
+
+static bool kvmclock_ns_needed(void *opaque)
+{
+    KVMClockState *s = opaque;
+
+    return s->adv_clock_enabled;
+}
+
+static const VMStateDescription kvmclock_advance_ns = {
+    .name = "kvmclock/advance_ns",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .needed = kvmclock_ns_needed,
+    .pre_save = kvmclock_pre_save,
+    .post_load = kvmclock_post_load,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT64(ns, KVMClockState),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
 static const VMStateDescription kvmclock_vmsd = {
     .name = "kvmclock",
     .version_id = 1,
@@ -159,15 +255,25 @@ static const VMStateDescription kvmclock_vmsd = {
     .fields = (VMStateField[]) {
         VMSTATE_UINT64(clock, KVMClockState),
         VMSTATE_END_OF_LIST()
+    },
+    .subsections = (const VMStateDescription * []) {
+        &kvmclock_advance_ns,
+        NULL
     }
 };
 
+static Property kvmclock_properties[] = {
+    DEFINE_PROP_BOOL("advance_clock", KVMClockState, adv_clock_enabled, true),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
 static void kvmclock_class_init(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
 
     dc->realize = kvmclock_realize;
     dc->vmsd = &kvmclock_vmsd;
+    dc->props = kvmclock_properties;
 }
 
 static const TypeInfo kvmclock_info = {
diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 98dc772..243352e 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -370,6 +370,11 @@ bool e820_get_entry(int, uint32_t, uint64_t *, uint64_t *);
 #define PC_COMPAT_2_7 \
     HW_COMPAT_2_7 \
     {\
+        .driver   = "kvmclock",\
+        .property = "advance_clock",\
+        .value    = "off",\
+    },\
+    {\
         .driver   = TYPE_X86_CPU,\
         .property = "l3-cache",\
         .value    = "off",\

^ permalink raw reply related	[flat|nested] 80+ messages in thread

* Re: [QEMU PATCH] kvmclock: advance clock by time window between vm_stop and pre_save
  2016-11-04 15:25   ` [Qemu-devel] " Radim Krčmář
@ 2016-11-04 17:07     ` Marcelo Tosatti
  -1 siblings, 0 replies; 80+ messages in thread
From: Marcelo Tosatti @ 2016-11-04 17:07 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: kvm, qemu-devel, Dr. David Alan Gilbert, Paolo Bonzini,
	Juan Quintela, Eduardo Habkost

On Fri, Nov 04, 2016 at 04:25:23PM +0100, Radim Krčmář wrote:
> > +        /*
> > +         * Transition from VM-running to VM-stopped via migration?
> > +         * Record when the VM was stopped.
> > +         */
> > +
> > +        if (state == RUN_STATE_FINISH_MIGRATE &&
> > +            !migration_in_postcopy(migrate_get_current())) {
> > +            clock_gettime(CLOCK_MONOTONIC, &s->t_aftervmstop);
> 
> How big (more like small) was the clock delta between here and
> kvmclock_pre_save with postcopy?
> 
> Thanks.

qemu-system-x86_64: postcopy_ram_supported_by_host: userfaultfd not
available: Function not implemented

But should be about the same as precopy+this patch (guess as i don't 
know the postcopy path).


^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [Qemu-devel] [QEMU PATCH] kvmclock: advance clock by time window between vm_stop and pre_save
@ 2016-11-04 17:07     ` Marcelo Tosatti
  0 siblings, 0 replies; 80+ messages in thread
From: Marcelo Tosatti @ 2016-11-04 17:07 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: kvm, qemu-devel, Dr. David Alan Gilbert, Paolo Bonzini,
	Juan Quintela, Eduardo Habkost

On Fri, Nov 04, 2016 at 04:25:23PM +0100, Radim Krčmář wrote:
> > +        /*
> > +         * Transition from VM-running to VM-stopped via migration?
> > +         * Record when the VM was stopped.
> > +         */
> > +
> > +        if (state == RUN_STATE_FINISH_MIGRATE &&
> > +            !migration_in_postcopy(migrate_get_current())) {
> > +            clock_gettime(CLOCK_MONOTONIC, &s->t_aftervmstop);
> 
> How big (more like small) was the clock delta between here and
> kvmclock_pre_save with postcopy?
> 
> Thanks.

qemu-system-x86_64: postcopy_ram_supported_by_host: userfaultfd not
available: Function not implemented

But should be about the same as precopy+this patch (guess as i don't 
know the postcopy path).

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [QEMU PATCH] kvmclock: advance clock by time window between vm_stop and pre_save
  2016-11-04 15:57         ` [Qemu-devel] " Paolo Bonzini
@ 2016-11-04 17:16           ` Radim Krčmář
  -1 siblings, 0 replies; 80+ messages in thread
From: Radim Krčmář @ 2016-11-04 17:16 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Marcelo Tosatti, kvm, qemu-devel, Dr. David Alan Gilbert,
	Juan Quintela, Eduardo Habkost, Roman Kagan

2016-11-04 16:57+0100, Paolo Bonzini:
> On 04/11/2016 16:48, Radim Krčmář wrote:
>> 2016-11-04 16:33+0100, Paolo Bonzini:
>>> On 04/11/2016 16:25, Radim Krčmář wrote:
>>>>>>  
>>>>>> +        if (s->advance_clock && s->clock + s->advance_clock > s->clock) {
>>>>>> +            s->clock += s->advance_clock;
>>>>>> +            s->advance_clock = 0;
>>>>>> +        }
>>>> Can't the advance_clock added to the migrated KVMClockState instead of
>>>> passing it as another parameter?
>>>>
>>>> (It is sad that we can't just query KVMClockState in kvmclock_pre_save
>>>>  because of the Linux bug.)
>>>
>>> What Linux bug?  The one that makes us use kvmclock_current_nsec?
>> 
>> No, the one that forced Marcelo to add the 10 minute limit to the
>> advance_clock.  We wouldn't need this advance_clock hack if we could
>> just call KVM_GET_CLOCK like we did before 00f4d64ee76e ("kvmclock:
>> clock should count only if vm is running").
> 
> There are two cases:
> 
> - migrating a paused guest
> 
> - pausing at the end of migration
> 
> In the first case, kvmclock_vm_state_change's !running branch will see
> state == RUN_STATE_FINISH_MIGRATE && s->clock_valid.  In the second
> case, it will see state == RUN_STATE_FINISH_MIGRATE && !s->clock_valid.

I lift my case, marcelo's said that stopping the time is a feature ...
(*kittens die*)

> In the second case it should do nothing.  Then kvmclock_pre_save will
> see !s->clock_valid and call KVM_GET_CLOCK.  A bonus is that, if
> migration fails and migration_thread() calls vm_start(), then the guest
> will not see the clock drift backwards.

Yes, moving KVM_GET_CLOCK to kvmclock_pre_save() and doing nothing in
kvmclock_vm_state_change() to avoid the drift on failed migration would
be nicer.  We'd then also get rid of the ns migration parameter.

>>> It should work with 4.9-rc (well, once Linus applies my pull request).
>>> 4.9-rc will not return ktime_get_ns for KVM_GET_CLOCK; it will return
>>> the raw value from the kernel timekeeper.

Oh, and this does introduce a minor bug to this patch -- the time
counted by KVM_GET_CLOCK is has different frequency CLOCK_MONOTONIC.
Not accounting for that is bearable.

>>> I'm thinking that we should add a KVM capability for this, and skip
>>> kvmclock_current_nsec if the capability is present.  The first part is
>>> trivial, so we can do it even during Linux rc period.
>> 
>> I agree, that sounds like a nice improvement.
> 
> Ok, I'll send the KVM patch then.  It can even be the value *returned*
> for KVM_CAP_ADJUST_CLOCK---right now it returns 1, we can return 3 (bit
> 0 for KVM_CAP_ADJUST_CLOCK_RESENT and bit 1 for returning the raw
> value).  Any suggestion for the name?

KVM_CAP_GET_CLOCK_MASTERCLOCK?

to show that the time is counted differently when using master clock.
Another idea would be KVM_CAP_GET_CLOCK_ACTUAL, because we should now
read what the guest is seeing.

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [Qemu-devel] [QEMU PATCH] kvmclock: advance clock by time window between vm_stop and pre_save
@ 2016-11-04 17:16           ` Radim Krčmář
  0 siblings, 0 replies; 80+ messages in thread
From: Radim Krčmář @ 2016-11-04 17:16 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Marcelo Tosatti, kvm, qemu-devel, Dr. David Alan Gilbert,
	Juan Quintela, Eduardo Habkost, Roman Kagan

2016-11-04 16:57+0100, Paolo Bonzini:
> On 04/11/2016 16:48, Radim Krčmář wrote:
>> 2016-11-04 16:33+0100, Paolo Bonzini:
>>> On 04/11/2016 16:25, Radim Krčmář wrote:
>>>>>>  
>>>>>> +        if (s->advance_clock && s->clock + s->advance_clock > s->clock) {
>>>>>> +            s->clock += s->advance_clock;
>>>>>> +            s->advance_clock = 0;
>>>>>> +        }
>>>> Can't the advance_clock added to the migrated KVMClockState instead of
>>>> passing it as another parameter?
>>>>
>>>> (It is sad that we can't just query KVMClockState in kvmclock_pre_save
>>>>  because of the Linux bug.)
>>>
>>> What Linux bug?  The one that makes us use kvmclock_current_nsec?
>> 
>> No, the one that forced Marcelo to add the 10 minute limit to the
>> advance_clock.  We wouldn't need this advance_clock hack if we could
>> just call KVM_GET_CLOCK like we did before 00f4d64ee76e ("kvmclock:
>> clock should count only if vm is running").
> 
> There are two cases:
> 
> - migrating a paused guest
> 
> - pausing at the end of migration
> 
> In the first case, kvmclock_vm_state_change's !running branch will see
> state == RUN_STATE_FINISH_MIGRATE && s->clock_valid.  In the second
> case, it will see state == RUN_STATE_FINISH_MIGRATE && !s->clock_valid.

I lift my case, marcelo's said that stopping the time is a feature ...
(*kittens die*)

> In the second case it should do nothing.  Then kvmclock_pre_save will
> see !s->clock_valid and call KVM_GET_CLOCK.  A bonus is that, if
> migration fails and migration_thread() calls vm_start(), then the guest
> will not see the clock drift backwards.

Yes, moving KVM_GET_CLOCK to kvmclock_pre_save() and doing nothing in
kvmclock_vm_state_change() to avoid the drift on failed migration would
be nicer.  We'd then also get rid of the ns migration parameter.

>>> It should work with 4.9-rc (well, once Linus applies my pull request).
>>> 4.9-rc will not return ktime_get_ns for KVM_GET_CLOCK; it will return
>>> the raw value from the kernel timekeeper.

Oh, and this does introduce a minor bug to this patch -- the time
counted by KVM_GET_CLOCK is has different frequency CLOCK_MONOTONIC.
Not accounting for that is bearable.

>>> I'm thinking that we should add a KVM capability for this, and skip
>>> kvmclock_current_nsec if the capability is present.  The first part is
>>> trivial, so we can do it even during Linux rc period.
>> 
>> I agree, that sounds like a nice improvement.
> 
> Ok, I'll send the KVM patch then.  It can even be the value *returned*
> for KVM_CAP_ADJUST_CLOCK---right now it returns 1, we can return 3 (bit
> 0 for KVM_CAP_ADJUST_CLOCK_RESENT and bit 1 for returning the raw
> value).  Any suggestion for the name?

KVM_CAP_GET_CLOCK_MASTERCLOCK?

to show that the time is counted differently when using master clock.
Another idea would be KVM_CAP_GET_CLOCK_ACTUAL, because we should now
read what the guest is seeing.

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [QEMU PATCH] kvmclock: advance clock by time window between vm_stop and pre_save
  2016-11-04 16:24         ` [Qemu-devel] " Marcelo Tosatti
@ 2016-11-04 17:34           ` Radim Krčmář
  -1 siblings, 0 replies; 80+ messages in thread
From: Radim Krčmář @ 2016-11-04 17:34 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: Paolo Bonzini, kvm, qemu-devel, Dr. David Alan Gilbert,
	Juan Quintela, Eduardo Habkost, Roman Kagan

2016-11-04 14:24-0200, Marcelo Tosatti:
> On Fri, Nov 04, 2016 at 04:48:28PM +0100, Radim Krčmář wrote:
>> 2016-11-04 16:33+0100, Paolo Bonzini:
>> > On 04/11/2016 16:25, Radim Krčmář wrote:
>> >>> >  
>> >>> > +        if (s->advance_clock && s->clock + s->advance_clock > s->clock) {
>> >>> > +            s->clock += s->advance_clock;
>> >>> > +            s->advance_clock = 0;
>> >>> > +        }
>> >> Can't the advance_clock added to the migrated KVMClockState instead of
>> >> passing it as another parameter?
>> >> 
>> >> (It is sad that we can't just query KVMClockState in kvmclock_pre_save
>> >>  because of the Linux bug.)
>> > 
>> > What Linux bug?  The one that makes us use kvmclock_current_nsec?
>> 
>> No, the one that forced Marcelo to add the 10 minute limit to the
>> advance_clock. 
> 
> Overflow when reading clocksource, in the timer interrupt.

Is it still there?  I wonder why 10 minutes is the limit. :)

>>  We wouldn't need this advance_clock hack if we could
>> just call KVM_GET_CLOCK like we did before 00f4d64ee76e ("kvmclock:
>> clock should count only if vm is running").
> 
> People have requested that CLOCK_MONOTONIC stops when 
> vm suspends.

Ugh, we could have done that on the OS level, but stopping kvmclock
means that we sabotaged CLOCK_BOOTTIME as it doesn't return
CLOCK_MONOTONIC + time spent in suspend anymore.

And kvmclock is defined to count nanosecond (slightly different in
practice) since some point in time, so pausing it is not really valid.

>> > It should work with 4.9-rc (well, once Linus applies my pull request).
>> > 4.9-rc will not return ktime_get_ns for KVM_GET_CLOCK; it will return
>> > the raw value from the kernel timekeeper.
>> > 
>> > I'm thinking that we should add a KVM capability for this, and skip
>> > kvmclock_current_nsec if the capability is present.  The first part is
>> > trivial, so we can do it even during Linux rc period.
>> 
>> I agree, that sounds like a nice improvement.
> 
> I fail to see how this (KVM_GET_CLOCK using raw value (vs NTP corrected)
> relates to clocksource overflow and CLOCK_MONOTONIC stop requests.

If the guest uses master clock, then kvmclock drifts away from BOOTTIME,
so returning the value of kvmclock using kernel_ns() would introduce a
drift when setting the clock -- we must read the same way that the guest
does.

> -----
> 
> Todo list (collective???):
> 
> 1) Implement suggestion to Roman: 
> "Subject: Re: [PATCH] x86/kvm: fix condition to update kvm master
> clocks" to handle the time backwards when updating masterclock 
> from kernel clock.
> 
> 2) Review TSC scaling support (make sure scaled TSC 
> is propagated properly everywhere).
> 
> 3) Enable migration with invariant TSC.
> 
> 4) Fullvirt fix for local APIC deadline timer bug

The list looks good.

I'll resume work on (4) now that rework of APIC timers is merged.
It's going to be ugly on the guest side, because linux could be using it
even when not using kvmclock for sched clock ...

> (BTW Paolo Windows is also vulnerable right).

Ugh, we can't do much more than disabling TSC deadline there until QEMU
can migrate it.

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [Qemu-devel] [QEMU PATCH] kvmclock: advance clock by time window between vm_stop and pre_save
@ 2016-11-04 17:34           ` Radim Krčmář
  0 siblings, 0 replies; 80+ messages in thread
From: Radim Krčmář @ 2016-11-04 17:34 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: Paolo Bonzini, kvm, qemu-devel, Dr. David Alan Gilbert,
	Juan Quintela, Eduardo Habkost, Roman Kagan

2016-11-04 14:24-0200, Marcelo Tosatti:
> On Fri, Nov 04, 2016 at 04:48:28PM +0100, Radim Krčmář wrote:
>> 2016-11-04 16:33+0100, Paolo Bonzini:
>> > On 04/11/2016 16:25, Radim Krčmář wrote:
>> >>> >  
>> >>> > +        if (s->advance_clock && s->clock + s->advance_clock > s->clock) {
>> >>> > +            s->clock += s->advance_clock;
>> >>> > +            s->advance_clock = 0;
>> >>> > +        }
>> >> Can't the advance_clock added to the migrated KVMClockState instead of
>> >> passing it as another parameter?
>> >> 
>> >> (It is sad that we can't just query KVMClockState in kvmclock_pre_save
>> >>  because of the Linux bug.)
>> > 
>> > What Linux bug?  The one that makes us use kvmclock_current_nsec?
>> 
>> No, the one that forced Marcelo to add the 10 minute limit to the
>> advance_clock. 
> 
> Overflow when reading clocksource, in the timer interrupt.

Is it still there?  I wonder why 10 minutes is the limit. :)

>>  We wouldn't need this advance_clock hack if we could
>> just call KVM_GET_CLOCK like we did before 00f4d64ee76e ("kvmclock:
>> clock should count only if vm is running").
> 
> People have requested that CLOCK_MONOTONIC stops when 
> vm suspends.

Ugh, we could have done that on the OS level, but stopping kvmclock
means that we sabotaged CLOCK_BOOTTIME as it doesn't return
CLOCK_MONOTONIC + time spent in suspend anymore.

And kvmclock is defined to count nanosecond (slightly different in
practice) since some point in time, so pausing it is not really valid.

>> > It should work with 4.9-rc (well, once Linus applies my pull request).
>> > 4.9-rc will not return ktime_get_ns for KVM_GET_CLOCK; it will return
>> > the raw value from the kernel timekeeper.
>> > 
>> > I'm thinking that we should add a KVM capability for this, and skip
>> > kvmclock_current_nsec if the capability is present.  The first part is
>> > trivial, so we can do it even during Linux rc period.
>> 
>> I agree, that sounds like a nice improvement.
> 
> I fail to see how this (KVM_GET_CLOCK using raw value (vs NTP corrected)
> relates to clocksource overflow and CLOCK_MONOTONIC stop requests.

If the guest uses master clock, then kvmclock drifts away from BOOTTIME,
so returning the value of kvmclock using kernel_ns() would introduce a
drift when setting the clock -- we must read the same way that the guest
does.

> -----
> 
> Todo list (collective???):
> 
> 1) Implement suggestion to Roman: 
> "Subject: Re: [PATCH] x86/kvm: fix condition to update kvm master
> clocks" to handle the time backwards when updating masterclock 
> from kernel clock.
> 
> 2) Review TSC scaling support (make sure scaled TSC 
> is propagated properly everywhere).
> 
> 3) Enable migration with invariant TSC.
> 
> 4) Fullvirt fix for local APIC deadline timer bug

The list looks good.

I'll resume work on (4) now that rework of APIC timers is merged.
It's going to be ugly on the guest side, because linux could be using it
even when not using kvmclock for sched clock ...

> (BTW Paolo Windows is also vulnerable right).

Ugh, we can't do much more than disabling TSC deadline there until QEMU
can migrate it.

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [QEMU PATCH] kvmclock: advance clock by time window between vm_stop and pre_save
  2016-11-04 17:07     ` [Qemu-devel] " Marcelo Tosatti
@ 2016-11-04 17:39       ` Radim Krčmář
  -1 siblings, 0 replies; 80+ messages in thread
From: Radim Krčmář @ 2016-11-04 17:39 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: kvm, qemu-devel, Dr. David Alan Gilbert, Paolo Bonzini,
	Juan Quintela, Eduardo Habkost

2016-11-04 15:07-0200, Marcelo Tosatti:
> On Fri, Nov 04, 2016 at 04:25:23PM +0100, Radim Krčmář wrote:
>> > +        /*
>> > +         * Transition from VM-running to VM-stopped via migration?
>> > +         * Record when the VM was stopped.
>> > +         */
>> > +
>> > +        if (state == RUN_STATE_FINISH_MIGRATE &&
>> > +            !migration_in_postcopy(migrate_get_current())) {
>> > +            clock_gettime(CLOCK_MONOTONIC, &s->t_aftervmstop);
>> 
>> How big (more like small) was the clock delta between here and
>> kvmclock_pre_save with postcopy?
>> 
>> Thanks.
> 
> qemu-system-x86_64: postcopy_ram_supported_by_host: userfaultfd not
> available: Function not implemented
> 
> But should be about the same as precopy+this patch (guess as i don't 
> know the postcopy path).

I was wondering about the improvement we could achieve by not excluding
postcopy from the time fixup.  (i.e. how much time elapses between
pausing and migrating the vm in postcopy)

I would also guess that it is not significant.

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [Qemu-devel] [QEMU PATCH] kvmclock: advance clock by time window between vm_stop and pre_save
@ 2016-11-04 17:39       ` Radim Krčmář
  0 siblings, 0 replies; 80+ messages in thread
From: Radim Krčmář @ 2016-11-04 17:39 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: kvm, qemu-devel, Dr. David Alan Gilbert, Paolo Bonzini,
	Juan Quintela, Eduardo Habkost

2016-11-04 15:07-0200, Marcelo Tosatti:
> On Fri, Nov 04, 2016 at 04:25:23PM +0100, Radim Krčmář wrote:
>> > +        /*
>> > +         * Transition from VM-running to VM-stopped via migration?
>> > +         * Record when the VM was stopped.
>> > +         */
>> > +
>> > +        if (state == RUN_STATE_FINISH_MIGRATE &&
>> > +            !migration_in_postcopy(migrate_get_current())) {
>> > +            clock_gettime(CLOCK_MONOTONIC, &s->t_aftervmstop);
>> 
>> How big (more like small) was the clock delta between here and
>> kvmclock_pre_save with postcopy?
>> 
>> Thanks.
> 
> qemu-system-x86_64: postcopy_ram_supported_by_host: userfaultfd not
> available: Function not implemented
> 
> But should be about the same as precopy+this patch (guess as i don't 
> know the postcopy path).

I was wondering about the improvement we could achieve by not excluding
postcopy from the time fixup.  (i.e. how much time elapses between
pausing and migrating the vm in postcopy)

I would also guess that it is not significant.

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [QEMU PATCH] kvmclock: advance clock by time window between vm_stop and pre_save
  2016-11-04 17:34           ` [Qemu-devel] " Radim Krčmář
@ 2016-11-04 18:29             ` Marcelo Tosatti
  -1 siblings, 0 replies; 80+ messages in thread
From: Marcelo Tosatti @ 2016-11-04 18:29 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: Paolo Bonzini, kvm, qemu-devel, Dr. David Alan Gilbert,
	Juan Quintela, Eduardo Habkost, Roman Kagan

On Fri, Nov 04, 2016 at 06:34:20PM +0100, Radim Krčmář wrote:
> 2016-11-04 14:24-0200, Marcelo Tosatti:
> > On Fri, Nov 04, 2016 at 04:48:28PM +0100, Radim Krčmář wrote:
> >> 2016-11-04 16:33+0100, Paolo Bonzini:
> >> > On 04/11/2016 16:25, Radim Krčmář wrote:
> >> >>> >  
> >> >>> > +        if (s->advance_clock && s->clock + s->advance_clock > s->clock) {
> >> >>> > +            s->clock += s->advance_clock;
> >> >>> > +            s->advance_clock = 0;
> >> >>> > +        }
> >> >> Can't the advance_clock added to the migrated KVMClockState instead of
> >> >> passing it as another parameter?
> >> >> 
> >> >> (It is sad that we can't just query KVMClockState in kvmclock_pre_save
> >> >>  because of the Linux bug.)
> >> > 
> >> > What Linux bug?  The one that makes us use kvmclock_current_nsec?
> >> 
> >> No, the one that forced Marcelo to add the 10 minute limit to the
> >> advance_clock. 
> > 
> > Overflow when reading clocksource, in the timer interrupt.
> 
> Is it still there?  I wonder why 10 minutes is the limit. :)

Second question: I don't know, i guess it started as a 
"lets clamp this to values that make sense to catch
any potential offenders" but now i think it makes 
sense to say:

"because it does not make sense to search for the smaller
overflow of Linux guests and use that".

So 10minutes works for me using the sentence:
"migration should not take longer than 10 minutes".

You prefer to drop that 10 minutes check?

First question:

I suppose so: disable CLOCK_MONOTONIC counting on pause, 
pause your VM for two days, and resume.

timekeeping_resume

        cycle_now = tk->tkr_mono.read(clock);
        if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) &&
                cycle_now > tk->tkr_mono.cycle_last) {
                u64 num, max = ULLONG_MAX;
                u32 mult = clock->mult;
                u32 shift = clock->shift;
                s64 nsec = 0;

                cycle_delta = clocksource_delta(cycle_now,
tk->tkr_mono.cycle_last,
                                                tk->tkr_mono.mask);

                /*
                 * "cycle_delta * mutl" may cause 64 bits overflow, if
                 * the
                 * suspended time is too long. In that case we need do
                 * the
                 * 64 bits math carefully
                 */
                do_div(max, mult);
                if (cycle_delta > max) {
                        num = div64_u64(cycle_delta, max);
                        nsec = (((u64) max * mult) >> shift) * num;
                        cycle_delta -= num * max;
                }
                nsec += ((u64) cycle_delta * mult) >> shift;

timekeeping_forward_now
/**
 * timekeeping_forward_now - update clock to the current time
 *
 * Forward the current clock to update its state since the last call to
 * update_wall_time(). This is useful before significant clock changes,
 * as it avoids having to deal with this time offset explicitly.
 */
static void timekeeping_forward_now(struct timekeeper *tk)
{
        struct clocksource *clock = tk->tkr_mono.clock;
        cycle_t cycle_now, delta;
        s64 nsec;

        cycle_now = tk->tkr_mono.read(clock);
        delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last,
tk->tkr_mono.mask);
        tk->tkr_mono.cycle_last = cycle_now;
        tk->tkr_raw.cycle_last  = cycle_now;

        tk->tkr_mono.xtime_nsec += delta * tk->tkr_mono.mult;

For example.

> >>  We wouldn't need this advance_clock hack if we could
> >> just call KVM_GET_CLOCK like we did before 00f4d64ee76e ("kvmclock:
> >> clock should count only if vm is running").
> > 
> > People have requested that CLOCK_MONOTONIC stops when 
> > vm suspends.
> 
> Ugh, we could have done that on the OS level, 

Done it how at the OS level ?

> but stopping kvmclock
> means that we sabotaged CLOCK_BOOTTIME as it doesn't return
> CLOCK_MONOTONIC + time spent in suspend anymore.

It does:
So this patchset introduces CLOCK_BOOTTIME, which is identical
to CLOCK_MONOTONIC, but includes any time spent in suspend.

"suspend" refers to suspend-to-RAM, not "virtual machine stopped".

CLOCK_BOOTTIME fails to work when you extend "suspend" 
to "suspend AND virtual machine pause or restoration".

(if you want that, can be fixed easily i supposed, but 
nobody ever asked for it).

> And kvmclock is defined to count nanosecond (slightly different in
> practice) since some point in time, so pausing it is not really valid.

Sorry, where is that defined? Nobody makes that assumption
(that kvmclock should be correlated to some physical time
and that it can't stop counting).

> >> > It should work with 4.9-rc (well, once Linus applies my pull request).
> >> > 4.9-rc will not return ktime_get_ns for KVM_GET_CLOCK; it will return
> >> > the raw value from the kernel timekeeper.
> >> > 
> >> > I'm thinking that we should add a KVM capability for this, and skip
> >> > kvmclock_current_nsec if the capability is present.  The first part is
> >> > trivial, so we can do it even during Linux rc period.
> >> 
> >> I agree, that sounds like a nice improvement.
> > 
> > I fail to see how this (KVM_GET_CLOCK using raw value (vs NTP corrected)
> > relates to clocksource overflow and CLOCK_MONOTONIC stop requests.
> 
> If the guest uses master clock, then kvmclock drifts away from BOOTTIME,
> so returning the value of kvmclock using kernel_ns() would introduce a
> drift when setting the clock -- we must read the same way that the guest
> does.

Does not work:

    (periodic incremental reads from TSC, using the offset from
     previous TSC read, storing to scaling to a memory location)
    and reads interpolating with TSC (what kernel does)

        != (different values than)

    (TSC scaled to nanoseconds)

Its not obvious to me this should be the case, perhaps if done 
carefully can work.

(See the thread with Roman, this was determined empirically).

> > -----
> > 
> > Todo list (collective???):
> > 
> > 1) Implement suggestion to Roman: 
> > "Subject: Re: [PATCH] x86/kvm: fix condition to update kvm master
> > clocks" to handle the time backwards when updating masterclock 
> > from kernel clock.
> > 
> > 2) Review TSC scaling support (make sure scaled TSC 
> > is propagated properly everywhere).
> > 
> > 3) Enable migration with invariant TSC.
> > 
> > 4) Fullvirt fix for local APIC deadline timer bug
> 
> The list looks good.
> 
> I'll resume work on (4) now that rework of APIC timers is merged.
> It's going to be ugly on the guest side, because linux could be using it
> even when not using kvmclock for sched clock ...

>From previous discussion on the thread Eduardo started:

>> > Actually, a nicer fix would be to check the different
>> > frequencies and scale the deadline relative to the difference.
>>
>> You cannot know what exactly the guest was thinking when it set the
>> TSC
>> deadline.  Perhaps it wanted an interrupt when the TSC was exactly
>> 9876543210.

Yep, the spec just says that the timer fires when TSC >= deadline.

> You can't expect the correlation between TSC value and timer interrupt
> execution to be precise, because of the delay between HW timer
> expiration and interrupt execution.

Yes, this is valid.

> So you have to live with the fact that the TSC deadline timer can be
> late (which is the same thing as with your paravirt solution, in case
> of migration to host with faster TSC freq) (which to me renders the
> argument above invalid).
>
> Solution for old guests and new guests:
> Just save how far ahead in the future the TSC deadline timer is
> supposed
> to expire on the source (in ns), in the destination save all registers
> (but disable TSC deadline timer injection), arm a timer in QEMU
> for expiration time, reenable TSC deadline timer reinjection.

The interrupt can also fire early after migrating to a TSC with lower
frequency, which violates the definition of TSC deadline timer when an
OS could even RDTSC a value lower than the deadline in the interrupt
handler.  (An OS doesn't need to expect that.)

We already have vcpu->arch.virtual_tsc_khz that ought to remain constant
for a lifetime of the guest and KVM uses it to convert TSC delta into
correct nanoseconds.

The main problem is that QEMU changes virtual_tsc_khz when migrating
without hardware scaling, so KVM is forced to get nanoseconds wrong ...

If QEMU doesn't want to keep the TSC frequency constant, then it would
be better if it didn't expose TSC in CPUID -- guest would just use
kvmclock without being tempted by direct TSC accesses.


> > (BTW Paolo Windows is also vulnerable right).
> 
> Ugh, we can't do much more than disabling TSC deadline there until QEMU
> can migrate it.

So the idea was to convert to nanoseconds, count a timer
in nanoseconds, and when it expires inject immediatelly.
(Yes its ugly).

(btw i suppose you can talk to destination via the command interface 
on the postcopy patches now), say to get the destination tsc frequency.



^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [Qemu-devel] [QEMU PATCH] kvmclock: advance clock by time window between vm_stop and pre_save
@ 2016-11-04 18:29             ` Marcelo Tosatti
  0 siblings, 0 replies; 80+ messages in thread
From: Marcelo Tosatti @ 2016-11-04 18:29 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: Paolo Bonzini, kvm, qemu-devel, Dr. David Alan Gilbert,
	Juan Quintela, Eduardo Habkost, Roman Kagan

On Fri, Nov 04, 2016 at 06:34:20PM +0100, Radim Krčmář wrote:
> 2016-11-04 14:24-0200, Marcelo Tosatti:
> > On Fri, Nov 04, 2016 at 04:48:28PM +0100, Radim Krčmář wrote:
> >> 2016-11-04 16:33+0100, Paolo Bonzini:
> >> > On 04/11/2016 16:25, Radim Krčmář wrote:
> >> >>> >  
> >> >>> > +        if (s->advance_clock && s->clock + s->advance_clock > s->clock) {
> >> >>> > +            s->clock += s->advance_clock;
> >> >>> > +            s->advance_clock = 0;
> >> >>> > +        }
> >> >> Can't the advance_clock added to the migrated KVMClockState instead of
> >> >> passing it as another parameter?
> >> >> 
> >> >> (It is sad that we can't just query KVMClockState in kvmclock_pre_save
> >> >>  because of the Linux bug.)
> >> > 
> >> > What Linux bug?  The one that makes us use kvmclock_current_nsec?
> >> 
> >> No, the one that forced Marcelo to add the 10 minute limit to the
> >> advance_clock. 
> > 
> > Overflow when reading clocksource, in the timer interrupt.
> 
> Is it still there?  I wonder why 10 minutes is the limit. :)

Second question: I don't know, i guess it started as a 
"lets clamp this to values that make sense to catch
any potential offenders" but now i think it makes 
sense to say:

"because it does not make sense to search for the smaller
overflow of Linux guests and use that".

So 10minutes works for me using the sentence:
"migration should not take longer than 10 minutes".

You prefer to drop that 10 minutes check?

First question:

I suppose so: disable CLOCK_MONOTONIC counting on pause, 
pause your VM for two days, and resume.

timekeeping_resume

        cycle_now = tk->tkr_mono.read(clock);
        if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) &&
                cycle_now > tk->tkr_mono.cycle_last) {
                u64 num, max = ULLONG_MAX;
                u32 mult = clock->mult;
                u32 shift = clock->shift;
                s64 nsec = 0;

                cycle_delta = clocksource_delta(cycle_now,
tk->tkr_mono.cycle_last,
                                                tk->tkr_mono.mask);

                /*
                 * "cycle_delta * mutl" may cause 64 bits overflow, if
                 * the
                 * suspended time is too long. In that case we need do
                 * the
                 * 64 bits math carefully
                 */
                do_div(max, mult);
                if (cycle_delta > max) {
                        num = div64_u64(cycle_delta, max);
                        nsec = (((u64) max * mult) >> shift) * num;
                        cycle_delta -= num * max;
                }
                nsec += ((u64) cycle_delta * mult) >> shift;

timekeeping_forward_now
/**
 * timekeeping_forward_now - update clock to the current time
 *
 * Forward the current clock to update its state since the last call to
 * update_wall_time(). This is useful before significant clock changes,
 * as it avoids having to deal with this time offset explicitly.
 */
static void timekeeping_forward_now(struct timekeeper *tk)
{
        struct clocksource *clock = tk->tkr_mono.clock;
        cycle_t cycle_now, delta;
        s64 nsec;

        cycle_now = tk->tkr_mono.read(clock);
        delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last,
tk->tkr_mono.mask);
        tk->tkr_mono.cycle_last = cycle_now;
        tk->tkr_raw.cycle_last  = cycle_now;

        tk->tkr_mono.xtime_nsec += delta * tk->tkr_mono.mult;

For example.

> >>  We wouldn't need this advance_clock hack if we could
> >> just call KVM_GET_CLOCK like we did before 00f4d64ee76e ("kvmclock:
> >> clock should count only if vm is running").
> > 
> > People have requested that CLOCK_MONOTONIC stops when 
> > vm suspends.
> 
> Ugh, we could have done that on the OS level, 

Done it how at the OS level ?

> but stopping kvmclock
> means that we sabotaged CLOCK_BOOTTIME as it doesn't return
> CLOCK_MONOTONIC + time spent in suspend anymore.

It does:
So this patchset introduces CLOCK_BOOTTIME, which is identical
to CLOCK_MONOTONIC, but includes any time spent in suspend.

"suspend" refers to suspend-to-RAM, not "virtual machine stopped".

CLOCK_BOOTTIME fails to work when you extend "suspend" 
to "suspend AND virtual machine pause or restoration".

(if you want that, can be fixed easily i supposed, but 
nobody ever asked for it).

> And kvmclock is defined to count nanosecond (slightly different in
> practice) since some point in time, so pausing it is not really valid.

Sorry, where is that defined? Nobody makes that assumption
(that kvmclock should be correlated to some physical time
and that it can't stop counting).

> >> > It should work with 4.9-rc (well, once Linus applies my pull request).
> >> > 4.9-rc will not return ktime_get_ns for KVM_GET_CLOCK; it will return
> >> > the raw value from the kernel timekeeper.
> >> > 
> >> > I'm thinking that we should add a KVM capability for this, and skip
> >> > kvmclock_current_nsec if the capability is present.  The first part is
> >> > trivial, so we can do it even during Linux rc period.
> >> 
> >> I agree, that sounds like a nice improvement.
> > 
> > I fail to see how this (KVM_GET_CLOCK using raw value (vs NTP corrected)
> > relates to clocksource overflow and CLOCK_MONOTONIC stop requests.
> 
> If the guest uses master clock, then kvmclock drifts away from BOOTTIME,
> so returning the value of kvmclock using kernel_ns() would introduce a
> drift when setting the clock -- we must read the same way that the guest
> does.

Does not work:

    (periodic incremental reads from TSC, using the offset from
     previous TSC read, storing to scaling to a memory location)
    and reads interpolating with TSC (what kernel does)

        != (different values than)

    (TSC scaled to nanoseconds)

Its not obvious to me this should be the case, perhaps if done 
carefully can work.

(See the thread with Roman, this was determined empirically).

> > -----
> > 
> > Todo list (collective???):
> > 
> > 1) Implement suggestion to Roman: 
> > "Subject: Re: [PATCH] x86/kvm: fix condition to update kvm master
> > clocks" to handle the time backwards when updating masterclock 
> > from kernel clock.
> > 
> > 2) Review TSC scaling support (make sure scaled TSC 
> > is propagated properly everywhere).
> > 
> > 3) Enable migration with invariant TSC.
> > 
> > 4) Fullvirt fix for local APIC deadline timer bug
> 
> The list looks good.
> 
> I'll resume work on (4) now that rework of APIC timers is merged.
> It's going to be ugly on the guest side, because linux could be using it
> even when not using kvmclock for sched clock ...

>From previous discussion on the thread Eduardo started:

>> > Actually, a nicer fix would be to check the different
>> > frequencies and scale the deadline relative to the difference.
>>
>> You cannot know what exactly the guest was thinking when it set the
>> TSC
>> deadline.  Perhaps it wanted an interrupt when the TSC was exactly
>> 9876543210.

Yep, the spec just says that the timer fires when TSC >= deadline.

> You can't expect the correlation between TSC value and timer interrupt
> execution to be precise, because of the delay between HW timer
> expiration and interrupt execution.

Yes, this is valid.

> So you have to live with the fact that the TSC deadline timer can be
> late (which is the same thing as with your paravirt solution, in case
> of migration to host with faster TSC freq) (which to me renders the
> argument above invalid).
>
> Solution for old guests and new guests:
> Just save how far ahead in the future the TSC deadline timer is
> supposed
> to expire on the source (in ns), in the destination save all registers
> (but disable TSC deadline timer injection), arm a timer in QEMU
> for expiration time, reenable TSC deadline timer reinjection.

The interrupt can also fire early after migrating to a TSC with lower
frequency, which violates the definition of TSC deadline timer when an
OS could even RDTSC a value lower than the deadline in the interrupt
handler.  (An OS doesn't need to expect that.)

We already have vcpu->arch.virtual_tsc_khz that ought to remain constant
for a lifetime of the guest and KVM uses it to convert TSC delta into
correct nanoseconds.

The main problem is that QEMU changes virtual_tsc_khz when migrating
without hardware scaling, so KVM is forced to get nanoseconds wrong ...

If QEMU doesn't want to keep the TSC frequency constant, then it would
be better if it didn't expose TSC in CPUID -- guest would just use
kvmclock without being tempted by direct TSC accesses.


> > (BTW Paolo Windows is also vulnerable right).
> 
> Ugh, we can't do much more than disabling TSC deadline there until QEMU
> can migrate it.

So the idea was to convert to nanoseconds, count a timer
in nanoseconds, and when it expires inject immediatelly.
(Yes its ugly).

(btw i suppose you can talk to destination via the command interface 
on the postcopy patches now), say to get the destination tsc frequency.

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [QEMU PATCH] kvmclock: advance clock by time window between vm_stop and pre_save
  2016-11-04 17:39       ` [Qemu-devel] " Radim Krčmář
@ 2016-11-04 18:31         ` Marcelo Tosatti
  -1 siblings, 0 replies; 80+ messages in thread
From: Marcelo Tosatti @ 2016-11-04 18:31 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: kvm, qemu-devel, Dr. David Alan Gilbert, Paolo Bonzini,
	Juan Quintela, Eduardo Habkost

On Fri, Nov 04, 2016 at 06:39:18PM +0100, Radim Krčmář wrote:
> 2016-11-04 15:07-0200, Marcelo Tosatti:
> > On Fri, Nov 04, 2016 at 04:25:23PM +0100, Radim Krčmář wrote:
> >> > +        /*
> >> > +         * Transition from VM-running to VM-stopped via migration?
> >> > +         * Record when the VM was stopped.
> >> > +         */
> >> > +
> >> > +        if (state == RUN_STATE_FINISH_MIGRATE &&
> >> > +            !migration_in_postcopy(migrate_get_current())) {
> >> > +            clock_gettime(CLOCK_MONOTONIC, &s->t_aftervmstop);
> >> 
> >> How big (more like small) was the clock delta between here and
> >> kvmclock_pre_save with postcopy?
> >> 
> >> Thanks.
> > 
> > qemu-system-x86_64: postcopy_ram_supported_by_host: userfaultfd not
> > available: Function not implemented
> > 
> > But should be about the same as precopy+this patch (guess as i don't 
> > know the postcopy path).
> 
> I was wondering about the improvement we could achieve by not excluding
> postcopy from the time fixup.  (i.e. how much time elapses between
> pausing and migrating the vm in postcopy)
> 
> I would also guess that it is not significant.

Ideally the total should be zero because for certain workloads
any change is problematic... 


^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [Qemu-devel] [QEMU PATCH] kvmclock: advance clock by time window between vm_stop and pre_save
@ 2016-11-04 18:31         ` Marcelo Tosatti
  0 siblings, 0 replies; 80+ messages in thread
From: Marcelo Tosatti @ 2016-11-04 18:31 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: kvm, qemu-devel, Dr. David Alan Gilbert, Paolo Bonzini,
	Juan Quintela, Eduardo Habkost

On Fri, Nov 04, 2016 at 06:39:18PM +0100, Radim Krčmář wrote:
> 2016-11-04 15:07-0200, Marcelo Tosatti:
> > On Fri, Nov 04, 2016 at 04:25:23PM +0100, Radim Krčmář wrote:
> >> > +        /*
> >> > +         * Transition from VM-running to VM-stopped via migration?
> >> > +         * Record when the VM was stopped.
> >> > +         */
> >> > +
> >> > +        if (state == RUN_STATE_FINISH_MIGRATE &&
> >> > +            !migration_in_postcopy(migrate_get_current())) {
> >> > +            clock_gettime(CLOCK_MONOTONIC, &s->t_aftervmstop);
> >> 
> >> How big (more like small) was the clock delta between here and
> >> kvmclock_pre_save with postcopy?
> >> 
> >> Thanks.
> > 
> > qemu-system-x86_64: postcopy_ram_supported_by_host: userfaultfd not
> > available: Function not implemented
> > 
> > But should be about the same as precopy+this patch (guess as i don't 
> > know the postcopy path).
> 
> I was wondering about the improvement we could achieve by not excluding
> postcopy from the time fixup.  (i.e. how much time elapses between
> pausing and migrating the vm in postcopy)
> 
> I would also guess that it is not significant.

Ideally the total should be zero because for certain workloads
any change is problematic... 

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [QEMU PATCH v2] kvmclock: advance clock by time window between vm_stop and pre_save
  2016-11-04 16:59   ` [Qemu-devel] " Marcelo Tosatti
@ 2016-11-04 18:57     ` Juan Quintela
  -1 siblings, 0 replies; 80+ messages in thread
From: Juan Quintela @ 2016-11-04 18:57 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: kvm, qemu-devel, Dr. David Alan Gilbert, Paolo Bonzini,
	Radim Krčmář,
	Eduardo Habkost

Marcelo Tosatti <mtosatti@redhat.com> wrote:
> This patch, relative to pre-copy migration codepath,
> measures the time between vm_stop() and pre_save(),
> which includes copying the remaining RAM to destination,
> and advances the clock by that amount.
>
> In a VM with 5 seconds downtime, this reduces the guest
> clock difference on destination from 5s to 0.2s.
>
> Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time.
>
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
>
> ---
>
> v2: use subsection (Juan Quintela)
>     fix older machine types support
>

Reviewed-by: Juan Quintela <quintela@redhat.com>

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [Qemu-devel] [QEMU PATCH v2] kvmclock: advance clock by time window between vm_stop and pre_save
@ 2016-11-04 18:57     ` Juan Quintela
  0 siblings, 0 replies; 80+ messages in thread
From: Juan Quintela @ 2016-11-04 18:57 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: kvm, qemu-devel, Dr. David Alan Gilbert, Paolo Bonzini,
	Radim Krčmář,
	Eduardo Habkost

Marcelo Tosatti <mtosatti@redhat.com> wrote:
> This patch, relative to pre-copy migration codepath,
> measures the time between vm_stop() and pre_save(),
> which includes copying the remaining RAM to destination,
> and advances the clock by that amount.
>
> In a VM with 5 seconds downtime, this reduces the guest
> clock difference on destination from 5s to 0.2s.
>
> Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time.
>
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
>
> ---
>
> v2: use subsection (Juan Quintela)
>     fix older machine types support
>

Reviewed-by: Juan Quintela <quintela@redhat.com>

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [QEMU PATCH] kvmclock: advance clock by time window between vm_stop and pre_save
  2016-11-04 18:29             ` [Qemu-devel] " Marcelo Tosatti
@ 2016-11-04 20:07               ` Radim Krčmář
  -1 siblings, 0 replies; 80+ messages in thread
From: Radim Krčmář @ 2016-11-04 20:07 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: Eduardo Habkost, kvm, Juan Quintela, qemu-devel,
	Dr. David Alan Gilbert, Roman Kagan, Paolo Bonzini

2016-11-04 16:29-0200, Marcelo Tosatti:
> On Fri, Nov 04, 2016 at 06:34:20PM +0100, Radim Krčmář wrote:
>> 2016-11-04 14:24-0200, Marcelo Tosatti:
>> > On Fri, Nov 04, 2016 at 04:48:28PM +0100, Radim Krčmář wrote:
>> >> 2016-11-04 16:33+0100, Paolo Bonzini:
>> >> > On 04/11/2016 16:25, Radim Krčmář wrote:
>> >> >>> >  
>> >> >>> > +        if (s->advance_clock && s->clock + s->advance_clock > s->clock) {
>> >> >>> > +            s->clock += s->advance_clock;
>> >> >>> > +            s->advance_clock = 0;
>> >> >>> > +        }
>> >> >> Can't the advance_clock added to the migrated KVMClockState instead of
>> >> >> passing it as another parameter?
>> >> >> 
>> >> >> (It is sad that we can't just query KVMClockState in kvmclock_pre_save
>> >> >>  because of the Linux bug.)
>> >> > 
>> >> > What Linux bug?  The one that makes us use kvmclock_current_nsec?
>> >> 
>> >> No, the one that forced Marcelo to add the 10 minute limit to the
>> >> advance_clock. 
>> > 
>> > Overflow when reading clocksource, in the timer interrupt.
>> 
>> Is it still there?  I wonder why 10 minutes is the limit. :)
> 
> Second question: I don't know, i guess it started as a 
> "lets clamp this to values that make sense to catch
> any potential offenders" but now i think it makes 
> sense to say:
> 
> "because it does not make sense to search for the smaller
> overflow of Linux guests and use that".
> 
> So 10minutes works for me using the sentence:
> "migration should not take longer than 10 minutes".
> 
> You prefer to drop that 10 minutes check?

I would.  The overflow should mean that the value will be off, but it
would already be, so being a little more wrong doesn't seem worth the
extra code.
It is minor and v2 has r-b, so let's just forget about it. :)

> First question:
> 
> I suppose so: disable CLOCK_MONOTONIC counting on pause, 
> pause your VM for two days, and resume.
> 
> timekeeping_resume
> 
>         cycle_now = tk->tkr_mono.read(clock);
>         if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) &&
>                 cycle_now > tk->tkr_mono.cycle_last) {
>                 u64 num, max = ULLONG_MAX;
>                 u32 mult = clock->mult;
>                 u32 shift = clock->shift;
>                 s64 nsec = 0;
> 
>                 cycle_delta = clocksource_delta(cycle_now,
> tk->tkr_mono.cycle_last,
>                                                 tk->tkr_mono.mask);
> 
>                 /*
>                  * "cycle_delta * mutl" may cause 64 bits overflow, if
>                  * the
>                  * suspended time is too long. In that case we need do
>                  * the
>                  * 64 bits math carefully
>                  */
>                 do_div(max, mult);
>                 if (cycle_delta > max) {
>                         num = div64_u64(cycle_delta, max);
>                         nsec = (((u64) max * mult) >> shift) * num;
>                         cycle_delta -= num * max;
>                 }
>                 nsec += ((u64) cycle_delta * mult) >> shift;

This seems like it handles the overflow.  And does it tragically
ineffectively, so I can understand that they didn't want to do that in
interrupt context.  Using 128 bit multiplication (on x86_64) and shift
would get it done in a fraction of the time.

[...]
> For example.

I found the 10 minues in __clocksource_update_freq_scale():

	if (freq) {
		/*
		 * Calc the maximum number of seconds which we can run before
		 * wrapping around. For clocksources which have a mask > 32-bit
		 * we need to limit the max sleep time to have a good
		 * conversion precision. 10 minutes is still a reasonable
		 * amount. That results in a shift value of 24 for a
		 * clocksource with mask >= 40-bit and f >= 4GHz. That maps to
		 * ~ 0.06ppm granularity for NTP.
		 */
		sec = cs->mask;
		do_div(sec, freq);
		do_div(sec, scale);
		if (!sec)
			sec = 1;
		else if (sec > 600 && cs->mask > UINT_MAX)
			sec = 600;

		clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
				       NSEC_PER_SEC / scale, sec * scale);
	}

>> >>  We wouldn't need this advance_clock hack if we could
>> >> just call KVM_GET_CLOCK like we did before 00f4d64ee76e ("kvmclock:
>> >> clock should count only if vm is running").
>> > 
>> > People have requested that CLOCK_MONOTONIC stops when 
>> > vm suspends.
>> 
>> Ugh, we could have done that on the OS level, 
> 
> Done it how at the OS level ?

With the use of another interface steal-time like interface.

>> but stopping kvmclock
>> means that we sabotaged CLOCK_BOOTTIME as it doesn't return
>> CLOCK_MONOTONIC + time spent in suspend anymore.
> 
> It does:
> So this patchset introduces CLOCK_BOOTTIME, which is identical
> to CLOCK_MONOTONIC, but includes any time spent in suspend.
> 
> "suspend" refers to suspend-to-RAM, not "virtual machine stopped".
> 
> CLOCK_BOOTTIME fails to work when you extend "suspend" 
> to "suspend AND virtual machine pause or restoration".
> 
> (if you want that, can be fixed easily i supposed, but 
> nobody ever asked for it).

Yeah, it's just weird, because the reason why people wanted
CLOCK_MONOTONIC was quite stupid (to see how long it was unpaused ...)
while having a useable timesource seems quite important.

>> And kvmclock is defined to count nanosecond (slightly different in
>> practice) since some point in time, so pausing it is not really valid.
> 
> Sorry, where is that defined? Nobody makes that assumption
> (that kvmclock should be correlated to some physical time
> and that it can't stop counting).

Documentation/virtual/kvm/msr.txt

  system_time: a host notion of monotonic time, including sleep
  time at the time this structure was last updated. Unit is
  nanoseconds.

and arch/x86/include/asm/pvclock-abi.h:

  * pvclock_vcpu_time_info holds the system time and the tsc timestamp
  * of the last update. So the guest can use the tsc delta to get a
  * more precise system time.  There is one per virtual cpu.
  *
  * pvclock_wall_clock references the point in time when the system
  * time was zero (usually boot time), thus the guest calculates the
  * current wall clock by adding the system time.

>> >> > It should work with 4.9-rc (well, once Linus applies my pull request).
>> >> > 4.9-rc will not return ktime_get_ns for KVM_GET_CLOCK; it will return
>> >> > the raw value from the kernel timekeeper.
>> >> > 
>> >> > I'm thinking that we should add a KVM capability for this, and skip
>> >> > kvmclock_current_nsec if the capability is present.  The first part is
>> >> > trivial, so we can do it even during Linux rc period.
>> >> 
>> >> I agree, that sounds like a nice improvement.
>> > 
>> > I fail to see how this (KVM_GET_CLOCK using raw value (vs NTP corrected)
>> > relates to clocksource overflow and CLOCK_MONOTONIC stop requests.
>> 
>> If the guest uses master clock, then kvmclock drifts away from BOOTTIME,
>> so returning the value of kvmclock using kernel_ns() would introduce a
>> drift when setting the clock -- we must read the same way that the guest
>> does.
> 
> Does not work:
> 
>     (periodic incremental reads from TSC, using the offset from
>      previous TSC read, storing to scaling to a memory location)
>     and reads interpolating with TSC (what kernel does)
> 
>         != (different values than)
> 
>     (TSC scaled to nanoseconds)
> 
> Its not obvious to me this should be the case, perhaps if done 
> carefully can work.

This can't be equal if the TSC frequency is not constant, and it is not.
KVM master clock is using different TSC frequency than the kernel.
The current state is weird, because if we ever recompute the master
clock, we use kvm_get_time_and_clockread() for the new base, which
shifts the time. :/

We could match CLOCK_BOOTTIME with kvmclock if we wanted ...  It would
be much saner, but we'd recomputing the master clock every time the host
time gets updated.

> (See the thread with Roman, this was determined empirically).
> 
>> > -----
>> > 
>> > Todo list (collective???):
>> > 
>> > 1) Implement suggestion to Roman: 
>> > "Subject: Re: [PATCH] x86/kvm: fix condition to update kvm master
>> > clocks" to handle the time backwards when updating masterclock 
>> > from kernel clock.
>> > 
>> > 2) Review TSC scaling support (make sure scaled TSC 
>> > is propagated properly everywhere).
>> > 
>> > 3) Enable migration with invariant TSC.
>> > 
>> > 4) Fullvirt fix for local APIC deadline timer bug
>> 
>> The list looks good.
>> 
>> I'll resume work on (4) now that rework of APIC timers is merged.
>> It's going to be ugly on the guest side, because linux could be using it
>> even when not using kvmclock for sched clock ...
> 
> From previous discussion on the thread Eduardo started:
|> 
|>>> > Actually, a nicer fix would be to check the different
|>>> > frequencies and scale the deadline relative to the difference.
|>>>
|>>> You cannot know what exactly the guest was thinking when it set the
|>>> TSC
|>>> deadline.  Perhaps it wanted an interrupt when the TSC was exactly
|>>> 9876543210.
|> 
|> Yep, the spec just says that the timer fires when TSC >= deadline.
|> 
|>> You can't expect the correlation between TSC value and timer interrupt
|>> execution to be precise, because of the delay between HW timer
|>> expiration and interrupt execution.
|> 
|> Yes, this is valid.
|> 
|>> So you have to live with the fact that the TSC deadline timer can be
|>> late (which is the same thing as with your paravirt solution, in case
|>> of migration to host with faster TSC freq) (which to me renders the
|>> argument above invalid).
|>>
|>> Solution for old guests and new guests:
|>> Just save how far ahead in the future the TSC deadline timer is
|>> supposed
|>> to expire on the source (in ns), in the destination save all registers
|>> (but disable TSC deadline timer injection), arm a timer in QEMU
|>> for expiration time, reenable TSC deadline timer reinjection.
|> 
|> The interrupt can also fire early after migrating to a TSC with lower
|> frequency, which violates the definition of TSC deadline timer when an
|> OS could even RDTSC a value lower than the deadline in the interrupt
|> handler.  (An OS doesn't need to expect that.)
|> 
|> We already have vcpu->arch.virtual_tsc_khz that ought to remain constant
|> for a lifetime of the guest and KVM uses it to convert TSC delta into
|> correct nanoseconds.
|> 
|> The main problem is that QEMU changes virtual_tsc_khz when migrating
|> without hardware scaling, so KVM is forced to get nanoseconds wrong ...
|> 
|> If QEMU doesn't want to keep the TSC frequency constant, then it would
|> be better if it didn't expose TSC in CPUID -- guest would just use
|> kvmclock without being tempted by direct TSC accesses.
|> 
|> 
>> > (BTW Paolo Windows is also vulnerable right).
>> 
>> Ugh, we can't do much more than disabling TSC deadline there until QEMU
>> can migrate it.
> 
> So the idea was to convert to nanoseconds, count a timer
> in nanoseconds, and when it expires inject immediatelly.
> (Yes its ugly).

Problematic, because we can't know that the timer was used to count real
time: maybe the guest really wanted to count TSC cycles.  And it is just
wrong if we would fire the timer while the guest could rdtsc() a value
before the deadline in its handler, which might happen if we migrated to
a TSC with a lower frequency.

> (btw i suppose you can talk to destination via the command interface 
> on the postcopy patches now), say to get the destination tsc frequency.

The problem is that changing TSC frequency on the host affects all
following interrupts, because KVM is not informed of the new frequency.
The one timer armed during migration is relatively insignificant, so
Paolo's patches even ignored it.

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [Qemu-devel] [QEMU PATCH] kvmclock: advance clock by time window between vm_stop and pre_save
@ 2016-11-04 20:07               ` Radim Krčmář
  0 siblings, 0 replies; 80+ messages in thread
From: Radim Krčmář @ 2016-11-04 20:07 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: Paolo Bonzini, kvm, qemu-devel, Dr. David Alan Gilbert,
	Juan Quintela, Eduardo Habkost, Roman Kagan

2016-11-04 16:29-0200, Marcelo Tosatti:
> On Fri, Nov 04, 2016 at 06:34:20PM +0100, Radim Krčmář wrote:
>> 2016-11-04 14:24-0200, Marcelo Tosatti:
>> > On Fri, Nov 04, 2016 at 04:48:28PM +0100, Radim Krčmář wrote:
>> >> 2016-11-04 16:33+0100, Paolo Bonzini:
>> >> > On 04/11/2016 16:25, Radim Krčmář wrote:
>> >> >>> >  
>> >> >>> > +        if (s->advance_clock && s->clock + s->advance_clock > s->clock) {
>> >> >>> > +            s->clock += s->advance_clock;
>> >> >>> > +            s->advance_clock = 0;
>> >> >>> > +        }
>> >> >> Can't the advance_clock added to the migrated KVMClockState instead of
>> >> >> passing it as another parameter?
>> >> >> 
>> >> >> (It is sad that we can't just query KVMClockState in kvmclock_pre_save
>> >> >>  because of the Linux bug.)
>> >> > 
>> >> > What Linux bug?  The one that makes us use kvmclock_current_nsec?
>> >> 
>> >> No, the one that forced Marcelo to add the 10 minute limit to the
>> >> advance_clock. 
>> > 
>> > Overflow when reading clocksource, in the timer interrupt.
>> 
>> Is it still there?  I wonder why 10 minutes is the limit. :)
> 
> Second question: I don't know, i guess it started as a 
> "lets clamp this to values that make sense to catch
> any potential offenders" but now i think it makes 
> sense to say:
> 
> "because it does not make sense to search for the smaller
> overflow of Linux guests and use that".
> 
> So 10minutes works for me using the sentence:
> "migration should not take longer than 10 minutes".
> 
> You prefer to drop that 10 minutes check?

I would.  The overflow should mean that the value will be off, but it
would already be, so being a little more wrong doesn't seem worth the
extra code.
It is minor and v2 has r-b, so let's just forget about it. :)

> First question:
> 
> I suppose so: disable CLOCK_MONOTONIC counting on pause, 
> pause your VM for two days, and resume.
> 
> timekeeping_resume
> 
>         cycle_now = tk->tkr_mono.read(clock);
>         if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) &&
>                 cycle_now > tk->tkr_mono.cycle_last) {
>                 u64 num, max = ULLONG_MAX;
>                 u32 mult = clock->mult;
>                 u32 shift = clock->shift;
>                 s64 nsec = 0;
> 
>                 cycle_delta = clocksource_delta(cycle_now,
> tk->tkr_mono.cycle_last,
>                                                 tk->tkr_mono.mask);
> 
>                 /*
>                  * "cycle_delta * mutl" may cause 64 bits overflow, if
>                  * the
>                  * suspended time is too long. In that case we need do
>                  * the
>                  * 64 bits math carefully
>                  */
>                 do_div(max, mult);
>                 if (cycle_delta > max) {
>                         num = div64_u64(cycle_delta, max);
>                         nsec = (((u64) max * mult) >> shift) * num;
>                         cycle_delta -= num * max;
>                 }
>                 nsec += ((u64) cycle_delta * mult) >> shift;

This seems like it handles the overflow.  And does it tragically
ineffectively, so I can understand that they didn't want to do that in
interrupt context.  Using 128 bit multiplication (on x86_64) and shift
would get it done in a fraction of the time.

[...]
> For example.

I found the 10 minues in __clocksource_update_freq_scale():

	if (freq) {
		/*
		 * Calc the maximum number of seconds which we can run before
		 * wrapping around. For clocksources which have a mask > 32-bit
		 * we need to limit the max sleep time to have a good
		 * conversion precision. 10 minutes is still a reasonable
		 * amount. That results in a shift value of 24 for a
		 * clocksource with mask >= 40-bit and f >= 4GHz. That maps to
		 * ~ 0.06ppm granularity for NTP.
		 */
		sec = cs->mask;
		do_div(sec, freq);
		do_div(sec, scale);
		if (!sec)
			sec = 1;
		else if (sec > 600 && cs->mask > UINT_MAX)
			sec = 600;

		clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
				       NSEC_PER_SEC / scale, sec * scale);
	}

>> >>  We wouldn't need this advance_clock hack if we could
>> >> just call KVM_GET_CLOCK like we did before 00f4d64ee76e ("kvmclock:
>> >> clock should count only if vm is running").
>> > 
>> > People have requested that CLOCK_MONOTONIC stops when 
>> > vm suspends.
>> 
>> Ugh, we could have done that on the OS level, 
> 
> Done it how at the OS level ?

With the use of another interface steal-time like interface.

>> but stopping kvmclock
>> means that we sabotaged CLOCK_BOOTTIME as it doesn't return
>> CLOCK_MONOTONIC + time spent in suspend anymore.
> 
> It does:
> So this patchset introduces CLOCK_BOOTTIME, which is identical
> to CLOCK_MONOTONIC, but includes any time spent in suspend.
> 
> "suspend" refers to suspend-to-RAM, not "virtual machine stopped".
> 
> CLOCK_BOOTTIME fails to work when you extend "suspend" 
> to "suspend AND virtual machine pause or restoration".
> 
> (if you want that, can be fixed easily i supposed, but 
> nobody ever asked for it).

Yeah, it's just weird, because the reason why people wanted
CLOCK_MONOTONIC was quite stupid (to see how long it was unpaused ...)
while having a useable timesource seems quite important.

>> And kvmclock is defined to count nanosecond (slightly different in
>> practice) since some point in time, so pausing it is not really valid.
> 
> Sorry, where is that defined? Nobody makes that assumption
> (that kvmclock should be correlated to some physical time
> and that it can't stop counting).

Documentation/virtual/kvm/msr.txt

  system_time: a host notion of monotonic time, including sleep
  time at the time this structure was last updated. Unit is
  nanoseconds.

and arch/x86/include/asm/pvclock-abi.h:

  * pvclock_vcpu_time_info holds the system time and the tsc timestamp
  * of the last update. So the guest can use the tsc delta to get a
  * more precise system time.  There is one per virtual cpu.
  *
  * pvclock_wall_clock references the point in time when the system
  * time was zero (usually boot time), thus the guest calculates the
  * current wall clock by adding the system time.

>> >> > It should work with 4.9-rc (well, once Linus applies my pull request).
>> >> > 4.9-rc will not return ktime_get_ns for KVM_GET_CLOCK; it will return
>> >> > the raw value from the kernel timekeeper.
>> >> > 
>> >> > I'm thinking that we should add a KVM capability for this, and skip
>> >> > kvmclock_current_nsec if the capability is present.  The first part is
>> >> > trivial, so we can do it even during Linux rc period.
>> >> 
>> >> I agree, that sounds like a nice improvement.
>> > 
>> > I fail to see how this (KVM_GET_CLOCK using raw value (vs NTP corrected)
>> > relates to clocksource overflow and CLOCK_MONOTONIC stop requests.
>> 
>> If the guest uses master clock, then kvmclock drifts away from BOOTTIME,
>> so returning the value of kvmclock using kernel_ns() would introduce a
>> drift when setting the clock -- we must read the same way that the guest
>> does.
> 
> Does not work:
> 
>     (periodic incremental reads from TSC, using the offset from
>      previous TSC read, storing to scaling to a memory location)
>     and reads interpolating with TSC (what kernel does)
> 
>         != (different values than)
> 
>     (TSC scaled to nanoseconds)
> 
> Its not obvious to me this should be the case, perhaps if done 
> carefully can work.

This can't be equal if the TSC frequency is not constant, and it is not.
KVM master clock is using different TSC frequency than the kernel.
The current state is weird, because if we ever recompute the master
clock, we use kvm_get_time_and_clockread() for the new base, which
shifts the time. :/

We could match CLOCK_BOOTTIME with kvmclock if we wanted ...  It would
be much saner, but we'd recomputing the master clock every time the host
time gets updated.

> (See the thread with Roman, this was determined empirically).
> 
>> > -----
>> > 
>> > Todo list (collective???):
>> > 
>> > 1) Implement suggestion to Roman: 
>> > "Subject: Re: [PATCH] x86/kvm: fix condition to update kvm master
>> > clocks" to handle the time backwards when updating masterclock 
>> > from kernel clock.
>> > 
>> > 2) Review TSC scaling support (make sure scaled TSC 
>> > is propagated properly everywhere).
>> > 
>> > 3) Enable migration with invariant TSC.
>> > 
>> > 4) Fullvirt fix for local APIC deadline timer bug
>> 
>> The list looks good.
>> 
>> I'll resume work on (4) now that rework of APIC timers is merged.
>> It's going to be ugly on the guest side, because linux could be using it
>> even when not using kvmclock for sched clock ...
> 
> From previous discussion on the thread Eduardo started:
|> 
|>>> > Actually, a nicer fix would be to check the different
|>>> > frequencies and scale the deadline relative to the difference.
|>>>
|>>> You cannot know what exactly the guest was thinking when it set the
|>>> TSC
|>>> deadline.  Perhaps it wanted an interrupt when the TSC was exactly
|>>> 9876543210.
|> 
|> Yep, the spec just says that the timer fires when TSC >= deadline.
|> 
|>> You can't expect the correlation between TSC value and timer interrupt
|>> execution to be precise, because of the delay between HW timer
|>> expiration and interrupt execution.
|> 
|> Yes, this is valid.
|> 
|>> So you have to live with the fact that the TSC deadline timer can be
|>> late (which is the same thing as with your paravirt solution, in case
|>> of migration to host with faster TSC freq) (which to me renders the
|>> argument above invalid).
|>>
|>> Solution for old guests and new guests:
|>> Just save how far ahead in the future the TSC deadline timer is
|>> supposed
|>> to expire on the source (in ns), in the destination save all registers
|>> (but disable TSC deadline timer injection), arm a timer in QEMU
|>> for expiration time, reenable TSC deadline timer reinjection.
|> 
|> The interrupt can also fire early after migrating to a TSC with lower
|> frequency, which violates the definition of TSC deadline timer when an
|> OS could even RDTSC a value lower than the deadline in the interrupt
|> handler.  (An OS doesn't need to expect that.)
|> 
|> We already have vcpu->arch.virtual_tsc_khz that ought to remain constant
|> for a lifetime of the guest and KVM uses it to convert TSC delta into
|> correct nanoseconds.
|> 
|> The main problem is that QEMU changes virtual_tsc_khz when migrating
|> without hardware scaling, so KVM is forced to get nanoseconds wrong ...
|> 
|> If QEMU doesn't want to keep the TSC frequency constant, then it would
|> be better if it didn't expose TSC in CPUID -- guest would just use
|> kvmclock without being tempted by direct TSC accesses.
|> 
|> 
>> > (BTW Paolo Windows is also vulnerable right).
>> 
>> Ugh, we can't do much more than disabling TSC deadline there until QEMU
>> can migrate it.
> 
> So the idea was to convert to nanoseconds, count a timer
> in nanoseconds, and when it expires inject immediatelly.
> (Yes its ugly).

Problematic, because we can't know that the timer was used to count real
time: maybe the guest really wanted to count TSC cycles.  And it is just
wrong if we would fire the timer while the guest could rdtsc() a value
before the deadline in its handler, which might happen if we migrated to
a TSC with a lower frequency.

> (btw i suppose you can talk to destination via the command interface 
> on the postcopy patches now), say to get the destination tsc frequency.

The problem is that changing TSC frequency on the host affects all
following interrupts, because KVM is not informed of the new frequency.
The one timer armed during migration is relatively insignificant, so
Paolo's patches even ignored it.

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [QEMU PATCH] kvmclock: advance clock by time window between vm_stop and pre_save
  2016-11-04 17:16           ` [Qemu-devel] " Radim Krčmář
@ 2016-11-04 21:29             ` Paolo Bonzini
  -1 siblings, 0 replies; 80+ messages in thread
From: Paolo Bonzini @ 2016-11-04 21:29 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: Marcelo Tosatti, kvm, qemu-devel, Dr. David Alan Gilbert,
	Juan Quintela, Eduardo Habkost, Roman Kagan


> >> No, the one that forced Marcelo to add the 10 minute limit to the
> >> advance_clock.  We wouldn't need this advance_clock hack if we could
> >> just call KVM_GET_CLOCK like we did before 00f4d64ee76e ("kvmclock:
> >> clock should count only if vm is running").
> > 
> > There are two cases:
> > 
> > - migrating a paused guest
> > 
> > - pausing at the end of migration
> > 
> > In the first case, kvmclock_vm_state_change's !running branch will see
> > state == RUN_STATE_FINISH_MIGRATE && s->clock_valid.  In the second
> > case, it will see state == RUN_STATE_FINISH_MIGRATE && !s->clock_valid.
> 
> I lift my case, marcelo's said that stopping the time is a feature ...
> (*kittens die*)

But that's why separating the two cases brings us the best of both worlds.
If migrating a paused guest, there's no need for any adjustment, so no
advance_clock hack.  If pausing at the end of migration, there's no need
to pause kvmclock (this patch is effectively working around 00f4d64ee76e)
and if we don't do that we can just call KVM_GET_CLOCK at pre_save time.

> Oh, and this does introduce a minor bug to this patch -- the time
> counted by KVM_GET_CLOCK is has different frequency CLOCK_MONOTONIC.
> Not accounting for that is bearable.

Not really, I was going to point that out when I got to replying with
a review. :)

Paolo

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [Qemu-devel] [QEMU PATCH] kvmclock: advance clock by time window between vm_stop and pre_save
@ 2016-11-04 21:29             ` Paolo Bonzini
  0 siblings, 0 replies; 80+ messages in thread
From: Paolo Bonzini @ 2016-11-04 21:29 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: Marcelo Tosatti, kvm, qemu-devel, Dr. David Alan Gilbert,
	Juan Quintela, Eduardo Habkost, Roman Kagan


> >> No, the one that forced Marcelo to add the 10 minute limit to the
> >> advance_clock.  We wouldn't need this advance_clock hack if we could
> >> just call KVM_GET_CLOCK like we did before 00f4d64ee76e ("kvmclock:
> >> clock should count only if vm is running").
> > 
> > There are two cases:
> > 
> > - migrating a paused guest
> > 
> > - pausing at the end of migration
> > 
> > In the first case, kvmclock_vm_state_change's !running branch will see
> > state == RUN_STATE_FINISH_MIGRATE && s->clock_valid.  In the second
> > case, it will see state == RUN_STATE_FINISH_MIGRATE && !s->clock_valid.
> 
> I lift my case, marcelo's said that stopping the time is a feature ...
> (*kittens die*)

But that's why separating the two cases brings us the best of both worlds.
If migrating a paused guest, there's no need for any adjustment, so no
advance_clock hack.  If pausing at the end of migration, there's no need
to pause kvmclock (this patch is effectively working around 00f4d64ee76e)
and if we don't do that we can just call KVM_GET_CLOCK at pre_save time.

> Oh, and this does introduce a minor bug to this patch -- the time
> counted by KVM_GET_CLOCK is has different frequency CLOCK_MONOTONIC.
> Not accounting for that is bearable.

Not really, I was going to point that out when I got to replying with
a review. :)

Paolo

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [QEMU PATCH] kvmclock: advance clock by time window between vm_stop and pre_save
  2016-11-04 21:29             ` [Qemu-devel] " Paolo Bonzini
@ 2016-11-04 21:47               ` Marcelo Tosatti
  -1 siblings, 0 replies; 80+ messages in thread
From: Marcelo Tosatti @ 2016-11-04 21:47 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Radim Krčmář,
	kvm, qemu-devel, Dr. David Alan Gilbert, Juan Quintela,
	Eduardo Habkost, Roman Kagan

On Fri, Nov 04, 2016 at 05:29:36PM -0400, Paolo Bonzini wrote:
> 
> > >> No, the one that forced Marcelo to add the 10 minute limit to the
> > >> advance_clock.  We wouldn't need this advance_clock hack if we could
> > >> just call KVM_GET_CLOCK like we did before 00f4d64ee76e ("kvmclock:
> > >> clock should count only if vm is running").
> > > 
> > > There are two cases:
> > > 
> > > - migrating a paused guest
> > > 
> > > - pausing at the end of migration
> > > 
> > > In the first case, kvmclock_vm_state_change's !running branch will see
> > > state == RUN_STATE_FINISH_MIGRATE && s->clock_valid.  In the second
> > > case, it will see state == RUN_STATE_FINISH_MIGRATE && !s->clock_valid.
> > 
> > I lift my case, marcelo's said that stopping the time is a feature ...
> > (*kittens die*)
> 
> But that's why separating the two cases brings us the best of both worlds.
> If migrating a paused guest, there's no need for any adjustment, so no
> advance_clock hack.  If pausing at the end of migration, there's no need
> to pause kvmclock (this patch is effectively working around 00f4d64ee76e)
> and if we don't do that we can just call KVM_GET_CLOCK at pre_save time.

That was my internal v1. But then, the destination ignores s->clock
as follows:

    if (running) {
        struct kvm_clock_data data = {};
        uint64_t time_at_migration = kvmclock_current_nsec(s);

        s->clock_valid = false;

        /* We can't rely on the migrated clock value, just discard it */
        if (time_at_migration) {
            s->clock = time_at_migration;
        }

        data.clock = s->clock;
        ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);

So you need to send that "ns" value (difference of two clock reads)
separately.

> 
> > Oh, and this does introduce a minor bug to this patch -- the time
> > counted by KVM_GET_CLOCK is has different frequency CLOCK_MONOTONIC.
> > Not accounting for that is bearable.
> 
> Not really, I was going to point that out when I got to replying with
> a review. :)
> 
> Paolo


^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [Qemu-devel] [QEMU PATCH] kvmclock: advance clock by time window between vm_stop and pre_save
@ 2016-11-04 21:47               ` Marcelo Tosatti
  0 siblings, 0 replies; 80+ messages in thread
From: Marcelo Tosatti @ 2016-11-04 21:47 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Radim Krčmář,
	kvm, qemu-devel, Dr. David Alan Gilbert, Juan Quintela,
	Eduardo Habkost, Roman Kagan

On Fri, Nov 04, 2016 at 05:29:36PM -0400, Paolo Bonzini wrote:
> 
> > >> No, the one that forced Marcelo to add the 10 minute limit to the
> > >> advance_clock.  We wouldn't need this advance_clock hack if we could
> > >> just call KVM_GET_CLOCK like we did before 00f4d64ee76e ("kvmclock:
> > >> clock should count only if vm is running").
> > > 
> > > There are two cases:
> > > 
> > > - migrating a paused guest
> > > 
> > > - pausing at the end of migration
> > > 
> > > In the first case, kvmclock_vm_state_change's !running branch will see
> > > state == RUN_STATE_FINISH_MIGRATE && s->clock_valid.  In the second
> > > case, it will see state == RUN_STATE_FINISH_MIGRATE && !s->clock_valid.
> > 
> > I lift my case, marcelo's said that stopping the time is a feature ...
> > (*kittens die*)
> 
> But that's why separating the two cases brings us the best of both worlds.
> If migrating a paused guest, there's no need for any adjustment, so no
> advance_clock hack.  If pausing at the end of migration, there's no need
> to pause kvmclock (this patch is effectively working around 00f4d64ee76e)
> and if we don't do that we can just call KVM_GET_CLOCK at pre_save time.

That was my internal v1. But then, the destination ignores s->clock
as follows:

    if (running) {
        struct kvm_clock_data data = {};
        uint64_t time_at_migration = kvmclock_current_nsec(s);

        s->clock_valid = false;

        /* We can't rely on the migrated clock value, just discard it */
        if (time_at_migration) {
            s->clock = time_at_migration;
        }

        data.clock = s->clock;
        ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);

So you need to send that "ns" value (difference of two clock reads)
separately.

> 
> > Oh, and this does introduce a minor bug to this patch -- the time
> > counted by KVM_GET_CLOCK is has different frequency CLOCK_MONOTONIC.
> > Not accounting for that is bearable.
> 
> Not really, I was going to point that out when I got to replying with
> a review. :)
> 
> Paolo

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [QEMU PATCH] kvmclock: advance clock by time window between vm_stop and pre_save
  2016-11-04 21:47               ` [Qemu-devel] " Marcelo Tosatti
@ 2016-11-04 22:35                 ` Paolo Bonzini
  -1 siblings, 0 replies; 80+ messages in thread
From: Paolo Bonzini @ 2016-11-04 22:35 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: Radim Krčmář,
	kvm, qemu-devel, Dr. David Alan Gilbert, Juan Quintela,
	Eduardo Habkost, Roman Kagan

> > But that's why separating the two cases brings us the best of both worlds.
> > If migrating a paused guest, there's no need for any adjustment, so no
> > advance_clock hack.  If pausing at the end of migration, there's no need
> > to pause kvmclock (this patch is effectively working around 00f4d64ee76e)
> > and if we don't do that we can just call KVM_GET_CLOCK at pre_save time.
> 
> That was my internal v1. But then, the destination ignores s->clock
> as follows:
> 
>     if (running) {
>         struct kvm_clock_data data = {};
>         uint64_t time_at_migration = kvmclock_current_nsec(s);

Right, but this is unnecessary in 4.9-rc, where KVM_GET_CLOCK *already*
returns the same as QEMU's kvmclock_current_nsec.  So this is the other
half of my suggestion, where we need to check the behavior of KVM_GET_CLOCK
via KVM_CHECK_EXTENSION.

I think it's okay to only fix the bug with master clock enabled and
for new KVM with sensible KVM_GET_CLOCK semantics.

Paolo

>         s->clock_valid = false;
> 
>         /* We can't rely on the migrated clock value, just discard it */
>         if (time_at_migration) {
>             s->clock = time_at_migration;
>         }
> 
>         data.clock = s->clock;
>         ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
> 
> So you need to send that "ns" value (difference of two clock reads)
> separately.
> 
> > 
> > > Oh, and this does introduce a minor bug to this patch -- the time
> > > counted by KVM_GET_CLOCK is has different frequency CLOCK_MONOTONIC.
> > > Not accounting for that is bearable.
> > 
> > Not really, I was going to point that out when I got to replying with
> > a review. :)
> > 
> > Paolo
> 
> 

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [Qemu-devel] [QEMU PATCH] kvmclock: advance clock by time window between vm_stop and pre_save
@ 2016-11-04 22:35                 ` Paolo Bonzini
  0 siblings, 0 replies; 80+ messages in thread
From: Paolo Bonzini @ 2016-11-04 22:35 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: Radim Krčmář,
	kvm, qemu-devel, Dr. David Alan Gilbert, Juan Quintela,
	Eduardo Habkost, Roman Kagan

> > But that's why separating the two cases brings us the best of both worlds.
> > If migrating a paused guest, there's no need for any adjustment, so no
> > advance_clock hack.  If pausing at the end of migration, there's no need
> > to pause kvmclock (this patch is effectively working around 00f4d64ee76e)
> > and if we don't do that we can just call KVM_GET_CLOCK at pre_save time.
> 
> That was my internal v1. But then, the destination ignores s->clock
> as follows:
> 
>     if (running) {
>         struct kvm_clock_data data = {};
>         uint64_t time_at_migration = kvmclock_current_nsec(s);

Right, but this is unnecessary in 4.9-rc, where KVM_GET_CLOCK *already*
returns the same as QEMU's kvmclock_current_nsec.  So this is the other
half of my suggestion, where we need to check the behavior of KVM_GET_CLOCK
via KVM_CHECK_EXTENSION.

I think it's okay to only fix the bug with master clock enabled and
for new KVM with sensible KVM_GET_CLOCK semantics.

Paolo

>         s->clock_valid = false;
> 
>         /* We can't rely on the migrated clock value, just discard it */
>         if (time_at_migration) {
>             s->clock = time_at_migration;
>         }
> 
>         data.clock = s->clock;
>         ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
> 
> So you need to send that "ns" value (difference of two clock reads)
> separately.
> 
> > 
> > > Oh, and this does introduce a minor bug to this patch -- the time
> > > counted by KVM_GET_CLOCK is has different frequency CLOCK_MONOTONIC.
> > > Not accounting for that is bearable.
> > 
> > Not really, I was going to point that out when I got to replying with
> > a review. :)
> > 
> > Paolo
> 
> 

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [QEMU PATCH] kvmclock: advance clock by time window between vm_stop and pre_save
  2016-11-04 17:39       ` [Qemu-devel] " Radim Krčmář
@ 2016-11-07 13:08         ` Dr. David Alan Gilbert
  -1 siblings, 0 replies; 80+ messages in thread
From: Dr. David Alan Gilbert @ 2016-11-07 13:08 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: Marcelo Tosatti, kvm, qemu-devel, Paolo Bonzini, Juan Quintela,
	Eduardo Habkost

* Radim Krčmář (rkrcmar@redhat.com) wrote:
> 2016-11-04 15:07-0200, Marcelo Tosatti:
> > On Fri, Nov 04, 2016 at 04:25:23PM +0100, Radim Krčmář wrote:
> >> > +        /*
> >> > +         * Transition from VM-running to VM-stopped via migration?
> >> > +         * Record when the VM was stopped.
> >> > +         */
> >> > +
> >> > +        if (state == RUN_STATE_FINISH_MIGRATE &&
> >> > +            !migration_in_postcopy(migrate_get_current())) {
> >> > +            clock_gettime(CLOCK_MONOTONIC, &s->t_aftervmstop);
> >> 
> >> How big (more like small) was the clock delta between here and
> >> kvmclock_pre_save with postcopy?
> >> 
> >> Thanks.
> > 
> > qemu-system-x86_64: postcopy_ram_supported_by_host: userfaultfd not
> > available: Function not implemented
> > 
> > But should be about the same as precopy+this patch (guess as i don't 
> > know the postcopy path).
> 
> I was wondering about the improvement we could achieve by not excluding
> postcopy from the time fixup.  (i.e. how much time elapses between
> pausing and migrating the vm in postcopy)
> 
> I would also guess that it is not significant.

It can add up, so it would be useful to be able to do this trick.
We have to calculate and send some 'discard maps' that can take
some time, especially on larger VMs, and we also have to serialise
the state of all non-RAM devices, so if you have a lot of other
emulated devices it can take a bit more time.

Dave

--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [Qemu-devel] [QEMU PATCH] kvmclock: advance clock by time window between vm_stop and pre_save
@ 2016-11-07 13:08         ` Dr. David Alan Gilbert
  0 siblings, 0 replies; 80+ messages in thread
From: Dr. David Alan Gilbert @ 2016-11-07 13:08 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: Marcelo Tosatti, kvm, qemu-devel, Paolo Bonzini, Juan Quintela,
	Eduardo Habkost

* Radim Krčmář (rkrcmar@redhat.com) wrote:
> 2016-11-04 15:07-0200, Marcelo Tosatti:
> > On Fri, Nov 04, 2016 at 04:25:23PM +0100, Radim Krčmář wrote:
> >> > +        /*
> >> > +         * Transition from VM-running to VM-stopped via migration?
> >> > +         * Record when the VM was stopped.
> >> > +         */
> >> > +
> >> > +        if (state == RUN_STATE_FINISH_MIGRATE &&
> >> > +            !migration_in_postcopy(migrate_get_current())) {
> >> > +            clock_gettime(CLOCK_MONOTONIC, &s->t_aftervmstop);
> >> 
> >> How big (more like small) was the clock delta between here and
> >> kvmclock_pre_save with postcopy?
> >> 
> >> Thanks.
> > 
> > qemu-system-x86_64: postcopy_ram_supported_by_host: userfaultfd not
> > available: Function not implemented
> > 
> > But should be about the same as precopy+this patch (guess as i don't 
> > know the postcopy path).
> 
> I was wondering about the improvement we could achieve by not excluding
> postcopy from the time fixup.  (i.e. how much time elapses between
> pausing and migrating the vm in postcopy)
> 
> I would also guess that it is not significant.

It can add up, so it would be useful to be able to do this trick.
We have to calculate and send some 'discard maps' that can take
some time, especially on larger VMs, and we also have to serialise
the state of all non-RAM devices, so if you have a lot of other
emulated devices it can take a bit more time.

Dave

--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [QEMU PATCH] kvmclock: advance clock by time window between vm_stop and pre_save
  2016-11-04 17:16           ` [Qemu-devel] " Radim Krčmář
@ 2016-11-07 14:31             ` Roman Kagan
  -1 siblings, 0 replies; 80+ messages in thread
From: Roman Kagan @ 2016-11-07 14:31 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: Eduardo Habkost, kvm, Juan Quintela, Marcelo Tosatti, qemu-devel,
	Dr. David Alan Gilbert, Paolo Bonzini

On Fri, Nov 04, 2016 at 06:16:06PM +0100, Radim Krčmář wrote:
> 2016-11-04 16:57+0100, Paolo Bonzini:
> > On 04/11/2016 16:48, Radim Krčmář wrote:
> >> 2016-11-04 16:33+0100, Paolo Bonzini:
> >>> On 04/11/2016 16:25, Radim Krčmář wrote:
> >>>>>>  
> >>>>>> +        if (s->advance_clock && s->clock + s->advance_clock > s->clock) {
> >>>>>> +            s->clock += s->advance_clock;
> >>>>>> +            s->advance_clock = 0;
> >>>>>> +        }
> >>>> Can't the advance_clock added to the migrated KVMClockState instead of
> >>>> passing it as another parameter?
> >>>>
> >>>> (It is sad that we can't just query KVMClockState in kvmclock_pre_save
> >>>>  because of the Linux bug.)
> >>>
> >>> What Linux bug?  The one that makes us use kvmclock_current_nsec?
> >> 
> >> No, the one that forced Marcelo to add the 10 minute limit to the
> >> advance_clock.  We wouldn't need this advance_clock hack if we could
> >> just call KVM_GET_CLOCK like we did before 00f4d64ee76e ("kvmclock:
> >> clock should count only if vm is running").
> > 
> > There are two cases:
> > 
> > - migrating a paused guest
> > 
> > - pausing at the end of migration
> > 
> > In the first case, kvmclock_vm_state_change's !running branch will see
> > state == RUN_STATE_FINISH_MIGRATE && s->clock_valid.  In the second
> > case, it will see state == RUN_STATE_FINISH_MIGRATE && !s->clock_valid.
> 
> I lift my case, marcelo's said that stopping the time is a feature ...
> (*kittens die*)

Sorry to chime in in the middle of the thread, but I wonder how happy
the guests are with this behavior.  Intuitively pausing or snapshotting
feels like closing the lid of a laptop, so every time I see the guest
waking up in the past after a pause I get confused.  It may also be
unexpected by Windows guests who never had this overflow problem but
now, being tied up with kvmclock, have to stop the time while in pause,
too.

Roman.

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [Qemu-devel] [QEMU PATCH] kvmclock: advance clock by time window between vm_stop and pre_save
@ 2016-11-07 14:31             ` Roman Kagan
  0 siblings, 0 replies; 80+ messages in thread
From: Roman Kagan @ 2016-11-07 14:31 UTC (permalink / raw)
  To: Radim Krčmář
  Cc: Paolo Bonzini, Marcelo Tosatti, kvm, qemu-devel,
	Dr. David Alan Gilbert, Juan Quintela, Eduardo Habkost

On Fri, Nov 04, 2016 at 06:16:06PM +0100, Radim Krčmář wrote:
> 2016-11-04 16:57+0100, Paolo Bonzini:
> > On 04/11/2016 16:48, Radim Krčmář wrote:
> >> 2016-11-04 16:33+0100, Paolo Bonzini:
> >>> On 04/11/2016 16:25, Radim Krčmář wrote:
> >>>>>>  
> >>>>>> +        if (s->advance_clock && s->clock + s->advance_clock > s->clock) {
> >>>>>> +            s->clock += s->advance_clock;
> >>>>>> +            s->advance_clock = 0;
> >>>>>> +        }
> >>>> Can't the advance_clock added to the migrated KVMClockState instead of
> >>>> passing it as another parameter?
> >>>>
> >>>> (It is sad that we can't just query KVMClockState in kvmclock_pre_save
> >>>>  because of the Linux bug.)
> >>>
> >>> What Linux bug?  The one that makes us use kvmclock_current_nsec?
> >> 
> >> No, the one that forced Marcelo to add the 10 minute limit to the
> >> advance_clock.  We wouldn't need this advance_clock hack if we could
> >> just call KVM_GET_CLOCK like we did before 00f4d64ee76e ("kvmclock:
> >> clock should count only if vm is running").
> > 
> > There are two cases:
> > 
> > - migrating a paused guest
> > 
> > - pausing at the end of migration
> > 
> > In the first case, kvmclock_vm_state_change's !running branch will see
> > state == RUN_STATE_FINISH_MIGRATE && s->clock_valid.  In the second
> > case, it will see state == RUN_STATE_FINISH_MIGRATE && !s->clock_valid.
> 
> I lift my case, marcelo's said that stopping the time is a feature ...
> (*kittens die*)

Sorry to chime in in the middle of the thread, but I wonder how happy
the guests are with this behavior.  Intuitively pausing or snapshotting
feels like closing the lid of a laptop, so every time I see the guest
waking up in the past after a pause I get confused.  It may also be
unexpected by Windows guests who never had this overflow problem but
now, being tied up with kvmclock, have to stop the time while in pause,
too.

Roman.

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [QEMU PATCH v2] kvmclock: advance clock by time window between vm_stop and pre_save
  2016-11-04 16:59   ` [Qemu-devel] " Marcelo Tosatti
@ 2016-11-07 15:46     ` Dr. David Alan Gilbert
  -1 siblings, 0 replies; 80+ messages in thread
From: Dr. David Alan Gilbert @ 2016-11-07 15:46 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: kvm, qemu-devel, Paolo Bonzini, Radim Krčmář,
	Juan Quintela, Eduardo Habkost

* Marcelo Tosatti (mtosatti@redhat.com) wrote:
> This patch, relative to pre-copy migration codepath,
> measures the time between vm_stop() and pre_save(),
> which includes copying the remaining RAM to destination,
> and advances the clock by that amount.
> 
> In a VM with 5 seconds downtime, this reduces the guest
> clock difference on destination from 5s to 0.2s.
> 
> Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time.

One thing that bothers me is that it's only this clock that's
getting corrected; doesn't it cause things to get upset when
one clock moves and the others dont?
Shouldn't the pause delay be recorded somewhere architecturally
independent and then be a thing that kvm-clock happens to use and
other clocks might as well?

Dave

> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> 
> ---
> 
> v2: use subsection (Juan Quintela)
>     fix older machine types support
> 
> diff --git a/hw/i386/kvm/clock.c b/hw/i386/kvm/clock.c
> index 0f75dd3..a2a02ac 100644
> --- a/hw/i386/kvm/clock.c
> +++ b/hw/i386/kvm/clock.c
> @@ -22,9 +22,11 @@
>  #include "kvm_i386.h"
>  #include "hw/sysbus.h"
>  #include "hw/kvm/clock.h"
> +#include "migration/migration.h"
>  
>  #include <linux/kvm.h>
>  #include <linux/kvm_para.h>
> +#include <time.h>
>  
>  #define TYPE_KVM_CLOCK "kvmclock"
>  #define KVM_CLOCK(obj) OBJECT_CHECK(KVMClockState, (obj), TYPE_KVM_CLOCK)
> @@ -35,7 +37,13 @@ typedef struct KVMClockState {
>      /*< public >*/
>  
>      uint64_t clock;
> +    uint64_t ns;
>      bool clock_valid;
> +
> +    uint64_t advance_clock;
> +    struct timespec t_aftervmstop;
> +
> +    bool adv_clock_enabled;
>  } KVMClockState;
>  
>  struct pvclock_vcpu_time_info {
> @@ -100,6 +108,11 @@ static void kvmclock_vm_state_change(void *opaque, int running,
>              s->clock = time_at_migration;
>          }
>  
> +        if (s->advance_clock && s->clock + s->advance_clock > s->clock) {
> +            s->clock += s->advance_clock;
> +            s->advance_clock = 0;
> +        }
> +
>          data.clock = s->clock;
>          ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
>          if (ret < 0) {
> @@ -135,6 +148,18 @@ static void kvmclock_vm_state_change(void *opaque, int running,
>              abort();
>          }
>          s->clock = data.clock;
> +        /*
> +         * Transition from VM-running to VM-stopped via migration?
> +         * Record when the VM was stopped.
> +         */
> +
> +        if (state == RUN_STATE_FINISH_MIGRATE &&
> +            !migration_in_postcopy(migrate_get_current())) {
> +            clock_gettime(CLOCK_MONOTONIC, &s->t_aftervmstop);
> +        } else {
> +            s->t_aftervmstop.tv_sec = 0;
> +            s->t_aftervmstop.tv_nsec = 0;
> +        }
>  
>          /*
>           * If the VM is stopped, declare the clock state valid to
> @@ -152,6 +177,77 @@ static void kvmclock_realize(DeviceState *dev, Error **errp)
>      qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
>  }
>  
> +static uint64_t clock_delta(struct timespec *before, struct timespec *after)
> +{
> +    if (before->tv_sec > after->tv_sec ||
> +        (before->tv_sec == after->tv_sec &&
> +         before->tv_nsec > after->tv_nsec)) {
> +        fprintf(stderr, "clock_delta failed: before=(%ld sec, %ld nsec),"
> +                        "after=(%ld sec, %ld nsec)\n", before->tv_sec,
> +                        before->tv_nsec, after->tv_sec, after->tv_nsec);
> +        abort();
> +    }
> +
> +    return (after->tv_sec - before->tv_sec) * 1000000000ULL +
> +            after->tv_nsec - before->tv_nsec;
> +}
> +
> +static void kvmclock_pre_save(void *opaque)
> +{
> +    KVMClockState *s = opaque;
> +    struct timespec now;
> +    uint64_t ns;
> +
> +    if (s->t_aftervmstop.tv_sec == 0) {
> +        return;
> +    }
> +
> +    clock_gettime(CLOCK_MONOTONIC, &now);
> +
> +    ns = clock_delta(&s->t_aftervmstop, &now);
> +
> +    /*
> +     * Linux guests can overflow if time jumps
> +     * forward in large increments.
> +     * Cap maximum adjustment to 10 minutes.
> +     */
> +    ns = MIN(ns, 600000000000ULL);
> +
> +    if (s->clock + ns > s->clock) {
> +        s->ns = ns;
> +    }
> +}
> +
> +static int kvmclock_post_load(void *opaque, int version_id)
> +{
> +    KVMClockState *s = opaque;
> +
> +    /* save the value from incoming migration */
> +    s->advance_clock = s->ns;
> +
> +    return 0;
> +}
> +
> +static bool kvmclock_ns_needed(void *opaque)
> +{
> +    KVMClockState *s = opaque;
> +
> +    return s->adv_clock_enabled;
> +}
> +
> +static const VMStateDescription kvmclock_advance_ns = {
> +    .name = "kvmclock/advance_ns",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .needed = kvmclock_ns_needed,
> +    .pre_save = kvmclock_pre_save,
> +    .post_load = kvmclock_post_load,
> +    .fields = (VMStateField[]) {
> +        VMSTATE_UINT64(ns, KVMClockState),
> +        VMSTATE_END_OF_LIST()
> +    }
> +};
> +
>  static const VMStateDescription kvmclock_vmsd = {
>      .name = "kvmclock",
>      .version_id = 1,
> @@ -159,15 +255,25 @@ static const VMStateDescription kvmclock_vmsd = {
>      .fields = (VMStateField[]) {
>          VMSTATE_UINT64(clock, KVMClockState),
>          VMSTATE_END_OF_LIST()
> +    },
> +    .subsections = (const VMStateDescription * []) {
> +        &kvmclock_advance_ns,
> +        NULL
>      }
>  };
>  
> +static Property kvmclock_properties[] = {
> +    DEFINE_PROP_BOOL("advance_clock", KVMClockState, adv_clock_enabled, true),
> +    DEFINE_PROP_END_OF_LIST(),
> +};
> +
>  static void kvmclock_class_init(ObjectClass *klass, void *data)
>  {
>      DeviceClass *dc = DEVICE_CLASS(klass);
>  
>      dc->realize = kvmclock_realize;
>      dc->vmsd = &kvmclock_vmsd;
> +    dc->props = kvmclock_properties;
>  }
>  
>  static const TypeInfo kvmclock_info = {
> diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
> index 98dc772..243352e 100644
> --- a/include/hw/i386/pc.h
> +++ b/include/hw/i386/pc.h
> @@ -370,6 +370,11 @@ bool e820_get_entry(int, uint32_t, uint64_t *, uint64_t *);
>  #define PC_COMPAT_2_7 \
>      HW_COMPAT_2_7 \
>      {\
> +        .driver   = "kvmclock",\
> +        .property = "advance_clock",\
> +        .value    = "off",\
> +    },\
> +    {\
>          .driver   = TYPE_X86_CPU,\
>          .property = "l3-cache",\
>          .value    = "off",\
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [Qemu-devel] [QEMU PATCH v2] kvmclock: advance clock by time window between vm_stop and pre_save
@ 2016-11-07 15:46     ` Dr. David Alan Gilbert
  0 siblings, 0 replies; 80+ messages in thread
From: Dr. David Alan Gilbert @ 2016-11-07 15:46 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: kvm, qemu-devel, Paolo Bonzini, Radim Krčmář,
	Juan Quintela, Eduardo Habkost

* Marcelo Tosatti (mtosatti@redhat.com) wrote:
> This patch, relative to pre-copy migration codepath,
> measures the time between vm_stop() and pre_save(),
> which includes copying the remaining RAM to destination,
> and advances the clock by that amount.
> 
> In a VM with 5 seconds downtime, this reduces the guest
> clock difference on destination from 5s to 0.2s.
> 
> Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time.

One thing that bothers me is that it's only this clock that's
getting corrected; doesn't it cause things to get upset when
one clock moves and the others dont?
Shouldn't the pause delay be recorded somewhere architecturally
independent and then be a thing that kvm-clock happens to use and
other clocks might as well?

Dave

> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> 
> ---
> 
> v2: use subsection (Juan Quintela)
>     fix older machine types support
> 
> diff --git a/hw/i386/kvm/clock.c b/hw/i386/kvm/clock.c
> index 0f75dd3..a2a02ac 100644
> --- a/hw/i386/kvm/clock.c
> +++ b/hw/i386/kvm/clock.c
> @@ -22,9 +22,11 @@
>  #include "kvm_i386.h"
>  #include "hw/sysbus.h"
>  #include "hw/kvm/clock.h"
> +#include "migration/migration.h"
>  
>  #include <linux/kvm.h>
>  #include <linux/kvm_para.h>
> +#include <time.h>
>  
>  #define TYPE_KVM_CLOCK "kvmclock"
>  #define KVM_CLOCK(obj) OBJECT_CHECK(KVMClockState, (obj), TYPE_KVM_CLOCK)
> @@ -35,7 +37,13 @@ typedef struct KVMClockState {
>      /*< public >*/
>  
>      uint64_t clock;
> +    uint64_t ns;
>      bool clock_valid;
> +
> +    uint64_t advance_clock;
> +    struct timespec t_aftervmstop;
> +
> +    bool adv_clock_enabled;
>  } KVMClockState;
>  
>  struct pvclock_vcpu_time_info {
> @@ -100,6 +108,11 @@ static void kvmclock_vm_state_change(void *opaque, int running,
>              s->clock = time_at_migration;
>          }
>  
> +        if (s->advance_clock && s->clock + s->advance_clock > s->clock) {
> +            s->clock += s->advance_clock;
> +            s->advance_clock = 0;
> +        }
> +
>          data.clock = s->clock;
>          ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
>          if (ret < 0) {
> @@ -135,6 +148,18 @@ static void kvmclock_vm_state_change(void *opaque, int running,
>              abort();
>          }
>          s->clock = data.clock;
> +        /*
> +         * Transition from VM-running to VM-stopped via migration?
> +         * Record when the VM was stopped.
> +         */
> +
> +        if (state == RUN_STATE_FINISH_MIGRATE &&
> +            !migration_in_postcopy(migrate_get_current())) {
> +            clock_gettime(CLOCK_MONOTONIC, &s->t_aftervmstop);
> +        } else {
> +            s->t_aftervmstop.tv_sec = 0;
> +            s->t_aftervmstop.tv_nsec = 0;
> +        }
>  
>          /*
>           * If the VM is stopped, declare the clock state valid to
> @@ -152,6 +177,77 @@ static void kvmclock_realize(DeviceState *dev, Error **errp)
>      qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
>  }
>  
> +static uint64_t clock_delta(struct timespec *before, struct timespec *after)
> +{
> +    if (before->tv_sec > after->tv_sec ||
> +        (before->tv_sec == after->tv_sec &&
> +         before->tv_nsec > after->tv_nsec)) {
> +        fprintf(stderr, "clock_delta failed: before=(%ld sec, %ld nsec),"
> +                        "after=(%ld sec, %ld nsec)\n", before->tv_sec,
> +                        before->tv_nsec, after->tv_sec, after->tv_nsec);
> +        abort();
> +    }
> +
> +    return (after->tv_sec - before->tv_sec) * 1000000000ULL +
> +            after->tv_nsec - before->tv_nsec;
> +}
> +
> +static void kvmclock_pre_save(void *opaque)
> +{
> +    KVMClockState *s = opaque;
> +    struct timespec now;
> +    uint64_t ns;
> +
> +    if (s->t_aftervmstop.tv_sec == 0) {
> +        return;
> +    }
> +
> +    clock_gettime(CLOCK_MONOTONIC, &now);
> +
> +    ns = clock_delta(&s->t_aftervmstop, &now);
> +
> +    /*
> +     * Linux guests can overflow if time jumps
> +     * forward in large increments.
> +     * Cap maximum adjustment to 10 minutes.
> +     */
> +    ns = MIN(ns, 600000000000ULL);
> +
> +    if (s->clock + ns > s->clock) {
> +        s->ns = ns;
> +    }
> +}
> +
> +static int kvmclock_post_load(void *opaque, int version_id)
> +{
> +    KVMClockState *s = opaque;
> +
> +    /* save the value from incoming migration */
> +    s->advance_clock = s->ns;
> +
> +    return 0;
> +}
> +
> +static bool kvmclock_ns_needed(void *opaque)
> +{
> +    KVMClockState *s = opaque;
> +
> +    return s->adv_clock_enabled;
> +}
> +
> +static const VMStateDescription kvmclock_advance_ns = {
> +    .name = "kvmclock/advance_ns",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .needed = kvmclock_ns_needed,
> +    .pre_save = kvmclock_pre_save,
> +    .post_load = kvmclock_post_load,
> +    .fields = (VMStateField[]) {
> +        VMSTATE_UINT64(ns, KVMClockState),
> +        VMSTATE_END_OF_LIST()
> +    }
> +};
> +
>  static const VMStateDescription kvmclock_vmsd = {
>      .name = "kvmclock",
>      .version_id = 1,
> @@ -159,15 +255,25 @@ static const VMStateDescription kvmclock_vmsd = {
>      .fields = (VMStateField[]) {
>          VMSTATE_UINT64(clock, KVMClockState),
>          VMSTATE_END_OF_LIST()
> +    },
> +    .subsections = (const VMStateDescription * []) {
> +        &kvmclock_advance_ns,
> +        NULL
>      }
>  };
>  
> +static Property kvmclock_properties[] = {
> +    DEFINE_PROP_BOOL("advance_clock", KVMClockState, adv_clock_enabled, true),
> +    DEFINE_PROP_END_OF_LIST(),
> +};
> +
>  static void kvmclock_class_init(ObjectClass *klass, void *data)
>  {
>      DeviceClass *dc = DEVICE_CLASS(klass);
>  
>      dc->realize = kvmclock_realize;
>      dc->vmsd = &kvmclock_vmsd;
> +    dc->props = kvmclock_properties;
>  }
>  
>  static const TypeInfo kvmclock_info = {
> diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
> index 98dc772..243352e 100644
> --- a/include/hw/i386/pc.h
> +++ b/include/hw/i386/pc.h
> @@ -370,6 +370,11 @@ bool e820_get_entry(int, uint32_t, uint64_t *, uint64_t *);
>  #define PC_COMPAT_2_7 \
>      HW_COMPAT_2_7 \
>      {\
> +        .driver   = "kvmclock",\
> +        .property = "advance_clock",\
> +        .value    = "off",\
> +    },\
> +    {\
>          .driver   = TYPE_X86_CPU,\
>          .property = "l3-cache",\
>          .value    = "off",\
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [QEMU PATCH] kvmclock: advance clock by time window between vm_stop and pre_save
  2016-11-07 14:31             ` [Qemu-devel] " Roman Kagan
@ 2016-11-07 19:31               ` Marcelo Tosatti
  -1 siblings, 0 replies; 80+ messages in thread
From: Marcelo Tosatti @ 2016-11-07 19:31 UTC (permalink / raw)
  To: Roman Kagan, Radim Krčmář,
	Paolo Bonzini, kvm, qemu-devel, Dr. David Alan Gilbert,
	Juan Quintela, Eduardo Habkost

On Mon, Nov 07, 2016 at 05:31:49PM +0300, Roman Kagan wrote:
> On Fri, Nov 04, 2016 at 06:16:06PM +0100, Radim Krčmář wrote:
> > 2016-11-04 16:57+0100, Paolo Bonzini:
> > > On 04/11/2016 16:48, Radim Krčmář wrote:
> > >> 2016-11-04 16:33+0100, Paolo Bonzini:
> > >>> On 04/11/2016 16:25, Radim Krčmář wrote:
> > >>>>>>  
> > >>>>>> +        if (s->advance_clock && s->clock + s->advance_clock > s->clock) {
> > >>>>>> +            s->clock += s->advance_clock;
> > >>>>>> +            s->advance_clock = 0;
> > >>>>>> +        }
> > >>>> Can't the advance_clock added to the migrated KVMClockState instead of
> > >>>> passing it as another parameter?
> > >>>>
> > >>>> (It is sad that we can't just query KVMClockState in kvmclock_pre_save
> > >>>>  because of the Linux bug.)
> > >>>
> > >>> What Linux bug?  The one that makes us use kvmclock_current_nsec?
> > >> 
> > >> No, the one that forced Marcelo to add the 10 minute limit to the
> > >> advance_clock.  We wouldn't need this advance_clock hack if we could
> > >> just call KVM_GET_CLOCK like we did before 00f4d64ee76e ("kvmclock:
> > >> clock should count only if vm is running").
> > > 
> > > There are two cases:
> > > 
> > > - migrating a paused guest
> > > 
> > > - pausing at the end of migration
> > > 
> > > In the first case, kvmclock_vm_state_change's !running branch will see
> > > state == RUN_STATE_FINISH_MIGRATE && s->clock_valid.  In the second
> > > case, it will see state == RUN_STATE_FINISH_MIGRATE && !s->clock_valid.
> > 
> > I lift my case, marcelo's said that stopping the time is a feature ...
> > (*kittens die*)
> 
> Sorry to chime in in the middle of the thread, but I wonder how happy
> the guests are with this behavior.  Intuitively pausing or snapshotting
> feels like closing the lid of a laptop, so every time I see the guest
> waking up in the past after a pause I get confused.  It may also be
> unexpected by Windows guests who never had this overflow problem but
> now, being tied up with kvmclock, have to stop the time while in pause,
> too.
> 
> Roman.

Waking up should be using guest-set-time QGA API: 

http://bugzilla.redhat.com/show_bug.cgi?id=1102411

Check "virsh domtime".


^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [Qemu-devel] [QEMU PATCH] kvmclock: advance clock by time window between vm_stop and pre_save
@ 2016-11-07 19:31               ` Marcelo Tosatti
  0 siblings, 0 replies; 80+ messages in thread
From: Marcelo Tosatti @ 2016-11-07 19:31 UTC (permalink / raw)
  To: Roman Kagan, Radim Krčmář,
	Paolo Bonzini, kvm, qemu-devel, Dr. David Alan Gilbert,
	Juan Quintela, Eduardo Habkost

On Mon, Nov 07, 2016 at 05:31:49PM +0300, Roman Kagan wrote:
> On Fri, Nov 04, 2016 at 06:16:06PM +0100, Radim Krčmář wrote:
> > 2016-11-04 16:57+0100, Paolo Bonzini:
> > > On 04/11/2016 16:48, Radim Krčmář wrote:
> > >> 2016-11-04 16:33+0100, Paolo Bonzini:
> > >>> On 04/11/2016 16:25, Radim Krčmář wrote:
> > >>>>>>  
> > >>>>>> +        if (s->advance_clock && s->clock + s->advance_clock > s->clock) {
> > >>>>>> +            s->clock += s->advance_clock;
> > >>>>>> +            s->advance_clock = 0;
> > >>>>>> +        }
> > >>>> Can't the advance_clock added to the migrated KVMClockState instead of
> > >>>> passing it as another parameter?
> > >>>>
> > >>>> (It is sad that we can't just query KVMClockState in kvmclock_pre_save
> > >>>>  because of the Linux bug.)
> > >>>
> > >>> What Linux bug?  The one that makes us use kvmclock_current_nsec?
> > >> 
> > >> No, the one that forced Marcelo to add the 10 minute limit to the
> > >> advance_clock.  We wouldn't need this advance_clock hack if we could
> > >> just call KVM_GET_CLOCK like we did before 00f4d64ee76e ("kvmclock:
> > >> clock should count only if vm is running").
> > > 
> > > There are two cases:
> > > 
> > > - migrating a paused guest
> > > 
> > > - pausing at the end of migration
> > > 
> > > In the first case, kvmclock_vm_state_change's !running branch will see
> > > state == RUN_STATE_FINISH_MIGRATE && s->clock_valid.  In the second
> > > case, it will see state == RUN_STATE_FINISH_MIGRATE && !s->clock_valid.
> > 
> > I lift my case, marcelo's said that stopping the time is a feature ...
> > (*kittens die*)
> 
> Sorry to chime in in the middle of the thread, but I wonder how happy
> the guests are with this behavior.  Intuitively pausing or snapshotting
> feels like closing the lid of a laptop, so every time I see the guest
> waking up in the past after a pause I get confused.  It may also be
> unexpected by Windows guests who never had this overflow problem but
> now, being tied up with kvmclock, have to stop the time while in pause,
> too.
> 
> Roman.

Waking up should be using guest-set-time QGA API: 

http://bugzilla.redhat.com/show_bug.cgi?id=1102411

Check "virsh domtime".

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [QEMU PATCH v2] kvmclock: advance clock by time window between vm_stop and pre_save
  2016-11-07 15:46     ` [Qemu-devel] " Dr. David Alan Gilbert
@ 2016-11-07 19:41       ` Marcelo Tosatti
  -1 siblings, 0 replies; 80+ messages in thread
From: Marcelo Tosatti @ 2016-11-07 19:41 UTC (permalink / raw)
  To: Dr. David Alan Gilbert
  Cc: kvm, qemu-devel, Paolo Bonzini, Radim Krčmář,
	Juan Quintela, Eduardo Habkost

On Mon, Nov 07, 2016 at 03:46:11PM +0000, Dr. David Alan Gilbert wrote:
> * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> > This patch, relative to pre-copy migration codepath,
> > measures the time between vm_stop() and pre_save(),
> > which includes copying the remaining RAM to destination,
> > and advances the clock by that amount.
> > 
> > In a VM with 5 seconds downtime, this reduces the guest
> > clock difference on destination from 5s to 0.2s.
> > 
> > Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time.
> 
> One thing that bothers me is that it's only this clock that's
> getting corrected; doesn't it cause things to get upset when
> one clock moves and the others dont?

If you are correlating the clocks, then yes.

Older Linux guests get upset (marking the TSC clocksource unstable
because the watchdog checks TSC vs kvmclock), but there is a workaround for it 
in newer guests
(kvmclock interface to notify watchdog to not complain).

Note marking TSC clocksource unstable on older guests is harmless
because kvmclock is the standard clocksource.

For Windows guests, i don't know that Windows correlates between different
clocks.

That is, there is relative control as to which software reads kvmclock 
or Windows TIMER MSR, so i don't see the need to advance every clock 
exposed.

> Shouldn't the pause delay be recorded somewhere architecturally
> independent and then be a thing that kvm-clock happens to use and
> other clocks might as well?

In theory, yes. In practice, i don't see the need for this... 


^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [Qemu-devel] [QEMU PATCH v2] kvmclock: advance clock by time window between vm_stop and pre_save
@ 2016-11-07 19:41       ` Marcelo Tosatti
  0 siblings, 0 replies; 80+ messages in thread
From: Marcelo Tosatti @ 2016-11-07 19:41 UTC (permalink / raw)
  To: Dr. David Alan Gilbert
  Cc: kvm, qemu-devel, Paolo Bonzini, Radim Krčmář,
	Juan Quintela, Eduardo Habkost

On Mon, Nov 07, 2016 at 03:46:11PM +0000, Dr. David Alan Gilbert wrote:
> * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> > This patch, relative to pre-copy migration codepath,
> > measures the time between vm_stop() and pre_save(),
> > which includes copying the remaining RAM to destination,
> > and advances the clock by that amount.
> > 
> > In a VM with 5 seconds downtime, this reduces the guest
> > clock difference on destination from 5s to 0.2s.
> > 
> > Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time.
> 
> One thing that bothers me is that it's only this clock that's
> getting corrected; doesn't it cause things to get upset when
> one clock moves and the others dont?

If you are correlating the clocks, then yes.

Older Linux guests get upset (marking the TSC clocksource unstable
because the watchdog checks TSC vs kvmclock), but there is a workaround for it 
in newer guests
(kvmclock interface to notify watchdog to not complain).

Note marking TSC clocksource unstable on older guests is harmless
because kvmclock is the standard clocksource.

For Windows guests, i don't know that Windows correlates between different
clocks.

That is, there is relative control as to which software reads kvmclock 
or Windows TIMER MSR, so i don't see the need to advance every clock 
exposed.

> Shouldn't the pause delay be recorded somewhere architecturally
> independent and then be a thing that kvm-clock happens to use and
> other clocks might as well?

In theory, yes. In practice, i don't see the need for this... 

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [QEMU PATCH v2] kvmclock: advance clock by time window between vm_stop and pre_save
  2016-11-07 19:41       ` [Qemu-devel] " Marcelo Tosatti
@ 2016-11-07 20:03         ` Dr. David Alan Gilbert
  -1 siblings, 0 replies; 80+ messages in thread
From: Dr. David Alan Gilbert @ 2016-11-07 20:03 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: kvm, qemu-devel, Paolo Bonzini, Radim Krčmář,
	Juan Quintela, Eduardo Habkost

* Marcelo Tosatti (mtosatti@redhat.com) wrote:
> On Mon, Nov 07, 2016 at 03:46:11PM +0000, Dr. David Alan Gilbert wrote:
> > * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> > > This patch, relative to pre-copy migration codepath,
> > > measures the time between vm_stop() and pre_save(),
> > > which includes copying the remaining RAM to destination,
> > > and advances the clock by that amount.
> > > 
> > > In a VM with 5 seconds downtime, this reduces the guest
> > > clock difference on destination from 5s to 0.2s.
> > > 
> > > Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time.
> > 
> > One thing that bothers me is that it's only this clock that's
> > getting corrected; doesn't it cause things to get upset when
> > one clock moves and the others dont?
> 
> If you are correlating the clocks, then yes.
> 
> Older Linux guests get upset (marking the TSC clocksource unstable
> because the watchdog checks TSC vs kvmclock), but there is a workaround for it 
> in newer guests
> (kvmclock interface to notify watchdog to not complain).
> 
> Note marking TSC clocksource unstable on older guests is harmless
> because kvmclock is the standard clocksource.
> 
> For Windows guests, i don't know that Windows correlates between different
> clocks.
> 
> That is, there is relative control as to which software reads kvmclock 
> or Windows TIMER MSR, so i don't see the need to advance every clock 
> exposed.
> 
> > Shouldn't the pause delay be recorded somewhere architecturally
> > independent and then be a thing that kvm-clock happens to use and
> > other clocks might as well?
> 
> In theory, yes. In practice, i don't see the need for this... 

It seems unlikely to me that x86 is the only one that will want
to do something similar.

Dave

--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [Qemu-devel] [QEMU PATCH v2] kvmclock: advance clock by time window between vm_stop and pre_save
@ 2016-11-07 20:03         ` Dr. David Alan Gilbert
  0 siblings, 0 replies; 80+ messages in thread
From: Dr. David Alan Gilbert @ 2016-11-07 20:03 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: kvm, qemu-devel, Paolo Bonzini, Radim Krčmář,
	Juan Quintela, Eduardo Habkost

* Marcelo Tosatti (mtosatti@redhat.com) wrote:
> On Mon, Nov 07, 2016 at 03:46:11PM +0000, Dr. David Alan Gilbert wrote:
> > * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> > > This patch, relative to pre-copy migration codepath,
> > > measures the time between vm_stop() and pre_save(),
> > > which includes copying the remaining RAM to destination,
> > > and advances the clock by that amount.
> > > 
> > > In a VM with 5 seconds downtime, this reduces the guest
> > > clock difference on destination from 5s to 0.2s.
> > > 
> > > Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time.
> > 
> > One thing that bothers me is that it's only this clock that's
> > getting corrected; doesn't it cause things to get upset when
> > one clock moves and the others dont?
> 
> If you are correlating the clocks, then yes.
> 
> Older Linux guests get upset (marking the TSC clocksource unstable
> because the watchdog checks TSC vs kvmclock), but there is a workaround for it 
> in newer guests
> (kvmclock interface to notify watchdog to not complain).
> 
> Note marking TSC clocksource unstable on older guests is harmless
> because kvmclock is the standard clocksource.
> 
> For Windows guests, i don't know that Windows correlates between different
> clocks.
> 
> That is, there is relative control as to which software reads kvmclock 
> or Windows TIMER MSR, so i don't see the need to advance every clock 
> exposed.
> 
> > Shouldn't the pause delay be recorded somewhere architecturally
> > independent and then be a thing that kvm-clock happens to use and
> > other clocks might as well?
> 
> In theory, yes. In practice, i don't see the need for this... 

It seems unlikely to me that x86 is the only one that will want
to do something similar.

Dave

--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [QEMU PATCH v2] kvmclock: advance clock by time window between vm_stop and pre_save
  2016-11-07 20:03         ` [Qemu-devel] " Dr. David Alan Gilbert
@ 2016-11-08  0:06           ` Marcelo Tosatti
  -1 siblings, 0 replies; 80+ messages in thread
From: Marcelo Tosatti @ 2016-11-08  0:06 UTC (permalink / raw)
  To: Dr. David Alan Gilbert
  Cc: Eduardo Habkost, kvm, Juan Quintela, Radim Krčmář,
	qemu-devel, Paolo Bonzini

On Mon, Nov 07, 2016 at 08:03:50PM +0000, Dr. David Alan Gilbert wrote:
> * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> > On Mon, Nov 07, 2016 at 03:46:11PM +0000, Dr. David Alan Gilbert wrote:
> > > * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> > > > This patch, relative to pre-copy migration codepath,
> > > > measures the time between vm_stop() and pre_save(),
> > > > which includes copying the remaining RAM to destination,
> > > > and advances the clock by that amount.
> > > > 
> > > > In a VM with 5 seconds downtime, this reduces the guest
> > > > clock difference on destination from 5s to 0.2s.
> > > > 
> > > > Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time.
> > > 
> > > One thing that bothers me is that it's only this clock that's
> > > getting corrected; doesn't it cause things to get upset when
> > > one clock moves and the others dont?
> > 
> > If you are correlating the clocks, then yes.
> > 
> > Older Linux guests get upset (marking the TSC clocksource unstable
> > because the watchdog checks TSC vs kvmclock), but there is a workaround for it 
> > in newer guests
> > (kvmclock interface to notify watchdog to not complain).
> > 
> > Note marking TSC clocksource unstable on older guests is harmless
> > because kvmclock is the standard clocksource.
> > 
> > For Windows guests, i don't know that Windows correlates between different
> > clocks.
> > 
> > That is, there is relative control as to which software reads kvmclock 
> > or Windows TIMER MSR, so i don't see the need to advance every clock 
> > exposed.
> > 
> > > Shouldn't the pause delay be recorded somewhere architecturally
> > > independent and then be a thing that kvm-clock happens to use and
> > > other clocks might as well?
> > 
> > In theory, yes. In practice, i don't see the need for this... 
> 
> It seems unlikely to me that x86 is the only one that will want
> to do something similar.

Can't they copy what kvmclock is doing today? 

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [Qemu-devel] [QEMU PATCH v2] kvmclock: advance clock by time window between vm_stop and pre_save
@ 2016-11-08  0:06           ` Marcelo Tosatti
  0 siblings, 0 replies; 80+ messages in thread
From: Marcelo Tosatti @ 2016-11-08  0:06 UTC (permalink / raw)
  To: Dr. David Alan Gilbert
  Cc: kvm, qemu-devel, Paolo Bonzini, Radim Krčmář,
	Juan Quintela, Eduardo Habkost

On Mon, Nov 07, 2016 at 08:03:50PM +0000, Dr. David Alan Gilbert wrote:
> * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> > On Mon, Nov 07, 2016 at 03:46:11PM +0000, Dr. David Alan Gilbert wrote:
> > > * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> > > > This patch, relative to pre-copy migration codepath,
> > > > measures the time between vm_stop() and pre_save(),
> > > > which includes copying the remaining RAM to destination,
> > > > and advances the clock by that amount.
> > > > 
> > > > In a VM with 5 seconds downtime, this reduces the guest
> > > > clock difference on destination from 5s to 0.2s.
> > > > 
> > > > Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time.
> > > 
> > > One thing that bothers me is that it's only this clock that's
> > > getting corrected; doesn't it cause things to get upset when
> > > one clock moves and the others dont?
> > 
> > If you are correlating the clocks, then yes.
> > 
> > Older Linux guests get upset (marking the TSC clocksource unstable
> > because the watchdog checks TSC vs kvmclock), but there is a workaround for it 
> > in newer guests
> > (kvmclock interface to notify watchdog to not complain).
> > 
> > Note marking TSC clocksource unstable on older guests is harmless
> > because kvmclock is the standard clocksource.
> > 
> > For Windows guests, i don't know that Windows correlates between different
> > clocks.
> > 
> > That is, there is relative control as to which software reads kvmclock 
> > or Windows TIMER MSR, so i don't see the need to advance every clock 
> > exposed.
> > 
> > > Shouldn't the pause delay be recorded somewhere architecturally
> > > independent and then be a thing that kvm-clock happens to use and
> > > other clocks might as well?
> > 
> > In theory, yes. In practice, i don't see the need for this... 
> 
> It seems unlikely to me that x86 is the only one that will want
> to do something similar.

Can't they copy what kvmclock is doing today? 

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [QEMU PATCH v2] kvmclock: advance clock by time window between vm_stop and pre_save
  2016-11-08  0:06           ` [Qemu-devel] " Marcelo Tosatti
@ 2016-11-08 10:22             ` Dr. David Alan Gilbert
  -1 siblings, 0 replies; 80+ messages in thread
From: Dr. David Alan Gilbert @ 2016-11-08 10:22 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: kvm, qemu-devel, Paolo Bonzini, Radim Krčmář,
	Juan Quintela, Eduardo Habkost

* Marcelo Tosatti (mtosatti@redhat.com) wrote:
> On Mon, Nov 07, 2016 at 08:03:50PM +0000, Dr. David Alan Gilbert wrote:
> > * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> > > On Mon, Nov 07, 2016 at 03:46:11PM +0000, Dr. David Alan Gilbert wrote:
> > > > * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> > > > > This patch, relative to pre-copy migration codepath,
> > > > > measures the time between vm_stop() and pre_save(),
> > > > > which includes copying the remaining RAM to destination,
> > > > > and advances the clock by that amount.
> > > > > 
> > > > > In a VM with 5 seconds downtime, this reduces the guest
> > > > > clock difference on destination from 5s to 0.2s.
> > > > > 
> > > > > Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time.
> > > > 
> > > > One thing that bothers me is that it's only this clock that's
> > > > getting corrected; doesn't it cause things to get upset when
> > > > one clock moves and the others dont?
> > > 
> > > If you are correlating the clocks, then yes.
> > > 
> > > Older Linux guests get upset (marking the TSC clocksource unstable
> > > because the watchdog checks TSC vs kvmclock), but there is a workaround for it 
> > > in newer guests
> > > (kvmclock interface to notify watchdog to not complain).
> > > 
> > > Note marking TSC clocksource unstable on older guests is harmless
> > > because kvmclock is the standard clocksource.
> > > 
> > > For Windows guests, i don't know that Windows correlates between different
> > > clocks.
> > > 
> > > That is, there is relative control as to which software reads kvmclock 
> > > or Windows TIMER MSR, so i don't see the need to advance every clock 
> > > exposed.
> > > 
> > > > Shouldn't the pause delay be recorded somewhere architecturally
> > > > independent and then be a thing that kvm-clock happens to use and
> > > > other clocks might as well?
> > > 
> > > In theory, yes. In practice, i don't see the need for this... 
> > 
> > It seems unlikely to me that x86 is the only one that will want
> > to do something similar.
> 
> Can't they copy what kvmclock is doing today? 

We shouldn't have copies of code all over should we?

Dave

--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [Qemu-devel] [QEMU PATCH v2] kvmclock: advance clock by time window between vm_stop and pre_save
@ 2016-11-08 10:22             ` Dr. David Alan Gilbert
  0 siblings, 0 replies; 80+ messages in thread
From: Dr. David Alan Gilbert @ 2016-11-08 10:22 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: kvm, qemu-devel, Paolo Bonzini, Radim Krčmář,
	Juan Quintela, Eduardo Habkost

* Marcelo Tosatti (mtosatti@redhat.com) wrote:
> On Mon, Nov 07, 2016 at 08:03:50PM +0000, Dr. David Alan Gilbert wrote:
> > * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> > > On Mon, Nov 07, 2016 at 03:46:11PM +0000, Dr. David Alan Gilbert wrote:
> > > > * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> > > > > This patch, relative to pre-copy migration codepath,
> > > > > measures the time between vm_stop() and pre_save(),
> > > > > which includes copying the remaining RAM to destination,
> > > > > and advances the clock by that amount.
> > > > > 
> > > > > In a VM with 5 seconds downtime, this reduces the guest
> > > > > clock difference on destination from 5s to 0.2s.
> > > > > 
> > > > > Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time.
> > > > 
> > > > One thing that bothers me is that it's only this clock that's
> > > > getting corrected; doesn't it cause things to get upset when
> > > > one clock moves and the others dont?
> > > 
> > > If you are correlating the clocks, then yes.
> > > 
> > > Older Linux guests get upset (marking the TSC clocksource unstable
> > > because the watchdog checks TSC vs kvmclock), but there is a workaround for it 
> > > in newer guests
> > > (kvmclock interface to notify watchdog to not complain).
> > > 
> > > Note marking TSC clocksource unstable on older guests is harmless
> > > because kvmclock is the standard clocksource.
> > > 
> > > For Windows guests, i don't know that Windows correlates between different
> > > clocks.
> > > 
> > > That is, there is relative control as to which software reads kvmclock 
> > > or Windows TIMER MSR, so i don't see the need to advance every clock 
> > > exposed.
> > > 
> > > > Shouldn't the pause delay be recorded somewhere architecturally
> > > > independent and then be a thing that kvm-clock happens to use and
> > > > other clocks might as well?
> > > 
> > > In theory, yes. In practice, i don't see the need for this... 
> > 
> > It seems unlikely to me that x86 is the only one that will want
> > to do something similar.
> 
> Can't they copy what kvmclock is doing today? 

We shouldn't have copies of code all over should we?

Dave

--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [QEMU PATCH v2] kvmclock: advance clock by time window between vm_stop and pre_save
  2016-11-08 10:22             ` [Qemu-devel] " Dr. David Alan Gilbert
@ 2016-11-08 13:32               ` Marcelo Tosatti
  -1 siblings, 0 replies; 80+ messages in thread
From: Marcelo Tosatti @ 2016-11-08 13:32 UTC (permalink / raw)
  To: Dr. David Alan Gilbert
  Cc: kvm, qemu-devel, Paolo Bonzini, Radim Krčmář,
	Juan Quintela, Eduardo Habkost

On Tue, Nov 08, 2016 at 10:22:56AM +0000, Dr. David Alan Gilbert wrote:
> * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> > On Mon, Nov 07, 2016 at 08:03:50PM +0000, Dr. David Alan Gilbert wrote:
> > > * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> > > > On Mon, Nov 07, 2016 at 03:46:11PM +0000, Dr. David Alan Gilbert wrote:
> > > > > * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> > > > > > This patch, relative to pre-copy migration codepath,
> > > > > > measures the time between vm_stop() and pre_save(),
> > > > > > which includes copying the remaining RAM to destination,
> > > > > > and advances the clock by that amount.
> > > > > > 
> > > > > > In a VM with 5 seconds downtime, this reduces the guest
> > > > > > clock difference on destination from 5s to 0.2s.
> > > > > > 
> > > > > > Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time.
> > > > > 
> > > > > One thing that bothers me is that it's only this clock that's
> > > > > getting corrected; doesn't it cause things to get upset when
> > > > > one clock moves and the others dont?
> > > > 
> > > > If you are correlating the clocks, then yes.
> > > > 
> > > > Older Linux guests get upset (marking the TSC clocksource unstable
> > > > because the watchdog checks TSC vs kvmclock), but there is a workaround for it 
> > > > in newer guests
> > > > (kvmclock interface to notify watchdog to not complain).
> > > > 
> > > > Note marking TSC clocksource unstable on older guests is harmless
> > > > because kvmclock is the standard clocksource.
> > > > 
> > > > For Windows guests, i don't know that Windows correlates between different
> > > > clocks.
> > > > 
> > > > That is, there is relative control as to which software reads kvmclock 
> > > > or Windows TIMER MSR, so i don't see the need to advance every clock 
> > > > exposed.
> > > > 
> > > > > Shouldn't the pause delay be recorded somewhere architecturally
> > > > > independent and then be a thing that kvm-clock happens to use and
> > > > > other clocks might as well?
> > > > 
> > > > In theory, yes. In practice, i don't see the need for this... 
> > > 
> > > It seems unlikely to me that x86 is the only one that will want
> > > to do something similar.
> > 
> > Can't they copy what kvmclock is doing today? 
> 
> We shouldn't have copies of code all over should we?
> 
> Dave

Fine i'll add a notifier.


^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [Qemu-devel] [QEMU PATCH v2] kvmclock: advance clock by time window between vm_stop and pre_save
@ 2016-11-08 13:32               ` Marcelo Tosatti
  0 siblings, 0 replies; 80+ messages in thread
From: Marcelo Tosatti @ 2016-11-08 13:32 UTC (permalink / raw)
  To: Dr. David Alan Gilbert
  Cc: kvm, qemu-devel, Paolo Bonzini, Radim Krčmář,
	Juan Quintela, Eduardo Habkost

On Tue, Nov 08, 2016 at 10:22:56AM +0000, Dr. David Alan Gilbert wrote:
> * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> > On Mon, Nov 07, 2016 at 08:03:50PM +0000, Dr. David Alan Gilbert wrote:
> > > * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> > > > On Mon, Nov 07, 2016 at 03:46:11PM +0000, Dr. David Alan Gilbert wrote:
> > > > > * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> > > > > > This patch, relative to pre-copy migration codepath,
> > > > > > measures the time between vm_stop() and pre_save(),
> > > > > > which includes copying the remaining RAM to destination,
> > > > > > and advances the clock by that amount.
> > > > > > 
> > > > > > In a VM with 5 seconds downtime, this reduces the guest
> > > > > > clock difference on destination from 5s to 0.2s.
> > > > > > 
> > > > > > Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time.
> > > > > 
> > > > > One thing that bothers me is that it's only this clock that's
> > > > > getting corrected; doesn't it cause things to get upset when
> > > > > one clock moves and the others dont?
> > > > 
> > > > If you are correlating the clocks, then yes.
> > > > 
> > > > Older Linux guests get upset (marking the TSC clocksource unstable
> > > > because the watchdog checks TSC vs kvmclock), but there is a workaround for it 
> > > > in newer guests
> > > > (kvmclock interface to notify watchdog to not complain).
> > > > 
> > > > Note marking TSC clocksource unstable on older guests is harmless
> > > > because kvmclock is the standard clocksource.
> > > > 
> > > > For Windows guests, i don't know that Windows correlates between different
> > > > clocks.
> > > > 
> > > > That is, there is relative control as to which software reads kvmclock 
> > > > or Windows TIMER MSR, so i don't see the need to advance every clock 
> > > > exposed.
> > > > 
> > > > > Shouldn't the pause delay be recorded somewhere architecturally
> > > > > independent and then be a thing that kvm-clock happens to use and
> > > > > other clocks might as well?
> > > > 
> > > > In theory, yes. In practice, i don't see the need for this... 
> > > 
> > > It seems unlikely to me that x86 is the only one that will want
> > > to do something similar.
> > 
> > Can't they copy what kvmclock is doing today? 
> 
> We shouldn't have copies of code all over should we?
> 
> Dave

Fine i'll add a notifier.

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [QEMU PATCH v2] kvmclock: advance clock by time window between vm_stop and pre_save
  2016-11-08 10:22             ` [Qemu-devel] " Dr. David Alan Gilbert
@ 2016-11-09 16:23               ` Paolo Bonzini
  -1 siblings, 0 replies; 80+ messages in thread
From: Paolo Bonzini @ 2016-11-09 16:23 UTC (permalink / raw)
  To: Dr. David Alan Gilbert, Marcelo Tosatti
  Cc: kvm, qemu-devel, Radim Krčmář,
	Juan Quintela, Eduardo Habkost



On 08/11/2016 11:22, Dr. David Alan Gilbert wrote:
> * Marcelo Tosatti (mtosatti@redhat.com) wrote:
>> On Mon, Nov 07, 2016 at 08:03:50PM +0000, Dr. David Alan Gilbert wrote:
>>> * Marcelo Tosatti (mtosatti@redhat.com) wrote:
>>>> On Mon, Nov 07, 2016 at 03:46:11PM +0000, Dr. David Alan Gilbert wrote:
>>>>> * Marcelo Tosatti (mtosatti@redhat.com) wrote:
>>>>>> This patch, relative to pre-copy migration codepath,
>>>>>> measures the time between vm_stop() and pre_save(),
>>>>>> which includes copying the remaining RAM to destination,
>>>>>> and advances the clock by that amount.
>>>>>>
>>>>>> In a VM with 5 seconds downtime, this reduces the guest
>>>>>> clock difference on destination from 5s to 0.2s.
>>>>>>
>>>>>> Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time.
>>>>>
>>>>> One thing that bothers me is that it's only this clock that's
>>>>> getting corrected; doesn't it cause things to get upset when
>>>>> one clock moves and the others dont?
>>>>
>>>> If you are correlating the clocks, then yes.
>>>>
>>>> Older Linux guests get upset (marking the TSC clocksource unstable
>>>> because the watchdog checks TSC vs kvmclock), but there is a workaround for it 
>>>> in newer guests
>>>> (kvmclock interface to notify watchdog to not complain).
>>>>
>>>> Note marking TSC clocksource unstable on older guests is harmless
>>>> because kvmclock is the standard clocksource.
>>>>
>>>> For Windows guests, i don't know that Windows correlates between different
>>>> clocks.
>>>>
>>>> That is, there is relative control as to which software reads kvmclock 
>>>> or Windows TIMER MSR, so i don't see the need to advance every clock 
>>>> exposed.
>>>>
>>>>> Shouldn't the pause delay be recorded somewhere architecturally
>>>>> independent and then be a thing that kvm-clock happens to use and
>>>>> other clocks might as well?
>>>>
>>>> In theory, yes. In practice, i don't see the need for this... 
>>>
>>> It seems unlikely to me that x86 is the only one that will want
>>> to do something similar.
>>
>> Can't they copy what kvmclock is doing today? 
> 
> We shouldn't have copies of code all over should we?

Let's cross the bridge when we get there.

For now I'm more interested in having a version of the patch that is
clean and doesn't need advance_clock.

Paolo

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [Qemu-devel] [QEMU PATCH v2] kvmclock: advance clock by time window between vm_stop and pre_save
@ 2016-11-09 16:23               ` Paolo Bonzini
  0 siblings, 0 replies; 80+ messages in thread
From: Paolo Bonzini @ 2016-11-09 16:23 UTC (permalink / raw)
  To: Dr. David Alan Gilbert, Marcelo Tosatti
  Cc: kvm, qemu-devel, Radim Krčmář,
	Juan Quintela, Eduardo Habkost



On 08/11/2016 11:22, Dr. David Alan Gilbert wrote:
> * Marcelo Tosatti (mtosatti@redhat.com) wrote:
>> On Mon, Nov 07, 2016 at 08:03:50PM +0000, Dr. David Alan Gilbert wrote:
>>> * Marcelo Tosatti (mtosatti@redhat.com) wrote:
>>>> On Mon, Nov 07, 2016 at 03:46:11PM +0000, Dr. David Alan Gilbert wrote:
>>>>> * Marcelo Tosatti (mtosatti@redhat.com) wrote:
>>>>>> This patch, relative to pre-copy migration codepath,
>>>>>> measures the time between vm_stop() and pre_save(),
>>>>>> which includes copying the remaining RAM to destination,
>>>>>> and advances the clock by that amount.
>>>>>>
>>>>>> In a VM with 5 seconds downtime, this reduces the guest
>>>>>> clock difference on destination from 5s to 0.2s.
>>>>>>
>>>>>> Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time.
>>>>>
>>>>> One thing that bothers me is that it's only this clock that's
>>>>> getting corrected; doesn't it cause things to get upset when
>>>>> one clock moves and the others dont?
>>>>
>>>> If you are correlating the clocks, then yes.
>>>>
>>>> Older Linux guests get upset (marking the TSC clocksource unstable
>>>> because the watchdog checks TSC vs kvmclock), but there is a workaround for it 
>>>> in newer guests
>>>> (kvmclock interface to notify watchdog to not complain).
>>>>
>>>> Note marking TSC clocksource unstable on older guests is harmless
>>>> because kvmclock is the standard clocksource.
>>>>
>>>> For Windows guests, i don't know that Windows correlates between different
>>>> clocks.
>>>>
>>>> That is, there is relative control as to which software reads kvmclock 
>>>> or Windows TIMER MSR, so i don't see the need to advance every clock 
>>>> exposed.
>>>>
>>>>> Shouldn't the pause delay be recorded somewhere architecturally
>>>>> independent and then be a thing that kvm-clock happens to use and
>>>>> other clocks might as well?
>>>>
>>>> In theory, yes. In practice, i don't see the need for this... 
>>>
>>> It seems unlikely to me that x86 is the only one that will want
>>> to do something similar.
>>
>> Can't they copy what kvmclock is doing today? 
> 
> We shouldn't have copies of code all over should we?

Let's cross the bridge when we get there.

For now I'm more interested in having a version of the patch that is
clean and doesn't need advance_clock.

Paolo

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [QEMU PATCH v2] kvmclock: advance clock by time window between vm_stop and pre_save
  2016-11-09 16:23               ` [Qemu-devel] " Paolo Bonzini
@ 2016-11-09 16:28                 ` Dr. David Alan Gilbert
  -1 siblings, 0 replies; 80+ messages in thread
From: Dr. David Alan Gilbert @ 2016-11-09 16:28 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Marcelo Tosatti, kvm, qemu-devel, Radim Krčmář,
	Juan Quintela, Eduardo Habkost

* Paolo Bonzini (pbonzini@redhat.com) wrote:
> 
> 
> On 08/11/2016 11:22, Dr. David Alan Gilbert wrote:
> > * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> >> On Mon, Nov 07, 2016 at 08:03:50PM +0000, Dr. David Alan Gilbert wrote:
> >>> * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> >>>> On Mon, Nov 07, 2016 at 03:46:11PM +0000, Dr. David Alan Gilbert wrote:
> >>>>> * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> >>>>>> This patch, relative to pre-copy migration codepath,
> >>>>>> measures the time between vm_stop() and pre_save(),
> >>>>>> which includes copying the remaining RAM to destination,
> >>>>>> and advances the clock by that amount.
> >>>>>>
> >>>>>> In a VM with 5 seconds downtime, this reduces the guest
> >>>>>> clock difference on destination from 5s to 0.2s.
> >>>>>>
> >>>>>> Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time.
> >>>>>
> >>>>> One thing that bothers me is that it's only this clock that's
> >>>>> getting corrected; doesn't it cause things to get upset when
> >>>>> one clock moves and the others dont?
> >>>>
> >>>> If you are correlating the clocks, then yes.
> >>>>
> >>>> Older Linux guests get upset (marking the TSC clocksource unstable
> >>>> because the watchdog checks TSC vs kvmclock), but there is a workaround for it 
> >>>> in newer guests
> >>>> (kvmclock interface to notify watchdog to not complain).
> >>>>
> >>>> Note marking TSC clocksource unstable on older guests is harmless
> >>>> because kvmclock is the standard clocksource.
> >>>>
> >>>> For Windows guests, i don't know that Windows correlates between different
> >>>> clocks.
> >>>>
> >>>> That is, there is relative control as to which software reads kvmclock 
> >>>> or Windows TIMER MSR, so i don't see the need to advance every clock 
> >>>> exposed.
> >>>>
> >>>>> Shouldn't the pause delay be recorded somewhere architecturally
> >>>>> independent and then be a thing that kvm-clock happens to use and
> >>>>> other clocks might as well?
> >>>>
> >>>> In theory, yes. In practice, i don't see the need for this... 
> >>>
> >>> It seems unlikely to me that x86 is the only one that will want
> >>> to do something similar.
> >>
> >> Can't they copy what kvmclock is doing today? 
> > 
> > We shouldn't have copies of code all over should we?
> 
> Let's cross the bridge when we get there.

That will mean it has the migration data in the wrong place
and any other clocks that need to be incremented by the same offset
will need a hook or be inconsistent with this calculation.

Dave

> For now I'm more interested in having a version of the patch that is
> clean and doesn't need advance_clock.
> 
> Paolo
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [Qemu-devel] [QEMU PATCH v2] kvmclock: advance clock by time window between vm_stop and pre_save
@ 2016-11-09 16:28                 ` Dr. David Alan Gilbert
  0 siblings, 0 replies; 80+ messages in thread
From: Dr. David Alan Gilbert @ 2016-11-09 16:28 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Marcelo Tosatti, kvm, qemu-devel, Radim Krčmář,
	Juan Quintela, Eduardo Habkost

* Paolo Bonzini (pbonzini@redhat.com) wrote:
> 
> 
> On 08/11/2016 11:22, Dr. David Alan Gilbert wrote:
> > * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> >> On Mon, Nov 07, 2016 at 08:03:50PM +0000, Dr. David Alan Gilbert wrote:
> >>> * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> >>>> On Mon, Nov 07, 2016 at 03:46:11PM +0000, Dr. David Alan Gilbert wrote:
> >>>>> * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> >>>>>> This patch, relative to pre-copy migration codepath,
> >>>>>> measures the time between vm_stop() and pre_save(),
> >>>>>> which includes copying the remaining RAM to destination,
> >>>>>> and advances the clock by that amount.
> >>>>>>
> >>>>>> In a VM with 5 seconds downtime, this reduces the guest
> >>>>>> clock difference on destination from 5s to 0.2s.
> >>>>>>
> >>>>>> Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time.
> >>>>>
> >>>>> One thing that bothers me is that it's only this clock that's
> >>>>> getting corrected; doesn't it cause things to get upset when
> >>>>> one clock moves and the others dont?
> >>>>
> >>>> If you are correlating the clocks, then yes.
> >>>>
> >>>> Older Linux guests get upset (marking the TSC clocksource unstable
> >>>> because the watchdog checks TSC vs kvmclock), but there is a workaround for it 
> >>>> in newer guests
> >>>> (kvmclock interface to notify watchdog to not complain).
> >>>>
> >>>> Note marking TSC clocksource unstable on older guests is harmless
> >>>> because kvmclock is the standard clocksource.
> >>>>
> >>>> For Windows guests, i don't know that Windows correlates between different
> >>>> clocks.
> >>>>
> >>>> That is, there is relative control as to which software reads kvmclock 
> >>>> or Windows TIMER MSR, so i don't see the need to advance every clock 
> >>>> exposed.
> >>>>
> >>>>> Shouldn't the pause delay be recorded somewhere architecturally
> >>>>> independent and then be a thing that kvm-clock happens to use and
> >>>>> other clocks might as well?
> >>>>
> >>>> In theory, yes. In practice, i don't see the need for this... 
> >>>
> >>> It seems unlikely to me that x86 is the only one that will want
> >>> to do something similar.
> >>
> >> Can't they copy what kvmclock is doing today? 
> > 
> > We shouldn't have copies of code all over should we?
> 
> Let's cross the bridge when we get there.

That will mean it has the migration data in the wrong place
and any other clocks that need to be incremented by the same offset
will need a hook or be inconsistent with this calculation.

Dave

> For now I'm more interested in having a version of the patch that is
> clean and doesn't need advance_clock.
> 
> Paolo
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [QEMU PATCH v2] kvmclock: advance clock by time window between vm_stop and pre_save
  2016-11-09 16:28                 ` [Qemu-devel] " Dr. David Alan Gilbert
@ 2016-11-09 16:33                   ` Paolo Bonzini
  -1 siblings, 0 replies; 80+ messages in thread
From: Paolo Bonzini @ 2016-11-09 16:33 UTC (permalink / raw)
  To: Dr. David Alan Gilbert
  Cc: Marcelo Tosatti, kvm, qemu-devel, Radim Krčmář,
	Juan Quintela, Eduardo Habkost



On 09/11/2016 17:28, Dr. David Alan Gilbert wrote:
> * Paolo Bonzini (pbonzini@redhat.com) wrote:
>>
>>
>> On 08/11/2016 11:22, Dr. David Alan Gilbert wrote:
>>> * Marcelo Tosatti (mtosatti@redhat.com) wrote:
>>>> On Mon, Nov 07, 2016 at 08:03:50PM +0000, Dr. David Alan Gilbert wrote:
>>>>> * Marcelo Tosatti (mtosatti@redhat.com) wrote:
>>>>>> On Mon, Nov 07, 2016 at 03:46:11PM +0000, Dr. David Alan Gilbert wrote:
>>>>>>> * Marcelo Tosatti (mtosatti@redhat.com) wrote:
>>>>>>>> This patch, relative to pre-copy migration codepath,
>>>>>>>> measures the time between vm_stop() and pre_save(),
>>>>>>>> which includes copying the remaining RAM to destination,
>>>>>>>> and advances the clock by that amount.
>>>>>>>>
>>>>>>>> In a VM with 5 seconds downtime, this reduces the guest
>>>>>>>> clock difference on destination from 5s to 0.2s.
>>>>>>>>
>>>>>>>> Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time.
>>>>>>>
>>>>>>> One thing that bothers me is that it's only this clock that's
>>>>>>> getting corrected; doesn't it cause things to get upset when
>>>>>>> one clock moves and the others dont?
>>>>>>
>>>>>> If you are correlating the clocks, then yes.
>>>>>>
>>>>>> Older Linux guests get upset (marking the TSC clocksource unstable
>>>>>> because the watchdog checks TSC vs kvmclock), but there is a workaround for it 
>>>>>> in newer guests
>>>>>> (kvmclock interface to notify watchdog to not complain).
>>>>>>
>>>>>> Note marking TSC clocksource unstable on older guests is harmless
>>>>>> because kvmclock is the standard clocksource.
>>>>>>
>>>>>> For Windows guests, i don't know that Windows correlates between different
>>>>>> clocks.
>>>>>>
>>>>>> That is, there is relative control as to which software reads kvmclock 
>>>>>> or Windows TIMER MSR, so i don't see the need to advance every clock 
>>>>>> exposed.
>>>>>>
>>>>>>> Shouldn't the pause delay be recorded somewhere architecturally
>>>>>>> independent and then be a thing that kvm-clock happens to use and
>>>>>>> other clocks might as well?
>>>>>>
>>>>>> In theory, yes. In practice, i don't see the need for this... 
>>>>>
>>>>> It seems unlikely to me that x86 is the only one that will want
>>>>> to do something similar.
>>>>
>>>> Can't they copy what kvmclock is doing today? 
>>>
>>> We shouldn't have copies of code all over should we?
>>
>> Let's cross the bridge when we get there.
> 
> That will mean it has the migration data in the wrong place
> and any other clocks that need to be incremented by the same offset
> will need a hook or be inconsistent with this calculation.

No, there is no additional migration data that is needed.  This is just
a bug in how the pausing of CLOCK_MONOTONIC was implemented for the
kvmclock clocksource.

Right now, x86 is the only case where we have the problem, and x86 is
using a single "backend" for both kvmclock and the Hyper-V TSC reference
page.

For everyone else, there is no clocksource paravirtualization going on
(luckily, considering what a mess is kvmclock).  They can just use
QEMU_CLOCK_VIRTUAL if they want something that pauses during the VM.
Now, QEMU_CLOCK_VIRTUAL actually has the same bug that Marcelo is
fixing, so we may indeed want a common solution if possible.  But again,
let's see first what the code looks like for _one_ clocksource, before
writing a generalized (and thus more complex) solution.

Paolo

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [Qemu-devel] [QEMU PATCH v2] kvmclock: advance clock by time window between vm_stop and pre_save
@ 2016-11-09 16:33                   ` Paolo Bonzini
  0 siblings, 0 replies; 80+ messages in thread
From: Paolo Bonzini @ 2016-11-09 16:33 UTC (permalink / raw)
  To: Dr. David Alan Gilbert
  Cc: Marcelo Tosatti, kvm, qemu-devel, Radim Krčmář,
	Juan Quintela, Eduardo Habkost



On 09/11/2016 17:28, Dr. David Alan Gilbert wrote:
> * Paolo Bonzini (pbonzini@redhat.com) wrote:
>>
>>
>> On 08/11/2016 11:22, Dr. David Alan Gilbert wrote:
>>> * Marcelo Tosatti (mtosatti@redhat.com) wrote:
>>>> On Mon, Nov 07, 2016 at 08:03:50PM +0000, Dr. David Alan Gilbert wrote:
>>>>> * Marcelo Tosatti (mtosatti@redhat.com) wrote:
>>>>>> On Mon, Nov 07, 2016 at 03:46:11PM +0000, Dr. David Alan Gilbert wrote:
>>>>>>> * Marcelo Tosatti (mtosatti@redhat.com) wrote:
>>>>>>>> This patch, relative to pre-copy migration codepath,
>>>>>>>> measures the time between vm_stop() and pre_save(),
>>>>>>>> which includes copying the remaining RAM to destination,
>>>>>>>> and advances the clock by that amount.
>>>>>>>>
>>>>>>>> In a VM with 5 seconds downtime, this reduces the guest
>>>>>>>> clock difference on destination from 5s to 0.2s.
>>>>>>>>
>>>>>>>> Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time.
>>>>>>>
>>>>>>> One thing that bothers me is that it's only this clock that's
>>>>>>> getting corrected; doesn't it cause things to get upset when
>>>>>>> one clock moves and the others dont?
>>>>>>
>>>>>> If you are correlating the clocks, then yes.
>>>>>>
>>>>>> Older Linux guests get upset (marking the TSC clocksource unstable
>>>>>> because the watchdog checks TSC vs kvmclock), but there is a workaround for it 
>>>>>> in newer guests
>>>>>> (kvmclock interface to notify watchdog to not complain).
>>>>>>
>>>>>> Note marking TSC clocksource unstable on older guests is harmless
>>>>>> because kvmclock is the standard clocksource.
>>>>>>
>>>>>> For Windows guests, i don't know that Windows correlates between different
>>>>>> clocks.
>>>>>>
>>>>>> That is, there is relative control as to which software reads kvmclock 
>>>>>> or Windows TIMER MSR, so i don't see the need to advance every clock 
>>>>>> exposed.
>>>>>>
>>>>>>> Shouldn't the pause delay be recorded somewhere architecturally
>>>>>>> independent and then be a thing that kvm-clock happens to use and
>>>>>>> other clocks might as well?
>>>>>>
>>>>>> In theory, yes. In practice, i don't see the need for this... 
>>>>>
>>>>> It seems unlikely to me that x86 is the only one that will want
>>>>> to do something similar.
>>>>
>>>> Can't they copy what kvmclock is doing today? 
>>>
>>> We shouldn't have copies of code all over should we?
>>
>> Let's cross the bridge when we get there.
> 
> That will mean it has the migration data in the wrong place
> and any other clocks that need to be incremented by the same offset
> will need a hook or be inconsistent with this calculation.

No, there is no additional migration data that is needed.  This is just
a bug in how the pausing of CLOCK_MONOTONIC was implemented for the
kvmclock clocksource.

Right now, x86 is the only case where we have the problem, and x86 is
using a single "backend" for both kvmclock and the Hyper-V TSC reference
page.

For everyone else, there is no clocksource paravirtualization going on
(luckily, considering what a mess is kvmclock).  They can just use
QEMU_CLOCK_VIRTUAL if they want something that pauses during the VM.
Now, QEMU_CLOCK_VIRTUAL actually has the same bug that Marcelo is
fixing, so we may indeed want a common solution if possible.  But again,
let's see first what the code looks like for _one_ clocksource, before
writing a generalized (and thus more complex) solution.

Paolo

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [QEMU PATCH v2] kvmclock: advance clock by time window between vm_stop and pre_save
  2016-11-08 13:32               ` [Qemu-devel] " Marcelo Tosatti
@ 2016-11-09 19:32                 ` Marcelo Tosatti
  -1 siblings, 0 replies; 80+ messages in thread
From: Marcelo Tosatti @ 2016-11-09 19:32 UTC (permalink / raw)
  To: Dr. David Alan Gilbert
  Cc: kvm, qemu-devel, Paolo Bonzini, Radim Krčmář,
	Juan Quintela, Eduardo Habkost

On Tue, Nov 08, 2016 at 11:32:30AM -0200, Marcelo Tosatti wrote:
> On Tue, Nov 08, 2016 at 10:22:56AM +0000, Dr. David Alan Gilbert wrote:
> > * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> > > On Mon, Nov 07, 2016 at 08:03:50PM +0000, Dr. David Alan Gilbert wrote:
> > > > * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> > > > > On Mon, Nov 07, 2016 at 03:46:11PM +0000, Dr. David Alan Gilbert wrote:
> > > > > > * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> > > > > > > This patch, relative to pre-copy migration codepath,
> > > > > > > measures the time between vm_stop() and pre_save(),
> > > > > > > which includes copying the remaining RAM to destination,
> > > > > > > and advances the clock by that amount.
> > > > > > > 
> > > > > > > In a VM with 5 seconds downtime, this reduces the guest
> > > > > > > clock difference on destination from 5s to 0.2s.
> > > > > > > 
> > > > > > > Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time.
> > > > > > 
> > > > > > One thing that bothers me is that it's only this clock that's
> > > > > > getting corrected; doesn't it cause things to get upset when
> > > > > > one clock moves and the others dont?
> > > > > 
> > > > > If you are correlating the clocks, then yes.
> > > > > 
> > > > > Older Linux guests get upset (marking the TSC clocksource unstable
> > > > > because the watchdog checks TSC vs kvmclock), but there is a workaround for it 
> > > > > in newer guests
> > > > > (kvmclock interface to notify watchdog to not complain).
> > > > > 
> > > > > Note marking TSC clocksource unstable on older guests is harmless
> > > > > because kvmclock is the standard clocksource.
> > > > > 
> > > > > For Windows guests, i don't know that Windows correlates between different
> > > > > clocks.
> > > > > 
> > > > > That is, there is relative control as to which software reads kvmclock 
> > > > > or Windows TIMER MSR, so i don't see the need to advance every clock 
> > > > > exposed.
> > > > > 
> > > > > > Shouldn't the pause delay be recorded somewhere architecturally
> > > > > > independent and then be a thing that kvm-clock happens to use and
> > > > > > other clocks might as well?
> > > > > 
> > > > > In theory, yes. In practice, i don't see the need for this... 
> > > > 
> > > > It seems unlikely to me that x86 is the only one that will want
> > > > to do something similar.
> > > 
> > > Can't they copy what kvmclock is doing today? 
> > 
> > We shouldn't have copies of code all over should we?
> > 
> > Dave
> 
> Fine i'll add a notifier.
> 

KVM Linux guests all have to: 

Make CLOCK_MONOTONIC not count during vmpause
(because it mimicks behaviour under suspend/resume 
of baremetal). And because time overflows.

Assuming the HW clock counts while the VM is paused,
it means they have to hook into vmstate change
notifiers to:

        event           action
        vmstop          KVM_GET_CLOCK
        vmstart         KVM_SET_CLOCK (that earlier value)

For x86 we want to start counting the time there 
because while the VM is running, the host TSC 
is keeping track of time. 
So you measure the amount of time between:

        -> Guest VM clock stops ticking.
        -> clock_gettime(CLOCK_MONOTONIC, &pointA);
        ...
        -> presave: clock_gettime(CLOCK_MONOTONIC, &pointB);

I measured the additional time between presave and EOF: its about
5ms.

On destination, there is an additional 30ms between EOF receival 
and restoration of TSC.

Now, the clock difference is 130ms, and i am not sure where it 
comes from, trying to figure out. But the patch should give 35ms 
difference which is pretty good. Chasing that down...

So in summary: i don't see the point of making the code
"generic" without knowing what the other arches 
want to do.









^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [Qemu-devel] [QEMU PATCH v2] kvmclock: advance clock by time window between vm_stop and pre_save
@ 2016-11-09 19:32                 ` Marcelo Tosatti
  0 siblings, 0 replies; 80+ messages in thread
From: Marcelo Tosatti @ 2016-11-09 19:32 UTC (permalink / raw)
  To: Dr. David Alan Gilbert
  Cc: kvm, qemu-devel, Paolo Bonzini, Radim Krčmář,
	Juan Quintela, Eduardo Habkost

On Tue, Nov 08, 2016 at 11:32:30AM -0200, Marcelo Tosatti wrote:
> On Tue, Nov 08, 2016 at 10:22:56AM +0000, Dr. David Alan Gilbert wrote:
> > * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> > > On Mon, Nov 07, 2016 at 08:03:50PM +0000, Dr. David Alan Gilbert wrote:
> > > > * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> > > > > On Mon, Nov 07, 2016 at 03:46:11PM +0000, Dr. David Alan Gilbert wrote:
> > > > > > * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> > > > > > > This patch, relative to pre-copy migration codepath,
> > > > > > > measures the time between vm_stop() and pre_save(),
> > > > > > > which includes copying the remaining RAM to destination,
> > > > > > > and advances the clock by that amount.
> > > > > > > 
> > > > > > > In a VM with 5 seconds downtime, this reduces the guest
> > > > > > > clock difference on destination from 5s to 0.2s.
> > > > > > > 
> > > > > > > Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time.
> > > > > > 
> > > > > > One thing that bothers me is that it's only this clock that's
> > > > > > getting corrected; doesn't it cause things to get upset when
> > > > > > one clock moves and the others dont?
> > > > > 
> > > > > If you are correlating the clocks, then yes.
> > > > > 
> > > > > Older Linux guests get upset (marking the TSC clocksource unstable
> > > > > because the watchdog checks TSC vs kvmclock), but there is a workaround for it 
> > > > > in newer guests
> > > > > (kvmclock interface to notify watchdog to not complain).
> > > > > 
> > > > > Note marking TSC clocksource unstable on older guests is harmless
> > > > > because kvmclock is the standard clocksource.
> > > > > 
> > > > > For Windows guests, i don't know that Windows correlates between different
> > > > > clocks.
> > > > > 
> > > > > That is, there is relative control as to which software reads kvmclock 
> > > > > or Windows TIMER MSR, so i don't see the need to advance every clock 
> > > > > exposed.
> > > > > 
> > > > > > Shouldn't the pause delay be recorded somewhere architecturally
> > > > > > independent and then be a thing that kvm-clock happens to use and
> > > > > > other clocks might as well?
> > > > > 
> > > > > In theory, yes. In practice, i don't see the need for this... 
> > > > 
> > > > It seems unlikely to me that x86 is the only one that will want
> > > > to do something similar.
> > > 
> > > Can't they copy what kvmclock is doing today? 
> > 
> > We shouldn't have copies of code all over should we?
> > 
> > Dave
> 
> Fine i'll add a notifier.
> 

KVM Linux guests all have to: 

Make CLOCK_MONOTONIC not count during vmpause
(because it mimicks behaviour under suspend/resume 
of baremetal). And because time overflows.

Assuming the HW clock counts while the VM is paused,
it means they have to hook into vmstate change
notifiers to:

        event           action
        vmstop          KVM_GET_CLOCK
        vmstart         KVM_SET_CLOCK (that earlier value)

For x86 we want to start counting the time there 
because while the VM is running, the host TSC 
is keeping track of time. 
So you measure the amount of time between:

        -> Guest VM clock stops ticking.
        -> clock_gettime(CLOCK_MONOTONIC, &pointA);
        ...
        -> presave: clock_gettime(CLOCK_MONOTONIC, &pointB);

I measured the additional time between presave and EOF: its about
5ms.

On destination, there is an additional 30ms between EOF receival 
and restoration of TSC.

Now, the clock difference is 130ms, and i am not sure where it 
comes from, trying to figure out. But the patch should give 35ms 
difference which is pretty good. Chasing that down...

So in summary: i don't see the point of making the code
"generic" without knowing what the other arches 
want to do.

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [QEMU PATCH v2] kvmclock: advance clock by time window between vm_stop and pre_save
  2016-11-09 16:23               ` [Qemu-devel] " Paolo Bonzini
@ 2016-11-10 11:48                 ` Marcelo Tosatti
  -1 siblings, 0 replies; 80+ messages in thread
From: Marcelo Tosatti @ 2016-11-10 11:48 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Dr. David Alan Gilbert, kvm, qemu-devel,
	Radim Krčmář,
	Juan Quintela, Eduardo Habkost

GOn Wed, Nov 09, 2016 at 05:23:50PM +0100, Paolo Bonzini wrote:
> 
> 
> On 08/11/2016 11:22, Dr. David Alan Gilbert wrote:
> > * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> >> On Mon, Nov 07, 2016 at 08:03:50PM +0000, Dr. David Alan Gilbert wrote:
> >>> * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> >>>> On Mon, Nov 07, 2016 at 03:46:11PM +0000, Dr. David Alan Gilbert wrote:
> >>>>> * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> >>>>>> This patch, relative to pre-copy migration codepath,
> >>>>>> measures the time between vm_stop() and pre_save(),
> >>>>>> which includes copying the remaining RAM to destination,
> >>>>>> and advances the clock by that amount.
> >>>>>>
> >>>>>> In a VM with 5 seconds downtime, this reduces the guest
> >>>>>> clock difference on destination from 5s to 0.2s.
> >>>>>>
> >>>>>> Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time.
> >>>>>
> >>>>> One thing that bothers me is that it's only this clock that's
> >>>>> getting corrected; doesn't it cause things to get upset when
> >>>>> one clock moves and the others dont?
> >>>>
> >>>> If you are correlating the clocks, then yes.
> >>>>
> >>>> Older Linux guests get upset (marking the TSC clocksource unstable
> >>>> because the watchdog checks TSC vs kvmclock), but there is a workaround for it 
> >>>> in newer guests
> >>>> (kvmclock interface to notify watchdog to not complain).
> >>>>
> >>>> Note marking TSC clocksource unstable on older guests is harmless
> >>>> because kvmclock is the standard clocksource.
> >>>>
> >>>> For Windows guests, i don't know that Windows correlates between different
> >>>> clocks.
> >>>>
> >>>> That is, there is relative control as to which software reads kvmclock 
> >>>> or Windows TIMER MSR, so i don't see the need to advance every clock 
> >>>> exposed.
> >>>>
> >>>>> Shouldn't the pause delay be recorded somewhere architecturally
> >>>>> independent and then be a thing that kvm-clock happens to use and
> >>>>> other clocks might as well?
> >>>>
> >>>> In theory, yes. In practice, i don't see the need for this... 
> >>>
> >>> It seems unlikely to me that x86 is the only one that will want
> >>> to do something similar.
> >>
> >> Can't they copy what kvmclock is doing today? 
> > 
> > We shouldn't have copies of code all over should we?
> 
> Let's cross the bridge when we get there.
> 
> For now I'm more interested in having a version of the patch that is
> clean and doesn't need advance_clock.
> 
> Paolo


Destination has to run the following logic:

If (source has KVM_CAP_ADVANCE_CLOCK)
    use KVM_GET_CLOCK value
Else
   read pvclock from guest

To support migration from older QEMU versions which do not have
KVM_CAP_ADVANCE_CLOCK (or new QEMU versions running on old
hosts without KVM_CAP_ADVANCE_CLOCK).

I don't see any clean way to give that information, except changing
the migration format to pass "host: kvm_cap_advance_clock=true/false"
information.

Ideas?


^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [Qemu-devel] [QEMU PATCH v2] kvmclock: advance clock by time window between vm_stop and pre_save
@ 2016-11-10 11:48                 ` Marcelo Tosatti
  0 siblings, 0 replies; 80+ messages in thread
From: Marcelo Tosatti @ 2016-11-10 11:48 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Dr. David Alan Gilbert, kvm, qemu-devel,
	Radim Krčmář,
	Juan Quintela, Eduardo Habkost

GOn Wed, Nov 09, 2016 at 05:23:50PM +0100, Paolo Bonzini wrote:
> 
> 
> On 08/11/2016 11:22, Dr. David Alan Gilbert wrote:
> > * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> >> On Mon, Nov 07, 2016 at 08:03:50PM +0000, Dr. David Alan Gilbert wrote:
> >>> * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> >>>> On Mon, Nov 07, 2016 at 03:46:11PM +0000, Dr. David Alan Gilbert wrote:
> >>>>> * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> >>>>>> This patch, relative to pre-copy migration codepath,
> >>>>>> measures the time between vm_stop() and pre_save(),
> >>>>>> which includes copying the remaining RAM to destination,
> >>>>>> and advances the clock by that amount.
> >>>>>>
> >>>>>> In a VM with 5 seconds downtime, this reduces the guest
> >>>>>> clock difference on destination from 5s to 0.2s.
> >>>>>>
> >>>>>> Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time.
> >>>>>
> >>>>> One thing that bothers me is that it's only this clock that's
> >>>>> getting corrected; doesn't it cause things to get upset when
> >>>>> one clock moves and the others dont?
> >>>>
> >>>> If you are correlating the clocks, then yes.
> >>>>
> >>>> Older Linux guests get upset (marking the TSC clocksource unstable
> >>>> because the watchdog checks TSC vs kvmclock), but there is a workaround for it 
> >>>> in newer guests
> >>>> (kvmclock interface to notify watchdog to not complain).
> >>>>
> >>>> Note marking TSC clocksource unstable on older guests is harmless
> >>>> because kvmclock is the standard clocksource.
> >>>>
> >>>> For Windows guests, i don't know that Windows correlates between different
> >>>> clocks.
> >>>>
> >>>> That is, there is relative control as to which software reads kvmclock 
> >>>> or Windows TIMER MSR, so i don't see the need to advance every clock 
> >>>> exposed.
> >>>>
> >>>>> Shouldn't the pause delay be recorded somewhere architecturally
> >>>>> independent and then be a thing that kvm-clock happens to use and
> >>>>> other clocks might as well?
> >>>>
> >>>> In theory, yes. In practice, i don't see the need for this... 
> >>>
> >>> It seems unlikely to me that x86 is the only one that will want
> >>> to do something similar.
> >>
> >> Can't they copy what kvmclock is doing today? 
> > 
> > We shouldn't have copies of code all over should we?
> 
> Let's cross the bridge when we get there.
> 
> For now I'm more interested in having a version of the patch that is
> clean and doesn't need advance_clock.
> 
> Paolo


Destination has to run the following logic:

If (source has KVM_CAP_ADVANCE_CLOCK)
    use KVM_GET_CLOCK value
Else
   read pvclock from guest

To support migration from older QEMU versions which do not have
KVM_CAP_ADVANCE_CLOCK (or new QEMU versions running on old
hosts without KVM_CAP_ADVANCE_CLOCK).

I don't see any clean way to give that information, except changing
the migration format to pass "host: kvm_cap_advance_clock=true/false"
information.

Ideas?

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [QEMU PATCH v2] kvmclock: advance clock by time window between vm_stop and pre_save
  2016-11-10 11:48                 ` [Qemu-devel] " Marcelo Tosatti
@ 2016-11-10 17:57                   ` Paolo Bonzini
  -1 siblings, 0 replies; 80+ messages in thread
From: Paolo Bonzini @ 2016-11-10 17:57 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: Dr. David Alan Gilbert, kvm, qemu-devel,
	Radim Krčmář,
	Juan Quintela, Eduardo Habkost



On 10/11/2016 12:48, Marcelo Tosatti wrote:
> Destination has to run the following logic:
> 
> If (source has KVM_CAP_ADVANCE_CLOCK)
>     use KVM_GET_CLOCK value
> Else
>    read pvclock from guest
> 
> To support migration from older QEMU versions which do not have
> KVM_CAP_ADVANCE_CLOCK (or new QEMU versions running on old
> hosts without KVM_CAP_ADVANCE_CLOCK).
> 
> I don't see any clean way to give that information, except changing
> the migration format to pass "host: kvm_cap_advance_clock=true/false"
> information.

If you make it only affect new machine types, you could transmit a dummy
clock value such as -1 if the source does not have KVM_CLOCK_TSC_STABLE.

Paolo

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [Qemu-devel] [QEMU PATCH v2] kvmclock: advance clock by time window between vm_stop and pre_save
@ 2016-11-10 17:57                   ` Paolo Bonzini
  0 siblings, 0 replies; 80+ messages in thread
From: Paolo Bonzini @ 2016-11-10 17:57 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: Dr. David Alan Gilbert, kvm, qemu-devel,
	Radim Krčmář,
	Juan Quintela, Eduardo Habkost



On 10/11/2016 12:48, Marcelo Tosatti wrote:
> Destination has to run the following logic:
> 
> If (source has KVM_CAP_ADVANCE_CLOCK)
>     use KVM_GET_CLOCK value
> Else
>    read pvclock from guest
> 
> To support migration from older QEMU versions which do not have
> KVM_CAP_ADVANCE_CLOCK (or new QEMU versions running on old
> hosts without KVM_CAP_ADVANCE_CLOCK).
> 
> I don't see any clean way to give that information, except changing
> the migration format to pass "host: kvm_cap_advance_clock=true/false"
> information.

If you make it only affect new machine types, you could transmit a dummy
clock value such as -1 if the source does not have KVM_CLOCK_TSC_STABLE.

Paolo

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [QEMU PATCH v2] kvmclock: advance clock by time window between vm_stop and pre_save
  2016-11-10 17:57                   ` [Qemu-devel] " Paolo Bonzini
@ 2016-11-11 14:23                     ` Marcelo Tosatti
  -1 siblings, 0 replies; 80+ messages in thread
From: Marcelo Tosatti @ 2016-11-11 14:23 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Dr. David Alan Gilbert, kvm, qemu-devel,
	Radim Krčmář,
	Juan Quintela, Eduardo Habkost

On Thu, Nov 10, 2016 at 06:57:21PM +0100, Paolo Bonzini wrote:
> 
> 
> On 10/11/2016 12:48, Marcelo Tosatti wrote:
> > Destination has to run the following logic:
> > 
> > If (source has KVM_CAP_ADVANCE_CLOCK)
> >     use KVM_GET_CLOCK value
> > Else
> >    read pvclock from guest
> > 
> > To support migration from older QEMU versions which do not have
> > KVM_CAP_ADVANCE_CLOCK (or new QEMU versions running on old
> > hosts without KVM_CAP_ADVANCE_CLOCK).
> > 
> > I don't see any clean way to give that information, except changing
> > the migration format to pass "host: kvm_cap_advance_clock=true/false"
> > information.
> 
> If you make it only affect new machine types, you could transmit a dummy
> clock value such as -1 if the source does not have KVM_CLOCK_TSC_STABLE.
> 
> Paolo

Prefer a new subsection (which is fine since migration is broken
anyway), because otherwise you have to deal with restoring
s->clock from -1 to what was read at KVM_GET_CLOCK (in case
migration fails).


^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [Qemu-devel] [QEMU PATCH v2] kvmclock: advance clock by time window between vm_stop and pre_save
@ 2016-11-11 14:23                     ` Marcelo Tosatti
  0 siblings, 0 replies; 80+ messages in thread
From: Marcelo Tosatti @ 2016-11-11 14:23 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Dr. David Alan Gilbert, kvm, qemu-devel,
	Radim Krčmář,
	Juan Quintela, Eduardo Habkost

On Thu, Nov 10, 2016 at 06:57:21PM +0100, Paolo Bonzini wrote:
> 
> 
> On 10/11/2016 12:48, Marcelo Tosatti wrote:
> > Destination has to run the following logic:
> > 
> > If (source has KVM_CAP_ADVANCE_CLOCK)
> >     use KVM_GET_CLOCK value
> > Else
> >    read pvclock from guest
> > 
> > To support migration from older QEMU versions which do not have
> > KVM_CAP_ADVANCE_CLOCK (or new QEMU versions running on old
> > hosts without KVM_CAP_ADVANCE_CLOCK).
> > 
> > I don't see any clean way to give that information, except changing
> > the migration format to pass "host: kvm_cap_advance_clock=true/false"
> > information.
> 
> If you make it only affect new machine types, you could transmit a dummy
> clock value such as -1 if the source does not have KVM_CLOCK_TSC_STABLE.
> 
> Paolo

Prefer a new subsection (which is fine since migration is broken
anyway), because otherwise you have to deal with restoring
s->clock from -1 to what was read at KVM_GET_CLOCK (in case
migration fails).

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [QEMU PATCH v2] kvmclock: advance clock by time window between vm_stop and pre_save
  2016-11-07 19:41       ` [Qemu-devel] " Marcelo Tosatti
@ 2017-02-07 10:02         ` Wanpeng Li
  -1 siblings, 0 replies; 80+ messages in thread
From: Wanpeng Li @ 2017-02-07 10:02 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: Dr. David Alan Gilbert, kvm, qemu-devel, Paolo Bonzini,
	Radim Krčmář,
	Juan Quintela, Eduardo Habkost

2016-11-08 3:41 GMT+08:00 Marcelo Tosatti <mtosatti@redhat.com>:
> On Mon, Nov 07, 2016 at 03:46:11PM +0000, Dr. David Alan Gilbert wrote:
>> * Marcelo Tosatti (mtosatti@redhat.com) wrote:
>> > This patch, relative to pre-copy migration codepath,
>> > measures the time between vm_stop() and pre_save(),
>> > which includes copying the remaining RAM to destination,
>> > and advances the clock by that amount.
>> >
>> > In a VM with 5 seconds downtime, this reduces the guest
>> > clock difference on destination from 5s to 0.2s.
>> >
>> > Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time.
>>
>> One thing that bothers me is that it's only this clock that's
>> getting corrected; doesn't it cause things to get upset when
>> one clock moves and the others dont?
>
> If you are correlating the clocks, then yes.
>
> Older Linux guests get upset (marking the TSC clocksource unstable
> because the watchdog checks TSC vs kvmclock), but there is a workaround for it
> in newer guests
> (kvmclock interface to notify watchdog to not complain).

Could you point out which interface? I didn't find it.

Regards,
Wanpeng Li

>
> Note marking TSC clocksource unstable on older guests is harmless
> because kvmclock is the standard clocksource.
>
> For Windows guests, i don't know that Windows correlates between different
> clocks.
>
> That is, there is relative control as to which software reads kvmclock
> or Windows TIMER MSR, so i don't see the need to advance every clock
> exposed.
>
>> Shouldn't the pause delay be recorded somewhere architecturally
>> independent and then be a thing that kvm-clock happens to use and
>> other clocks might as well?
>
> In theory, yes. In practice, i don't see the need for this...
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [Qemu-devel] [QEMU PATCH v2] kvmclock: advance clock by time window between vm_stop and pre_save
@ 2017-02-07 10:02         ` Wanpeng Li
  0 siblings, 0 replies; 80+ messages in thread
From: Wanpeng Li @ 2017-02-07 10:02 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: Dr. David Alan Gilbert, kvm, qemu-devel, Paolo Bonzini,
	Radim Krčmář,
	Juan Quintela, Eduardo Habkost

2016-11-08 3:41 GMT+08:00 Marcelo Tosatti <mtosatti@redhat.com>:
> On Mon, Nov 07, 2016 at 03:46:11PM +0000, Dr. David Alan Gilbert wrote:
>> * Marcelo Tosatti (mtosatti@redhat.com) wrote:
>> > This patch, relative to pre-copy migration codepath,
>> > measures the time between vm_stop() and pre_save(),
>> > which includes copying the remaining RAM to destination,
>> > and advances the clock by that amount.
>> >
>> > In a VM with 5 seconds downtime, this reduces the guest
>> > clock difference on destination from 5s to 0.2s.
>> >
>> > Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time.
>>
>> One thing that bothers me is that it's only this clock that's
>> getting corrected; doesn't it cause things to get upset when
>> one clock moves and the others dont?
>
> If you are correlating the clocks, then yes.
>
> Older Linux guests get upset (marking the TSC clocksource unstable
> because the watchdog checks TSC vs kvmclock), but there is a workaround for it
> in newer guests
> (kvmclock interface to notify watchdog to not complain).

Could you point out which interface? I didn't find it.

Regards,
Wanpeng Li

>
> Note marking TSC clocksource unstable on older guests is harmless
> because kvmclock is the standard clocksource.
>
> For Windows guests, i don't know that Windows correlates between different
> clocks.
>
> That is, there is relative control as to which software reads kvmclock
> or Windows TIMER MSR, so i don't see the need to advance every clock
> exposed.
>
>> Shouldn't the pause delay be recorded somewhere architecturally
>> independent and then be a thing that kvm-clock happens to use and
>> other clocks might as well?
>
> In theory, yes. In practice, i don't see the need for this...
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [QEMU PATCH v2] kvmclock: advance clock by time window between vm_stop and pre_save
  2017-02-07 10:02         ` [Qemu-devel] " Wanpeng Li
@ 2017-02-07 12:18           ` Marcelo Tosatti
  -1 siblings, 0 replies; 80+ messages in thread
From: Marcelo Tosatti @ 2017-02-07 12:18 UTC (permalink / raw)
  To: Wanpeng Li
  Cc: Dr. David Alan Gilbert, kvm, qemu-devel, Paolo Bonzini,
	Radim Krčmář,
	Juan Quintela, Eduardo Habkost

On Tue, Feb 07, 2017 at 06:02:13PM +0800, Wanpeng Li wrote:
> 2016-11-08 3:41 GMT+08:00 Marcelo Tosatti <mtosatti@redhat.com>:
> > On Mon, Nov 07, 2016 at 03:46:11PM +0000, Dr. David Alan Gilbert wrote:
> >> * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> >> > This patch, relative to pre-copy migration codepath,
> >> > measures the time between vm_stop() and pre_save(),
> >> > which includes copying the remaining RAM to destination,
> >> > and advances the clock by that amount.
> >> >
> >> > In a VM with 5 seconds downtime, this reduces the guest
> >> > clock difference on destination from 5s to 0.2s.
> >> >
> >> > Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time.
> >>
> >> One thing that bothers me is that it's only this clock that's
> >> getting corrected; doesn't it cause things to get upset when
> >> one clock moves and the others dont?
> >
> > If you are correlating the clocks, then yes.
> >
> > Older Linux guests get upset (marking the TSC clocksource unstable
> > because the watchdog checks TSC vs kvmclock), but there is a workaround for it
> > in newer guests
> > (kvmclock interface to notify watchdog to not complain).
> 
> Could you point out which interface? I didn't find it.
> 
> Regards,
> Wanpeng Li

PVCLOCK_GUEST_STOPPED flag.

> 
> >
> > Note marking TSC clocksource unstable on older guests is harmless
> > because kvmclock is the standard clocksource.
> >
> > For Windows guests, i don't know that Windows correlates between different
> > clocks.
> >
> > That is, there is relative control as to which software reads kvmclock
> > or Windows TIMER MSR, so i don't see the need to advance every clock
> > exposed.
> >
> >> Shouldn't the pause delay be recorded somewhere architecturally
> >> independent and then be a thing that kvm-clock happens to use and
> >> other clocks might as well?
> >
> > In theory, yes. In practice, i don't see the need for this...
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe kvm" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [Qemu-devel] [QEMU PATCH v2] kvmclock: advance clock by time window between vm_stop and pre_save
@ 2017-02-07 12:18           ` Marcelo Tosatti
  0 siblings, 0 replies; 80+ messages in thread
From: Marcelo Tosatti @ 2017-02-07 12:18 UTC (permalink / raw)
  To: Wanpeng Li
  Cc: Dr. David Alan Gilbert, kvm, qemu-devel, Paolo Bonzini,
	Radim Krčmář,
	Juan Quintela, Eduardo Habkost

On Tue, Feb 07, 2017 at 06:02:13PM +0800, Wanpeng Li wrote:
> 2016-11-08 3:41 GMT+08:00 Marcelo Tosatti <mtosatti@redhat.com>:
> > On Mon, Nov 07, 2016 at 03:46:11PM +0000, Dr. David Alan Gilbert wrote:
> >> * Marcelo Tosatti (mtosatti@redhat.com) wrote:
> >> > This patch, relative to pre-copy migration codepath,
> >> > measures the time between vm_stop() and pre_save(),
> >> > which includes copying the remaining RAM to destination,
> >> > and advances the clock by that amount.
> >> >
> >> > In a VM with 5 seconds downtime, this reduces the guest
> >> > clock difference on destination from 5s to 0.2s.
> >> >
> >> > Tested with Linux and Windows 2012 R2 guests with -cpu XXX,+hv-time.
> >>
> >> One thing that bothers me is that it's only this clock that's
> >> getting corrected; doesn't it cause things to get upset when
> >> one clock moves and the others dont?
> >
> > If you are correlating the clocks, then yes.
> >
> > Older Linux guests get upset (marking the TSC clocksource unstable
> > because the watchdog checks TSC vs kvmclock), but there is a workaround for it
> > in newer guests
> > (kvmclock interface to notify watchdog to not complain).
> 
> Could you point out which interface? I didn't find it.
> 
> Regards,
> Wanpeng Li

PVCLOCK_GUEST_STOPPED flag.

> 
> >
> > Note marking TSC clocksource unstable on older guests is harmless
> > because kvmclock is the standard clocksource.
> >
> > For Windows guests, i don't know that Windows correlates between different
> > clocks.
> >
> > That is, there is relative control as to which software reads kvmclock
> > or Windows TIMER MSR, so i don't see the need to advance every clock
> > exposed.
> >
> >> Shouldn't the pause delay be recorded somewhere architecturally
> >> independent and then be a thing that kvm-clock happens to use and
> >> other clocks might as well?
> >
> > In theory, yes. In practice, i don't see the need for this...
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe kvm" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 80+ messages in thread

end of thread, other threads:[~2017-02-07 12:18 UTC | newest]

Thread overview: 80+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-11-04  9:43 [QEMU PATCH] kvmclock: advance clock by time window between vm_stop and pre_save Marcelo Tosatti
2016-11-04  9:43 ` [Qemu-devel] " Marcelo Tosatti
2016-11-04 12:28 ` Juan Quintela
2016-11-04 12:28   ` [Qemu-devel] " Juan Quintela
2016-11-04 12:35   ` Marcelo Tosatti
2016-11-04 12:35     ` [Qemu-devel] " Marcelo Tosatti
2016-11-04 14:00     ` Marcelo Tosatti
2016-11-04 14:00       ` [Qemu-devel] " Marcelo Tosatti
2016-11-04 15:25 ` Radim Krčmář
2016-11-04 15:25   ` [Qemu-devel] " Radim Krčmář
2016-11-04 15:33   ` Paolo Bonzini
2016-11-04 15:33     ` [Qemu-devel] " Paolo Bonzini
2016-11-04 15:48     ` Radim Krčmář
2016-11-04 15:48       ` [Qemu-devel] " Radim Krčmář
2016-11-04 15:57       ` Paolo Bonzini
2016-11-04 15:57         ` [Qemu-devel] " Paolo Bonzini
2016-11-04 17:16         ` Radim Krčmář
2016-11-04 17:16           ` [Qemu-devel] " Radim Krčmář
2016-11-04 21:29           ` Paolo Bonzini
2016-11-04 21:29             ` [Qemu-devel] " Paolo Bonzini
2016-11-04 21:47             ` Marcelo Tosatti
2016-11-04 21:47               ` [Qemu-devel] " Marcelo Tosatti
2016-11-04 22:35               ` Paolo Bonzini
2016-11-04 22:35                 ` [Qemu-devel] " Paolo Bonzini
2016-11-07 14:31           ` Roman Kagan
2016-11-07 14:31             ` [Qemu-devel] " Roman Kagan
2016-11-07 19:31             ` Marcelo Tosatti
2016-11-07 19:31               ` [Qemu-devel] " Marcelo Tosatti
2016-11-04 16:24       ` Marcelo Tosatti
2016-11-04 16:24         ` [Qemu-devel] " Marcelo Tosatti
2016-11-04 17:34         ` Radim Krčmář
2016-11-04 17:34           ` [Qemu-devel] " Radim Krčmář
2016-11-04 18:29           ` Marcelo Tosatti
2016-11-04 18:29             ` [Qemu-devel] " Marcelo Tosatti
2016-11-04 20:07             ` Radim Krčmář
2016-11-04 20:07               ` [Qemu-devel] " Radim Krčmář
2016-11-04 16:04   ` Marcelo Tosatti
2016-11-04 16:04     ` [Qemu-devel] " Marcelo Tosatti
2016-11-04 17:07   ` Marcelo Tosatti
2016-11-04 17:07     ` [Qemu-devel] " Marcelo Tosatti
2016-11-04 17:39     ` Radim Krčmář
2016-11-04 17:39       ` [Qemu-devel] " Radim Krčmář
2016-11-04 18:31       ` Marcelo Tosatti
2016-11-04 18:31         ` [Qemu-devel] " Marcelo Tosatti
2016-11-07 13:08       ` Dr. David Alan Gilbert
2016-11-07 13:08         ` [Qemu-devel] " Dr. David Alan Gilbert
2016-11-04 16:59 ` [QEMU PATCH v2] " Marcelo Tosatti
2016-11-04 16:59   ` [Qemu-devel] " Marcelo Tosatti
2016-11-04 18:57   ` Juan Quintela
2016-11-04 18:57     ` [Qemu-devel] " Juan Quintela
2016-11-07 15:46   ` Dr. David Alan Gilbert
2016-11-07 15:46     ` [Qemu-devel] " Dr. David Alan Gilbert
2016-11-07 19:41     ` Marcelo Tosatti
2016-11-07 19:41       ` [Qemu-devel] " Marcelo Tosatti
2016-11-07 20:03       ` Dr. David Alan Gilbert
2016-11-07 20:03         ` [Qemu-devel] " Dr. David Alan Gilbert
2016-11-08  0:06         ` Marcelo Tosatti
2016-11-08  0:06           ` [Qemu-devel] " Marcelo Tosatti
2016-11-08 10:22           ` Dr. David Alan Gilbert
2016-11-08 10:22             ` [Qemu-devel] " Dr. David Alan Gilbert
2016-11-08 13:32             ` Marcelo Tosatti
2016-11-08 13:32               ` [Qemu-devel] " Marcelo Tosatti
2016-11-09 19:32               ` Marcelo Tosatti
2016-11-09 19:32                 ` [Qemu-devel] " Marcelo Tosatti
2016-11-09 16:23             ` Paolo Bonzini
2016-11-09 16:23               ` [Qemu-devel] " Paolo Bonzini
2016-11-09 16:28               ` Dr. David Alan Gilbert
2016-11-09 16:28                 ` [Qemu-devel] " Dr. David Alan Gilbert
2016-11-09 16:33                 ` Paolo Bonzini
2016-11-09 16:33                   ` [Qemu-devel] " Paolo Bonzini
2016-11-10 11:48               ` Marcelo Tosatti
2016-11-10 11:48                 ` [Qemu-devel] " Marcelo Tosatti
2016-11-10 17:57                 ` Paolo Bonzini
2016-11-10 17:57                   ` [Qemu-devel] " Paolo Bonzini
2016-11-11 14:23                   ` Marcelo Tosatti
2016-11-11 14:23                     ` [Qemu-devel] " Marcelo Tosatti
2017-02-07 10:02       ` Wanpeng Li
2017-02-07 10:02         ` [Qemu-devel] " Wanpeng Li
2017-02-07 12:18         ` Marcelo Tosatti
2017-02-07 12:18           ` [Qemu-devel] " Marcelo Tosatti

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.