All of lore.kernel.org
 help / color / mirror / Atom feed
* [qemu patch V2 0/2] improve kvmclock difference on migration
@ 2016-11-17 13:24 ` Marcelo Tosatti
  0 siblings, 0 replies; 22+ messages in thread
From: Marcelo Tosatti @ 2016-11-17 13:24 UTC (permalink / raw)
  To: kvm
  Cc: Eduardo Habkost, Juan Quintela, Radim Krcmar, qemu-devel,
	Dr. David Alan Gilbert, Paolo Bonzini

See individual patches for details.
This patchset depends on the not yet applied
"[PATCH] kvm: kvmclock: let KVM_GET_CLOCK return whether the master clock is in use"
from Paolo.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [Qemu-devel] [qemu patch V2 0/2] improve kvmclock difference on migration
@ 2016-11-17 13:24 ` Marcelo Tosatti
  0 siblings, 0 replies; 22+ messages in thread
From: Marcelo Tosatti @ 2016-11-17 13:24 UTC (permalink / raw)
  To: kvm
  Cc: qemu-devel, Dr. David Alan Gilbert, Paolo Bonzini, Juan Quintela,
	Radim Krcmar, Eduardo Habkost

See individual patches for details.
This patchset depends on the not yet applied
"[PATCH] kvm: kvmclock: let KVM_GET_CLOCK return whether the master clock is in use"
from Paolo.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [qemu patch V2 1/2] kvm: sync linux headers
  2016-11-17 13:24 ` [Qemu-devel] " Marcelo Tosatti
@ 2016-11-17 13:24   ` Marcelo Tosatti
  -1 siblings, 0 replies; 22+ messages in thread
From: Marcelo Tosatti @ 2016-11-17 13:24 UTC (permalink / raw)
  To: kvm
  Cc: Marcelo Tosatti, Eduardo Habkost, Juan Quintela, Radim Krcmar,
	qemu-devel, Dr. David Alan Gilbert, Paolo Bonzini

[-- Attachment #1: sync-linux-headers.patch --]
[-- Type: text/plain, Size: 5447 bytes --]

Import KVM_CLOCK_TSC_STABLE.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/include/standard-headers/linux/input.h b/include/standard-headers/linux/input.h
index 7361a16..b472b85 100644
--- a/include/standard-headers/linux/input.h
+++ b/include/standard-headers/linux/input.h
@@ -245,6 +245,7 @@ struct input_mask {
 #define BUS_SPI			0x1C
 #define BUS_RMI			0x1D
 #define BUS_CEC			0x1E
+#define BUS_INTEL_ISHTP		0x1F
 
 /*
  * MT_TOOL types
diff --git a/include/standard-headers/linux/pci_regs.h b/include/standard-headers/linux/pci_regs.h
index 4040951..e5a2e68 100644
--- a/include/standard-headers/linux/pci_regs.h
+++ b/include/standard-headers/linux/pci_regs.h
@@ -612,6 +612,8 @@
  */
 #define PCI_EXP_DEVCAP2		36	/* Device Capabilities 2 */
 #define  PCI_EXP_DEVCAP2_ARI		0x00000020 /* Alternative Routing-ID */
+#define  PCI_EXP_DEVCAP2_ATOMIC_ROUTE	0x00000040 /* Atomic Op routing */
+#define PCI_EXP_DEVCAP2_ATOMIC_COMP64	0x00000100 /* Atomic 64-bit compare */
 #define  PCI_EXP_DEVCAP2_LTR		0x00000800 /* Latency tolerance reporting */
 #define  PCI_EXP_DEVCAP2_OBFF_MASK	0x000c0000 /* OBFF support mechanism */
 #define  PCI_EXP_DEVCAP2_OBFF_MSG	0x00040000 /* New message signaling */
@@ -619,6 +621,7 @@
 #define PCI_EXP_DEVCTL2		40	/* Device Control 2 */
 #define  PCI_EXP_DEVCTL2_COMP_TIMEOUT	0x000f	/* Completion Timeout Value */
 #define  PCI_EXP_DEVCTL2_ARI		0x0020	/* Alternative Routing-ID */
+#define PCI_EXP_DEVCTL2_ATOMIC_REQ	0x0040	/* Set Atomic requests */
 #define  PCI_EXP_DEVCTL2_IDO_REQ_EN	0x0100	/* Allow IDO for requests */
 #define  PCI_EXP_DEVCTL2_IDO_CMP_EN	0x0200	/* Allow IDO for completions */
 #define  PCI_EXP_DEVCTL2_LTR_EN		0x0400	/* Enable LTR mechanism */
@@ -671,7 +674,8 @@
 #define PCI_EXT_CAP_ID_PMUX	0x1A	/* Protocol Multiplexing */
 #define PCI_EXT_CAP_ID_PASID	0x1B	/* Process Address Space ID */
 #define PCI_EXT_CAP_ID_DPC	0x1D	/* Downstream Port Containment */
-#define PCI_EXT_CAP_ID_MAX	PCI_EXT_CAP_ID_DPC
+#define PCI_EXT_CAP_ID_PTM	0x1F	/* Precision Time Measurement */
+#define PCI_EXT_CAP_ID_MAX	PCI_EXT_CAP_ID_PTM
 
 #define PCI_EXT_CAP_DSN_SIZEOF	12
 #define PCI_EXT_CAP_MCAST_ENDPOINT_SIZEOF 40
@@ -964,4 +968,13 @@
 
 #define PCI_EXP_DPC_SOURCE_ID		10	/* DPC Source Identifier */
 
+/* Precision Time Measurement */
+#define PCI_PTM_CAP			0x04	    /* PTM Capability */
+#define  PCI_PTM_CAP_REQ		0x00000001  /* Requester capable */
+#define  PCI_PTM_CAP_ROOT		0x00000004  /* Root capable */
+#define  PCI_PTM_GRANULARITY_MASK	0x0000FF00  /* Clock granularity */
+#define PCI_PTM_CTRL			0x08	    /* PTM Control */
+#define  PCI_PTM_CTRL_ENABLE		0x00000001  /* PTM enable */
+#define  PCI_PTM_CTRL_ROOT		0x00000002  /* Root select */
+
 #endif /* LINUX_PCI_REGS_H */
diff --git a/linux-headers/asm-arm/kvm.h b/linux-headers/asm-arm/kvm.h
index 541268c..2fb7859 100644
--- a/linux-headers/asm-arm/kvm.h
+++ b/linux-headers/asm-arm/kvm.h
@@ -84,6 +84,13 @@ struct kvm_regs {
 #define KVM_VGIC_V2_DIST_SIZE		0x1000
 #define KVM_VGIC_V2_CPU_SIZE		0x2000
 
+/* Supported VGICv3 address types  */
+#define KVM_VGIC_V3_ADDR_TYPE_DIST	2
+#define KVM_VGIC_V3_ADDR_TYPE_REDIST	3
+
+#define KVM_VGIC_V3_DIST_SIZE		SZ_64K
+#define KVM_VGIC_V3_REDIST_SIZE		(2 * SZ_64K)
+
 #define KVM_ARM_VCPU_POWER_OFF		0 /* CPU is started in OFF state */
 #define KVM_ARM_VCPU_PSCI_0_2		1 /* CPU uses PSCI v0.2 */
 
diff --git a/linux-headers/asm-x86/unistd_32.h b/linux-headers/asm-x86/unistd_32.h
index abeaf40..d45ea28 100644
--- a/linux-headers/asm-x86/unistd_32.h
+++ b/linux-headers/asm-x86/unistd_32.h
@@ -377,5 +377,8 @@
 #define __NR_copy_file_range 377
 #define __NR_preadv2 378
 #define __NR_pwritev2 379
+#define __NR_pkey_mprotect 380
+#define __NR_pkey_alloc 381
+#define __NR_pkey_free 382
 
 #endif /* _ASM_X86_UNISTD_32_H */
diff --git a/linux-headers/asm-x86/unistd_64.h b/linux-headers/asm-x86/unistd_64.h
index 73c3d1f..e22db91 100644
--- a/linux-headers/asm-x86/unistd_64.h
+++ b/linux-headers/asm-x86/unistd_64.h
@@ -330,5 +330,8 @@
 #define __NR_copy_file_range 326
 #define __NR_preadv2 327
 #define __NR_pwritev2 328
+#define __NR_pkey_mprotect 329
+#define __NR_pkey_alloc 330
+#define __NR_pkey_free 331
 
 #endif /* _ASM_X86_UNISTD_64_H */
diff --git a/linux-headers/asm-x86/unistd_x32.h b/linux-headers/asm-x86/unistd_x32.h
index e5aea76..84e58b2 100644
--- a/linux-headers/asm-x86/unistd_x32.h
+++ b/linux-headers/asm-x86/unistd_x32.h
@@ -283,6 +283,9 @@
 #define __NR_membarrier (__X32_SYSCALL_BIT + 324)
 #define __NR_mlock2 (__X32_SYSCALL_BIT + 325)
 #define __NR_copy_file_range (__X32_SYSCALL_BIT + 326)
+#define __NR_pkey_mprotect (__X32_SYSCALL_BIT + 329)
+#define __NR_pkey_alloc (__X32_SYSCALL_BIT + 330)
+#define __NR_pkey_free (__X32_SYSCALL_BIT + 331)
 #define __NR_rt_sigaction (__X32_SYSCALL_BIT + 512)
 #define __NR_rt_sigreturn (__X32_SYSCALL_BIT + 513)
 #define __NR_ioctl (__X32_SYSCALL_BIT + 514)
diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h
index 4806e06..bb0ed71 100644
--- a/linux-headers/linux/kvm.h
+++ b/linux-headers/linux/kvm.h
@@ -972,12 +972,19 @@ struct kvm_irqfd {
 	__u8  pad[16];
 };
 
+/* For KVM_CAP_ADJUST_CLOCK */
+
+/* Do not use 1, KVM_CHECK_EXTENSION returned it before we had flags.  */
+#define KVM_CLOCK_TSC_STABLE		2
+
 struct kvm_clock_data {
 	__u64 clock;
 	__u32 flags;
 	__u32 pad[9];
 };
 
+/* For KVM_CAP_SW_TLB */
+
 #define KVM_MMU_FSL_BOOKE_NOHV		0
 #define KVM_MMU_FSL_BOOKE_HV		1
 

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [Qemu-devel] [qemu patch V2 1/2] kvm: sync linux headers
@ 2016-11-17 13:24   ` Marcelo Tosatti
  0 siblings, 0 replies; 22+ messages in thread
From: Marcelo Tosatti @ 2016-11-17 13:24 UTC (permalink / raw)
  To: kvm
  Cc: qemu-devel, Dr. David Alan Gilbert, Paolo Bonzini, Juan Quintela,
	Radim Krcmar, Eduardo Habkost, Marcelo Tosatti

[-- Attachment #1: sync-linux-headers.patch --]
[-- Type: text/plain, Size: 5447 bytes --]

Import KVM_CLOCK_TSC_STABLE.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

diff --git a/include/standard-headers/linux/input.h b/include/standard-headers/linux/input.h
index 7361a16..b472b85 100644
--- a/include/standard-headers/linux/input.h
+++ b/include/standard-headers/linux/input.h
@@ -245,6 +245,7 @@ struct input_mask {
 #define BUS_SPI			0x1C
 #define BUS_RMI			0x1D
 #define BUS_CEC			0x1E
+#define BUS_INTEL_ISHTP		0x1F
 
 /*
  * MT_TOOL types
diff --git a/include/standard-headers/linux/pci_regs.h b/include/standard-headers/linux/pci_regs.h
index 4040951..e5a2e68 100644
--- a/include/standard-headers/linux/pci_regs.h
+++ b/include/standard-headers/linux/pci_regs.h
@@ -612,6 +612,8 @@
  */
 #define PCI_EXP_DEVCAP2		36	/* Device Capabilities 2 */
 #define  PCI_EXP_DEVCAP2_ARI		0x00000020 /* Alternative Routing-ID */
+#define  PCI_EXP_DEVCAP2_ATOMIC_ROUTE	0x00000040 /* Atomic Op routing */
+#define PCI_EXP_DEVCAP2_ATOMIC_COMP64	0x00000100 /* Atomic 64-bit compare */
 #define  PCI_EXP_DEVCAP2_LTR		0x00000800 /* Latency tolerance reporting */
 #define  PCI_EXP_DEVCAP2_OBFF_MASK	0x000c0000 /* OBFF support mechanism */
 #define  PCI_EXP_DEVCAP2_OBFF_MSG	0x00040000 /* New message signaling */
@@ -619,6 +621,7 @@
 #define PCI_EXP_DEVCTL2		40	/* Device Control 2 */
 #define  PCI_EXP_DEVCTL2_COMP_TIMEOUT	0x000f	/* Completion Timeout Value */
 #define  PCI_EXP_DEVCTL2_ARI		0x0020	/* Alternative Routing-ID */
+#define PCI_EXP_DEVCTL2_ATOMIC_REQ	0x0040	/* Set Atomic requests */
 #define  PCI_EXP_DEVCTL2_IDO_REQ_EN	0x0100	/* Allow IDO for requests */
 #define  PCI_EXP_DEVCTL2_IDO_CMP_EN	0x0200	/* Allow IDO for completions */
 #define  PCI_EXP_DEVCTL2_LTR_EN		0x0400	/* Enable LTR mechanism */
@@ -671,7 +674,8 @@
 #define PCI_EXT_CAP_ID_PMUX	0x1A	/* Protocol Multiplexing */
 #define PCI_EXT_CAP_ID_PASID	0x1B	/* Process Address Space ID */
 #define PCI_EXT_CAP_ID_DPC	0x1D	/* Downstream Port Containment */
-#define PCI_EXT_CAP_ID_MAX	PCI_EXT_CAP_ID_DPC
+#define PCI_EXT_CAP_ID_PTM	0x1F	/* Precision Time Measurement */
+#define PCI_EXT_CAP_ID_MAX	PCI_EXT_CAP_ID_PTM
 
 #define PCI_EXT_CAP_DSN_SIZEOF	12
 #define PCI_EXT_CAP_MCAST_ENDPOINT_SIZEOF 40
@@ -964,4 +968,13 @@
 
 #define PCI_EXP_DPC_SOURCE_ID		10	/* DPC Source Identifier */
 
+/* Precision Time Measurement */
+#define PCI_PTM_CAP			0x04	    /* PTM Capability */
+#define  PCI_PTM_CAP_REQ		0x00000001  /* Requester capable */
+#define  PCI_PTM_CAP_ROOT		0x00000004  /* Root capable */
+#define  PCI_PTM_GRANULARITY_MASK	0x0000FF00  /* Clock granularity */
+#define PCI_PTM_CTRL			0x08	    /* PTM Control */
+#define  PCI_PTM_CTRL_ENABLE		0x00000001  /* PTM enable */
+#define  PCI_PTM_CTRL_ROOT		0x00000002  /* Root select */
+
 #endif /* LINUX_PCI_REGS_H */
diff --git a/linux-headers/asm-arm/kvm.h b/linux-headers/asm-arm/kvm.h
index 541268c..2fb7859 100644
--- a/linux-headers/asm-arm/kvm.h
+++ b/linux-headers/asm-arm/kvm.h
@@ -84,6 +84,13 @@ struct kvm_regs {
 #define KVM_VGIC_V2_DIST_SIZE		0x1000
 #define KVM_VGIC_V2_CPU_SIZE		0x2000
 
+/* Supported VGICv3 address types  */
+#define KVM_VGIC_V3_ADDR_TYPE_DIST	2
+#define KVM_VGIC_V3_ADDR_TYPE_REDIST	3
+
+#define KVM_VGIC_V3_DIST_SIZE		SZ_64K
+#define KVM_VGIC_V3_REDIST_SIZE		(2 * SZ_64K)
+
 #define KVM_ARM_VCPU_POWER_OFF		0 /* CPU is started in OFF state */
 #define KVM_ARM_VCPU_PSCI_0_2		1 /* CPU uses PSCI v0.2 */
 
diff --git a/linux-headers/asm-x86/unistd_32.h b/linux-headers/asm-x86/unistd_32.h
index abeaf40..d45ea28 100644
--- a/linux-headers/asm-x86/unistd_32.h
+++ b/linux-headers/asm-x86/unistd_32.h
@@ -377,5 +377,8 @@
 #define __NR_copy_file_range 377
 #define __NR_preadv2 378
 #define __NR_pwritev2 379
+#define __NR_pkey_mprotect 380
+#define __NR_pkey_alloc 381
+#define __NR_pkey_free 382
 
 #endif /* _ASM_X86_UNISTD_32_H */
diff --git a/linux-headers/asm-x86/unistd_64.h b/linux-headers/asm-x86/unistd_64.h
index 73c3d1f..e22db91 100644
--- a/linux-headers/asm-x86/unistd_64.h
+++ b/linux-headers/asm-x86/unistd_64.h
@@ -330,5 +330,8 @@
 #define __NR_copy_file_range 326
 #define __NR_preadv2 327
 #define __NR_pwritev2 328
+#define __NR_pkey_mprotect 329
+#define __NR_pkey_alloc 330
+#define __NR_pkey_free 331
 
 #endif /* _ASM_X86_UNISTD_64_H */
diff --git a/linux-headers/asm-x86/unistd_x32.h b/linux-headers/asm-x86/unistd_x32.h
index e5aea76..84e58b2 100644
--- a/linux-headers/asm-x86/unistd_x32.h
+++ b/linux-headers/asm-x86/unistd_x32.h
@@ -283,6 +283,9 @@
 #define __NR_membarrier (__X32_SYSCALL_BIT + 324)
 #define __NR_mlock2 (__X32_SYSCALL_BIT + 325)
 #define __NR_copy_file_range (__X32_SYSCALL_BIT + 326)
+#define __NR_pkey_mprotect (__X32_SYSCALL_BIT + 329)
+#define __NR_pkey_alloc (__X32_SYSCALL_BIT + 330)
+#define __NR_pkey_free (__X32_SYSCALL_BIT + 331)
 #define __NR_rt_sigaction (__X32_SYSCALL_BIT + 512)
 #define __NR_rt_sigreturn (__X32_SYSCALL_BIT + 513)
 #define __NR_ioctl (__X32_SYSCALL_BIT + 514)
diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h
index 4806e06..bb0ed71 100644
--- a/linux-headers/linux/kvm.h
+++ b/linux-headers/linux/kvm.h
@@ -972,12 +972,19 @@ struct kvm_irqfd {
 	__u8  pad[16];
 };
 
+/* For KVM_CAP_ADJUST_CLOCK */
+
+/* Do not use 1, KVM_CHECK_EXTENSION returned it before we had flags.  */
+#define KVM_CLOCK_TSC_STABLE		2
+
 struct kvm_clock_data {
 	__u64 clock;
 	__u32 flags;
 	__u32 pad[9];
 };
 
+/* For KVM_CAP_SW_TLB */
+
 #define KVM_MMU_FSL_BOOKE_NOHV		0
 #define KVM_MMU_FSL_BOOKE_HV		1
 

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [qemu patch V2 2/2] kvmclock: reduce kvmclock difference on migration
  2016-11-17 13:24 ` [Qemu-devel] " Marcelo Tosatti
@ 2016-11-17 13:24   ` Marcelo Tosatti
  -1 siblings, 0 replies; 22+ messages in thread
From: Marcelo Tosatti @ 2016-11-17 13:24 UTC (permalink / raw)
  To: kvm
  Cc: Marcelo Tosatti, Eduardo Habkost, Juan Quintela, Radim Krcmar,
	qemu-devel, Dr. David Alan Gilbert, Paolo Bonzini

[-- Attachment #1: kvmclock-advance-clock.patch --]
[-- Type: text/plain, Size: 7736 bytes --]

Check for KVM_CAP_ADJUST_CLOCK capability KVM_CLOCK_TSC_STABLE, which
indicates that KVM_GET_CLOCK returns a value as seen by the guest at
that moment.

For new machine types, use this value rather than reading 
from guest memory.

This reduces kvmclock difference on migration from 5s to 0.1s
(when max_downtime == 5s).

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

---
 hw/i386/kvm/clock.c    |  108 ++++++++++++++++++++++++++++++++++++++++++-------
 include/hw/i386/pc.h   |    5 ++
 target-i386/kvm.c      |    7 +++
 target-i386/kvm_i386.h |    1 
 4 files changed, 107 insertions(+), 14 deletions(-)

v2: 
- improve variable names (Juan)
- consolidate code on kvm_get_clock function (Paolo)
- return mach_use_reliable_get_clock from needed function (Paolo)

Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
===================================================================
--- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-14 10:40:39.748116312 -0200
+++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-11-14 13:38:29.299955042 -0200
@@ -36,6 +36,12 @@
 
     uint64_t clock;
     bool clock_valid;
+
+    /* whether machine supports reliable KVM_GET_CLOCK */
+    bool mach_use_reliable_get_clock;
+
+    /* whether source host supported reliable KVM_GET_CLOCK */
+    bool src_use_reliable_get_clock;
 } KVMClockState;
 
 struct pvclock_vcpu_time_info {
@@ -81,6 +87,19 @@
     return nsec + time.system_time;
 }
 
+static uint64_t kvm_get_clock(void)
+{
+    struct kvm_clock_data data;
+    int ret;
+
+    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
+    if (ret < 0) {
+        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
+                abort();
+    }
+    return data.clock;
+}
+
 static void kvmclock_vm_state_change(void *opaque, int running,
                                      RunState state)
 {
@@ -91,15 +110,37 @@
 
     if (running) {
         struct kvm_clock_data data = {};
-        uint64_t time_at_migration = kvmclock_current_nsec(s);
+        uint64_t pvclock_via_mem = 0;
 
-        s->clock_valid = false;
+        /* local (running VM) restore */
+        if (s->clock_valid) {
+            /*
+             * if host does not support reliable KVM_GET_CLOCK,
+             * read kvmclock value from memory
+             */
+            if (!kvm_has_adjust_clock_stable()) {
+                pvclock_via_mem = kvmclock_current_nsec(s);
+            }
+        /* migration/savevm/init restore */
+        } else {
+            /*
+             * use s->clock in case machine uses reliable
+             * get clock and source host supported
+             * reliable get clock
+             */
+            if (!(s->mach_use_reliable_get_clock &&
+                  s->src_use_reliable_get_clock)) {
+                pvclock_via_mem = kvmclock_current_nsec(s);
+            }
+        }
 
-        /* We can't rely on the migrated clock value, just discard it */
-        if (time_at_migration) {
-            s->clock = time_at_migration;
+        /* We can't rely on the saved clock value, just discard it */
+        if (pvclock_via_mem) {
+            s->clock = pvclock_via_mem;
         }
 
+        s->clock_valid = false;
+
         data.clock = s->clock;
         ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
         if (ret < 0) {
@@ -120,8 +161,6 @@
             }
         }
     } else {
-        struct kvm_clock_data data;
-        int ret;
 
         if (s->clock_valid) {
             return;
@@ -129,13 +168,7 @@
 
         kvm_synchronize_all_tsc();
 
-        ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
-        if (ret < 0) {
-            fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
-            abort();
-        }
-        s->clock = data.clock;
-
+        s->clock = kvm_get_clock();
         /*
          * If the VM is stopped, declare the clock state valid to
          * avoid re-reading it on next vmsave (which would return
@@ -152,22 +185,69 @@
     qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
 }
 
+static bool kvmclock_src_use_reliable_get_clock(void *opaque)
+{
+    KVMClockState *s = opaque;
+
+    /*
+     * On machine types that support reliable KVM_GET_CLOCK,
+     * if host kernel does provide reliable KVM_GET_CLOCK,
+     * set src_use_reliable_get_clock=true so that destination
+     * avoids reading kvmclock from memory.
+     */
+    if (s->mach_use_reliable_get_clock && kvm_has_adjust_clock_stable()) {
+        s->src_use_reliable_get_clock = true;
+    }
+
+    return s->mach_use_reliable_get_clock;
+}
+
+static const VMStateDescription kvmclock_reliable_get_clock = {
+    .name = "kvmclock/src_use_reliable_get_clock",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .needed = kvmclock_src_use_reliable_get_clock,
+    .fields = (VMStateField[]) {
+        VMSTATE_BOOL(src_use_reliable_get_clock, KVMClockState),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+static void kvmclock_pre_save(void *opaque)
+{
+    KVMClockState *s = opaque;
+
+    s->clock = kvm_get_clock();
+}
+
 static const VMStateDescription kvmclock_vmsd = {
     .name = "kvmclock",
     .version_id = 1,
     .minimum_version_id = 1,
+    .pre_save = kvmclock_pre_save,
     .fields = (VMStateField[]) {
         VMSTATE_UINT64(clock, KVMClockState),
         VMSTATE_END_OF_LIST()
+    },
+    .subsections = (const VMStateDescription * []) {
+        &kvmclock_reliable_get_clock,
+        NULL
     }
 };
 
+static Property kvmclock_properties[] = {
+    DEFINE_PROP_BOOL("mach_use_reliable_get_clock", KVMClockState,
+                      mach_use_reliable_get_clock, true),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
 static void kvmclock_class_init(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
 
     dc->realize = kvmclock_realize;
     dc->vmsd = &kvmclock_vmsd;
+    dc->props = kvmclock_properties;
 }
 
 static const TypeInfo kvmclock_info = {
Index: qemu-mig-advance-clock/include/hw/i386/pc.h
===================================================================
--- qemu-mig-advance-clock.orig/include/hw/i386/pc.h	2016-11-14 10:40:39.748116312 -0200
+++ qemu-mig-advance-clock/include/hw/i386/pc.h	2016-11-14 10:40:48.059128649 -0200
@@ -370,6 +370,11 @@
 #define PC_COMPAT_2_7 \
     HW_COMPAT_2_7 \
     {\
+        .driver   = "kvmclock",\
+        .property = "mach_use_reliable_get_clock",\
+        .value    = "off",\
+    },\
+    {\
         .driver   = TYPE_X86_CPU,\
         .property = "l3-cache",\
         .value    = "off",\
Index: qemu-mig-advance-clock/target-i386/kvm.c
===================================================================
--- qemu-mig-advance-clock.orig/target-i386/kvm.c	2016-11-14 10:40:39.750116314 -0200
+++ qemu-mig-advance-clock/target-i386/kvm.c	2016-11-14 10:40:48.061128652 -0200
@@ -117,6 +117,13 @@
     return kvm_check_extension(kvm_state, KVM_CAP_X86_SMM);
 }
 
+bool kvm_has_adjust_clock_stable(void)
+{
+    int ret = kvm_check_extension(kvm_state, KVM_CAP_ADJUST_CLOCK);
+
+    return (ret == KVM_CLOCK_TSC_STABLE);
+}
+
 bool kvm_allows_irq0_override(void)
 {
     return !kvm_irqchip_in_kernel() || kvm_has_gsi_routing();
Index: qemu-mig-advance-clock/target-i386/kvm_i386.h
===================================================================
--- qemu-mig-advance-clock.orig/target-i386/kvm_i386.h	2016-11-14 10:40:39.751116316 -0200
+++ qemu-mig-advance-clock/target-i386/kvm_i386.h	2016-11-14 10:40:48.061128652 -0200
@@ -17,6 +17,7 @@
 
 bool kvm_allows_irq0_override(void);
 bool kvm_has_smm(void);
+bool kvm_has_adjust_clock_stable(void);
 void kvm_synchronize_all_tsc(void);
 void kvm_arch_reset_vcpu(X86CPU *cs);
 void kvm_arch_do_init_vcpu(X86CPU *cs);

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [Qemu-devel] [qemu patch V2 2/2] kvmclock: reduce kvmclock difference on migration
@ 2016-11-17 13:24   ` Marcelo Tosatti
  0 siblings, 0 replies; 22+ messages in thread
From: Marcelo Tosatti @ 2016-11-17 13:24 UTC (permalink / raw)
  To: kvm
  Cc: qemu-devel, Dr. David Alan Gilbert, Paolo Bonzini, Juan Quintela,
	Radim Krcmar, Eduardo Habkost, Marcelo Tosatti

[-- Attachment #1: kvmclock-advance-clock.patch --]
[-- Type: text/plain, Size: 7736 bytes --]

Check for KVM_CAP_ADJUST_CLOCK capability KVM_CLOCK_TSC_STABLE, which
indicates that KVM_GET_CLOCK returns a value as seen by the guest at
that moment.

For new machine types, use this value rather than reading 
from guest memory.

This reduces kvmclock difference on migration from 5s to 0.1s
(when max_downtime == 5s).

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

---
 hw/i386/kvm/clock.c    |  108 ++++++++++++++++++++++++++++++++++++++++++-------
 include/hw/i386/pc.h   |    5 ++
 target-i386/kvm.c      |    7 +++
 target-i386/kvm_i386.h |    1 
 4 files changed, 107 insertions(+), 14 deletions(-)

v2: 
- improve variable names (Juan)
- consolidate code on kvm_get_clock function (Paolo)
- return mach_use_reliable_get_clock from needed function (Paolo)

Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
===================================================================
--- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-14 10:40:39.748116312 -0200
+++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-11-14 13:38:29.299955042 -0200
@@ -36,6 +36,12 @@
 
     uint64_t clock;
     bool clock_valid;
+
+    /* whether machine supports reliable KVM_GET_CLOCK */
+    bool mach_use_reliable_get_clock;
+
+    /* whether source host supported reliable KVM_GET_CLOCK */
+    bool src_use_reliable_get_clock;
 } KVMClockState;
 
 struct pvclock_vcpu_time_info {
@@ -81,6 +87,19 @@
     return nsec + time.system_time;
 }
 
+static uint64_t kvm_get_clock(void)
+{
+    struct kvm_clock_data data;
+    int ret;
+
+    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
+    if (ret < 0) {
+        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
+                abort();
+    }
+    return data.clock;
+}
+
 static void kvmclock_vm_state_change(void *opaque, int running,
                                      RunState state)
 {
@@ -91,15 +110,37 @@
 
     if (running) {
         struct kvm_clock_data data = {};
-        uint64_t time_at_migration = kvmclock_current_nsec(s);
+        uint64_t pvclock_via_mem = 0;
 
-        s->clock_valid = false;
+        /* local (running VM) restore */
+        if (s->clock_valid) {
+            /*
+             * if host does not support reliable KVM_GET_CLOCK,
+             * read kvmclock value from memory
+             */
+            if (!kvm_has_adjust_clock_stable()) {
+                pvclock_via_mem = kvmclock_current_nsec(s);
+            }
+        /* migration/savevm/init restore */
+        } else {
+            /*
+             * use s->clock in case machine uses reliable
+             * get clock and source host supported
+             * reliable get clock
+             */
+            if (!(s->mach_use_reliable_get_clock &&
+                  s->src_use_reliable_get_clock)) {
+                pvclock_via_mem = kvmclock_current_nsec(s);
+            }
+        }
 
-        /* We can't rely on the migrated clock value, just discard it */
-        if (time_at_migration) {
-            s->clock = time_at_migration;
+        /* We can't rely on the saved clock value, just discard it */
+        if (pvclock_via_mem) {
+            s->clock = pvclock_via_mem;
         }
 
+        s->clock_valid = false;
+
         data.clock = s->clock;
         ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
         if (ret < 0) {
@@ -120,8 +161,6 @@
             }
         }
     } else {
-        struct kvm_clock_data data;
-        int ret;
 
         if (s->clock_valid) {
             return;
@@ -129,13 +168,7 @@
 
         kvm_synchronize_all_tsc();
 
-        ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
-        if (ret < 0) {
-            fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
-            abort();
-        }
-        s->clock = data.clock;
-
+        s->clock = kvm_get_clock();
         /*
          * If the VM is stopped, declare the clock state valid to
          * avoid re-reading it on next vmsave (which would return
@@ -152,22 +185,69 @@
     qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
 }
 
+static bool kvmclock_src_use_reliable_get_clock(void *opaque)
+{
+    KVMClockState *s = opaque;
+
+    /*
+     * On machine types that support reliable KVM_GET_CLOCK,
+     * if host kernel does provide reliable KVM_GET_CLOCK,
+     * set src_use_reliable_get_clock=true so that destination
+     * avoids reading kvmclock from memory.
+     */
+    if (s->mach_use_reliable_get_clock && kvm_has_adjust_clock_stable()) {
+        s->src_use_reliable_get_clock = true;
+    }
+
+    return s->mach_use_reliable_get_clock;
+}
+
+static const VMStateDescription kvmclock_reliable_get_clock = {
+    .name = "kvmclock/src_use_reliable_get_clock",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .needed = kvmclock_src_use_reliable_get_clock,
+    .fields = (VMStateField[]) {
+        VMSTATE_BOOL(src_use_reliable_get_clock, KVMClockState),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+static void kvmclock_pre_save(void *opaque)
+{
+    KVMClockState *s = opaque;
+
+    s->clock = kvm_get_clock();
+}
+
 static const VMStateDescription kvmclock_vmsd = {
     .name = "kvmclock",
     .version_id = 1,
     .minimum_version_id = 1,
+    .pre_save = kvmclock_pre_save,
     .fields = (VMStateField[]) {
         VMSTATE_UINT64(clock, KVMClockState),
         VMSTATE_END_OF_LIST()
+    },
+    .subsections = (const VMStateDescription * []) {
+        &kvmclock_reliable_get_clock,
+        NULL
     }
 };
 
+static Property kvmclock_properties[] = {
+    DEFINE_PROP_BOOL("mach_use_reliable_get_clock", KVMClockState,
+                      mach_use_reliable_get_clock, true),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
 static void kvmclock_class_init(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
 
     dc->realize = kvmclock_realize;
     dc->vmsd = &kvmclock_vmsd;
+    dc->props = kvmclock_properties;
 }
 
 static const TypeInfo kvmclock_info = {
Index: qemu-mig-advance-clock/include/hw/i386/pc.h
===================================================================
--- qemu-mig-advance-clock.orig/include/hw/i386/pc.h	2016-11-14 10:40:39.748116312 -0200
+++ qemu-mig-advance-clock/include/hw/i386/pc.h	2016-11-14 10:40:48.059128649 -0200
@@ -370,6 +370,11 @@
 #define PC_COMPAT_2_7 \
     HW_COMPAT_2_7 \
     {\
+        .driver   = "kvmclock",\
+        .property = "mach_use_reliable_get_clock",\
+        .value    = "off",\
+    },\
+    {\
         .driver   = TYPE_X86_CPU,\
         .property = "l3-cache",\
         .value    = "off",\
Index: qemu-mig-advance-clock/target-i386/kvm.c
===================================================================
--- qemu-mig-advance-clock.orig/target-i386/kvm.c	2016-11-14 10:40:39.750116314 -0200
+++ qemu-mig-advance-clock/target-i386/kvm.c	2016-11-14 10:40:48.061128652 -0200
@@ -117,6 +117,13 @@
     return kvm_check_extension(kvm_state, KVM_CAP_X86_SMM);
 }
 
+bool kvm_has_adjust_clock_stable(void)
+{
+    int ret = kvm_check_extension(kvm_state, KVM_CAP_ADJUST_CLOCK);
+
+    return (ret == KVM_CLOCK_TSC_STABLE);
+}
+
 bool kvm_allows_irq0_override(void)
 {
     return !kvm_irqchip_in_kernel() || kvm_has_gsi_routing();
Index: qemu-mig-advance-clock/target-i386/kvm_i386.h
===================================================================
--- qemu-mig-advance-clock.orig/target-i386/kvm_i386.h	2016-11-14 10:40:39.751116316 -0200
+++ qemu-mig-advance-clock/target-i386/kvm_i386.h	2016-11-14 10:40:48.061128652 -0200
@@ -17,6 +17,7 @@
 
 bool kvm_allows_irq0_override(void);
 bool kvm_has_smm(void);
+bool kvm_has_adjust_clock_stable(void);
 void kvm_synchronize_all_tsc(void);
 void kvm_arch_reset_vcpu(X86CPU *cs);
 void kvm_arch_do_init_vcpu(X86CPU *cs);

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [qemu patch V2 2/2] kvmclock: reduce kvmclock difference on migration
  2016-11-17 13:24   ` [Qemu-devel] " Marcelo Tosatti
@ 2016-11-17 14:11     ` Eduardo Habkost
  -1 siblings, 0 replies; 22+ messages in thread
From: Eduardo Habkost @ 2016-11-17 14:11 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: kvm, Juan Quintela, Radim Krcmar, qemu-devel,
	Dr. David Alan Gilbert, Paolo Bonzini

On Thu, Nov 17, 2016 at 11:24:13AM -0200, Marcelo Tosatti wrote:
> Check for KVM_CAP_ADJUST_CLOCK capability KVM_CLOCK_TSC_STABLE, which
> indicates that KVM_GET_CLOCK returns a value as seen by the guest at
> that moment.
> 
> For new machine types, use this value rather than reading 
> from guest memory.
> 
> This reduces kvmclock difference on migration from 5s to 0.1s
> (when max_downtime == 5s).
> 
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> 
> ---
>  hw/i386/kvm/clock.c    |  108 ++++++++++++++++++++++++++++++++++++++++++-------
>  include/hw/i386/pc.h   |    5 ++
>  target-i386/kvm.c      |    7 +++
>  target-i386/kvm_i386.h |    1 
>  4 files changed, 107 insertions(+), 14 deletions(-)
> 
> v2: 
> - improve variable names (Juan)
> - consolidate code on kvm_get_clock function (Paolo)
> - return mach_use_reliable_get_clock from needed function (Paolo)
> 
> Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
> ===================================================================
> --- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-14 10:40:39.748116312 -0200
> +++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-11-14 13:38:29.299955042 -0200
> @@ -36,6 +36,12 @@
>  
>      uint64_t clock;
>      bool clock_valid;
> +
> +    /* whether machine supports reliable KVM_GET_CLOCK */
> +    bool mach_use_reliable_get_clock;
> +
> +    /* whether source host supported reliable KVM_GET_CLOCK */
> +    bool src_use_reliable_get_clock;
>  } KVMClockState;
>  
>  struct pvclock_vcpu_time_info {
> @@ -81,6 +87,19 @@
>      return nsec + time.system_time;
>  }
>  
> +static uint64_t kvm_get_clock(void)
> +{
> +    struct kvm_clock_data data;
> +    int ret;
> +
> +    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> +    if (ret < 0) {
> +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> +                abort();
> +    }
> +    return data.clock;
> +}
> +
>  static void kvmclock_vm_state_change(void *opaque, int running,
>                                       RunState state)
>  {
> @@ -91,15 +110,37 @@
>  
>      if (running) {
>          struct kvm_clock_data data = {};
> -        uint64_t time_at_migration = kvmclock_current_nsec(s);
> +        uint64_t pvclock_via_mem = 0;
>  
> -        s->clock_valid = false;
> +        /* local (running VM) restore */
> +        if (s->clock_valid) {
> +            /*
> +             * if host does not support reliable KVM_GET_CLOCK,
> +             * read kvmclock value from memory
> +             */
> +            if (!kvm_has_adjust_clock_stable()) {
> +                pvclock_via_mem = kvmclock_current_nsec(s);
> +            }
> +        /* migration/savevm/init restore */
> +        } else {
> +            /*
> +             * use s->clock in case machine uses reliable
> +             * get clock and source host supported
> +             * reliable get clock
> +             */
> +            if (!(s->mach_use_reliable_get_clock &&
> +                  s->src_use_reliable_get_clock)) {
> +                pvclock_via_mem = kvmclock_current_nsec(s);
> +            }

The s->mach_use_reliable_get_clock check seems redundant.
src_use_reliable_get_clock is set only if
mach_use_reliable_get_clock is true.

> +        }
>  
> -        /* We can't rely on the migrated clock value, just discard it */
> -        if (time_at_migration) {
> -            s->clock = time_at_migration;
> +        /* We can't rely on the saved clock value, just discard it */
> +        if (pvclock_via_mem) {
> +            s->clock = pvclock_via_mem;
>          }
>  
> +        s->clock_valid = false;
> +
>          data.clock = s->clock;
>          ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
>          if (ret < 0) {
> @@ -120,8 +161,6 @@
>              }
>          }
>      } else {
> -        struct kvm_clock_data data;
> -        int ret;
>  
>          if (s->clock_valid) {
>              return;
> @@ -129,13 +168,7 @@
>  
>          kvm_synchronize_all_tsc();
>  
> -        ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> -        if (ret < 0) {
> -            fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> -            abort();
> -        }
> -        s->clock = data.clock;
> -
> +        s->clock = kvm_get_clock();
>          /*
>           * If the VM is stopped, declare the clock state valid to
>           * avoid re-reading it on next vmsave (which would return
> @@ -152,22 +185,69 @@
>      qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
>  }
>  
> +static bool kvmclock_src_use_reliable_get_clock(void *opaque)
> +{
> +    KVMClockState *s = opaque;
> +
> +    /*
> +     * On machine types that support reliable KVM_GET_CLOCK,
> +     * if host kernel does provide reliable KVM_GET_CLOCK,
> +     * set src_use_reliable_get_clock=true so that destination
> +     * avoids reading kvmclock from memory.
> +     */
> +    if (s->mach_use_reliable_get_clock && kvm_has_adjust_clock_stable()) {
> +        s->src_use_reliable_get_clock = true;
> +    }

It feels fragile to change device state inside the .needed
function. Better to initialize src_use_reliable_get_clock on
kvmclock_realize()?

What exactly ensures src_use_reliable_get_clock is correctly
initialized on the migration destination as well?

> +
> +    return s->mach_use_reliable_get_clock;

If if kvm_has_adjust_clock_stable() is false, isn't it simpler to
simply skip the section?

It looks like mach_use_reliable_get_clock and
src_use_reliable_get_clock could become a single field:

* use_reliable_get_clock ("x-use-reliable-get-clock" property)
  set to true by default (set on DEFINE_PROP_BOOL parameter)
* "x-use-reliable-get-clock" set to false by default on older
  machine-types
* use_reliable_get_clock forced to false on kvmclock_realize() if
  !kvm_has_adjust_clock_stable()
* kvmclock_reliable_get_clock.needed return s->use_reliable_get_clock


> +}
> +
> +static const VMStateDescription kvmclock_reliable_get_clock = {
> +    .name = "kvmclock/src_use_reliable_get_clock",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .needed = kvmclock_src_use_reliable_get_clock,
> +    .fields = (VMStateField[]) {
> +        VMSTATE_BOOL(src_use_reliable_get_clock, KVMClockState),
> +        VMSTATE_END_OF_LIST()
> +    }
> +};
> +
> +static void kvmclock_pre_save(void *opaque)
> +{
> +    KVMClockState *s = opaque;
> +
> +    s->clock = kvm_get_clock();
> +}
> +
>  static const VMStateDescription kvmclock_vmsd = {
>      .name = "kvmclock",
>      .version_id = 1,
>      .minimum_version_id = 1,
> +    .pre_save = kvmclock_pre_save,
>      .fields = (VMStateField[]) {
>          VMSTATE_UINT64(clock, KVMClockState),
>          VMSTATE_END_OF_LIST()
> +    },
> +    .subsections = (const VMStateDescription * []) {
> +        &kvmclock_reliable_get_clock,
> +        NULL
>      }
>  };
>  
> +static Property kvmclock_properties[] = {
> +    DEFINE_PROP_BOOL("mach_use_reliable_get_clock", KVMClockState,
> +                      mach_use_reliable_get_clock, true),

QOM property coding style is to use "-" instead of "_". Also,
please prefix it with "x-" to indicate it is not part of QEMU
stable external interface.

> +    DEFINE_PROP_END_OF_LIST(),
> +};
> +
>  static void kvmclock_class_init(ObjectClass *klass, void *data)
>  {
>      DeviceClass *dc = DEVICE_CLASS(klass);
>  
>      dc->realize = kvmclock_realize;
>      dc->vmsd = &kvmclock_vmsd;
> +    dc->props = kvmclock_properties;
>  }
>  
>  static const TypeInfo kvmclock_info = {
> Index: qemu-mig-advance-clock/include/hw/i386/pc.h
> ===================================================================
> --- qemu-mig-advance-clock.orig/include/hw/i386/pc.h	2016-11-14 10:40:39.748116312 -0200
> +++ qemu-mig-advance-clock/include/hw/i386/pc.h	2016-11-14 10:40:48.059128649 -0200
> @@ -370,6 +370,11 @@
>  #define PC_COMPAT_2_7 \
>      HW_COMPAT_2_7 \
>      {\
> +        .driver   = "kvmclock",\
> +        .property = "mach_use_reliable_get_clock",\
> +        .value    = "off",\
> +    },\
> +    {\
>          .driver   = TYPE_X86_CPU,\
>          .property = "l3-cache",\
>          .value    = "off",\
> Index: qemu-mig-advance-clock/target-i386/kvm.c
> ===================================================================
> --- qemu-mig-advance-clock.orig/target-i386/kvm.c	2016-11-14 10:40:39.750116314 -0200
> +++ qemu-mig-advance-clock/target-i386/kvm.c	2016-11-14 10:40:48.061128652 -0200
> @@ -117,6 +117,13 @@
>      return kvm_check_extension(kvm_state, KVM_CAP_X86_SMM);
>  }
>  
> +bool kvm_has_adjust_clock_stable(void)
> +{
> +    int ret = kvm_check_extension(kvm_state, KVM_CAP_ADJUST_CLOCK);
> +
> +    return (ret == KVM_CLOCK_TSC_STABLE);
> +}
> +
>  bool kvm_allows_irq0_override(void)
>  {
>      return !kvm_irqchip_in_kernel() || kvm_has_gsi_routing();
> Index: qemu-mig-advance-clock/target-i386/kvm_i386.h
> ===================================================================
> --- qemu-mig-advance-clock.orig/target-i386/kvm_i386.h	2016-11-14 10:40:39.751116316 -0200
> +++ qemu-mig-advance-clock/target-i386/kvm_i386.h	2016-11-14 10:40:48.061128652 -0200
> @@ -17,6 +17,7 @@
>  
>  bool kvm_allows_irq0_override(void);
>  bool kvm_has_smm(void);
> +bool kvm_has_adjust_clock_stable(void);
>  void kvm_synchronize_all_tsc(void);
>  void kvm_arch_reset_vcpu(X86CPU *cs);
>  void kvm_arch_do_init_vcpu(X86CPU *cs);
> 
> 

-- 
Eduardo

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [qemu patch V2 2/2] kvmclock: reduce kvmclock difference on migration
@ 2016-11-17 14:11     ` Eduardo Habkost
  0 siblings, 0 replies; 22+ messages in thread
From: Eduardo Habkost @ 2016-11-17 14:11 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: kvm, qemu-devel, Dr. David Alan Gilbert, Paolo Bonzini,
	Juan Quintela, Radim Krcmar

On Thu, Nov 17, 2016 at 11:24:13AM -0200, Marcelo Tosatti wrote:
> Check for KVM_CAP_ADJUST_CLOCK capability KVM_CLOCK_TSC_STABLE, which
> indicates that KVM_GET_CLOCK returns a value as seen by the guest at
> that moment.
> 
> For new machine types, use this value rather than reading 
> from guest memory.
> 
> This reduces kvmclock difference on migration from 5s to 0.1s
> (when max_downtime == 5s).
> 
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> 
> ---
>  hw/i386/kvm/clock.c    |  108 ++++++++++++++++++++++++++++++++++++++++++-------
>  include/hw/i386/pc.h   |    5 ++
>  target-i386/kvm.c      |    7 +++
>  target-i386/kvm_i386.h |    1 
>  4 files changed, 107 insertions(+), 14 deletions(-)
> 
> v2: 
> - improve variable names (Juan)
> - consolidate code on kvm_get_clock function (Paolo)
> - return mach_use_reliable_get_clock from needed function (Paolo)
> 
> Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
> ===================================================================
> --- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-14 10:40:39.748116312 -0200
> +++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-11-14 13:38:29.299955042 -0200
> @@ -36,6 +36,12 @@
>  
>      uint64_t clock;
>      bool clock_valid;
> +
> +    /* whether machine supports reliable KVM_GET_CLOCK */
> +    bool mach_use_reliable_get_clock;
> +
> +    /* whether source host supported reliable KVM_GET_CLOCK */
> +    bool src_use_reliable_get_clock;
>  } KVMClockState;
>  
>  struct pvclock_vcpu_time_info {
> @@ -81,6 +87,19 @@
>      return nsec + time.system_time;
>  }
>  
> +static uint64_t kvm_get_clock(void)
> +{
> +    struct kvm_clock_data data;
> +    int ret;
> +
> +    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> +    if (ret < 0) {
> +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> +                abort();
> +    }
> +    return data.clock;
> +}
> +
>  static void kvmclock_vm_state_change(void *opaque, int running,
>                                       RunState state)
>  {
> @@ -91,15 +110,37 @@
>  
>      if (running) {
>          struct kvm_clock_data data = {};
> -        uint64_t time_at_migration = kvmclock_current_nsec(s);
> +        uint64_t pvclock_via_mem = 0;
>  
> -        s->clock_valid = false;
> +        /* local (running VM) restore */
> +        if (s->clock_valid) {
> +            /*
> +             * if host does not support reliable KVM_GET_CLOCK,
> +             * read kvmclock value from memory
> +             */
> +            if (!kvm_has_adjust_clock_stable()) {
> +                pvclock_via_mem = kvmclock_current_nsec(s);
> +            }
> +        /* migration/savevm/init restore */
> +        } else {
> +            /*
> +             * use s->clock in case machine uses reliable
> +             * get clock and source host supported
> +             * reliable get clock
> +             */
> +            if (!(s->mach_use_reliable_get_clock &&
> +                  s->src_use_reliable_get_clock)) {
> +                pvclock_via_mem = kvmclock_current_nsec(s);
> +            }

The s->mach_use_reliable_get_clock check seems redundant.
src_use_reliable_get_clock is set only if
mach_use_reliable_get_clock is true.

> +        }
>  
> -        /* We can't rely on the migrated clock value, just discard it */
> -        if (time_at_migration) {
> -            s->clock = time_at_migration;
> +        /* We can't rely on the saved clock value, just discard it */
> +        if (pvclock_via_mem) {
> +            s->clock = pvclock_via_mem;
>          }
>  
> +        s->clock_valid = false;
> +
>          data.clock = s->clock;
>          ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
>          if (ret < 0) {
> @@ -120,8 +161,6 @@
>              }
>          }
>      } else {
> -        struct kvm_clock_data data;
> -        int ret;
>  
>          if (s->clock_valid) {
>              return;
> @@ -129,13 +168,7 @@
>  
>          kvm_synchronize_all_tsc();
>  
> -        ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> -        if (ret < 0) {
> -            fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> -            abort();
> -        }
> -        s->clock = data.clock;
> -
> +        s->clock = kvm_get_clock();
>          /*
>           * If the VM is stopped, declare the clock state valid to
>           * avoid re-reading it on next vmsave (which would return
> @@ -152,22 +185,69 @@
>      qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
>  }
>  
> +static bool kvmclock_src_use_reliable_get_clock(void *opaque)
> +{
> +    KVMClockState *s = opaque;
> +
> +    /*
> +     * On machine types that support reliable KVM_GET_CLOCK,
> +     * if host kernel does provide reliable KVM_GET_CLOCK,
> +     * set src_use_reliable_get_clock=true so that destination
> +     * avoids reading kvmclock from memory.
> +     */
> +    if (s->mach_use_reliable_get_clock && kvm_has_adjust_clock_stable()) {
> +        s->src_use_reliable_get_clock = true;
> +    }

It feels fragile to change device state inside the .needed
function. Better to initialize src_use_reliable_get_clock on
kvmclock_realize()?

What exactly ensures src_use_reliable_get_clock is correctly
initialized on the migration destination as well?

> +
> +    return s->mach_use_reliable_get_clock;

If if kvm_has_adjust_clock_stable() is false, isn't it simpler to
simply skip the section?

It looks like mach_use_reliable_get_clock and
src_use_reliable_get_clock could become a single field:

* use_reliable_get_clock ("x-use-reliable-get-clock" property)
  set to true by default (set on DEFINE_PROP_BOOL parameter)
* "x-use-reliable-get-clock" set to false by default on older
  machine-types
* use_reliable_get_clock forced to false on kvmclock_realize() if
  !kvm_has_adjust_clock_stable()
* kvmclock_reliable_get_clock.needed return s->use_reliable_get_clock


> +}
> +
> +static const VMStateDescription kvmclock_reliable_get_clock = {
> +    .name = "kvmclock/src_use_reliable_get_clock",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .needed = kvmclock_src_use_reliable_get_clock,
> +    .fields = (VMStateField[]) {
> +        VMSTATE_BOOL(src_use_reliable_get_clock, KVMClockState),
> +        VMSTATE_END_OF_LIST()
> +    }
> +};
> +
> +static void kvmclock_pre_save(void *opaque)
> +{
> +    KVMClockState *s = opaque;
> +
> +    s->clock = kvm_get_clock();
> +}
> +
>  static const VMStateDescription kvmclock_vmsd = {
>      .name = "kvmclock",
>      .version_id = 1,
>      .minimum_version_id = 1,
> +    .pre_save = kvmclock_pre_save,
>      .fields = (VMStateField[]) {
>          VMSTATE_UINT64(clock, KVMClockState),
>          VMSTATE_END_OF_LIST()
> +    },
> +    .subsections = (const VMStateDescription * []) {
> +        &kvmclock_reliable_get_clock,
> +        NULL
>      }
>  };
>  
> +static Property kvmclock_properties[] = {
> +    DEFINE_PROP_BOOL("mach_use_reliable_get_clock", KVMClockState,
> +                      mach_use_reliable_get_clock, true),

QOM property coding style is to use "-" instead of "_". Also,
please prefix it with "x-" to indicate it is not part of QEMU
stable external interface.

> +    DEFINE_PROP_END_OF_LIST(),
> +};
> +
>  static void kvmclock_class_init(ObjectClass *klass, void *data)
>  {
>      DeviceClass *dc = DEVICE_CLASS(klass);
>  
>      dc->realize = kvmclock_realize;
>      dc->vmsd = &kvmclock_vmsd;
> +    dc->props = kvmclock_properties;
>  }
>  
>  static const TypeInfo kvmclock_info = {
> Index: qemu-mig-advance-clock/include/hw/i386/pc.h
> ===================================================================
> --- qemu-mig-advance-clock.orig/include/hw/i386/pc.h	2016-11-14 10:40:39.748116312 -0200
> +++ qemu-mig-advance-clock/include/hw/i386/pc.h	2016-11-14 10:40:48.059128649 -0200
> @@ -370,6 +370,11 @@
>  #define PC_COMPAT_2_7 \
>      HW_COMPAT_2_7 \
>      {\
> +        .driver   = "kvmclock",\
> +        .property = "mach_use_reliable_get_clock",\
> +        .value    = "off",\
> +    },\
> +    {\
>          .driver   = TYPE_X86_CPU,\
>          .property = "l3-cache",\
>          .value    = "off",\
> Index: qemu-mig-advance-clock/target-i386/kvm.c
> ===================================================================
> --- qemu-mig-advance-clock.orig/target-i386/kvm.c	2016-11-14 10:40:39.750116314 -0200
> +++ qemu-mig-advance-clock/target-i386/kvm.c	2016-11-14 10:40:48.061128652 -0200
> @@ -117,6 +117,13 @@
>      return kvm_check_extension(kvm_state, KVM_CAP_X86_SMM);
>  }
>  
> +bool kvm_has_adjust_clock_stable(void)
> +{
> +    int ret = kvm_check_extension(kvm_state, KVM_CAP_ADJUST_CLOCK);
> +
> +    return (ret == KVM_CLOCK_TSC_STABLE);
> +}
> +
>  bool kvm_allows_irq0_override(void)
>  {
>      return !kvm_irqchip_in_kernel() || kvm_has_gsi_routing();
> Index: qemu-mig-advance-clock/target-i386/kvm_i386.h
> ===================================================================
> --- qemu-mig-advance-clock.orig/target-i386/kvm_i386.h	2016-11-14 10:40:39.751116316 -0200
> +++ qemu-mig-advance-clock/target-i386/kvm_i386.h	2016-11-14 10:40:48.061128652 -0200
> @@ -17,6 +17,7 @@
>  
>  bool kvm_allows_irq0_override(void);
>  bool kvm_has_smm(void);
> +bool kvm_has_adjust_clock_stable(void);
>  void kvm_synchronize_all_tsc(void);
>  void kvm_arch_reset_vcpu(X86CPU *cs);
>  void kvm_arch_do_init_vcpu(X86CPU *cs);
> 
> 

-- 
Eduardo

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [qemu patch V2 2/2] kvmclock: reduce kvmclock difference on migration
  2016-11-17 14:11     ` [Qemu-devel] " Eduardo Habkost
@ 2016-11-17 14:15       ` Paolo Bonzini
  -1 siblings, 0 replies; 22+ messages in thread
From: Paolo Bonzini @ 2016-11-17 14:15 UTC (permalink / raw)
  To: Eduardo Habkost, Marcelo Tosatti
  Cc: Juan Quintela, Radim Krcmar, qemu-devel, kvm, Dr. David Alan Gilbert



On 17/11/2016 15:11, Eduardo Habkost wrote:
> On Thu, Nov 17, 2016 at 11:24:13AM -0200, Marcelo Tosatti wrote:
>> Check for KVM_CAP_ADJUST_CLOCK capability KVM_CLOCK_TSC_STABLE, which
>> indicates that KVM_GET_CLOCK returns a value as seen by the guest at
>> that moment.
>>
>> For new machine types, use this value rather than reading 
>> from guest memory.
>>
>> This reduces kvmclock difference on migration from 5s to 0.1s
>> (when max_downtime == 5s).
>>
>> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
>>
>> ---
>>  hw/i386/kvm/clock.c    |  108 ++++++++++++++++++++++++++++++++++++++++++-------
>>  include/hw/i386/pc.h   |    5 ++
>>  target-i386/kvm.c      |    7 +++
>>  target-i386/kvm_i386.h |    1 
>>  4 files changed, 107 insertions(+), 14 deletions(-)
>>
>> v2: 
>> - improve variable names (Juan)
>> - consolidate code on kvm_get_clock function (Paolo)
>> - return mach_use_reliable_get_clock from needed function (Paolo)
>>
>> Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
>> ===================================================================
>> --- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-14 10:40:39.748116312 -0200
>> +++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-11-14 13:38:29.299955042 -0200
>> @@ -36,6 +36,12 @@
>>  
>>      uint64_t clock;
>>      bool clock_valid;
>> +
>> +    /* whether machine supports reliable KVM_GET_CLOCK */
>> +    bool mach_use_reliable_get_clock;
>> +
>> +    /* whether source host supported reliable KVM_GET_CLOCK */
>> +    bool src_use_reliable_get_clock;
>>  } KVMClockState;
>>  
>>  struct pvclock_vcpu_time_info {
>> @@ -81,6 +87,19 @@
>>      return nsec + time.system_time;
>>  }
>>  
>> +static uint64_t kvm_get_clock(void)
>> +{
>> +    struct kvm_clock_data data;
>> +    int ret;
>> +
>> +    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
>> +    if (ret < 0) {
>> +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
>> +                abort();
>> +    }
>> +    return data.clock;
>> +}
>> +
>>  static void kvmclock_vm_state_change(void *opaque, int running,
>>                                       RunState state)
>>  {
>> @@ -91,15 +110,37 @@
>>  
>>      if (running) {
>>          struct kvm_clock_data data = {};
>> -        uint64_t time_at_migration = kvmclock_current_nsec(s);
>> +        uint64_t pvclock_via_mem = 0;
>>  
>> -        s->clock_valid = false;
>> +        /* local (running VM) restore */
>> +        if (s->clock_valid) {
>> +            /*
>> +             * if host does not support reliable KVM_GET_CLOCK,
>> +             * read kvmclock value from memory
>> +             */
>> +            if (!kvm_has_adjust_clock_stable()) {
>> +                pvclock_via_mem = kvmclock_current_nsec(s);
>> +            }
>> +        /* migration/savevm/init restore */
>> +        } else {
>> +            /*
>> +             * use s->clock in case machine uses reliable
>> +             * get clock and source host supported
>> +             * reliable get clock
>> +             */
>> +            if (!(s->mach_use_reliable_get_clock &&
>> +                  s->src_use_reliable_get_clock)) {
>> +                pvclock_via_mem = kvmclock_current_nsec(s);
>> +            }
> 
> The s->mach_use_reliable_get_clock check seems redundant.
> src_use_reliable_get_clock is set only if
> mach_use_reliable_get_clock is true.
> 
>> +        }
>>  
>> -        /* We can't rely on the migrated clock value, just discard it */
>> -        if (time_at_migration) {
>> -            s->clock = time_at_migration;
>> +        /* We can't rely on the saved clock value, just discard it */
>> +        if (pvclock_via_mem) {
>> +            s->clock = pvclock_via_mem;
>>          }
>>  
>> +        s->clock_valid = false;
>> +
>>          data.clock = s->clock;
>>          ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
>>          if (ret < 0) {
>> @@ -120,8 +161,6 @@
>>              }
>>          }
>>      } else {
>> -        struct kvm_clock_data data;
>> -        int ret;
>>  
>>          if (s->clock_valid) {
>>              return;
>> @@ -129,13 +168,7 @@
>>  
>>          kvm_synchronize_all_tsc();
>>  
>> -        ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
>> -        if (ret < 0) {
>> -            fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
>> -            abort();
>> -        }
>> -        s->clock = data.clock;
>> -
>> +        s->clock = kvm_get_clock();
>>          /*
>>           * If the VM is stopped, declare the clock state valid to
>>           * avoid re-reading it on next vmsave (which would return
>> @@ -152,22 +185,69 @@
>>      qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
>>  }
>>  
>> +static bool kvmclock_src_use_reliable_get_clock(void *opaque)
>> +{
>> +    KVMClockState *s = opaque;
>> +
>> +    /*
>> +     * On machine types that support reliable KVM_GET_CLOCK,
>> +     * if host kernel does provide reliable KVM_GET_CLOCK,
>> +     * set src_use_reliable_get_clock=true so that destination
>> +     * avoids reading kvmclock from memory.
>> +     */
>> +    if (s->mach_use_reliable_get_clock && kvm_has_adjust_clock_stable()) {
>> +        s->src_use_reliable_get_clock = true;
>> +    }
> 
> It feels fragile to change device state inside the .needed
> function. Better to initialize src_use_reliable_get_clock on
> kvmclock_realize()?
> 
> What exactly ensures src_use_reliable_get_clock is correctly
> initialized on the migration destination as well?
> 
>> +
>> +    return s->mach_use_reliable_get_clock;
> 
> If if kvm_has_adjust_clock_stable() is false, isn't it simpler to
> simply skip the section?

This is what I asked for. :)

However, I was proposing a different way to initialize
src_use_reliable_get_clock.  I still have to understand exactly how
Marcelo's algorithm works because (based on the kvmclock code) it's more
trick than it seems.

Paolo

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [qemu patch V2 2/2] kvmclock: reduce kvmclock difference on migration
@ 2016-11-17 14:15       ` Paolo Bonzini
  0 siblings, 0 replies; 22+ messages in thread
From: Paolo Bonzini @ 2016-11-17 14:15 UTC (permalink / raw)
  To: Eduardo Habkost, Marcelo Tosatti
  Cc: kvm, qemu-devel, Dr. David Alan Gilbert, Juan Quintela, Radim Krcmar



On 17/11/2016 15:11, Eduardo Habkost wrote:
> On Thu, Nov 17, 2016 at 11:24:13AM -0200, Marcelo Tosatti wrote:
>> Check for KVM_CAP_ADJUST_CLOCK capability KVM_CLOCK_TSC_STABLE, which
>> indicates that KVM_GET_CLOCK returns a value as seen by the guest at
>> that moment.
>>
>> For new machine types, use this value rather than reading 
>> from guest memory.
>>
>> This reduces kvmclock difference on migration from 5s to 0.1s
>> (when max_downtime == 5s).
>>
>> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
>>
>> ---
>>  hw/i386/kvm/clock.c    |  108 ++++++++++++++++++++++++++++++++++++++++++-------
>>  include/hw/i386/pc.h   |    5 ++
>>  target-i386/kvm.c      |    7 +++
>>  target-i386/kvm_i386.h |    1 
>>  4 files changed, 107 insertions(+), 14 deletions(-)
>>
>> v2: 
>> - improve variable names (Juan)
>> - consolidate code on kvm_get_clock function (Paolo)
>> - return mach_use_reliable_get_clock from needed function (Paolo)
>>
>> Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
>> ===================================================================
>> --- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-14 10:40:39.748116312 -0200
>> +++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-11-14 13:38:29.299955042 -0200
>> @@ -36,6 +36,12 @@
>>  
>>      uint64_t clock;
>>      bool clock_valid;
>> +
>> +    /* whether machine supports reliable KVM_GET_CLOCK */
>> +    bool mach_use_reliable_get_clock;
>> +
>> +    /* whether source host supported reliable KVM_GET_CLOCK */
>> +    bool src_use_reliable_get_clock;
>>  } KVMClockState;
>>  
>>  struct pvclock_vcpu_time_info {
>> @@ -81,6 +87,19 @@
>>      return nsec + time.system_time;
>>  }
>>  
>> +static uint64_t kvm_get_clock(void)
>> +{
>> +    struct kvm_clock_data data;
>> +    int ret;
>> +
>> +    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
>> +    if (ret < 0) {
>> +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
>> +                abort();
>> +    }
>> +    return data.clock;
>> +}
>> +
>>  static void kvmclock_vm_state_change(void *opaque, int running,
>>                                       RunState state)
>>  {
>> @@ -91,15 +110,37 @@
>>  
>>      if (running) {
>>          struct kvm_clock_data data = {};
>> -        uint64_t time_at_migration = kvmclock_current_nsec(s);
>> +        uint64_t pvclock_via_mem = 0;
>>  
>> -        s->clock_valid = false;
>> +        /* local (running VM) restore */
>> +        if (s->clock_valid) {
>> +            /*
>> +             * if host does not support reliable KVM_GET_CLOCK,
>> +             * read kvmclock value from memory
>> +             */
>> +            if (!kvm_has_adjust_clock_stable()) {
>> +                pvclock_via_mem = kvmclock_current_nsec(s);
>> +            }
>> +        /* migration/savevm/init restore */
>> +        } else {
>> +            /*
>> +             * use s->clock in case machine uses reliable
>> +             * get clock and source host supported
>> +             * reliable get clock
>> +             */
>> +            if (!(s->mach_use_reliable_get_clock &&
>> +                  s->src_use_reliable_get_clock)) {
>> +                pvclock_via_mem = kvmclock_current_nsec(s);
>> +            }
> 
> The s->mach_use_reliable_get_clock check seems redundant.
> src_use_reliable_get_clock is set only if
> mach_use_reliable_get_clock is true.
> 
>> +        }
>>  
>> -        /* We can't rely on the migrated clock value, just discard it */
>> -        if (time_at_migration) {
>> -            s->clock = time_at_migration;
>> +        /* We can't rely on the saved clock value, just discard it */
>> +        if (pvclock_via_mem) {
>> +            s->clock = pvclock_via_mem;
>>          }
>>  
>> +        s->clock_valid = false;
>> +
>>          data.clock = s->clock;
>>          ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
>>          if (ret < 0) {
>> @@ -120,8 +161,6 @@
>>              }
>>          }
>>      } else {
>> -        struct kvm_clock_data data;
>> -        int ret;
>>  
>>          if (s->clock_valid) {
>>              return;
>> @@ -129,13 +168,7 @@
>>  
>>          kvm_synchronize_all_tsc();
>>  
>> -        ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
>> -        if (ret < 0) {
>> -            fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
>> -            abort();
>> -        }
>> -        s->clock = data.clock;
>> -
>> +        s->clock = kvm_get_clock();
>>          /*
>>           * If the VM is stopped, declare the clock state valid to
>>           * avoid re-reading it on next vmsave (which would return
>> @@ -152,22 +185,69 @@
>>      qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
>>  }
>>  
>> +static bool kvmclock_src_use_reliable_get_clock(void *opaque)
>> +{
>> +    KVMClockState *s = opaque;
>> +
>> +    /*
>> +     * On machine types that support reliable KVM_GET_CLOCK,
>> +     * if host kernel does provide reliable KVM_GET_CLOCK,
>> +     * set src_use_reliable_get_clock=true so that destination
>> +     * avoids reading kvmclock from memory.
>> +     */
>> +    if (s->mach_use_reliable_get_clock && kvm_has_adjust_clock_stable()) {
>> +        s->src_use_reliable_get_clock = true;
>> +    }
> 
> It feels fragile to change device state inside the .needed
> function. Better to initialize src_use_reliable_get_clock on
> kvmclock_realize()?
> 
> What exactly ensures src_use_reliable_get_clock is correctly
> initialized on the migration destination as well?
> 
>> +
>> +    return s->mach_use_reliable_get_clock;
> 
> If if kvm_has_adjust_clock_stable() is false, isn't it simpler to
> simply skip the section?

This is what I asked for. :)

However, I was proposing a different way to initialize
src_use_reliable_get_clock.  I still have to understand exactly how
Marcelo's algorithm works because (based on the kvmclock code) it's more
trick than it seems.

Paolo

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [qemu patch V2 2/2] kvmclock: reduce kvmclock difference on migration
  2016-11-17 14:15       ` [Qemu-devel] " Paolo Bonzini
@ 2016-11-17 16:30         ` Marcelo Tosatti
  -1 siblings, 0 replies; 22+ messages in thread
From: Marcelo Tosatti @ 2016-11-17 16:30 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Eduardo Habkost, kvm, qemu-devel, Dr. David Alan Gilbert,
	Juan Quintela, Radim Krcmar

On Thu, Nov 17, 2016 at 03:15:03PM +0100, Paolo Bonzini wrote:
> 
> 
> On 17/11/2016 15:11, Eduardo Habkost wrote:
> > On Thu, Nov 17, 2016 at 11:24:13AM -0200, Marcelo Tosatti wrote:
> >> Check for KVM_CAP_ADJUST_CLOCK capability KVM_CLOCK_TSC_STABLE, which
> >> indicates that KVM_GET_CLOCK returns a value as seen by the guest at
> >> that moment.
> >>
> >> For new machine types, use this value rather than reading 
> >> from guest memory.
> >>
> >> This reduces kvmclock difference on migration from 5s to 0.1s
> >> (when max_downtime == 5s).
> >>
> >> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> >>
> >> ---
> >>  hw/i386/kvm/clock.c    |  108 ++++++++++++++++++++++++++++++++++++++++++-------
> >>  include/hw/i386/pc.h   |    5 ++
> >>  target-i386/kvm.c      |    7 +++
> >>  target-i386/kvm_i386.h |    1 
> >>  4 files changed, 107 insertions(+), 14 deletions(-)
> >>
> >> v2: 
> >> - improve variable names (Juan)
> >> - consolidate code on kvm_get_clock function (Paolo)
> >> - return mach_use_reliable_get_clock from needed function (Paolo)
> >>
> >> Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
> >> ===================================================================
> >> --- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-14 10:40:39.748116312 -0200
> >> +++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-11-14 13:38:29.299955042 -0200
> >> @@ -36,6 +36,12 @@
> >>  
> >>      uint64_t clock;
> >>      bool clock_valid;
> >> +
> >> +    /* whether machine supports reliable KVM_GET_CLOCK */
> >> +    bool mach_use_reliable_get_clock;
> >> +
> >> +    /* whether source host supported reliable KVM_GET_CLOCK */
> >> +    bool src_use_reliable_get_clock;
> >>  } KVMClockState;
> >>  
> >>  struct pvclock_vcpu_time_info {
> >> @@ -81,6 +87,19 @@
> >>      return nsec + time.system_time;
> >>  }
> >>  
> >> +static uint64_t kvm_get_clock(void)
> >> +{
> >> +    struct kvm_clock_data data;
> >> +    int ret;
> >> +
> >> +    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> >> +    if (ret < 0) {
> >> +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> >> +                abort();
> >> +    }
> >> +    return data.clock;
> >> +}
> >> +
> >>  static void kvmclock_vm_state_change(void *opaque, int running,
> >>                                       RunState state)
> >>  {
> >> @@ -91,15 +110,37 @@
> >>  
> >>      if (running) {
> >>          struct kvm_clock_data data = {};
> >> -        uint64_t time_at_migration = kvmclock_current_nsec(s);
> >> +        uint64_t pvclock_via_mem = 0;
> >>  
> >> -        s->clock_valid = false;
> >> +        /* local (running VM) restore */
> >> +        if (s->clock_valid) {
> >> +            /*
> >> +             * if host does not support reliable KVM_GET_CLOCK,
> >> +             * read kvmclock value from memory
> >> +             */
> >> +            if (!kvm_has_adjust_clock_stable()) {
> >> +                pvclock_via_mem = kvmclock_current_nsec(s);
> >> +            }
> >> +        /* migration/savevm/init restore */
> >> +        } else {
> >> +            /*
> >> +             * use s->clock in case machine uses reliable
> >> +             * get clock and source host supported
> >> +             * reliable get clock
> >> +             */
> >> +            if (!(s->mach_use_reliable_get_clock &&
> >> +                  s->src_use_reliable_get_clock)) {
> >> +                pvclock_via_mem = kvmclock_current_nsec(s);
> >> +            }
> > 
> > The s->mach_use_reliable_get_clock check seems redundant.
> > src_use_reliable_get_clock is set only if
> > mach_use_reliable_get_clock is true.
> > 
> >> +        }
> >>  
> >> -        /* We can't rely on the migrated clock value, just discard it */
> >> -        if (time_at_migration) {
> >> -            s->clock = time_at_migration;
> >> +        /* We can't rely on the saved clock value, just discard it */
> >> +        if (pvclock_via_mem) {
> >> +            s->clock = pvclock_via_mem;
> >>          }
> >>  
> >> +        s->clock_valid = false;
> >> +
> >>          data.clock = s->clock;
> >>          ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
> >>          if (ret < 0) {
> >> @@ -120,8 +161,6 @@
> >>              }
> >>          }
> >>      } else {
> >> -        struct kvm_clock_data data;
> >> -        int ret;
> >>  
> >>          if (s->clock_valid) {
> >>              return;
> >> @@ -129,13 +168,7 @@
> >>  
> >>          kvm_synchronize_all_tsc();
> >>  
> >> -        ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> >> -        if (ret < 0) {
> >> -            fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> >> -            abort();
> >> -        }
> >> -        s->clock = data.clock;
> >> -
> >> +        s->clock = kvm_get_clock();
> >>          /*
> >>           * If the VM is stopped, declare the clock state valid to
> >>           * avoid re-reading it on next vmsave (which would return
> >> @@ -152,22 +185,69 @@
> >>      qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
> >>  }
> >>  
> >> +static bool kvmclock_src_use_reliable_get_clock(void *opaque)
> >> +{
> >> +    KVMClockState *s = opaque;
> >> +
> >> +    /*
> >> +     * On machine types that support reliable KVM_GET_CLOCK,
> >> +     * if host kernel does provide reliable KVM_GET_CLOCK,
> >> +     * set src_use_reliable_get_clock=true so that destination
> >> +     * avoids reading kvmclock from memory.
> >> +     */
> >> +    if (s->mach_use_reliable_get_clock && kvm_has_adjust_clock_stable()) {
> >> +        s->src_use_reliable_get_clock = true;
> >> +    }
> > 
> > It feels fragile to change device state inside the .needed
> > function. Better to initialize src_use_reliable_get_clock on
> > kvmclock_realize()?

Done. Added a new variable because the code was not complete
(stop/cont also runs locally).

> > What exactly ensures src_use_reliable_get_clock is correctly
> > initialized on the migration destination as well?
> > 
> >> +
> >> +    return s->mach_use_reliable_get_clock;
> > 
> > If if kvm_has_adjust_clock_stable() is false, isn't it simpler to
> > simply skip the section?
> 
> This is what I asked for. :)
> 
> However, I was proposing a different way to initialize
> src_use_reliable_get_clock.  I still have to understand exactly how
> Marcelo's algorithm works because (based on the kvmclock code) it's more
> trick than it seems.
> 
> Paolo

You asked me to return s->mach_use_reliable_get_clock:

>>> +     * avoids reading kvmclock from memory.
>>> +     */
>>> +    if (s->mach_use_reliable_get_clock &&
>>> kvm_has_adjust_clock_stable()) {
>>> +        s->src_use_reliable_get_clock = true;
>>> +    }
>>> +
>>> +    return s->src_use_reliable_get_clock;
>>> +}
>>
>> Here you can just return s->mach_use_reliable_get_clock.
>
> mach_use_reliable_get_clock can be true but host might not support it.

Yes, but the "needed" function is only required to avoid breaking
pc-i440fx-2.7 and earlier.  If you return true here, you can still
migrate a "false" value for src_use_reliable_get_clock.

===================

I don't see why avoid the subsection, since new machine type is 
incompatible anyway. So Eduardo on your suggestion to 
skip sending the subsection, what is the advantage?



^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [qemu patch V2 2/2] kvmclock: reduce kvmclock difference on migration
@ 2016-11-17 16:30         ` Marcelo Tosatti
  0 siblings, 0 replies; 22+ messages in thread
From: Marcelo Tosatti @ 2016-11-17 16:30 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: Eduardo Habkost, kvm, qemu-devel, Dr. David Alan Gilbert,
	Juan Quintela, Radim Krcmar

On Thu, Nov 17, 2016 at 03:15:03PM +0100, Paolo Bonzini wrote:
> 
> 
> On 17/11/2016 15:11, Eduardo Habkost wrote:
> > On Thu, Nov 17, 2016 at 11:24:13AM -0200, Marcelo Tosatti wrote:
> >> Check for KVM_CAP_ADJUST_CLOCK capability KVM_CLOCK_TSC_STABLE, which
> >> indicates that KVM_GET_CLOCK returns a value as seen by the guest at
> >> that moment.
> >>
> >> For new machine types, use this value rather than reading 
> >> from guest memory.
> >>
> >> This reduces kvmclock difference on migration from 5s to 0.1s
> >> (when max_downtime == 5s).
> >>
> >> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> >>
> >> ---
> >>  hw/i386/kvm/clock.c    |  108 ++++++++++++++++++++++++++++++++++++++++++-------
> >>  include/hw/i386/pc.h   |    5 ++
> >>  target-i386/kvm.c      |    7 +++
> >>  target-i386/kvm_i386.h |    1 
> >>  4 files changed, 107 insertions(+), 14 deletions(-)
> >>
> >> v2: 
> >> - improve variable names (Juan)
> >> - consolidate code on kvm_get_clock function (Paolo)
> >> - return mach_use_reliable_get_clock from needed function (Paolo)
> >>
> >> Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
> >> ===================================================================
> >> --- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-14 10:40:39.748116312 -0200
> >> +++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-11-14 13:38:29.299955042 -0200
> >> @@ -36,6 +36,12 @@
> >>  
> >>      uint64_t clock;
> >>      bool clock_valid;
> >> +
> >> +    /* whether machine supports reliable KVM_GET_CLOCK */
> >> +    bool mach_use_reliable_get_clock;
> >> +
> >> +    /* whether source host supported reliable KVM_GET_CLOCK */
> >> +    bool src_use_reliable_get_clock;
> >>  } KVMClockState;
> >>  
> >>  struct pvclock_vcpu_time_info {
> >> @@ -81,6 +87,19 @@
> >>      return nsec + time.system_time;
> >>  }
> >>  
> >> +static uint64_t kvm_get_clock(void)
> >> +{
> >> +    struct kvm_clock_data data;
> >> +    int ret;
> >> +
> >> +    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> >> +    if (ret < 0) {
> >> +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> >> +                abort();
> >> +    }
> >> +    return data.clock;
> >> +}
> >> +
> >>  static void kvmclock_vm_state_change(void *opaque, int running,
> >>                                       RunState state)
> >>  {
> >> @@ -91,15 +110,37 @@
> >>  
> >>      if (running) {
> >>          struct kvm_clock_data data = {};
> >> -        uint64_t time_at_migration = kvmclock_current_nsec(s);
> >> +        uint64_t pvclock_via_mem = 0;
> >>  
> >> -        s->clock_valid = false;
> >> +        /* local (running VM) restore */
> >> +        if (s->clock_valid) {
> >> +            /*
> >> +             * if host does not support reliable KVM_GET_CLOCK,
> >> +             * read kvmclock value from memory
> >> +             */
> >> +            if (!kvm_has_adjust_clock_stable()) {
> >> +                pvclock_via_mem = kvmclock_current_nsec(s);
> >> +            }
> >> +        /* migration/savevm/init restore */
> >> +        } else {
> >> +            /*
> >> +             * use s->clock in case machine uses reliable
> >> +             * get clock and source host supported
> >> +             * reliable get clock
> >> +             */
> >> +            if (!(s->mach_use_reliable_get_clock &&
> >> +                  s->src_use_reliable_get_clock)) {
> >> +                pvclock_via_mem = kvmclock_current_nsec(s);
> >> +            }
> > 
> > The s->mach_use_reliable_get_clock check seems redundant.
> > src_use_reliable_get_clock is set only if
> > mach_use_reliable_get_clock is true.
> > 
> >> +        }
> >>  
> >> -        /* We can't rely on the migrated clock value, just discard it */
> >> -        if (time_at_migration) {
> >> -            s->clock = time_at_migration;
> >> +        /* We can't rely on the saved clock value, just discard it */
> >> +        if (pvclock_via_mem) {
> >> +            s->clock = pvclock_via_mem;
> >>          }
> >>  
> >> +        s->clock_valid = false;
> >> +
> >>          data.clock = s->clock;
> >>          ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
> >>          if (ret < 0) {
> >> @@ -120,8 +161,6 @@
> >>              }
> >>          }
> >>      } else {
> >> -        struct kvm_clock_data data;
> >> -        int ret;
> >>  
> >>          if (s->clock_valid) {
> >>              return;
> >> @@ -129,13 +168,7 @@
> >>  
> >>          kvm_synchronize_all_tsc();
> >>  
> >> -        ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> >> -        if (ret < 0) {
> >> -            fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> >> -            abort();
> >> -        }
> >> -        s->clock = data.clock;
> >> -
> >> +        s->clock = kvm_get_clock();
> >>          /*
> >>           * If the VM is stopped, declare the clock state valid to
> >>           * avoid re-reading it on next vmsave (which would return
> >> @@ -152,22 +185,69 @@
> >>      qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
> >>  }
> >>  
> >> +static bool kvmclock_src_use_reliable_get_clock(void *opaque)
> >> +{
> >> +    KVMClockState *s = opaque;
> >> +
> >> +    /*
> >> +     * On machine types that support reliable KVM_GET_CLOCK,
> >> +     * if host kernel does provide reliable KVM_GET_CLOCK,
> >> +     * set src_use_reliable_get_clock=true so that destination
> >> +     * avoids reading kvmclock from memory.
> >> +     */
> >> +    if (s->mach_use_reliable_get_clock && kvm_has_adjust_clock_stable()) {
> >> +        s->src_use_reliable_get_clock = true;
> >> +    }
> > 
> > It feels fragile to change device state inside the .needed
> > function. Better to initialize src_use_reliable_get_clock on
> > kvmclock_realize()?

Done. Added a new variable because the code was not complete
(stop/cont also runs locally).

> > What exactly ensures src_use_reliable_get_clock is correctly
> > initialized on the migration destination as well?
> > 
> >> +
> >> +    return s->mach_use_reliable_get_clock;
> > 
> > If if kvm_has_adjust_clock_stable() is false, isn't it simpler to
> > simply skip the section?
> 
> This is what I asked for. :)
> 
> However, I was proposing a different way to initialize
> src_use_reliable_get_clock.  I still have to understand exactly how
> Marcelo's algorithm works because (based on the kvmclock code) it's more
> trick than it seems.
> 
> Paolo

You asked me to return s->mach_use_reliable_get_clock:

>>> +     * avoids reading kvmclock from memory.
>>> +     */
>>> +    if (s->mach_use_reliable_get_clock &&
>>> kvm_has_adjust_clock_stable()) {
>>> +        s->src_use_reliable_get_clock = true;
>>> +    }
>>> +
>>> +    return s->src_use_reliable_get_clock;
>>> +}
>>
>> Here you can just return s->mach_use_reliable_get_clock.
>
> mach_use_reliable_get_clock can be true but host might not support it.

Yes, but the "needed" function is only required to avoid breaking
pc-i440fx-2.7 and earlier.  If you return true here, you can still
migrate a "false" value for src_use_reliable_get_clock.

===================

I don't see why avoid the subsection, since new machine type is 
incompatible anyway. So Eduardo on your suggestion to 
skip sending the subsection, what is the advantage?

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [qemu patch V2 2/2] kvmclock: reduce kvmclock difference on migration
  2016-11-17 14:11     ` [Qemu-devel] " Eduardo Habkost
@ 2016-11-17 17:15       ` Marcelo Tosatti
  -1 siblings, 0 replies; 22+ messages in thread
From: Marcelo Tosatti @ 2016-11-17 17:15 UTC (permalink / raw)
  To: Eduardo Habkost
  Cc: kvm, qemu-devel, Dr. David Alan Gilbert, Paolo Bonzini,
	Juan Quintela, Radim Krcmar

On Thu, Nov 17, 2016 at 12:11:26PM -0200, Eduardo Habkost wrote:
> On Thu, Nov 17, 2016 at 11:24:13AM -0200, Marcelo Tosatti wrote:
> > Check for KVM_CAP_ADJUST_CLOCK capability KVM_CLOCK_TSC_STABLE, which
> > indicates that KVM_GET_CLOCK returns a value as seen by the guest at
> > that moment.
> > 
> > For new machine types, use this value rather than reading 
> > from guest memory.
> > 
> > This reduces kvmclock difference on migration from 5s to 0.1s
> > (when max_downtime == 5s).
> > 
> > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > 
> > ---
> >  hw/i386/kvm/clock.c    |  108 ++++++++++++++++++++++++++++++++++++++++++-------
> >  include/hw/i386/pc.h   |    5 ++
> >  target-i386/kvm.c      |    7 +++
> >  target-i386/kvm_i386.h |    1 
> >  4 files changed, 107 insertions(+), 14 deletions(-)
> > 
> > v2: 
> > - improve variable names (Juan)
> > - consolidate code on kvm_get_clock function (Paolo)
> > - return mach_use_reliable_get_clock from needed function (Paolo)
> > 
> > Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
> > ===================================================================
> > --- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-14 10:40:39.748116312 -0200
> > +++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-11-14 13:38:29.299955042 -0200
> > @@ -36,6 +36,12 @@
> >  
> >      uint64_t clock;
> >      bool clock_valid;
> > +
> > +    /* whether machine supports reliable KVM_GET_CLOCK */
> > +    bool mach_use_reliable_get_clock;
> > +
> > +    /* whether source host supported reliable KVM_GET_CLOCK */
> > +    bool src_use_reliable_get_clock;
> >  } KVMClockState;
> >  
> >  struct pvclock_vcpu_time_info {
> > @@ -81,6 +87,19 @@
> >      return nsec + time.system_time;
> >  }
> >  
> > +static uint64_t kvm_get_clock(void)
> > +{
> > +    struct kvm_clock_data data;
> > +    int ret;
> > +
> > +    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> > +    if (ret < 0) {
> > +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> > +                abort();
> > +    }
> > +    return data.clock;
> > +}
> > +
> >  static void kvmclock_vm_state_change(void *opaque, int running,
> >                                       RunState state)
> >  {
> > @@ -91,15 +110,37 @@
> >  
> >      if (running) {
> >          struct kvm_clock_data data = {};
> > -        uint64_t time_at_migration = kvmclock_current_nsec(s);
> > +        uint64_t pvclock_via_mem = 0;
> >  
> > -        s->clock_valid = false;
> > +        /* local (running VM) restore */
> > +        if (s->clock_valid) {
> > +            /*
> > +             * if host does not support reliable KVM_GET_CLOCK,
> > +             * read kvmclock value from memory
> > +             */
> > +            if (!kvm_has_adjust_clock_stable()) {
> > +                pvclock_via_mem = kvmclock_current_nsec(s);
> > +            }
> > +        /* migration/savevm/init restore */
> > +        } else {
> > +            /*
> > +             * use s->clock in case machine uses reliable
> > +             * get clock and source host supported
> > +             * reliable get clock
> > +             */
> > +            if (!(s->mach_use_reliable_get_clock &&
> > +                  s->src_use_reliable_get_clock)) {
> > +                pvclock_via_mem = kvmclock_current_nsec(s);
> > +            }
> 
> The s->mach_use_reliable_get_clock check seems redundant.
> src_use_reliable_get_clock is set only if
> mach_use_reliable_get_clock is true.

Done.

> > +        }
> >  
> > -        /* We can't rely on the migrated clock value, just discard it */
> > -        if (time_at_migration) {
> > -            s->clock = time_at_migration;
> > +        /* We can't rely on the saved clock value, just discard it */
> > +        if (pvclock_via_mem) {
> > +            s->clock = pvclock_via_mem;
> >          }
> >  
> > +        s->clock_valid = false;
> > +
> >          data.clock = s->clock;
> >          ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
> >          if (ret < 0) {
> > @@ -120,8 +161,6 @@
> >              }
> >          }
> >      } else {
> > -        struct kvm_clock_data data;
> > -        int ret;
> >  
> >          if (s->clock_valid) {
> >              return;
> > @@ -129,13 +168,7 @@
> >  
> >          kvm_synchronize_all_tsc();
> >  
> > -        ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> > -        if (ret < 0) {
> > -            fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> > -            abort();
> > -        }
> > -        s->clock = data.clock;
> > -
> > +        s->clock = kvm_get_clock();
> >          /*
> >           * If the VM is stopped, declare the clock state valid to
> >           * avoid re-reading it on next vmsave (which would return
> > @@ -152,22 +185,69 @@
> >      qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
> >  }
> >  
> > +static bool kvmclock_src_use_reliable_get_clock(void *opaque)
> > +{
> > +    KVMClockState *s = opaque;
> > +
> > +    /*
> > +     * On machine types that support reliable KVM_GET_CLOCK,
> > +     * if host kernel does provide reliable KVM_GET_CLOCK,
> > +     * set src_use_reliable_get_clock=true so that destination
> > +     * avoids reading kvmclock from memory.
> > +     */
> > +    if (s->mach_use_reliable_get_clock && kvm_has_adjust_clock_stable()) {
> > +        s->src_use_reliable_get_clock = true;
> > +    }
> 
> It feels fragile to change device state inside the .needed
> function. Better to initialize src_use_reliable_get_clock on
> kvmclock_realize()?
> 
> What exactly ensures src_use_reliable_get_clock is correctly
> initialized on the migration destination as well?

Its initialized to false (because its part of device state).

> 
> > +
> > +    return s->mach_use_reliable_get_clock;
> 
> If if kvm_has_adjust_clock_stable() is false, isn't it simpler to
> simply skip the section?
> 
> It looks like mach_use_reliable_get_clock and
> src_use_reliable_get_clock could become a single field:
> 
> * use_reliable_get_clock ("x-use-reliable-get-clock" property)
>   set to true by default (set on DEFINE_PROP_BOOL parameter)
> * "x-use-reliable-get-clock" set to false by default on older
>   machine-types
> * use_reliable_get_clock forced to false on kvmclock_realize() if
>   !kvm_has_adjust_clock_stable()
> * kvmclock_reliable_get_clock.needed return s->use_reliable_get_clock

No because use_reliable_get_clock can be false because the destination host
does not support kvm_has_adjust_clock_stable(), but the source host
supports it (so we want to use the value in the first incoming
migration).

So the variable indicates whether the source host supported
migration, not whether the destination supports it.

> 
> > +}
> > +
> > +static const VMStateDescription kvmclock_reliable_get_clock = {
> > +    .name = "kvmclock/src_use_reliable_get_clock",
> > +    .version_id = 1,
> > +    .minimum_version_id = 1,
> > +    .needed = kvmclock_src_use_reliable_get_clock,
> > +    .fields = (VMStateField[]) {
> > +        VMSTATE_BOOL(src_use_reliable_get_clock, KVMClockState),
> > +        VMSTATE_END_OF_LIST()
> > +    }
> > +};
> > +
> > +static void kvmclock_pre_save(void *opaque)
> > +{
> > +    KVMClockState *s = opaque;
> > +
> > +    s->clock = kvm_get_clock();
> > +}
> > +
> >  static const VMStateDescription kvmclock_vmsd = {
> >      .name = "kvmclock",
> >      .version_id = 1,
> >      .minimum_version_id = 1,
> > +    .pre_save = kvmclock_pre_save,
> >      .fields = (VMStateField[]) {
> >          VMSTATE_UINT64(clock, KVMClockState),
> >          VMSTATE_END_OF_LIST()
> > +    },
> > +    .subsections = (const VMStateDescription * []) {
> > +        &kvmclock_reliable_get_clock,
> > +        NULL
> >      }
> >  };
> >  
> > +static Property kvmclock_properties[] = {
> > +    DEFINE_PROP_BOOL("mach_use_reliable_get_clock", KVMClockState,
> > +                      mach_use_reliable_get_clock, true),
> 
> QOM property coding style is to use "-" instead of "_". Also,
> please prefix it with "x-" to indicate it is not part of QEMU
> stable external interface.
> 
> > +    DEFINE_PROP_END_OF_LIST(),
> > +};
> > +
> >  static void kvmclock_class_init(ObjectClass *klass, void *data)
> >  {
> >      DeviceClass *dc = DEVICE_CLASS(klass);
> >  
> >      dc->realize = kvmclock_realize;
> >      dc->vmsd = &kvmclock_vmsd;
> > +    dc->props = kvmclock_properties;
> >  }
> >  
> >  static const TypeInfo kvmclock_info = {
> > Index: qemu-mig-advance-clock/include/hw/i386/pc.h
> > ===================================================================
> > --- qemu-mig-advance-clock.orig/include/hw/i386/pc.h	2016-11-14 10:40:39.748116312 -0200
> > +++ qemu-mig-advance-clock/include/hw/i386/pc.h	2016-11-14 10:40:48.059128649 -0200
> > @@ -370,6 +370,11 @@
> >  #define PC_COMPAT_2_7 \
> >      HW_COMPAT_2_7 \
> >      {\
> > +        .driver   = "kvmclock",\
> > +        .property = "mach_use_reliable_get_clock",\
> > +        .value    = "off",\
> > +    },\
> > +    {\
> >          .driver   = TYPE_X86_CPU,\
> >          .property = "l3-cache",\
> >          .value    = "off",\
> > Index: qemu-mig-advance-clock/target-i386/kvm.c
> > ===================================================================
> > --- qemu-mig-advance-clock.orig/target-i386/kvm.c	2016-11-14 10:40:39.750116314 -0200
> > +++ qemu-mig-advance-clock/target-i386/kvm.c	2016-11-14 10:40:48.061128652 -0200
> > @@ -117,6 +117,13 @@
> >      return kvm_check_extension(kvm_state, KVM_CAP_X86_SMM);
> >  }
> >  
> > +bool kvm_has_adjust_clock_stable(void)
> > +{
> > +    int ret = kvm_check_extension(kvm_state, KVM_CAP_ADJUST_CLOCK);
> > +
> > +    return (ret == KVM_CLOCK_TSC_STABLE);
> > +}
> > +
> >  bool kvm_allows_irq0_override(void)
> >  {
> >      return !kvm_irqchip_in_kernel() || kvm_has_gsi_routing();
> > Index: qemu-mig-advance-clock/target-i386/kvm_i386.h
> > ===================================================================
> > --- qemu-mig-advance-clock.orig/target-i386/kvm_i386.h	2016-11-14 10:40:39.751116316 -0200
> > +++ qemu-mig-advance-clock/target-i386/kvm_i386.h	2016-11-14 10:40:48.061128652 -0200
> > @@ -17,6 +17,7 @@
> >  
> >  bool kvm_allows_irq0_override(void);
> >  bool kvm_has_smm(void);
> > +bool kvm_has_adjust_clock_stable(void);
> >  void kvm_synchronize_all_tsc(void);
> >  void kvm_arch_reset_vcpu(X86CPU *cs);
> >  void kvm_arch_do_init_vcpu(X86CPU *cs);
> > 
> > 
> 
> -- 
> Eduardo

How about the following:

Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
===================================================================
--- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-17 15:07:11.220632761 -0200
+++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-11-17 15:11:51.372048640 -0200
@@ -36,6 +36,12 @@
 
     uint64_t clock;
     bool clock_valid;
+
+    /* whether machine supports reliable KVM_GET_CLOCK */
+    bool mach_use_reliable_get_clock;
+
+    /* whether source host supported reliable KVM_GET_CLOCK */
+    bool src_use_reliable_get_clock;
 } KVMClockState;
 
 struct pvclock_vcpu_time_info {
@@ -81,6 +87,19 @@
     return nsec + time.system_time;
 }
 
+static uint64_t kvm_get_clock(void)
+{
+    struct kvm_clock_data data;
+    int ret;
+
+    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
+    if (ret < 0) {
+        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
+                abort();
+    }
+    return data.clock;
+}
+
 static void kvmclock_vm_state_change(void *opaque, int running,
                                      RunState state)
 {
@@ -91,15 +110,36 @@
 
     if (running) {
         struct kvm_clock_data data = {};
-        uint64_t time_at_migration = kvmclock_current_nsec(s);
+        uint64_t pvclock_via_mem = 0;
 
-        s->clock_valid = false;
+        /* local (running VM) restore */
+        if (s->clock_valid) {
+            /*
+             * if host does not support reliable KVM_GET_CLOCK,
+             * read kvmclock value from memory
+             */
+            if (!kvm_has_adjust_clock_stable()) {
+                pvclock_via_mem = kvmclock_current_nsec(s);
+            }
+        /* migration/savevm/init restore */
+        } else {
+            /*
+             * use s->clock in case machine uses reliable
+             * get clock and source host supported
+             * reliable get clock
+             */
+            if (!s->src_use_reliable_get_clock) {
+                pvclock_via_mem = kvmclock_current_nsec(s);
+            }
+        }
 
-        /* We can't rely on the migrated clock value, just discard it */
-        if (time_at_migration) {
-            s->clock = time_at_migration;
+        /* We can't rely on the saved clock value, just discard it */
+        if (pvclock_via_mem) {
+            s->clock = pvclock_via_mem;
         }
 
+        s->clock_valid = false;
+
         data.clock = s->clock;
         ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
         if (ret < 0) {
@@ -120,8 +160,6 @@
             }
         }
     } else {
-        struct kvm_clock_data data;
-        int ret;
 
         if (s->clock_valid) {
             return;
@@ -129,13 +167,7 @@
 
         kvm_synchronize_all_tsc();
 
-        ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
-        if (ret < 0) {
-            fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
-            abort();
-        }
-        s->clock = data.clock;
-
+        s->clock = kvm_get_clock();
         /*
          * If the VM is stopped, declare the clock state valid to
          * avoid re-reading it on next vmsave (which would return
@@ -149,25 +181,72 @@
 {
     KVMClockState *s = KVM_CLOCK(dev);
 
+    /*
+     * On machine types that support reliable KVM_GET_CLOCK,
+     * if host kernel does provide reliable KVM_GET_CLOCK,
+     * set src_use_reliable_get_clock=true so that destination
+     * avoids reading kvmclock from memory.
+     */
+    if (s->mach_use_reliable_get_clock && kvm_has_adjust_clock_stable()) {
+        s->src_use_reliable_get_clock = true;
+    }
+
     qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
 }
 
+static bool kvmclock_src_use_reliable_get_clock(void *opaque)
+{
+    KVMClockState *s = opaque;
+
+    return s->mach_use_reliable_get_clock;
+}
+
+static const VMStateDescription kvmclock_reliable_get_clock = {
+    .name = "kvmclock/src_use_reliable_get_clock",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .needed = kvmclock_src_use_reliable_get_clock,
+    .fields = (VMStateField[]) {
+        VMSTATE_BOOL(src_use_reliable_get_clock, KVMClockState),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+static void kvmclock_pre_save(void *opaque)
+{
+    KVMClockState *s = opaque;
+
+    s->clock = kvm_get_clock();
+}
+
 static const VMStateDescription kvmclock_vmsd = {
     .name = "kvmclock",
     .version_id = 1,
     .minimum_version_id = 1,
+    .pre_save = kvmclock_pre_save,
     .fields = (VMStateField[]) {
         VMSTATE_UINT64(clock, KVMClockState),
         VMSTATE_END_OF_LIST()
+    },
+    .subsections = (const VMStateDescription * []) {
+        &kvmclock_reliable_get_clock,
+        NULL
     }
 };
 
+static Property kvmclock_properties[] = {
+    DEFINE_PROP_BOOL("x-mach-use-reliable-get-clock", KVMClockState,
+                      mach_use_reliable_get_clock, true),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
 static void kvmclock_class_init(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
 
     dc->realize = kvmclock_realize;
     dc->vmsd = &kvmclock_vmsd;
+    dc->props = kvmclock_properties;
 }
 
 static const TypeInfo kvmclock_info = {
Index: qemu-mig-advance-clock/include/hw/i386/pc.h
===================================================================
--- qemu-mig-advance-clock.orig/include/hw/i386/pc.h	2016-11-17 15:07:11.220632761 -0200
+++ qemu-mig-advance-clock/include/hw/i386/pc.h	2016-11-17 15:08:58.096791416 -0200
@@ -370,6 +370,11 @@
 #define PC_COMPAT_2_7 \
     HW_COMPAT_2_7 \
     {\
+        .driver   = "kvmclock",\
+        .property = "x-mach-use-reliable-get-clock",\
+        .value    = "off",\
+    },\
+    {\
         .driver   = TYPE_X86_CPU,\
         .property = "l3-cache",\
         .value    = "off",\
Index: qemu-mig-advance-clock/target-i386/kvm.c
===================================================================
--- qemu-mig-advance-clock.orig/target-i386/kvm.c	2016-11-17 15:07:11.221632762 -0200
+++ qemu-mig-advance-clock/target-i386/kvm.c	2016-11-17 15:07:15.867639659 -0200
@@ -117,6 +117,13 @@
     return kvm_check_extension(kvm_state, KVM_CAP_X86_SMM);
 }
 
+bool kvm_has_adjust_clock_stable(void)
+{
+    int ret = kvm_check_extension(kvm_state, KVM_CAP_ADJUST_CLOCK);
+
+    return (ret == KVM_CLOCK_TSC_STABLE);
+}
+
 bool kvm_allows_irq0_override(void)
 {
     return !kvm_irqchip_in_kernel() || kvm_has_gsi_routing();
Index: qemu-mig-advance-clock/target-i386/kvm_i386.h
===================================================================
--- qemu-mig-advance-clock.orig/target-i386/kvm_i386.h	2016-11-17 15:07:11.222632764 -0200
+++ qemu-mig-advance-clock/target-i386/kvm_i386.h	2016-11-17 15:07:15.867639659 -0200
@@ -17,6 +17,7 @@
 
 bool kvm_allows_irq0_override(void);
 bool kvm_has_smm(void);
+bool kvm_has_adjust_clock_stable(void);
 void kvm_synchronize_all_tsc(void);
 void kvm_arch_reset_vcpu(X86CPU *cs);
 void kvm_arch_do_init_vcpu(X86CPU *cs);

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [qemu patch V2 2/2] kvmclock: reduce kvmclock difference on migration
@ 2016-11-17 17:15       ` Marcelo Tosatti
  0 siblings, 0 replies; 22+ messages in thread
From: Marcelo Tosatti @ 2016-11-17 17:15 UTC (permalink / raw)
  To: Eduardo Habkost
  Cc: kvm, qemu-devel, Dr. David Alan Gilbert, Paolo Bonzini,
	Juan Quintela, Radim Krcmar

On Thu, Nov 17, 2016 at 12:11:26PM -0200, Eduardo Habkost wrote:
> On Thu, Nov 17, 2016 at 11:24:13AM -0200, Marcelo Tosatti wrote:
> > Check for KVM_CAP_ADJUST_CLOCK capability KVM_CLOCK_TSC_STABLE, which
> > indicates that KVM_GET_CLOCK returns a value as seen by the guest at
> > that moment.
> > 
> > For new machine types, use this value rather than reading 
> > from guest memory.
> > 
> > This reduces kvmclock difference on migration from 5s to 0.1s
> > (when max_downtime == 5s).
> > 
> > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > 
> > ---
> >  hw/i386/kvm/clock.c    |  108 ++++++++++++++++++++++++++++++++++++++++++-------
> >  include/hw/i386/pc.h   |    5 ++
> >  target-i386/kvm.c      |    7 +++
> >  target-i386/kvm_i386.h |    1 
> >  4 files changed, 107 insertions(+), 14 deletions(-)
> > 
> > v2: 
> > - improve variable names (Juan)
> > - consolidate code on kvm_get_clock function (Paolo)
> > - return mach_use_reliable_get_clock from needed function (Paolo)
> > 
> > Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
> > ===================================================================
> > --- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-14 10:40:39.748116312 -0200
> > +++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-11-14 13:38:29.299955042 -0200
> > @@ -36,6 +36,12 @@
> >  
> >      uint64_t clock;
> >      bool clock_valid;
> > +
> > +    /* whether machine supports reliable KVM_GET_CLOCK */
> > +    bool mach_use_reliable_get_clock;
> > +
> > +    /* whether source host supported reliable KVM_GET_CLOCK */
> > +    bool src_use_reliable_get_clock;
> >  } KVMClockState;
> >  
> >  struct pvclock_vcpu_time_info {
> > @@ -81,6 +87,19 @@
> >      return nsec + time.system_time;
> >  }
> >  
> > +static uint64_t kvm_get_clock(void)
> > +{
> > +    struct kvm_clock_data data;
> > +    int ret;
> > +
> > +    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> > +    if (ret < 0) {
> > +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> > +                abort();
> > +    }
> > +    return data.clock;
> > +}
> > +
> >  static void kvmclock_vm_state_change(void *opaque, int running,
> >                                       RunState state)
> >  {
> > @@ -91,15 +110,37 @@
> >  
> >      if (running) {
> >          struct kvm_clock_data data = {};
> > -        uint64_t time_at_migration = kvmclock_current_nsec(s);
> > +        uint64_t pvclock_via_mem = 0;
> >  
> > -        s->clock_valid = false;
> > +        /* local (running VM) restore */
> > +        if (s->clock_valid) {
> > +            /*
> > +             * if host does not support reliable KVM_GET_CLOCK,
> > +             * read kvmclock value from memory
> > +             */
> > +            if (!kvm_has_adjust_clock_stable()) {
> > +                pvclock_via_mem = kvmclock_current_nsec(s);
> > +            }
> > +        /* migration/savevm/init restore */
> > +        } else {
> > +            /*
> > +             * use s->clock in case machine uses reliable
> > +             * get clock and source host supported
> > +             * reliable get clock
> > +             */
> > +            if (!(s->mach_use_reliable_get_clock &&
> > +                  s->src_use_reliable_get_clock)) {
> > +                pvclock_via_mem = kvmclock_current_nsec(s);
> > +            }
> 
> The s->mach_use_reliable_get_clock check seems redundant.
> src_use_reliable_get_clock is set only if
> mach_use_reliable_get_clock is true.

Done.

> > +        }
> >  
> > -        /* We can't rely on the migrated clock value, just discard it */
> > -        if (time_at_migration) {
> > -            s->clock = time_at_migration;
> > +        /* We can't rely on the saved clock value, just discard it */
> > +        if (pvclock_via_mem) {
> > +            s->clock = pvclock_via_mem;
> >          }
> >  
> > +        s->clock_valid = false;
> > +
> >          data.clock = s->clock;
> >          ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
> >          if (ret < 0) {
> > @@ -120,8 +161,6 @@
> >              }
> >          }
> >      } else {
> > -        struct kvm_clock_data data;
> > -        int ret;
> >  
> >          if (s->clock_valid) {
> >              return;
> > @@ -129,13 +168,7 @@
> >  
> >          kvm_synchronize_all_tsc();
> >  
> > -        ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> > -        if (ret < 0) {
> > -            fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> > -            abort();
> > -        }
> > -        s->clock = data.clock;
> > -
> > +        s->clock = kvm_get_clock();
> >          /*
> >           * If the VM is stopped, declare the clock state valid to
> >           * avoid re-reading it on next vmsave (which would return
> > @@ -152,22 +185,69 @@
> >      qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
> >  }
> >  
> > +static bool kvmclock_src_use_reliable_get_clock(void *opaque)
> > +{
> > +    KVMClockState *s = opaque;
> > +
> > +    /*
> > +     * On machine types that support reliable KVM_GET_CLOCK,
> > +     * if host kernel does provide reliable KVM_GET_CLOCK,
> > +     * set src_use_reliable_get_clock=true so that destination
> > +     * avoids reading kvmclock from memory.
> > +     */
> > +    if (s->mach_use_reliable_get_clock && kvm_has_adjust_clock_stable()) {
> > +        s->src_use_reliable_get_clock = true;
> > +    }
> 
> It feels fragile to change device state inside the .needed
> function. Better to initialize src_use_reliable_get_clock on
> kvmclock_realize()?
> 
> What exactly ensures src_use_reliable_get_clock is correctly
> initialized on the migration destination as well?

Its initialized to false (because its part of device state).

> 
> > +
> > +    return s->mach_use_reliable_get_clock;
> 
> If if kvm_has_adjust_clock_stable() is false, isn't it simpler to
> simply skip the section?
> 
> It looks like mach_use_reliable_get_clock and
> src_use_reliable_get_clock could become a single field:
> 
> * use_reliable_get_clock ("x-use-reliable-get-clock" property)
>   set to true by default (set on DEFINE_PROP_BOOL parameter)
> * "x-use-reliable-get-clock" set to false by default on older
>   machine-types
> * use_reliable_get_clock forced to false on kvmclock_realize() if
>   !kvm_has_adjust_clock_stable()
> * kvmclock_reliable_get_clock.needed return s->use_reliable_get_clock

No because use_reliable_get_clock can be false because the destination host
does not support kvm_has_adjust_clock_stable(), but the source host
supports it (so we want to use the value in the first incoming
migration).

So the variable indicates whether the source host supported
migration, not whether the destination supports it.

> 
> > +}
> > +
> > +static const VMStateDescription kvmclock_reliable_get_clock = {
> > +    .name = "kvmclock/src_use_reliable_get_clock",
> > +    .version_id = 1,
> > +    .minimum_version_id = 1,
> > +    .needed = kvmclock_src_use_reliable_get_clock,
> > +    .fields = (VMStateField[]) {
> > +        VMSTATE_BOOL(src_use_reliable_get_clock, KVMClockState),
> > +        VMSTATE_END_OF_LIST()
> > +    }
> > +};
> > +
> > +static void kvmclock_pre_save(void *opaque)
> > +{
> > +    KVMClockState *s = opaque;
> > +
> > +    s->clock = kvm_get_clock();
> > +}
> > +
> >  static const VMStateDescription kvmclock_vmsd = {
> >      .name = "kvmclock",
> >      .version_id = 1,
> >      .minimum_version_id = 1,
> > +    .pre_save = kvmclock_pre_save,
> >      .fields = (VMStateField[]) {
> >          VMSTATE_UINT64(clock, KVMClockState),
> >          VMSTATE_END_OF_LIST()
> > +    },
> > +    .subsections = (const VMStateDescription * []) {
> > +        &kvmclock_reliable_get_clock,
> > +        NULL
> >      }
> >  };
> >  
> > +static Property kvmclock_properties[] = {
> > +    DEFINE_PROP_BOOL("mach_use_reliable_get_clock", KVMClockState,
> > +                      mach_use_reliable_get_clock, true),
> 
> QOM property coding style is to use "-" instead of "_". Also,
> please prefix it with "x-" to indicate it is not part of QEMU
> stable external interface.
> 
> > +    DEFINE_PROP_END_OF_LIST(),
> > +};
> > +
> >  static void kvmclock_class_init(ObjectClass *klass, void *data)
> >  {
> >      DeviceClass *dc = DEVICE_CLASS(klass);
> >  
> >      dc->realize = kvmclock_realize;
> >      dc->vmsd = &kvmclock_vmsd;
> > +    dc->props = kvmclock_properties;
> >  }
> >  
> >  static const TypeInfo kvmclock_info = {
> > Index: qemu-mig-advance-clock/include/hw/i386/pc.h
> > ===================================================================
> > --- qemu-mig-advance-clock.orig/include/hw/i386/pc.h	2016-11-14 10:40:39.748116312 -0200
> > +++ qemu-mig-advance-clock/include/hw/i386/pc.h	2016-11-14 10:40:48.059128649 -0200
> > @@ -370,6 +370,11 @@
> >  #define PC_COMPAT_2_7 \
> >      HW_COMPAT_2_7 \
> >      {\
> > +        .driver   = "kvmclock",\
> > +        .property = "mach_use_reliable_get_clock",\
> > +        .value    = "off",\
> > +    },\
> > +    {\
> >          .driver   = TYPE_X86_CPU,\
> >          .property = "l3-cache",\
> >          .value    = "off",\
> > Index: qemu-mig-advance-clock/target-i386/kvm.c
> > ===================================================================
> > --- qemu-mig-advance-clock.orig/target-i386/kvm.c	2016-11-14 10:40:39.750116314 -0200
> > +++ qemu-mig-advance-clock/target-i386/kvm.c	2016-11-14 10:40:48.061128652 -0200
> > @@ -117,6 +117,13 @@
> >      return kvm_check_extension(kvm_state, KVM_CAP_X86_SMM);
> >  }
> >  
> > +bool kvm_has_adjust_clock_stable(void)
> > +{
> > +    int ret = kvm_check_extension(kvm_state, KVM_CAP_ADJUST_CLOCK);
> > +
> > +    return (ret == KVM_CLOCK_TSC_STABLE);
> > +}
> > +
> >  bool kvm_allows_irq0_override(void)
> >  {
> >      return !kvm_irqchip_in_kernel() || kvm_has_gsi_routing();
> > Index: qemu-mig-advance-clock/target-i386/kvm_i386.h
> > ===================================================================
> > --- qemu-mig-advance-clock.orig/target-i386/kvm_i386.h	2016-11-14 10:40:39.751116316 -0200
> > +++ qemu-mig-advance-clock/target-i386/kvm_i386.h	2016-11-14 10:40:48.061128652 -0200
> > @@ -17,6 +17,7 @@
> >  
> >  bool kvm_allows_irq0_override(void);
> >  bool kvm_has_smm(void);
> > +bool kvm_has_adjust_clock_stable(void);
> >  void kvm_synchronize_all_tsc(void);
> >  void kvm_arch_reset_vcpu(X86CPU *cs);
> >  void kvm_arch_do_init_vcpu(X86CPU *cs);
> > 
> > 
> 
> -- 
> Eduardo

How about the following:

Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
===================================================================
--- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-17 15:07:11.220632761 -0200
+++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-11-17 15:11:51.372048640 -0200
@@ -36,6 +36,12 @@
 
     uint64_t clock;
     bool clock_valid;
+
+    /* whether machine supports reliable KVM_GET_CLOCK */
+    bool mach_use_reliable_get_clock;
+
+    /* whether source host supported reliable KVM_GET_CLOCK */
+    bool src_use_reliable_get_clock;
 } KVMClockState;
 
 struct pvclock_vcpu_time_info {
@@ -81,6 +87,19 @@
     return nsec + time.system_time;
 }
 
+static uint64_t kvm_get_clock(void)
+{
+    struct kvm_clock_data data;
+    int ret;
+
+    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
+    if (ret < 0) {
+        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
+                abort();
+    }
+    return data.clock;
+}
+
 static void kvmclock_vm_state_change(void *opaque, int running,
                                      RunState state)
 {
@@ -91,15 +110,36 @@
 
     if (running) {
         struct kvm_clock_data data = {};
-        uint64_t time_at_migration = kvmclock_current_nsec(s);
+        uint64_t pvclock_via_mem = 0;
 
-        s->clock_valid = false;
+        /* local (running VM) restore */
+        if (s->clock_valid) {
+            /*
+             * if host does not support reliable KVM_GET_CLOCK,
+             * read kvmclock value from memory
+             */
+            if (!kvm_has_adjust_clock_stable()) {
+                pvclock_via_mem = kvmclock_current_nsec(s);
+            }
+        /* migration/savevm/init restore */
+        } else {
+            /*
+             * use s->clock in case machine uses reliable
+             * get clock and source host supported
+             * reliable get clock
+             */
+            if (!s->src_use_reliable_get_clock) {
+                pvclock_via_mem = kvmclock_current_nsec(s);
+            }
+        }
 
-        /* We can't rely on the migrated clock value, just discard it */
-        if (time_at_migration) {
-            s->clock = time_at_migration;
+        /* We can't rely on the saved clock value, just discard it */
+        if (pvclock_via_mem) {
+            s->clock = pvclock_via_mem;
         }
 
+        s->clock_valid = false;
+
         data.clock = s->clock;
         ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
         if (ret < 0) {
@@ -120,8 +160,6 @@
             }
         }
     } else {
-        struct kvm_clock_data data;
-        int ret;
 
         if (s->clock_valid) {
             return;
@@ -129,13 +167,7 @@
 
         kvm_synchronize_all_tsc();
 
-        ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
-        if (ret < 0) {
-            fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
-            abort();
-        }
-        s->clock = data.clock;
-
+        s->clock = kvm_get_clock();
         /*
          * If the VM is stopped, declare the clock state valid to
          * avoid re-reading it on next vmsave (which would return
@@ -149,25 +181,72 @@
 {
     KVMClockState *s = KVM_CLOCK(dev);
 
+    /*
+     * On machine types that support reliable KVM_GET_CLOCK,
+     * if host kernel does provide reliable KVM_GET_CLOCK,
+     * set src_use_reliable_get_clock=true so that destination
+     * avoids reading kvmclock from memory.
+     */
+    if (s->mach_use_reliable_get_clock && kvm_has_adjust_clock_stable()) {
+        s->src_use_reliable_get_clock = true;
+    }
+
     qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
 }
 
+static bool kvmclock_src_use_reliable_get_clock(void *opaque)
+{
+    KVMClockState *s = opaque;
+
+    return s->mach_use_reliable_get_clock;
+}
+
+static const VMStateDescription kvmclock_reliable_get_clock = {
+    .name = "kvmclock/src_use_reliable_get_clock",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .needed = kvmclock_src_use_reliable_get_clock,
+    .fields = (VMStateField[]) {
+        VMSTATE_BOOL(src_use_reliable_get_clock, KVMClockState),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+static void kvmclock_pre_save(void *opaque)
+{
+    KVMClockState *s = opaque;
+
+    s->clock = kvm_get_clock();
+}
+
 static const VMStateDescription kvmclock_vmsd = {
     .name = "kvmclock",
     .version_id = 1,
     .minimum_version_id = 1,
+    .pre_save = kvmclock_pre_save,
     .fields = (VMStateField[]) {
         VMSTATE_UINT64(clock, KVMClockState),
         VMSTATE_END_OF_LIST()
+    },
+    .subsections = (const VMStateDescription * []) {
+        &kvmclock_reliable_get_clock,
+        NULL
     }
 };
 
+static Property kvmclock_properties[] = {
+    DEFINE_PROP_BOOL("x-mach-use-reliable-get-clock", KVMClockState,
+                      mach_use_reliable_get_clock, true),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
 static void kvmclock_class_init(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
 
     dc->realize = kvmclock_realize;
     dc->vmsd = &kvmclock_vmsd;
+    dc->props = kvmclock_properties;
 }
 
 static const TypeInfo kvmclock_info = {
Index: qemu-mig-advance-clock/include/hw/i386/pc.h
===================================================================
--- qemu-mig-advance-clock.orig/include/hw/i386/pc.h	2016-11-17 15:07:11.220632761 -0200
+++ qemu-mig-advance-clock/include/hw/i386/pc.h	2016-11-17 15:08:58.096791416 -0200
@@ -370,6 +370,11 @@
 #define PC_COMPAT_2_7 \
     HW_COMPAT_2_7 \
     {\
+        .driver   = "kvmclock",\
+        .property = "x-mach-use-reliable-get-clock",\
+        .value    = "off",\
+    },\
+    {\
         .driver   = TYPE_X86_CPU,\
         .property = "l3-cache",\
         .value    = "off",\
Index: qemu-mig-advance-clock/target-i386/kvm.c
===================================================================
--- qemu-mig-advance-clock.orig/target-i386/kvm.c	2016-11-17 15:07:11.221632762 -0200
+++ qemu-mig-advance-clock/target-i386/kvm.c	2016-11-17 15:07:15.867639659 -0200
@@ -117,6 +117,13 @@
     return kvm_check_extension(kvm_state, KVM_CAP_X86_SMM);
 }
 
+bool kvm_has_adjust_clock_stable(void)
+{
+    int ret = kvm_check_extension(kvm_state, KVM_CAP_ADJUST_CLOCK);
+
+    return (ret == KVM_CLOCK_TSC_STABLE);
+}
+
 bool kvm_allows_irq0_override(void)
 {
     return !kvm_irqchip_in_kernel() || kvm_has_gsi_routing();
Index: qemu-mig-advance-clock/target-i386/kvm_i386.h
===================================================================
--- qemu-mig-advance-clock.orig/target-i386/kvm_i386.h	2016-11-17 15:07:11.222632764 -0200
+++ qemu-mig-advance-clock/target-i386/kvm_i386.h	2016-11-17 15:07:15.867639659 -0200
@@ -17,6 +17,7 @@
 
 bool kvm_allows_irq0_override(void);
 bool kvm_has_smm(void);
+bool kvm_has_adjust_clock_stable(void);
 void kvm_synchronize_all_tsc(void);
 void kvm_arch_reset_vcpu(X86CPU *cs);
 void kvm_arch_do_init_vcpu(X86CPU *cs);

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [qemu patch V2 2/2] kvmclock: reduce kvmclock difference on migration
  2016-11-17 16:30         ` [Qemu-devel] " Marcelo Tosatti
@ 2016-11-17 17:41           ` Eduardo Habkost
  -1 siblings, 0 replies; 22+ messages in thread
From: Eduardo Habkost @ 2016-11-17 17:41 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: Paolo Bonzini, kvm, qemu-devel, Dr. David Alan Gilbert,
	Juan Quintela, Radim Krcmar

On Thu, Nov 17, 2016 at 02:30:52PM -0200, Marcelo Tosatti wrote:
> On Thu, Nov 17, 2016 at 03:15:03PM +0100, Paolo Bonzini wrote:
> > On 17/11/2016 15:11, Eduardo Habkost wrote:
> > > On Thu, Nov 17, 2016 at 11:24:13AM -0200, Marcelo Tosatti wrote:
[...]
> > > What exactly ensures src_use_reliable_get_clock is correctly
> > > initialized on the migration destination as well?
> > > 
> > >> +
> > >> +    return s->mach_use_reliable_get_clock;
> > > 
> > > If if kvm_has_adjust_clock_stable() is false, isn't it simpler to
> > > simply skip the section?
> > 
> > This is what I asked for. :)
> > 
> > However, I was proposing a different way to initialize
> > src_use_reliable_get_clock.  I still have to understand exactly how
> > Marcelo's algorithm works because (based on the kvmclock code) it's more
> > trick than it seems.
> > 
> > Paolo
> 
> You asked me to return s->mach_use_reliable_get_clock:
> 
> >>> +     * avoids reading kvmclock from memory.
> >>> +     */
> >>> +    if (s->mach_use_reliable_get_clock &&
> >>> kvm_has_adjust_clock_stable()) {
> >>> +        s->src_use_reliable_get_clock = true;
> >>> +    }
> >>> +
> >>> +    return s->src_use_reliable_get_clock;
> >>> +}
> >>
> >> Here you can just return s->mach_use_reliable_get_clock.
> >
> > mach_use_reliable_get_clock can be true but host might not support it.
> 
> Yes, but the "needed" function is only required to avoid breaking
> pc-i440fx-2.7 and earlier.  If you return true here, you can still
> migrate a "false" value for src_use_reliable_get_clock.
> 
> ===================
> 
> I don't see why avoid the subsection, since new machine type is 
> incompatible anyway. So Eduardo on your suggestion to 
> skip sending the subsection, what is the advantage?

I believe the main (only?) advantage is that it can make the code
simpler: if you simply skip the section if the field is false,
you don't even need two separate fields.

-- 
Eduardo

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [qemu patch V2 2/2] kvmclock: reduce kvmclock difference on migration
@ 2016-11-17 17:41           ` Eduardo Habkost
  0 siblings, 0 replies; 22+ messages in thread
From: Eduardo Habkost @ 2016-11-17 17:41 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: Paolo Bonzini, kvm, qemu-devel, Dr. David Alan Gilbert,
	Juan Quintela, Radim Krcmar

On Thu, Nov 17, 2016 at 02:30:52PM -0200, Marcelo Tosatti wrote:
> On Thu, Nov 17, 2016 at 03:15:03PM +0100, Paolo Bonzini wrote:
> > On 17/11/2016 15:11, Eduardo Habkost wrote:
> > > On Thu, Nov 17, 2016 at 11:24:13AM -0200, Marcelo Tosatti wrote:
[...]
> > > What exactly ensures src_use_reliable_get_clock is correctly
> > > initialized on the migration destination as well?
> > > 
> > >> +
> > >> +    return s->mach_use_reliable_get_clock;
> > > 
> > > If if kvm_has_adjust_clock_stable() is false, isn't it simpler to
> > > simply skip the section?
> > 
> > This is what I asked for. :)
> > 
> > However, I was proposing a different way to initialize
> > src_use_reliable_get_clock.  I still have to understand exactly how
> > Marcelo's algorithm works because (based on the kvmclock code) it's more
> > trick than it seems.
> > 
> > Paolo
> 
> You asked me to return s->mach_use_reliable_get_clock:
> 
> >>> +     * avoids reading kvmclock from memory.
> >>> +     */
> >>> +    if (s->mach_use_reliable_get_clock &&
> >>> kvm_has_adjust_clock_stable()) {
> >>> +        s->src_use_reliable_get_clock = true;
> >>> +    }
> >>> +
> >>> +    return s->src_use_reliable_get_clock;
> >>> +}
> >>
> >> Here you can just return s->mach_use_reliable_get_clock.
> >
> > mach_use_reliable_get_clock can be true but host might not support it.
> 
> Yes, but the "needed" function is only required to avoid breaking
> pc-i440fx-2.7 and earlier.  If you return true here, you can still
> migrate a "false" value for src_use_reliable_get_clock.
> 
> ===================
> 
> I don't see why avoid the subsection, since new machine type is 
> incompatible anyway. So Eduardo on your suggestion to 
> skip sending the subsection, what is the advantage?

I believe the main (only?) advantage is that it can make the code
simpler: if you simply skip the section if the field is false,
you don't even need two separate fields.

-- 
Eduardo

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [qemu patch V2 2/2] kvmclock: reduce kvmclock difference on migration
  2016-11-17 17:15       ` [Qemu-devel] " Marcelo Tosatti
@ 2016-11-17 18:16         ` Eduardo Habkost
  -1 siblings, 0 replies; 22+ messages in thread
From: Eduardo Habkost @ 2016-11-17 18:16 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: kvm, Juan Quintela, Radim Krcmar, qemu-devel,
	Dr. David Alan Gilbert, Paolo Bonzini

On Thu, Nov 17, 2016 at 03:15:35PM -0200, Marcelo Tosatti wrote:
> On Thu, Nov 17, 2016 at 12:11:26PM -0200, Eduardo Habkost wrote:
> > On Thu, Nov 17, 2016 at 11:24:13AM -0200, Marcelo Tosatti wrote:
> > > Check for KVM_CAP_ADJUST_CLOCK capability KVM_CLOCK_TSC_STABLE, which
> > > indicates that KVM_GET_CLOCK returns a value as seen by the guest at
> > > that moment.
> > > 
> > > For new machine types, use this value rather than reading 
> > > from guest memory.
> > > 
> > > This reduces kvmclock difference on migration from 5s to 0.1s
> > > (when max_downtime == 5s).
> > > 
> > > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > > 
> > > ---
> > >  hw/i386/kvm/clock.c    |  108 ++++++++++++++++++++++++++++++++++++++++++-------
> > >  include/hw/i386/pc.h   |    5 ++
> > >  target-i386/kvm.c      |    7 +++
> > >  target-i386/kvm_i386.h |    1 
> > >  4 files changed, 107 insertions(+), 14 deletions(-)
> > > 
> > > v2: 
> > > - improve variable names (Juan)
> > > - consolidate code on kvm_get_clock function (Paolo)
> > > - return mach_use_reliable_get_clock from needed function (Paolo)
> > > 
> > > Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
> > > ===================================================================
> > > --- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-14 10:40:39.748116312 -0200
> > > +++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-11-14 13:38:29.299955042 -0200
> > > @@ -36,6 +36,12 @@
> > >  
> > >      uint64_t clock;
> > >      bool clock_valid;
> > > +
> > > +    /* whether machine supports reliable KVM_GET_CLOCK */
> > > +    bool mach_use_reliable_get_clock;
> > > +
> > > +    /* whether source host supported reliable KVM_GET_CLOCK */
> > > +    bool src_use_reliable_get_clock;
> > >  } KVMClockState;
> > >  
> > >  struct pvclock_vcpu_time_info {
> > > @@ -81,6 +87,19 @@
> > >      return nsec + time.system_time;
> > >  }
> > >  
> > > +static uint64_t kvm_get_clock(void)
> > > +{
> > > +    struct kvm_clock_data data;
> > > +    int ret;
> > > +
> > > +    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> > > +    if (ret < 0) {
> > > +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> > > +                abort();
> > > +    }
> > > +    return data.clock;
> > > +}
> > > +
> > >  static void kvmclock_vm_state_change(void *opaque, int running,
> > >                                       RunState state)
> > >  {
> > > @@ -91,15 +110,37 @@
> > >  
> > >      if (running) {
> > >          struct kvm_clock_data data = {};
> > > -        uint64_t time_at_migration = kvmclock_current_nsec(s);
> > > +        uint64_t pvclock_via_mem = 0;
> > >  
> > > -        s->clock_valid = false;
> > > +        /* local (running VM) restore */
> > > +        if (s->clock_valid) {
> > > +            /*
> > > +             * if host does not support reliable KVM_GET_CLOCK,
> > > +             * read kvmclock value from memory
> > > +             */
> > > +            if (!kvm_has_adjust_clock_stable()) {
> > > +                pvclock_via_mem = kvmclock_current_nsec(s);
> > > +            }
> > > +        /* migration/savevm/init restore */
> > > +        } else {
> > > +            /*
> > > +             * use s->clock in case machine uses reliable
> > > +             * get clock and source host supported
> > > +             * reliable get clock
> > > +             */
> > > +            if (!(s->mach_use_reliable_get_clock &&
> > > +                  s->src_use_reliable_get_clock)) {
> > > +                pvclock_via_mem = kvmclock_current_nsec(s);
> > > +            }
> > 
> > The s->mach_use_reliable_get_clock check seems redundant.
> > src_use_reliable_get_clock is set only if
> > mach_use_reliable_get_clock is true.
> 
> Done.
> 
> > > +        }
> > >  
> > > -        /* We can't rely on the migrated clock value, just discard it */
> > > -        if (time_at_migration) {
> > > -            s->clock = time_at_migration;
> > > +        /* We can't rely on the saved clock value, just discard it */
> > > +        if (pvclock_via_mem) {
> > > +            s->clock = pvclock_via_mem;
> > >          }
> > >  
> > > +        s->clock_valid = false;
> > > +
> > >          data.clock = s->clock;
> > >          ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
> > >          if (ret < 0) {
> > > @@ -120,8 +161,6 @@
> > >              }
> > >          }
> > >      } else {
> > > -        struct kvm_clock_data data;
> > > -        int ret;
> > >  
> > >          if (s->clock_valid) {
> > >              return;
> > > @@ -129,13 +168,7 @@
> > >  
> > >          kvm_synchronize_all_tsc();
> > >  
> > > -        ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> > > -        if (ret < 0) {
> > > -            fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> > > -            abort();
> > > -        }
> > > -        s->clock = data.clock;
> > > -
> > > +        s->clock = kvm_get_clock();
> > >          /*
> > >           * If the VM is stopped, declare the clock state valid to
> > >           * avoid re-reading it on next vmsave (which would return
> > > @@ -152,22 +185,69 @@
> > >      qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
> > >  }
> > >  
> > > +static bool kvmclock_src_use_reliable_get_clock(void *opaque)
> > > +{
> > > +    KVMClockState *s = opaque;
> > > +
> > > +    /*
> > > +     * On machine types that support reliable KVM_GET_CLOCK,
> > > +     * if host kernel does provide reliable KVM_GET_CLOCK,
> > > +     * set src_use_reliable_get_clock=true so that destination
> > > +     * avoids reading kvmclock from memory.
> > > +     */
> > > +    if (s->mach_use_reliable_get_clock && kvm_has_adjust_clock_stable()) {
> > > +        s->src_use_reliable_get_clock = true;
> > > +    }
> > 
> > It feels fragile to change device state inside the .needed
> > function. Better to initialize src_use_reliable_get_clock on
> > kvmclock_realize()?
> > 
> > What exactly ensures src_use_reliable_get_clock is correctly
> > initialized on the migration destination as well?
> 
> Its initialized to false (because its part of device state).

Do you mean it is always going to be false on the destination
host? What if .needed is called by other code? .needed shouldn't
have any side-effects.

Also, the difference between the variables on the destination and
the source is very confusing, see other comments below:

> 
> > 
> > > +
> > > +    return s->mach_use_reliable_get_clock;
> > 
> > If if kvm_has_adjust_clock_stable() is false, isn't it simpler to
> > simply skip the section?
> > 
> > It looks like mach_use_reliable_get_clock and
> > src_use_reliable_get_clock could become a single field:
> > 
> > * use_reliable_get_clock ("x-use-reliable-get-clock" property)
> >   set to true by default (set on DEFINE_PROP_BOOL parameter)
> > * "x-use-reliable-get-clock" set to false by default on older
> >   machine-types
> > * use_reliable_get_clock forced to false on kvmclock_realize() if
> >   !kvm_has_adjust_clock_stable()
> > * kvmclock_reliable_get_clock.needed return s->use_reliable_get_clock
> 
> No because use_reliable_get_clock can be false because the destination host
> does not support kvm_has_adjust_clock_stable(), but the source host
> supports it (so we want to use the value in the first incoming
> migration).
> 

Are you talking about the value of the fields on the source host,
or on the destination host? In this case, you mean that
use_reliable_get_clock should be false on the destination, right?

> So the variable indicates whether the source host supported
> migration, not whether the destination supports it.

Good, that's the whole point of the field, isn't it? I will try
to summarize all cases below. Please correct me if something is
incorrect.

--------+------------+---------------+-----------+------------+
        | kvm_has_adj_clock_stable() | use_reliable_get_clock |
machine | src        | dst           | src       | dst        |
--------+------------+---------------+-----------+------------+
pc-2.8  | false      | false         | false     | false      |
pc-2.8  | false      | true          | false     | false (*)  |
pc-2.8  | true       | false         | true      | true (**)  |
pc-2.8  | true       | true          | true      | true       |
--------+------------+---------------+-----------+------------+
pc-2.7  | (any)      | (any)         | false     | false      |
--------+------------+---------------+-----------+------------+

(*) I guess this is where things would break if we skip the
    subsection (and use a single field). If the subsection is
    skipped, use_reliable_get_clock would be set to true on the
    destination. (See my comment below on the new patch you sent)

(**) Destination can still do the right thing here, right?



> 
[...]
> 
> How about the following:
> 
> Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
> ===================================================================
> --- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-17 15:07:11.220632761 -0200
> +++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-11-17 15:11:51.372048640 -0200
> @@ -36,6 +36,12 @@
>  
>      uint64_t clock;
>      bool clock_valid;
> +
> +    /* whether machine supports reliable KVM_GET_CLOCK */
> +    bool mach_use_reliable_get_clock;
> +
> +    /* whether source host supported reliable KVM_GET_CLOCK */
> +    bool src_use_reliable_get_clock;
>  } KVMClockState;
>  
>  struct pvclock_vcpu_time_info {
> @@ -81,6 +87,19 @@
>      return nsec + time.system_time;
>  }
>  
> +static uint64_t kvm_get_clock(void)
> +{
> +    struct kvm_clock_data data;
> +    int ret;
> +
> +    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> +    if (ret < 0) {
> +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> +                abort();
> +    }
> +    return data.clock;
> +}
> +
>  static void kvmclock_vm_state_change(void *opaque, int running,
>                                       RunState state)
>  {
> @@ -91,15 +110,36 @@
>  
>      if (running) {
>          struct kvm_clock_data data = {};
> -        uint64_t time_at_migration = kvmclock_current_nsec(s);
> +        uint64_t pvclock_via_mem = 0;
>  
> -        s->clock_valid = false;
> +        /* local (running VM) restore */
> +        if (s->clock_valid) {
> +            /*
> +             * if host does not support reliable KVM_GET_CLOCK,
> +             * read kvmclock value from memory
> +             */
> +            if (!kvm_has_adjust_clock_stable()) {
> +                pvclock_via_mem = kvmclock_current_nsec(s);
> +            }
> +        /* migration/savevm/init restore */
> +        } else {
> +            /*
> +             * use s->clock in case machine uses reliable
> +             * get clock and source host supported
> +             * reliable get clock
> +             */
> +            if (!s->src_use_reliable_get_clock) {
> +                pvclock_via_mem = kvmclock_current_nsec(s);
> +            }
> +        }
>  
> -        /* We can't rely on the migrated clock value, just discard it */
> -        if (time_at_migration) {
> -            s->clock = time_at_migration;
> +        /* We can't rely on the saved clock value, just discard it */
> +        if (pvclock_via_mem) {
> +            s->clock = pvclock_via_mem;
>          }
>  
> +        s->clock_valid = false;
> +
>          data.clock = s->clock;
>          ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
>          if (ret < 0) {
> @@ -120,8 +160,6 @@
>              }
>          }
>      } else {
> -        struct kvm_clock_data data;
> -        int ret;
>  
>          if (s->clock_valid) {
>              return;
> @@ -129,13 +167,7 @@
>  
>          kvm_synchronize_all_tsc();
>  
> -        ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> -        if (ret < 0) {
> -            fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> -            abort();
> -        }
> -        s->clock = data.clock;
> -
> +        s->clock = kvm_get_clock();
>          /*
>           * If the VM is stopped, declare the clock state valid to
>           * avoid re-reading it on next vmsave (which would return
> @@ -149,25 +181,72 @@
>  {
>      KVMClockState *s = KVM_CLOCK(dev);
>  
> +    /*
> +     * On machine types that support reliable KVM_GET_CLOCK,
> +     * if host kernel does provide reliable KVM_GET_CLOCK,
> +     * set src_use_reliable_get_clock=true so that destination
> +     * avoids reading kvmclock from memory.
> +     */
> +    if (s->mach_use_reliable_get_clock && kvm_has_adjust_clock_stable()) {
> +        s->src_use_reliable_get_clock = true;
> +    }
> +
>      qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
>  }
>  
> +static bool kvmclock_src_use_reliable_get_clock(void *opaque)
> +{
> +    KVMClockState *s = opaque;
> +
> +    return s->mach_use_reliable_get_clock;
> +}

I didn't review kvmclock_vm_state_change() and
kvm_has_adjust_clock_stable(), but the rules to migrate
src_use_reliable_get_clock look good now, and seem to match the
table above.

> +
> +static const VMStateDescription kvmclock_reliable_get_clock = {
> +    .name = "kvmclock/src_use_reliable_get_clock",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .needed = kvmclock_src_use_reliable_get_clock,
> +    .fields = (VMStateField[]) {
> +        VMSTATE_BOOL(src_use_reliable_get_clock, KVMClockState),
> +        VMSTATE_END_OF_LIST()
> +    }
> +};
> +
> +static void kvmclock_pre_save(void *opaque)
> +{
> +    KVMClockState *s = opaque;
> +
> +    s->clock = kvm_get_clock();
> +}
> +
>  static const VMStateDescription kvmclock_vmsd = {
>      .name = "kvmclock",
>      .version_id = 1,
>      .minimum_version_id = 1,
> +    .pre_save = kvmclock_pre_save,
>      .fields = (VMStateField[]) {
>          VMSTATE_UINT64(clock, KVMClockState),
>          VMSTATE_END_OF_LIST()
> +    },
> +    .subsections = (const VMStateDescription * []) {
> +        &kvmclock_reliable_get_clock,
> +        NULL
>      }
>  };
>  
> +static Property kvmclock_properties[] = {
> +    DEFINE_PROP_BOOL("x-mach-use-reliable-get-clock", KVMClockState,
> +                      mach_use_reliable_get_clock, true),
> +    DEFINE_PROP_END_OF_LIST(),
> +};
> +
>  static void kvmclock_class_init(ObjectClass *klass, void *data)
>  {
>      DeviceClass *dc = DEVICE_CLASS(klass);
>  
>      dc->realize = kvmclock_realize;
>      dc->vmsd = &kvmclock_vmsd;
> +    dc->props = kvmclock_properties;
>  }
>  
>  static const TypeInfo kvmclock_info = {
> Index: qemu-mig-advance-clock/include/hw/i386/pc.h
> ===================================================================
> --- qemu-mig-advance-clock.orig/include/hw/i386/pc.h	2016-11-17 15:07:11.220632761 -0200
> +++ qemu-mig-advance-clock/include/hw/i386/pc.h	2016-11-17 15:08:58.096791416 -0200
> @@ -370,6 +370,11 @@
>  #define PC_COMPAT_2_7 \
>      HW_COMPAT_2_7 \
>      {\
> +        .driver   = "kvmclock",\
> +        .property = "x-mach-use-reliable-get-clock",\
> +        .value    = "off",\
> +    },\
> +    {\
>          .driver   = TYPE_X86_CPU,\
>          .property = "l3-cache",\
>          .value    = "off",\
> Index: qemu-mig-advance-clock/target-i386/kvm.c
> ===================================================================
> --- qemu-mig-advance-clock.orig/target-i386/kvm.c	2016-11-17 15:07:11.221632762 -0200
> +++ qemu-mig-advance-clock/target-i386/kvm.c	2016-11-17 15:07:15.867639659 -0200
> @@ -117,6 +117,13 @@
>      return kvm_check_extension(kvm_state, KVM_CAP_X86_SMM);
>  }
>  
> +bool kvm_has_adjust_clock_stable(void)
> +{
> +    int ret = kvm_check_extension(kvm_state, KVM_CAP_ADJUST_CLOCK);
> +
> +    return (ret == KVM_CLOCK_TSC_STABLE);
> +}
> +
>  bool kvm_allows_irq0_override(void)
>  {
>      return !kvm_irqchip_in_kernel() || kvm_has_gsi_routing();
> Index: qemu-mig-advance-clock/target-i386/kvm_i386.h
> ===================================================================
> --- qemu-mig-advance-clock.orig/target-i386/kvm_i386.h	2016-11-17 15:07:11.222632764 -0200
> +++ qemu-mig-advance-clock/target-i386/kvm_i386.h	2016-11-17 15:07:15.867639659 -0200
> @@ -17,6 +17,7 @@
>  
>  bool kvm_allows_irq0_override(void);
>  bool kvm_has_smm(void);
> +bool kvm_has_adjust_clock_stable(void);
>  void kvm_synchronize_all_tsc(void);
>  void kvm_arch_reset_vcpu(X86CPU *cs);
>  void kvm_arch_do_init_vcpu(X86CPU *cs);

-- 
Eduardo

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [qemu patch V2 2/2] kvmclock: reduce kvmclock difference on migration
@ 2016-11-17 18:16         ` Eduardo Habkost
  0 siblings, 0 replies; 22+ messages in thread
From: Eduardo Habkost @ 2016-11-17 18:16 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: kvm, qemu-devel, Dr. David Alan Gilbert, Paolo Bonzini,
	Juan Quintela, Radim Krcmar

On Thu, Nov 17, 2016 at 03:15:35PM -0200, Marcelo Tosatti wrote:
> On Thu, Nov 17, 2016 at 12:11:26PM -0200, Eduardo Habkost wrote:
> > On Thu, Nov 17, 2016 at 11:24:13AM -0200, Marcelo Tosatti wrote:
> > > Check for KVM_CAP_ADJUST_CLOCK capability KVM_CLOCK_TSC_STABLE, which
> > > indicates that KVM_GET_CLOCK returns a value as seen by the guest at
> > > that moment.
> > > 
> > > For new machine types, use this value rather than reading 
> > > from guest memory.
> > > 
> > > This reduces kvmclock difference on migration from 5s to 0.1s
> > > (when max_downtime == 5s).
> > > 
> > > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > > 
> > > ---
> > >  hw/i386/kvm/clock.c    |  108 ++++++++++++++++++++++++++++++++++++++++++-------
> > >  include/hw/i386/pc.h   |    5 ++
> > >  target-i386/kvm.c      |    7 +++
> > >  target-i386/kvm_i386.h |    1 
> > >  4 files changed, 107 insertions(+), 14 deletions(-)
> > > 
> > > v2: 
> > > - improve variable names (Juan)
> > > - consolidate code on kvm_get_clock function (Paolo)
> > > - return mach_use_reliable_get_clock from needed function (Paolo)
> > > 
> > > Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
> > > ===================================================================
> > > --- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-14 10:40:39.748116312 -0200
> > > +++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-11-14 13:38:29.299955042 -0200
> > > @@ -36,6 +36,12 @@
> > >  
> > >      uint64_t clock;
> > >      bool clock_valid;
> > > +
> > > +    /* whether machine supports reliable KVM_GET_CLOCK */
> > > +    bool mach_use_reliable_get_clock;
> > > +
> > > +    /* whether source host supported reliable KVM_GET_CLOCK */
> > > +    bool src_use_reliable_get_clock;
> > >  } KVMClockState;
> > >  
> > >  struct pvclock_vcpu_time_info {
> > > @@ -81,6 +87,19 @@
> > >      return nsec + time.system_time;
> > >  }
> > >  
> > > +static uint64_t kvm_get_clock(void)
> > > +{
> > > +    struct kvm_clock_data data;
> > > +    int ret;
> > > +
> > > +    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> > > +    if (ret < 0) {
> > > +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> > > +                abort();
> > > +    }
> > > +    return data.clock;
> > > +}
> > > +
> > >  static void kvmclock_vm_state_change(void *opaque, int running,
> > >                                       RunState state)
> > >  {
> > > @@ -91,15 +110,37 @@
> > >  
> > >      if (running) {
> > >          struct kvm_clock_data data = {};
> > > -        uint64_t time_at_migration = kvmclock_current_nsec(s);
> > > +        uint64_t pvclock_via_mem = 0;
> > >  
> > > -        s->clock_valid = false;
> > > +        /* local (running VM) restore */
> > > +        if (s->clock_valid) {
> > > +            /*
> > > +             * if host does not support reliable KVM_GET_CLOCK,
> > > +             * read kvmclock value from memory
> > > +             */
> > > +            if (!kvm_has_adjust_clock_stable()) {
> > > +                pvclock_via_mem = kvmclock_current_nsec(s);
> > > +            }
> > > +        /* migration/savevm/init restore */
> > > +        } else {
> > > +            /*
> > > +             * use s->clock in case machine uses reliable
> > > +             * get clock and source host supported
> > > +             * reliable get clock
> > > +             */
> > > +            if (!(s->mach_use_reliable_get_clock &&
> > > +                  s->src_use_reliable_get_clock)) {
> > > +                pvclock_via_mem = kvmclock_current_nsec(s);
> > > +            }
> > 
> > The s->mach_use_reliable_get_clock check seems redundant.
> > src_use_reliable_get_clock is set only if
> > mach_use_reliable_get_clock is true.
> 
> Done.
> 
> > > +        }
> > >  
> > > -        /* We can't rely on the migrated clock value, just discard it */
> > > -        if (time_at_migration) {
> > > -            s->clock = time_at_migration;
> > > +        /* We can't rely on the saved clock value, just discard it */
> > > +        if (pvclock_via_mem) {
> > > +            s->clock = pvclock_via_mem;
> > >          }
> > >  
> > > +        s->clock_valid = false;
> > > +
> > >          data.clock = s->clock;
> > >          ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
> > >          if (ret < 0) {
> > > @@ -120,8 +161,6 @@
> > >              }
> > >          }
> > >      } else {
> > > -        struct kvm_clock_data data;
> > > -        int ret;
> > >  
> > >          if (s->clock_valid) {
> > >              return;
> > > @@ -129,13 +168,7 @@
> > >  
> > >          kvm_synchronize_all_tsc();
> > >  
> > > -        ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> > > -        if (ret < 0) {
> > > -            fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> > > -            abort();
> > > -        }
> > > -        s->clock = data.clock;
> > > -
> > > +        s->clock = kvm_get_clock();
> > >          /*
> > >           * If the VM is stopped, declare the clock state valid to
> > >           * avoid re-reading it on next vmsave (which would return
> > > @@ -152,22 +185,69 @@
> > >      qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
> > >  }
> > >  
> > > +static bool kvmclock_src_use_reliable_get_clock(void *opaque)
> > > +{
> > > +    KVMClockState *s = opaque;
> > > +
> > > +    /*
> > > +     * On machine types that support reliable KVM_GET_CLOCK,
> > > +     * if host kernel does provide reliable KVM_GET_CLOCK,
> > > +     * set src_use_reliable_get_clock=true so that destination
> > > +     * avoids reading kvmclock from memory.
> > > +     */
> > > +    if (s->mach_use_reliable_get_clock && kvm_has_adjust_clock_stable()) {
> > > +        s->src_use_reliable_get_clock = true;
> > > +    }
> > 
> > It feels fragile to change device state inside the .needed
> > function. Better to initialize src_use_reliable_get_clock on
> > kvmclock_realize()?
> > 
> > What exactly ensures src_use_reliable_get_clock is correctly
> > initialized on the migration destination as well?
> 
> Its initialized to false (because its part of device state).

Do you mean it is always going to be false on the destination
host? What if .needed is called by other code? .needed shouldn't
have any side-effects.

Also, the difference between the variables on the destination and
the source is very confusing, see other comments below:

> 
> > 
> > > +
> > > +    return s->mach_use_reliable_get_clock;
> > 
> > If if kvm_has_adjust_clock_stable() is false, isn't it simpler to
> > simply skip the section?
> > 
> > It looks like mach_use_reliable_get_clock and
> > src_use_reliable_get_clock could become a single field:
> > 
> > * use_reliable_get_clock ("x-use-reliable-get-clock" property)
> >   set to true by default (set on DEFINE_PROP_BOOL parameter)
> > * "x-use-reliable-get-clock" set to false by default on older
> >   machine-types
> > * use_reliable_get_clock forced to false on kvmclock_realize() if
> >   !kvm_has_adjust_clock_stable()
> > * kvmclock_reliable_get_clock.needed return s->use_reliable_get_clock
> 
> No because use_reliable_get_clock can be false because the destination host
> does not support kvm_has_adjust_clock_stable(), but the source host
> supports it (so we want to use the value in the first incoming
> migration).
> 

Are you talking about the value of the fields on the source host,
or on the destination host? In this case, you mean that
use_reliable_get_clock should be false on the destination, right?

> So the variable indicates whether the source host supported
> migration, not whether the destination supports it.

Good, that's the whole point of the field, isn't it? I will try
to summarize all cases below. Please correct me if something is
incorrect.

--------+------------+---------------+-----------+------------+
        | kvm_has_adj_clock_stable() | use_reliable_get_clock |
machine | src        | dst           | src       | dst        |
--------+------------+---------------+-----------+------------+
pc-2.8  | false      | false         | false     | false      |
pc-2.8  | false      | true          | false     | false (*)  |
pc-2.8  | true       | false         | true      | true (**)  |
pc-2.8  | true       | true          | true      | true       |
--------+------------+---------------+-----------+------------+
pc-2.7  | (any)      | (any)         | false     | false      |
--------+------------+---------------+-----------+------------+

(*) I guess this is where things would break if we skip the
    subsection (and use a single field). If the subsection is
    skipped, use_reliable_get_clock would be set to true on the
    destination. (See my comment below on the new patch you sent)

(**) Destination can still do the right thing here, right?



> 
[...]
> 
> How about the following:
> 
> Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
> ===================================================================
> --- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-17 15:07:11.220632761 -0200
> +++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-11-17 15:11:51.372048640 -0200
> @@ -36,6 +36,12 @@
>  
>      uint64_t clock;
>      bool clock_valid;
> +
> +    /* whether machine supports reliable KVM_GET_CLOCK */
> +    bool mach_use_reliable_get_clock;
> +
> +    /* whether source host supported reliable KVM_GET_CLOCK */
> +    bool src_use_reliable_get_clock;
>  } KVMClockState;
>  
>  struct pvclock_vcpu_time_info {
> @@ -81,6 +87,19 @@
>      return nsec + time.system_time;
>  }
>  
> +static uint64_t kvm_get_clock(void)
> +{
> +    struct kvm_clock_data data;
> +    int ret;
> +
> +    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> +    if (ret < 0) {
> +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> +                abort();
> +    }
> +    return data.clock;
> +}
> +
>  static void kvmclock_vm_state_change(void *opaque, int running,
>                                       RunState state)
>  {
> @@ -91,15 +110,36 @@
>  
>      if (running) {
>          struct kvm_clock_data data = {};
> -        uint64_t time_at_migration = kvmclock_current_nsec(s);
> +        uint64_t pvclock_via_mem = 0;
>  
> -        s->clock_valid = false;
> +        /* local (running VM) restore */
> +        if (s->clock_valid) {
> +            /*
> +             * if host does not support reliable KVM_GET_CLOCK,
> +             * read kvmclock value from memory
> +             */
> +            if (!kvm_has_adjust_clock_stable()) {
> +                pvclock_via_mem = kvmclock_current_nsec(s);
> +            }
> +        /* migration/savevm/init restore */
> +        } else {
> +            /*
> +             * use s->clock in case machine uses reliable
> +             * get clock and source host supported
> +             * reliable get clock
> +             */
> +            if (!s->src_use_reliable_get_clock) {
> +                pvclock_via_mem = kvmclock_current_nsec(s);
> +            }
> +        }
>  
> -        /* We can't rely on the migrated clock value, just discard it */
> -        if (time_at_migration) {
> -            s->clock = time_at_migration;
> +        /* We can't rely on the saved clock value, just discard it */
> +        if (pvclock_via_mem) {
> +            s->clock = pvclock_via_mem;
>          }
>  
> +        s->clock_valid = false;
> +
>          data.clock = s->clock;
>          ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
>          if (ret < 0) {
> @@ -120,8 +160,6 @@
>              }
>          }
>      } else {
> -        struct kvm_clock_data data;
> -        int ret;
>  
>          if (s->clock_valid) {
>              return;
> @@ -129,13 +167,7 @@
>  
>          kvm_synchronize_all_tsc();
>  
> -        ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> -        if (ret < 0) {
> -            fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> -            abort();
> -        }
> -        s->clock = data.clock;
> -
> +        s->clock = kvm_get_clock();
>          /*
>           * If the VM is stopped, declare the clock state valid to
>           * avoid re-reading it on next vmsave (which would return
> @@ -149,25 +181,72 @@
>  {
>      KVMClockState *s = KVM_CLOCK(dev);
>  
> +    /*
> +     * On machine types that support reliable KVM_GET_CLOCK,
> +     * if host kernel does provide reliable KVM_GET_CLOCK,
> +     * set src_use_reliable_get_clock=true so that destination
> +     * avoids reading kvmclock from memory.
> +     */
> +    if (s->mach_use_reliable_get_clock && kvm_has_adjust_clock_stable()) {
> +        s->src_use_reliable_get_clock = true;
> +    }
> +
>      qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
>  }
>  
> +static bool kvmclock_src_use_reliable_get_clock(void *opaque)
> +{
> +    KVMClockState *s = opaque;
> +
> +    return s->mach_use_reliable_get_clock;
> +}

I didn't review kvmclock_vm_state_change() and
kvm_has_adjust_clock_stable(), but the rules to migrate
src_use_reliable_get_clock look good now, and seem to match the
table above.

> +
> +static const VMStateDescription kvmclock_reliable_get_clock = {
> +    .name = "kvmclock/src_use_reliable_get_clock",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .needed = kvmclock_src_use_reliable_get_clock,
> +    .fields = (VMStateField[]) {
> +        VMSTATE_BOOL(src_use_reliable_get_clock, KVMClockState),
> +        VMSTATE_END_OF_LIST()
> +    }
> +};
> +
> +static void kvmclock_pre_save(void *opaque)
> +{
> +    KVMClockState *s = opaque;
> +
> +    s->clock = kvm_get_clock();
> +}
> +
>  static const VMStateDescription kvmclock_vmsd = {
>      .name = "kvmclock",
>      .version_id = 1,
>      .minimum_version_id = 1,
> +    .pre_save = kvmclock_pre_save,
>      .fields = (VMStateField[]) {
>          VMSTATE_UINT64(clock, KVMClockState),
>          VMSTATE_END_OF_LIST()
> +    },
> +    .subsections = (const VMStateDescription * []) {
> +        &kvmclock_reliable_get_clock,
> +        NULL
>      }
>  };
>  
> +static Property kvmclock_properties[] = {
> +    DEFINE_PROP_BOOL("x-mach-use-reliable-get-clock", KVMClockState,
> +                      mach_use_reliable_get_clock, true),
> +    DEFINE_PROP_END_OF_LIST(),
> +};
> +
>  static void kvmclock_class_init(ObjectClass *klass, void *data)
>  {
>      DeviceClass *dc = DEVICE_CLASS(klass);
>  
>      dc->realize = kvmclock_realize;
>      dc->vmsd = &kvmclock_vmsd;
> +    dc->props = kvmclock_properties;
>  }
>  
>  static const TypeInfo kvmclock_info = {
> Index: qemu-mig-advance-clock/include/hw/i386/pc.h
> ===================================================================
> --- qemu-mig-advance-clock.orig/include/hw/i386/pc.h	2016-11-17 15:07:11.220632761 -0200
> +++ qemu-mig-advance-clock/include/hw/i386/pc.h	2016-11-17 15:08:58.096791416 -0200
> @@ -370,6 +370,11 @@
>  #define PC_COMPAT_2_7 \
>      HW_COMPAT_2_7 \
>      {\
> +        .driver   = "kvmclock",\
> +        .property = "x-mach-use-reliable-get-clock",\
> +        .value    = "off",\
> +    },\
> +    {\
>          .driver   = TYPE_X86_CPU,\
>          .property = "l3-cache",\
>          .value    = "off",\
> Index: qemu-mig-advance-clock/target-i386/kvm.c
> ===================================================================
> --- qemu-mig-advance-clock.orig/target-i386/kvm.c	2016-11-17 15:07:11.221632762 -0200
> +++ qemu-mig-advance-clock/target-i386/kvm.c	2016-11-17 15:07:15.867639659 -0200
> @@ -117,6 +117,13 @@
>      return kvm_check_extension(kvm_state, KVM_CAP_X86_SMM);
>  }
>  
> +bool kvm_has_adjust_clock_stable(void)
> +{
> +    int ret = kvm_check_extension(kvm_state, KVM_CAP_ADJUST_CLOCK);
> +
> +    return (ret == KVM_CLOCK_TSC_STABLE);
> +}
> +
>  bool kvm_allows_irq0_override(void)
>  {
>      return !kvm_irqchip_in_kernel() || kvm_has_gsi_routing();
> Index: qemu-mig-advance-clock/target-i386/kvm_i386.h
> ===================================================================
> --- qemu-mig-advance-clock.orig/target-i386/kvm_i386.h	2016-11-17 15:07:11.222632764 -0200
> +++ qemu-mig-advance-clock/target-i386/kvm_i386.h	2016-11-17 15:07:15.867639659 -0200
> @@ -17,6 +17,7 @@
>  
>  bool kvm_allows_irq0_override(void);
>  bool kvm_has_smm(void);
> +bool kvm_has_adjust_clock_stable(void);
>  void kvm_synchronize_all_tsc(void);
>  void kvm_arch_reset_vcpu(X86CPU *cs);
>  void kvm_arch_do_init_vcpu(X86CPU *cs);

-- 
Eduardo

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [qemu patch V2 2/2] kvmclock: reduce kvmclock difference on migration
  2016-11-17 18:16         ` [Qemu-devel] " Eduardo Habkost
@ 2016-11-17 19:15           ` Marcelo Tosatti
  -1 siblings, 0 replies; 22+ messages in thread
From: Marcelo Tosatti @ 2016-11-17 19:15 UTC (permalink / raw)
  To: Eduardo Habkost
  Cc: kvm, qemu-devel, Dr. David Alan Gilbert, Paolo Bonzini,
	Juan Quintela, Radim Krcmar

On Thu, Nov 17, 2016 at 04:16:09PM -0200, Eduardo Habkost wrote:
> On Thu, Nov 17, 2016 at 03:15:35PM -0200, Marcelo Tosatti wrote:
> > On Thu, Nov 17, 2016 at 12:11:26PM -0200, Eduardo Habkost wrote:
> > > On Thu, Nov 17, 2016 at 11:24:13AM -0200, Marcelo Tosatti wrote:
> > > > Check for KVM_CAP_ADJUST_CLOCK capability KVM_CLOCK_TSC_STABLE, which
> > > > indicates that KVM_GET_CLOCK returns a value as seen by the guest at
> > > > that moment.
> > > > 
> > > > For new machine types, use this value rather than reading 
> > > > from guest memory.
> > > > 
> > > > This reduces kvmclock difference on migration from 5s to 0.1s
> > > > (when max_downtime == 5s).
> > > > 
> > > > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > > > 
> > > > ---
> > > >  hw/i386/kvm/clock.c    |  108 ++++++++++++++++++++++++++++++++++++++++++-------
> > > >  include/hw/i386/pc.h   |    5 ++
> > > >  target-i386/kvm.c      |    7 +++
> > > >  target-i386/kvm_i386.h |    1 
> > > >  4 files changed, 107 insertions(+), 14 deletions(-)
> > > > 
> > > > v2: 
> > > > - improve variable names (Juan)
> > > > - consolidate code on kvm_get_clock function (Paolo)
> > > > - return mach_use_reliable_get_clock from needed function (Paolo)
> > > > 
> > > > Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
> > > > ===================================================================
> > > > --- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-14 10:40:39.748116312 -0200
> > > > +++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-11-14 13:38:29.299955042 -0200
> > > > @@ -36,6 +36,12 @@
> > > >  
> > > >      uint64_t clock;
> > > >      bool clock_valid;
> > > > +
> > > > +    /* whether machine supports reliable KVM_GET_CLOCK */
> > > > +    bool mach_use_reliable_get_clock;
> > > > +
> > > > +    /* whether source host supported reliable KVM_GET_CLOCK */
> > > > +    bool src_use_reliable_get_clock;
> > > >  } KVMClockState;
> > > >  
> > > >  struct pvclock_vcpu_time_info {
> > > > @@ -81,6 +87,19 @@
> > > >      return nsec + time.system_time;
> > > >  }
> > > >  
> > > > +static uint64_t kvm_get_clock(void)
> > > > +{
> > > > +    struct kvm_clock_data data;
> > > > +    int ret;
> > > > +
> > > > +    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> > > > +    if (ret < 0) {
> > > > +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> > > > +                abort();
> > > > +    }
> > > > +    return data.clock;
> > > > +}
> > > > +
> > > >  static void kvmclock_vm_state_change(void *opaque, int running,
> > > >                                       RunState state)
> > > >  {
> > > > @@ -91,15 +110,37 @@
> > > >  
> > > >      if (running) {
> > > >          struct kvm_clock_data data = {};
> > > > -        uint64_t time_at_migration = kvmclock_current_nsec(s);
> > > > +        uint64_t pvclock_via_mem = 0;
> > > >  
> > > > -        s->clock_valid = false;
> > > > +        /* local (running VM) restore */
> > > > +        if (s->clock_valid) {
> > > > +            /*
> > > > +             * if host does not support reliable KVM_GET_CLOCK,
> > > > +             * read kvmclock value from memory
> > > > +             */
> > > > +            if (!kvm_has_adjust_clock_stable()) {
> > > > +                pvclock_via_mem = kvmclock_current_nsec(s);
> > > > +            }
> > > > +        /* migration/savevm/init restore */
> > > > +        } else {
> > > > +            /*
> > > > +             * use s->clock in case machine uses reliable
> > > > +             * get clock and source host supported
> > > > +             * reliable get clock
> > > > +             */
> > > > +            if (!(s->mach_use_reliable_get_clock &&
> > > > +                  s->src_use_reliable_get_clock)) {
> > > > +                pvclock_via_mem = kvmclock_current_nsec(s);
> > > > +            }
> > > 
> > > The s->mach_use_reliable_get_clock check seems redundant.
> > > src_use_reliable_get_clock is set only if
> > > mach_use_reliable_get_clock is true.
> > 
> > Done.
> > 
> > > > +        }
> > > >  
> > > > -        /* We can't rely on the migrated clock value, just discard it */
> > > > -        if (time_at_migration) {
> > > > -            s->clock = time_at_migration;
> > > > +        /* We can't rely on the saved clock value, just discard it */
> > > > +        if (pvclock_via_mem) {
> > > > +            s->clock = pvclock_via_mem;
> > > >          }
> > > >  
> > > > +        s->clock_valid = false;
> > > > +
> > > >          data.clock = s->clock;
> > > >          ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
> > > >          if (ret < 0) {
> > > > @@ -120,8 +161,6 @@
> > > >              }
> > > >          }
> > > >      } else {
> > > > -        struct kvm_clock_data data;
> > > > -        int ret;
> > > >  
> > > >          if (s->clock_valid) {
> > > >              return;
> > > > @@ -129,13 +168,7 @@
> > > >  
> > > >          kvm_synchronize_all_tsc();
> > > >  
> > > > -        ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> > > > -        if (ret < 0) {
> > > > -            fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> > > > -            abort();
> > > > -        }
> > > > -        s->clock = data.clock;
> > > > -
> > > > +        s->clock = kvm_get_clock();
> > > >          /*
> > > >           * If the VM is stopped, declare the clock state valid to
> > > >           * avoid re-reading it on next vmsave (which would return
> > > > @@ -152,22 +185,69 @@
> > > >      qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
> > > >  }
> > > >  
> > > > +static bool kvmclock_src_use_reliable_get_clock(void *opaque)
> > > > +{
> > > > +    KVMClockState *s = opaque;
> > > > +
> > > > +    /*
> > > > +     * On machine types that support reliable KVM_GET_CLOCK,
> > > > +     * if host kernel does provide reliable KVM_GET_CLOCK,
> > > > +     * set src_use_reliable_get_clock=true so that destination
> > > > +     * avoids reading kvmclock from memory.
> > > > +     */
> > > > +    if (s->mach_use_reliable_get_clock && kvm_has_adjust_clock_stable()) {
> > > > +        s->src_use_reliable_get_clock = true;
> > > > +    }
> > > 
> > > It feels fragile to change device state inside the .needed
> > > function. Better to initialize src_use_reliable_get_clock on
> > > kvmclock_realize()?
> > > 
> > > What exactly ensures src_use_reliable_get_clock is correctly
> > > initialized on the migration destination as well?
> > 
> > Its initialized to false (because its part of device state).
> 
> Do you mean it is always going to be false on the destination
> host? What if .needed is called by other code? .needed shouldn't
> have any side-effects.

I don't think needed is called by any other code.

> Also, the difference between the variables on the destination and
> the source is very confusing, see other comments below:

Yes it is: its because you don't know when is what. But if you consider
separate events:

    initialization, incoming migration, vmstop, vmcontinue.

And the values of the variables for each case, then it becomes 
clear.

If anyone has a suggestion to improve clarify, its welcome.

> > > 
> > > > +
> > > > +    return s->mach_use_reliable_get_clock;
> > > 
> > > If if kvm_has_adjust_clock_stable() is false, isn't it simpler to
> > > simply skip the section?
> > > 
> > > It looks like mach_use_reliable_get_clock and
> > > src_use_reliable_get_clock could become a single field:
> > > 
> > > * use_reliable_get_clock ("x-use-reliable-get-clock" property)
> > >   set to true by default (set on DEFINE_PROP_BOOL parameter)
> > > * "x-use-reliable-get-clock" set to false by default on older
> > >   machine-types
> > > * use_reliable_get_clock forced to false on kvmclock_realize() if
> > >   !kvm_has_adjust_clock_stable()
> > > * kvmclock_reliable_get_clock.needed return s->use_reliable_get_clock
> > 
> > No because use_reliable_get_clock can be false because the destination host
> > does not support kvm_has_adjust_clock_stable(), but the source host
> > supports it (so we want to use the value in the first incoming
> > migration).
> > 
> 
> Are you talking about the value of the fields on the source host,
> or on the destination host? In this case, you mean that
> use_reliable_get_clock should be false on the destination, right?

Yes, destination.

> > So the variable indicates whether the source host supported
> > migration, not whether the destination supports it.
> 
> Good, that's the whole point of the field, isn't it? I will try
> to summarize all cases below. Please correct me if something is
> incorrect.

Yes. So in this case, the cleanup/simplification to use a single
variable is not possible (or maybe it is, but it just complicates
things).

> --------+------------+---------------+-----------+------------+
>         | kvm_has_adj_clock_stable() | use_reliable_get_clock |
> machine | src        | dst           | src       | dst        |
> --------+------------+---------------+-----------+------------+
> pc-2.8  | false      | false         | false     | false      |
> pc-2.8  | false      | true          | false     | false (*)  |
> pc-2.8  | true       | false         | true      | true (**)  |
> pc-2.8  | true       | true          | true      | true       |
> --------+------------+---------------+-----------+------------+
> pc-2.7  | (any)      | (any)         | false     | false      |
> --------+------------+---------------+-----------+------------+
> 
> (*) I guess this is where things would break if we skip the
>     subsection (and use a single field). If the subsection is
>     skipped, use_reliable_get_clock would be set to true on the
>     destination. (See my comment below on the new patch you sent)

Well, perhaps you can massage things so that it works 
without sending subsection (that is):

    * Send subsection only in case value is true.
    * Do not send subsection in case value is false (then treat
      non presence of subsection and mach_reliable_clock=true 
      to mean source did not support kvm_has_reliable_clock).

Once you agree on "send a subsection for both cases", then you 
handle it.

The code is there, its simple, readable, so...

(what you can't do is to unify src_use_get_reliable_clock (remote 
information), and local information). Because you need 
to know both things to be able to handle the 4 cases properly on 
incoming migration.


> (**) Destination can still do the right thing here, right?
> 
> 
> 
> > 
> [...]
> > 
> > How about the following:
> > 
> > Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
> > ===================================================================
> > --- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-17 15:07:11.220632761 -0200
> > +++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-11-17 15:11:51.372048640 -0200
> > @@ -36,6 +36,12 @@
> >  
> >      uint64_t clock;
> >      bool clock_valid;
> > +
> > +    /* whether machine supports reliable KVM_GET_CLOCK */
> > +    bool mach_use_reliable_get_clock;
> > +
> > +    /* whether source host supported reliable KVM_GET_CLOCK */
> > +    bool src_use_reliable_get_clock;
> >  } KVMClockState;
> >  
> >  struct pvclock_vcpu_time_info {
> > @@ -81,6 +87,19 @@
> >      return nsec + time.system_time;
> >  }
> >  
> > +static uint64_t kvm_get_clock(void)
> > +{
> > +    struct kvm_clock_data data;
> > +    int ret;
> > +
> > +    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> > +    if (ret < 0) {
> > +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> > +                abort();
> > +    }
> > +    return data.clock;
> > +}
> > +
> >  static void kvmclock_vm_state_change(void *opaque, int running,
> >                                       RunState state)
> >  {
> > @@ -91,15 +110,36 @@
> >  
> >      if (running) {
> >          struct kvm_clock_data data = {};
> > -        uint64_t time_at_migration = kvmclock_current_nsec(s);
> > +        uint64_t pvclock_via_mem = 0;
> >  
> > -        s->clock_valid = false;
> > +        /* local (running VM) restore */
> > +        if (s->clock_valid) {
> > +            /*
> > +             * if host does not support reliable KVM_GET_CLOCK,
> > +             * read kvmclock value from memory
> > +             */
> > +            if (!kvm_has_adjust_clock_stable()) {
> > +                pvclock_via_mem = kvmclock_current_nsec(s);
> > +            }
> > +        /* migration/savevm/init restore */
> > +        } else {
> > +            /*
> > +             * use s->clock in case machine uses reliable
> > +             * get clock and source host supported
> > +             * reliable get clock
> > +             */
> > +            if (!s->src_use_reliable_get_clock) {
> > +                pvclock_via_mem = kvmclock_current_nsec(s);
> > +            }
> > +        }
> >  
> > -        /* We can't rely on the migrated clock value, just discard it */
> > -        if (time_at_migration) {
> > -            s->clock = time_at_migration;
> > +        /* We can't rely on the saved clock value, just discard it */
> > +        if (pvclock_via_mem) {
> > +            s->clock = pvclock_via_mem;
> >          }
> >  
> > +        s->clock_valid = false;
> > +
> >          data.clock = s->clock;
> >          ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
> >          if (ret < 0) {
> > @@ -120,8 +160,6 @@
> >              }
> >          }
> >      } else {
> > -        struct kvm_clock_data data;
> > -        int ret;
> >  
> >          if (s->clock_valid) {
> >              return;
> > @@ -129,13 +167,7 @@
> >  
> >          kvm_synchronize_all_tsc();
> >  
> > -        ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> > -        if (ret < 0) {
> > -            fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> > -            abort();
> > -        }
> > -        s->clock = data.clock;
> > -
> > +        s->clock = kvm_get_clock();
> >          /*
> >           * If the VM is stopped, declare the clock state valid to
> >           * avoid re-reading it on next vmsave (which would return
> > @@ -149,25 +181,72 @@
> >  {
> >      KVMClockState *s = KVM_CLOCK(dev);
> >  
> > +    /*
> > +     * On machine types that support reliable KVM_GET_CLOCK,
> > +     * if host kernel does provide reliable KVM_GET_CLOCK,
> > +     * set src_use_reliable_get_clock=true so that destination
> > +     * avoids reading kvmclock from memory.
> > +     */
> > +    if (s->mach_use_reliable_get_clock && kvm_has_adjust_clock_stable()) {
> > +        s->src_use_reliable_get_clock = true;
> > +    }
> > +
> >      qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
> >  }
> >  
> > +static bool kvmclock_src_use_reliable_get_clock(void *opaque)
> > +{
> > +    KVMClockState *s = opaque;
> > +
> > +    return s->mach_use_reliable_get_clock;
> > +}
> 
> I didn't review kvmclock_vm_state_change() and
> kvm_has_adjust_clock_stable(), but the rules to migrate
> src_use_reliable_get_clock look good now, and seem to match the
> table above.

Ok... Paolo, Radim?

Thanks Eduardo.

> 
> > +
> > +static const VMStateDescription kvmclock_reliable_get_clock = {
> > +    .name = "kvmclock/src_use_reliable_get_clock",
> > +    .version_id = 1,
> > +    .minimum_version_id = 1,
> > +    .needed = kvmclock_src_use_reliable_get_clock,
> > +    .fields = (VMStateField[]) {
> > +        VMSTATE_BOOL(src_use_reliable_get_clock, KVMClockState),
> > +        VMSTATE_END_OF_LIST()
> > +    }
> > +};
> > +
> > +static void kvmclock_pre_save(void *opaque)
> > +{
> > +    KVMClockState *s = opaque;
> > +
> > +    s->clock = kvm_get_clock();
> > +}
> > +
> >  static const VMStateDescription kvmclock_vmsd = {
> >      .name = "kvmclock",
> >      .version_id = 1,
> >      .minimum_version_id = 1,
> > +    .pre_save = kvmclock_pre_save,
> >      .fields = (VMStateField[]) {
> >          VMSTATE_UINT64(clock, KVMClockState),
> >          VMSTATE_END_OF_LIST()
> > +    },
> > +    .subsections = (const VMStateDescription * []) {
> > +        &kvmclock_reliable_get_clock,
> > +        NULL
> >      }
> >  };
> >  
> > +static Property kvmclock_properties[] = {
> > +    DEFINE_PROP_BOOL("x-mach-use-reliable-get-clock", KVMClockState,
> > +                      mach_use_reliable_get_clock, true),
> > +    DEFINE_PROP_END_OF_LIST(),
> > +};
> > +
> >  static void kvmclock_class_init(ObjectClass *klass, void *data)
> >  {
> >      DeviceClass *dc = DEVICE_CLASS(klass);
> >  
> >      dc->realize = kvmclock_realize;
> >      dc->vmsd = &kvmclock_vmsd;
> > +    dc->props = kvmclock_properties;
> >  }
> >  
> >  static const TypeInfo kvmclock_info = {
> > Index: qemu-mig-advance-clock/include/hw/i386/pc.h
> > ===================================================================
> > --- qemu-mig-advance-clock.orig/include/hw/i386/pc.h	2016-11-17 15:07:11.220632761 -0200
> > +++ qemu-mig-advance-clock/include/hw/i386/pc.h	2016-11-17 15:08:58.096791416 -0200
> > @@ -370,6 +370,11 @@
> >  #define PC_COMPAT_2_7 \
> >      HW_COMPAT_2_7 \
> >      {\
> > +        .driver   = "kvmclock",\
> > +        .property = "x-mach-use-reliable-get-clock",\
> > +        .value    = "off",\
> > +    },\
> > +    {\
> >          .driver   = TYPE_X86_CPU,\
> >          .property = "l3-cache",\
> >          .value    = "off",\
> > Index: qemu-mig-advance-clock/target-i386/kvm.c
> > ===================================================================
> > --- qemu-mig-advance-clock.orig/target-i386/kvm.c	2016-11-17 15:07:11.221632762 -0200
> > +++ qemu-mig-advance-clock/target-i386/kvm.c	2016-11-17 15:07:15.867639659 -0200
> > @@ -117,6 +117,13 @@
> >      return kvm_check_extension(kvm_state, KVM_CAP_X86_SMM);
> >  }
> >  
> > +bool kvm_has_adjust_clock_stable(void)
> > +{
> > +    int ret = kvm_check_extension(kvm_state, KVM_CAP_ADJUST_CLOCK);
> > +
> > +    return (ret == KVM_CLOCK_TSC_STABLE);
> > +}
> > +
> >  bool kvm_allows_irq0_override(void)
> >  {
> >      return !kvm_irqchip_in_kernel() || kvm_has_gsi_routing();
> > Index: qemu-mig-advance-clock/target-i386/kvm_i386.h
> > ===================================================================
> > --- qemu-mig-advance-clock.orig/target-i386/kvm_i386.h	2016-11-17 15:07:11.222632764 -0200
> > +++ qemu-mig-advance-clock/target-i386/kvm_i386.h	2016-11-17 15:07:15.867639659 -0200
> > @@ -17,6 +17,7 @@
> >  
> >  bool kvm_allows_irq0_override(void);
> >  bool kvm_has_smm(void);
> > +bool kvm_has_adjust_clock_stable(void);
> >  void kvm_synchronize_all_tsc(void);
> >  void kvm_arch_reset_vcpu(X86CPU *cs);
> >  void kvm_arch_do_init_vcpu(X86CPU *cs);
> 
> -- 
> Eduardo

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [qemu patch V2 2/2] kvmclock: reduce kvmclock difference on migration
@ 2016-11-17 19:15           ` Marcelo Tosatti
  0 siblings, 0 replies; 22+ messages in thread
From: Marcelo Tosatti @ 2016-11-17 19:15 UTC (permalink / raw)
  To: Eduardo Habkost
  Cc: kvm, qemu-devel, Dr. David Alan Gilbert, Paolo Bonzini,
	Juan Quintela, Radim Krcmar

On Thu, Nov 17, 2016 at 04:16:09PM -0200, Eduardo Habkost wrote:
> On Thu, Nov 17, 2016 at 03:15:35PM -0200, Marcelo Tosatti wrote:
> > On Thu, Nov 17, 2016 at 12:11:26PM -0200, Eduardo Habkost wrote:
> > > On Thu, Nov 17, 2016 at 11:24:13AM -0200, Marcelo Tosatti wrote:
> > > > Check for KVM_CAP_ADJUST_CLOCK capability KVM_CLOCK_TSC_STABLE, which
> > > > indicates that KVM_GET_CLOCK returns a value as seen by the guest at
> > > > that moment.
> > > > 
> > > > For new machine types, use this value rather than reading 
> > > > from guest memory.
> > > > 
> > > > This reduces kvmclock difference on migration from 5s to 0.1s
> > > > (when max_downtime == 5s).
> > > > 
> > > > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > > > 
> > > > ---
> > > >  hw/i386/kvm/clock.c    |  108 ++++++++++++++++++++++++++++++++++++++++++-------
> > > >  include/hw/i386/pc.h   |    5 ++
> > > >  target-i386/kvm.c      |    7 +++
> > > >  target-i386/kvm_i386.h |    1 
> > > >  4 files changed, 107 insertions(+), 14 deletions(-)
> > > > 
> > > > v2: 
> > > > - improve variable names (Juan)
> > > > - consolidate code on kvm_get_clock function (Paolo)
> > > > - return mach_use_reliable_get_clock from needed function (Paolo)
> > > > 
> > > > Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
> > > > ===================================================================
> > > > --- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-14 10:40:39.748116312 -0200
> > > > +++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-11-14 13:38:29.299955042 -0200
> > > > @@ -36,6 +36,12 @@
> > > >  
> > > >      uint64_t clock;
> > > >      bool clock_valid;
> > > > +
> > > > +    /* whether machine supports reliable KVM_GET_CLOCK */
> > > > +    bool mach_use_reliable_get_clock;
> > > > +
> > > > +    /* whether source host supported reliable KVM_GET_CLOCK */
> > > > +    bool src_use_reliable_get_clock;
> > > >  } KVMClockState;
> > > >  
> > > >  struct pvclock_vcpu_time_info {
> > > > @@ -81,6 +87,19 @@
> > > >      return nsec + time.system_time;
> > > >  }
> > > >  
> > > > +static uint64_t kvm_get_clock(void)
> > > > +{
> > > > +    struct kvm_clock_data data;
> > > > +    int ret;
> > > > +
> > > > +    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> > > > +    if (ret < 0) {
> > > > +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> > > > +                abort();
> > > > +    }
> > > > +    return data.clock;
> > > > +}
> > > > +
> > > >  static void kvmclock_vm_state_change(void *opaque, int running,
> > > >                                       RunState state)
> > > >  {
> > > > @@ -91,15 +110,37 @@
> > > >  
> > > >      if (running) {
> > > >          struct kvm_clock_data data = {};
> > > > -        uint64_t time_at_migration = kvmclock_current_nsec(s);
> > > > +        uint64_t pvclock_via_mem = 0;
> > > >  
> > > > -        s->clock_valid = false;
> > > > +        /* local (running VM) restore */
> > > > +        if (s->clock_valid) {
> > > > +            /*
> > > > +             * if host does not support reliable KVM_GET_CLOCK,
> > > > +             * read kvmclock value from memory
> > > > +             */
> > > > +            if (!kvm_has_adjust_clock_stable()) {
> > > > +                pvclock_via_mem = kvmclock_current_nsec(s);
> > > > +            }
> > > > +        /* migration/savevm/init restore */
> > > > +        } else {
> > > > +            /*
> > > > +             * use s->clock in case machine uses reliable
> > > > +             * get clock and source host supported
> > > > +             * reliable get clock
> > > > +             */
> > > > +            if (!(s->mach_use_reliable_get_clock &&
> > > > +                  s->src_use_reliable_get_clock)) {
> > > > +                pvclock_via_mem = kvmclock_current_nsec(s);
> > > > +            }
> > > 
> > > The s->mach_use_reliable_get_clock check seems redundant.
> > > src_use_reliable_get_clock is set only if
> > > mach_use_reliable_get_clock is true.
> > 
> > Done.
> > 
> > > > +        }
> > > >  
> > > > -        /* We can't rely on the migrated clock value, just discard it */
> > > > -        if (time_at_migration) {
> > > > -            s->clock = time_at_migration;
> > > > +        /* We can't rely on the saved clock value, just discard it */
> > > > +        if (pvclock_via_mem) {
> > > > +            s->clock = pvclock_via_mem;
> > > >          }
> > > >  
> > > > +        s->clock_valid = false;
> > > > +
> > > >          data.clock = s->clock;
> > > >          ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
> > > >          if (ret < 0) {
> > > > @@ -120,8 +161,6 @@
> > > >              }
> > > >          }
> > > >      } else {
> > > > -        struct kvm_clock_data data;
> > > > -        int ret;
> > > >  
> > > >          if (s->clock_valid) {
> > > >              return;
> > > > @@ -129,13 +168,7 @@
> > > >  
> > > >          kvm_synchronize_all_tsc();
> > > >  
> > > > -        ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> > > > -        if (ret < 0) {
> > > > -            fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> > > > -            abort();
> > > > -        }
> > > > -        s->clock = data.clock;
> > > > -
> > > > +        s->clock = kvm_get_clock();
> > > >          /*
> > > >           * If the VM is stopped, declare the clock state valid to
> > > >           * avoid re-reading it on next vmsave (which would return
> > > > @@ -152,22 +185,69 @@
> > > >      qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
> > > >  }
> > > >  
> > > > +static bool kvmclock_src_use_reliable_get_clock(void *opaque)
> > > > +{
> > > > +    KVMClockState *s = opaque;
> > > > +
> > > > +    /*
> > > > +     * On machine types that support reliable KVM_GET_CLOCK,
> > > > +     * if host kernel does provide reliable KVM_GET_CLOCK,
> > > > +     * set src_use_reliable_get_clock=true so that destination
> > > > +     * avoids reading kvmclock from memory.
> > > > +     */
> > > > +    if (s->mach_use_reliable_get_clock && kvm_has_adjust_clock_stable()) {
> > > > +        s->src_use_reliable_get_clock = true;
> > > > +    }
> > > 
> > > It feels fragile to change device state inside the .needed
> > > function. Better to initialize src_use_reliable_get_clock on
> > > kvmclock_realize()?
> > > 
> > > What exactly ensures src_use_reliable_get_clock is correctly
> > > initialized on the migration destination as well?
> > 
> > Its initialized to false (because its part of device state).
> 
> Do you mean it is always going to be false on the destination
> host? What if .needed is called by other code? .needed shouldn't
> have any side-effects.

I don't think needed is called by any other code.

> Also, the difference between the variables on the destination and
> the source is very confusing, see other comments below:

Yes it is: its because you don't know when is what. But if you consider
separate events:

    initialization, incoming migration, vmstop, vmcontinue.

And the values of the variables for each case, then it becomes 
clear.

If anyone has a suggestion to improve clarify, its welcome.

> > > 
> > > > +
> > > > +    return s->mach_use_reliable_get_clock;
> > > 
> > > If if kvm_has_adjust_clock_stable() is false, isn't it simpler to
> > > simply skip the section?
> > > 
> > > It looks like mach_use_reliable_get_clock and
> > > src_use_reliable_get_clock could become a single field:
> > > 
> > > * use_reliable_get_clock ("x-use-reliable-get-clock" property)
> > >   set to true by default (set on DEFINE_PROP_BOOL parameter)
> > > * "x-use-reliable-get-clock" set to false by default on older
> > >   machine-types
> > > * use_reliable_get_clock forced to false on kvmclock_realize() if
> > >   !kvm_has_adjust_clock_stable()
> > > * kvmclock_reliable_get_clock.needed return s->use_reliable_get_clock
> > 
> > No because use_reliable_get_clock can be false because the destination host
> > does not support kvm_has_adjust_clock_stable(), but the source host
> > supports it (so we want to use the value in the first incoming
> > migration).
> > 
> 
> Are you talking about the value of the fields on the source host,
> or on the destination host? In this case, you mean that
> use_reliable_get_clock should be false on the destination, right?

Yes, destination.

> > So the variable indicates whether the source host supported
> > migration, not whether the destination supports it.
> 
> Good, that's the whole point of the field, isn't it? I will try
> to summarize all cases below. Please correct me if something is
> incorrect.

Yes. So in this case, the cleanup/simplification to use a single
variable is not possible (or maybe it is, but it just complicates
things).

> --------+------------+---------------+-----------+------------+
>         | kvm_has_adj_clock_stable() | use_reliable_get_clock |
> machine | src        | dst           | src       | dst        |
> --------+------------+---------------+-----------+------------+
> pc-2.8  | false      | false         | false     | false      |
> pc-2.8  | false      | true          | false     | false (*)  |
> pc-2.8  | true       | false         | true      | true (**)  |
> pc-2.8  | true       | true          | true      | true       |
> --------+------------+---------------+-----------+------------+
> pc-2.7  | (any)      | (any)         | false     | false      |
> --------+------------+---------------+-----------+------------+
> 
> (*) I guess this is where things would break if we skip the
>     subsection (and use a single field). If the subsection is
>     skipped, use_reliable_get_clock would be set to true on the
>     destination. (See my comment below on the new patch you sent)

Well, perhaps you can massage things so that it works 
without sending subsection (that is):

    * Send subsection only in case value is true.
    * Do not send subsection in case value is false (then treat
      non presence of subsection and mach_reliable_clock=true 
      to mean source did not support kvm_has_reliable_clock).

Once you agree on "send a subsection for both cases", then you 
handle it.

The code is there, its simple, readable, so...

(what you can't do is to unify src_use_get_reliable_clock (remote 
information), and local information). Because you need 
to know both things to be able to handle the 4 cases properly on 
incoming migration.


> (**) Destination can still do the right thing here, right?
> 
> 
> 
> > 
> [...]
> > 
> > How about the following:
> > 
> > Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
> > ===================================================================
> > --- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-17 15:07:11.220632761 -0200
> > +++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-11-17 15:11:51.372048640 -0200
> > @@ -36,6 +36,12 @@
> >  
> >      uint64_t clock;
> >      bool clock_valid;
> > +
> > +    /* whether machine supports reliable KVM_GET_CLOCK */
> > +    bool mach_use_reliable_get_clock;
> > +
> > +    /* whether source host supported reliable KVM_GET_CLOCK */
> > +    bool src_use_reliable_get_clock;
> >  } KVMClockState;
> >  
> >  struct pvclock_vcpu_time_info {
> > @@ -81,6 +87,19 @@
> >      return nsec + time.system_time;
> >  }
> >  
> > +static uint64_t kvm_get_clock(void)
> > +{
> > +    struct kvm_clock_data data;
> > +    int ret;
> > +
> > +    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> > +    if (ret < 0) {
> > +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> > +                abort();
> > +    }
> > +    return data.clock;
> > +}
> > +
> >  static void kvmclock_vm_state_change(void *opaque, int running,
> >                                       RunState state)
> >  {
> > @@ -91,15 +110,36 @@
> >  
> >      if (running) {
> >          struct kvm_clock_data data = {};
> > -        uint64_t time_at_migration = kvmclock_current_nsec(s);
> > +        uint64_t pvclock_via_mem = 0;
> >  
> > -        s->clock_valid = false;
> > +        /* local (running VM) restore */
> > +        if (s->clock_valid) {
> > +            /*
> > +             * if host does not support reliable KVM_GET_CLOCK,
> > +             * read kvmclock value from memory
> > +             */
> > +            if (!kvm_has_adjust_clock_stable()) {
> > +                pvclock_via_mem = kvmclock_current_nsec(s);
> > +            }
> > +        /* migration/savevm/init restore */
> > +        } else {
> > +            /*
> > +             * use s->clock in case machine uses reliable
> > +             * get clock and source host supported
> > +             * reliable get clock
> > +             */
> > +            if (!s->src_use_reliable_get_clock) {
> > +                pvclock_via_mem = kvmclock_current_nsec(s);
> > +            }
> > +        }
> >  
> > -        /* We can't rely on the migrated clock value, just discard it */
> > -        if (time_at_migration) {
> > -            s->clock = time_at_migration;
> > +        /* We can't rely on the saved clock value, just discard it */
> > +        if (pvclock_via_mem) {
> > +            s->clock = pvclock_via_mem;
> >          }
> >  
> > +        s->clock_valid = false;
> > +
> >          data.clock = s->clock;
> >          ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
> >          if (ret < 0) {
> > @@ -120,8 +160,6 @@
> >              }
> >          }
> >      } else {
> > -        struct kvm_clock_data data;
> > -        int ret;
> >  
> >          if (s->clock_valid) {
> >              return;
> > @@ -129,13 +167,7 @@
> >  
> >          kvm_synchronize_all_tsc();
> >  
> > -        ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> > -        if (ret < 0) {
> > -            fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> > -            abort();
> > -        }
> > -        s->clock = data.clock;
> > -
> > +        s->clock = kvm_get_clock();
> >          /*
> >           * If the VM is stopped, declare the clock state valid to
> >           * avoid re-reading it on next vmsave (which would return
> > @@ -149,25 +181,72 @@
> >  {
> >      KVMClockState *s = KVM_CLOCK(dev);
> >  
> > +    /*
> > +     * On machine types that support reliable KVM_GET_CLOCK,
> > +     * if host kernel does provide reliable KVM_GET_CLOCK,
> > +     * set src_use_reliable_get_clock=true so that destination
> > +     * avoids reading kvmclock from memory.
> > +     */
> > +    if (s->mach_use_reliable_get_clock && kvm_has_adjust_clock_stable()) {
> > +        s->src_use_reliable_get_clock = true;
> > +    }
> > +
> >      qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
> >  }
> >  
> > +static bool kvmclock_src_use_reliable_get_clock(void *opaque)
> > +{
> > +    KVMClockState *s = opaque;
> > +
> > +    return s->mach_use_reliable_get_clock;
> > +}
> 
> I didn't review kvmclock_vm_state_change() and
> kvm_has_adjust_clock_stable(), but the rules to migrate
> src_use_reliable_get_clock look good now, and seem to match the
> table above.

Ok... Paolo, Radim?

Thanks Eduardo.

> 
> > +
> > +static const VMStateDescription kvmclock_reliable_get_clock = {
> > +    .name = "kvmclock/src_use_reliable_get_clock",
> > +    .version_id = 1,
> > +    .minimum_version_id = 1,
> > +    .needed = kvmclock_src_use_reliable_get_clock,
> > +    .fields = (VMStateField[]) {
> > +        VMSTATE_BOOL(src_use_reliable_get_clock, KVMClockState),
> > +        VMSTATE_END_OF_LIST()
> > +    }
> > +};
> > +
> > +static void kvmclock_pre_save(void *opaque)
> > +{
> > +    KVMClockState *s = opaque;
> > +
> > +    s->clock = kvm_get_clock();
> > +}
> > +
> >  static const VMStateDescription kvmclock_vmsd = {
> >      .name = "kvmclock",
> >      .version_id = 1,
> >      .minimum_version_id = 1,
> > +    .pre_save = kvmclock_pre_save,
> >      .fields = (VMStateField[]) {
> >          VMSTATE_UINT64(clock, KVMClockState),
> >          VMSTATE_END_OF_LIST()
> > +    },
> > +    .subsections = (const VMStateDescription * []) {
> > +        &kvmclock_reliable_get_clock,
> > +        NULL
> >      }
> >  };
> >  
> > +static Property kvmclock_properties[] = {
> > +    DEFINE_PROP_BOOL("x-mach-use-reliable-get-clock", KVMClockState,
> > +                      mach_use_reliable_get_clock, true),
> > +    DEFINE_PROP_END_OF_LIST(),
> > +};
> > +
> >  static void kvmclock_class_init(ObjectClass *klass, void *data)
> >  {
> >      DeviceClass *dc = DEVICE_CLASS(klass);
> >  
> >      dc->realize = kvmclock_realize;
> >      dc->vmsd = &kvmclock_vmsd;
> > +    dc->props = kvmclock_properties;
> >  }
> >  
> >  static const TypeInfo kvmclock_info = {
> > Index: qemu-mig-advance-clock/include/hw/i386/pc.h
> > ===================================================================
> > --- qemu-mig-advance-clock.orig/include/hw/i386/pc.h	2016-11-17 15:07:11.220632761 -0200
> > +++ qemu-mig-advance-clock/include/hw/i386/pc.h	2016-11-17 15:08:58.096791416 -0200
> > @@ -370,6 +370,11 @@
> >  #define PC_COMPAT_2_7 \
> >      HW_COMPAT_2_7 \
> >      {\
> > +        .driver   = "kvmclock",\
> > +        .property = "x-mach-use-reliable-get-clock",\
> > +        .value    = "off",\
> > +    },\
> > +    {\
> >          .driver   = TYPE_X86_CPU,\
> >          .property = "l3-cache",\
> >          .value    = "off",\
> > Index: qemu-mig-advance-clock/target-i386/kvm.c
> > ===================================================================
> > --- qemu-mig-advance-clock.orig/target-i386/kvm.c	2016-11-17 15:07:11.221632762 -0200
> > +++ qemu-mig-advance-clock/target-i386/kvm.c	2016-11-17 15:07:15.867639659 -0200
> > @@ -117,6 +117,13 @@
> >      return kvm_check_extension(kvm_state, KVM_CAP_X86_SMM);
> >  }
> >  
> > +bool kvm_has_adjust_clock_stable(void)
> > +{
> > +    int ret = kvm_check_extension(kvm_state, KVM_CAP_ADJUST_CLOCK);
> > +
> > +    return (ret == KVM_CLOCK_TSC_STABLE);
> > +}
> > +
> >  bool kvm_allows_irq0_override(void)
> >  {
> >      return !kvm_irqchip_in_kernel() || kvm_has_gsi_routing();
> > Index: qemu-mig-advance-clock/target-i386/kvm_i386.h
> > ===================================================================
> > --- qemu-mig-advance-clock.orig/target-i386/kvm_i386.h	2016-11-17 15:07:11.222632764 -0200
> > +++ qemu-mig-advance-clock/target-i386/kvm_i386.h	2016-11-17 15:07:15.867639659 -0200
> > @@ -17,6 +17,7 @@
> >  
> >  bool kvm_allows_irq0_override(void);
> >  bool kvm_has_smm(void);
> > +bool kvm_has_adjust_clock_stable(void);
> >  void kvm_synchronize_all_tsc(void);
> >  void kvm_arch_reset_vcpu(X86CPU *cs);
> >  void kvm_arch_do_init_vcpu(X86CPU *cs);
> 
> -- 
> Eduardo

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [qemu patch V2 2/2] kvmclock: reduce kvmclock difference on migration
  2016-11-17 19:15           ` [Qemu-devel] " Marcelo Tosatti
@ 2016-11-17 19:59             ` Eduardo Habkost
  -1 siblings, 0 replies; 22+ messages in thread
From: Eduardo Habkost @ 2016-11-17 19:59 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: kvm, qemu-devel, Dr. David Alan Gilbert, Paolo Bonzini,
	Juan Quintela, Radim Krcmar

On Thu, Nov 17, 2016 at 05:15:13PM -0200, Marcelo Tosatti wrote:
> On Thu, Nov 17, 2016 at 04:16:09PM -0200, Eduardo Habkost wrote:
> > On Thu, Nov 17, 2016 at 03:15:35PM -0200, Marcelo Tosatti wrote:
> > > On Thu, Nov 17, 2016 at 12:11:26PM -0200, Eduardo Habkost wrote:
> > > > On Thu, Nov 17, 2016 at 11:24:13AM -0200, Marcelo Tosatti wrote:
> > > > > Check for KVM_CAP_ADJUST_CLOCK capability KVM_CLOCK_TSC_STABLE, which
> > > > > indicates that KVM_GET_CLOCK returns a value as seen by the guest at
> > > > > that moment.
> > > > > 
> > > > > For new machine types, use this value rather than reading 
> > > > > from guest memory.
> > > > > 
> > > > > This reduces kvmclock difference on migration from 5s to 0.1s
> > > > > (when max_downtime == 5s).
> > > > > 
> > > > > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > > > > 
> > > > > ---
> > > > >  hw/i386/kvm/clock.c    |  108 ++++++++++++++++++++++++++++++++++++++++++-------
> > > > >  include/hw/i386/pc.h   |    5 ++
> > > > >  target-i386/kvm.c      |    7 +++
> > > > >  target-i386/kvm_i386.h |    1 
> > > > >  4 files changed, 107 insertions(+), 14 deletions(-)
> > > > > 
> > > > > v2: 
> > > > > - improve variable names (Juan)
> > > > > - consolidate code on kvm_get_clock function (Paolo)
> > > > > - return mach_use_reliable_get_clock from needed function (Paolo)
> > > > > 
> > > > > Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
> > > > > ===================================================================
> > > > > --- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-14 10:40:39.748116312 -0200
> > > > > +++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-11-14 13:38:29.299955042 -0200
> > > > > @@ -36,6 +36,12 @@
> > > > >  
> > > > >      uint64_t clock;
> > > > >      bool clock_valid;
> > > > > +
> > > > > +    /* whether machine supports reliable KVM_GET_CLOCK */
> > > > > +    bool mach_use_reliable_get_clock;
> > > > > +
> > > > > +    /* whether source host supported reliable KVM_GET_CLOCK */
> > > > > +    bool src_use_reliable_get_clock;
> > > > >  } KVMClockState;
> > > > >  
> > > > >  struct pvclock_vcpu_time_info {
> > > > > @@ -81,6 +87,19 @@
> > > > >      return nsec + time.system_time;
> > > > >  }
> > > > >  
> > > > > +static uint64_t kvm_get_clock(void)
> > > > > +{
> > > > > +    struct kvm_clock_data data;
> > > > > +    int ret;
> > > > > +
> > > > > +    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> > > > > +    if (ret < 0) {
> > > > > +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> > > > > +                abort();
> > > > > +    }
> > > > > +    return data.clock;
> > > > > +}
> > > > > +
> > > > >  static void kvmclock_vm_state_change(void *opaque, int running,
> > > > >                                       RunState state)
> > > > >  {
> > > > > @@ -91,15 +110,37 @@
> > > > >  
> > > > >      if (running) {
> > > > >          struct kvm_clock_data data = {};
> > > > > -        uint64_t time_at_migration = kvmclock_current_nsec(s);
> > > > > +        uint64_t pvclock_via_mem = 0;
> > > > >  
> > > > > -        s->clock_valid = false;
> > > > > +        /* local (running VM) restore */
> > > > > +        if (s->clock_valid) {
> > > > > +            /*
> > > > > +             * if host does not support reliable KVM_GET_CLOCK,
> > > > > +             * read kvmclock value from memory
> > > > > +             */
> > > > > +            if (!kvm_has_adjust_clock_stable()) {
> > > > > +                pvclock_via_mem = kvmclock_current_nsec(s);
> > > > > +            }
> > > > > +        /* migration/savevm/init restore */
> > > > > +        } else {
> > > > > +            /*
> > > > > +             * use s->clock in case machine uses reliable
> > > > > +             * get clock and source host supported
> > > > > +             * reliable get clock
> > > > > +             */
> > > > > +            if (!(s->mach_use_reliable_get_clock &&
> > > > > +                  s->src_use_reliable_get_clock)) {
> > > > > +                pvclock_via_mem = kvmclock_current_nsec(s);
> > > > > +            }
> > > > 
> > > > The s->mach_use_reliable_get_clock check seems redundant.
> > > > src_use_reliable_get_clock is set only if
> > > > mach_use_reliable_get_clock is true.
> > > 
> > > Done.
> > > 
> > > > > +        }
> > > > >  
> > > > > -        /* We can't rely on the migrated clock value, just discard it */
> > > > > -        if (time_at_migration) {
> > > > > -            s->clock = time_at_migration;
> > > > > +        /* We can't rely on the saved clock value, just discard it */
> > > > > +        if (pvclock_via_mem) {
> > > > > +            s->clock = pvclock_via_mem;
> > > > >          }
> > > > >  
> > > > > +        s->clock_valid = false;
> > > > > +
> > > > >          data.clock = s->clock;
> > > > >          ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
> > > > >          if (ret < 0) {
> > > > > @@ -120,8 +161,6 @@
> > > > >              }
> > > > >          }
> > > > >      } else {
> > > > > -        struct kvm_clock_data data;
> > > > > -        int ret;
> > > > >  
> > > > >          if (s->clock_valid) {
> > > > >              return;
> > > > > @@ -129,13 +168,7 @@
> > > > >  
> > > > >          kvm_synchronize_all_tsc();
> > > > >  
> > > > > -        ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> > > > > -        if (ret < 0) {
> > > > > -            fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> > > > > -            abort();
> > > > > -        }
> > > > > -        s->clock = data.clock;
> > > > > -
> > > > > +        s->clock = kvm_get_clock();
> > > > >          /*
> > > > >           * If the VM is stopped, declare the clock state valid to
> > > > >           * avoid re-reading it on next vmsave (which would return
> > > > > @@ -152,22 +185,69 @@
> > > > >      qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
> > > > >  }
> > > > >  
> > > > > +static bool kvmclock_src_use_reliable_get_clock(void *opaque)
> > > > > +{
> > > > > +    KVMClockState *s = opaque;
> > > > > +
> > > > > +    /*
> > > > > +     * On machine types that support reliable KVM_GET_CLOCK,
> > > > > +     * if host kernel does provide reliable KVM_GET_CLOCK,
> > > > > +     * set src_use_reliable_get_clock=true so that destination
> > > > > +     * avoids reading kvmclock from memory.
> > > > > +     */
> > > > > +    if (s->mach_use_reliable_get_clock && kvm_has_adjust_clock_stable()) {
> > > > > +        s->src_use_reliable_get_clock = true;
> > > > > +    }
> > > > 
> > > > It feels fragile to change device state inside the .needed
> > > > function. Better to initialize src_use_reliable_get_clock on
> > > > kvmclock_realize()?
> > > > 
> > > > What exactly ensures src_use_reliable_get_clock is correctly
> > > > initialized on the migration destination as well?
> > > 
> > > Its initialized to false (because its part of device state).
> > 
> > Do you mean it is always going to be false on the destination
> > host? What if .needed is called by other code? .needed shouldn't
> > have any side-effects.
> 
> I don't think needed is called by any other code.

I don't know if it is called. But it may be called by other code,
so it can't have side-effects.

> 
> > Also, the difference between the variables on the destination and
> > the source is very confusing, see other comments below:
> 
> Yes it is: its because you don't know when is what. But if you consider
> separate events:
> 
>     initialization, incoming migration, vmstop, vmcontinue.
> 
> And the values of the variables for each case, then it becomes 
> clear.
> 
> If anyone has a suggestion to improve clarify, its welcome.

I think it was confusing when src_use_reliable_get_clock was
initialized on .needed, but now it makes sense to me because
mach_use_reliable_get_clock and src_use_reliable_get_clock have
the same value on source and destination.

> 
> > > > 
> > > > > +
> > > > > +    return s->mach_use_reliable_get_clock;
> > > > 
> > > > If if kvm_has_adjust_clock_stable() is false, isn't it simpler to
> > > > simply skip the section?
> > > > 
> > > > It looks like mach_use_reliable_get_clock and
> > > > src_use_reliable_get_clock could become a single field:
> > > > 
> > > > * use_reliable_get_clock ("x-use-reliable-get-clock" property)
> > > >   set to true by default (set on DEFINE_PROP_BOOL parameter)
> > > > * "x-use-reliable-get-clock" set to false by default on older
> > > >   machine-types
> > > > * use_reliable_get_clock forced to false on kvmclock_realize() if
> > > >   !kvm_has_adjust_clock_stable()
> > > > * kvmclock_reliable_get_clock.needed return s->use_reliable_get_clock
> > > 
> > > No because use_reliable_get_clock can be false because the destination host
> > > does not support kvm_has_adjust_clock_stable(), but the source host
> > > supports it (so we want to use the value in the first incoming
> > > migration).
> > > 
> > 
> > Are you talking about the value of the fields on the source host,
> > or on the destination host? In this case, you mean that
> > use_reliable_get_clock should be false on the destination, right?
> 
> Yes, destination.
> 
> > > So the variable indicates whether the source host supported
> > > migration, not whether the destination supports it.
> > 
> > Good, that's the whole point of the field, isn't it? I will try
> > to summarize all cases below. Please correct me if something is
> > incorrect.
> 
> Yes. So in this case, the cleanup/simplification to use a single
> variable is not possible (or maybe it is, but it just complicates
> things).
> 
> > --------+------------+---------------+-----------+------------+
> >         | kvm_has_adj_clock_stable() | use_reliable_get_clock |
> > machine | src        | dst           | src       | dst        |
> > --------+------------+---------------+-----------+------------+
> > pc-2.8  | false      | false         | false     | false      |
> > pc-2.8  | false      | true          | false     | false (*)  |
> > pc-2.8  | true       | false         | true      | true (**)  |
> > pc-2.8  | true       | true          | true      | true       |
> > --------+------------+---------------+-----------+------------+
> > pc-2.7  | (any)      | (any)         | false     | false      |
> > --------+------------+---------------+-----------+------------+
> > 
> > (*) I guess this is where things would break if we skip the
> >     subsection (and use a single field). If the subsection is
> >     skipped, use_reliable_get_clock would be set to true on the
> >     destination. (See my comment below on the new patch you sent)
> 
> Well, perhaps you can massage things so that it works 
> without sending subsection (that is):
> 
>     * Send subsection only in case value is true.
>     * Do not send subsection in case value is false (then treat
>       non presence of subsection and mach_reliable_clock=true 
>       to mean source did not support kvm_has_reliable_clock).

Yes, it would be more complex. Always sending it if
mach_use_reliable_get_clock=true (like you proposed) is simpler.

> 
> Once you agree on "send a subsection for both cases", then you 
> handle it.
> 
> The code is there, its simple, readable, so...
> 
> (what you can't do is to unify src_use_get_reliable_clock (remote 
> information), and local information). Because you need 
> to know both things to be able to handle the 4 cases properly on 
> incoming migration.

OK.

> [...]

-- 
Eduardo

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Qemu-devel] [qemu patch V2 2/2] kvmclock: reduce kvmclock difference on migration
@ 2016-11-17 19:59             ` Eduardo Habkost
  0 siblings, 0 replies; 22+ messages in thread
From: Eduardo Habkost @ 2016-11-17 19:59 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: kvm, qemu-devel, Dr. David Alan Gilbert, Paolo Bonzini,
	Juan Quintela, Radim Krcmar

On Thu, Nov 17, 2016 at 05:15:13PM -0200, Marcelo Tosatti wrote:
> On Thu, Nov 17, 2016 at 04:16:09PM -0200, Eduardo Habkost wrote:
> > On Thu, Nov 17, 2016 at 03:15:35PM -0200, Marcelo Tosatti wrote:
> > > On Thu, Nov 17, 2016 at 12:11:26PM -0200, Eduardo Habkost wrote:
> > > > On Thu, Nov 17, 2016 at 11:24:13AM -0200, Marcelo Tosatti wrote:
> > > > > Check for KVM_CAP_ADJUST_CLOCK capability KVM_CLOCK_TSC_STABLE, which
> > > > > indicates that KVM_GET_CLOCK returns a value as seen by the guest at
> > > > > that moment.
> > > > > 
> > > > > For new machine types, use this value rather than reading 
> > > > > from guest memory.
> > > > > 
> > > > > This reduces kvmclock difference on migration from 5s to 0.1s
> > > > > (when max_downtime == 5s).
> > > > > 
> > > > > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> > > > > 
> > > > > ---
> > > > >  hw/i386/kvm/clock.c    |  108 ++++++++++++++++++++++++++++++++++++++++++-------
> > > > >  include/hw/i386/pc.h   |    5 ++
> > > > >  target-i386/kvm.c      |    7 +++
> > > > >  target-i386/kvm_i386.h |    1 
> > > > >  4 files changed, 107 insertions(+), 14 deletions(-)
> > > > > 
> > > > > v2: 
> > > > > - improve variable names (Juan)
> > > > > - consolidate code on kvm_get_clock function (Paolo)
> > > > > - return mach_use_reliable_get_clock from needed function (Paolo)
> > > > > 
> > > > > Index: qemu-mig-advance-clock/hw/i386/kvm/clock.c
> > > > > ===================================================================
> > > > > --- qemu-mig-advance-clock.orig/hw/i386/kvm/clock.c	2016-11-14 10:40:39.748116312 -0200
> > > > > +++ qemu-mig-advance-clock/hw/i386/kvm/clock.c	2016-11-14 13:38:29.299955042 -0200
> > > > > @@ -36,6 +36,12 @@
> > > > >  
> > > > >      uint64_t clock;
> > > > >      bool clock_valid;
> > > > > +
> > > > > +    /* whether machine supports reliable KVM_GET_CLOCK */
> > > > > +    bool mach_use_reliable_get_clock;
> > > > > +
> > > > > +    /* whether source host supported reliable KVM_GET_CLOCK */
> > > > > +    bool src_use_reliable_get_clock;
> > > > >  } KVMClockState;
> > > > >  
> > > > >  struct pvclock_vcpu_time_info {
> > > > > @@ -81,6 +87,19 @@
> > > > >      return nsec + time.system_time;
> > > > >  }
> > > > >  
> > > > > +static uint64_t kvm_get_clock(void)
> > > > > +{
> > > > > +    struct kvm_clock_data data;
> > > > > +    int ret;
> > > > > +
> > > > > +    ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> > > > > +    if (ret < 0) {
> > > > > +        fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> > > > > +                abort();
> > > > > +    }
> > > > > +    return data.clock;
> > > > > +}
> > > > > +
> > > > >  static void kvmclock_vm_state_change(void *opaque, int running,
> > > > >                                       RunState state)
> > > > >  {
> > > > > @@ -91,15 +110,37 @@
> > > > >  
> > > > >      if (running) {
> > > > >          struct kvm_clock_data data = {};
> > > > > -        uint64_t time_at_migration = kvmclock_current_nsec(s);
> > > > > +        uint64_t pvclock_via_mem = 0;
> > > > >  
> > > > > -        s->clock_valid = false;
> > > > > +        /* local (running VM) restore */
> > > > > +        if (s->clock_valid) {
> > > > > +            /*
> > > > > +             * if host does not support reliable KVM_GET_CLOCK,
> > > > > +             * read kvmclock value from memory
> > > > > +             */
> > > > > +            if (!kvm_has_adjust_clock_stable()) {
> > > > > +                pvclock_via_mem = kvmclock_current_nsec(s);
> > > > > +            }
> > > > > +        /* migration/savevm/init restore */
> > > > > +        } else {
> > > > > +            /*
> > > > > +             * use s->clock in case machine uses reliable
> > > > > +             * get clock and source host supported
> > > > > +             * reliable get clock
> > > > > +             */
> > > > > +            if (!(s->mach_use_reliable_get_clock &&
> > > > > +                  s->src_use_reliable_get_clock)) {
> > > > > +                pvclock_via_mem = kvmclock_current_nsec(s);
> > > > > +            }
> > > > 
> > > > The s->mach_use_reliable_get_clock check seems redundant.
> > > > src_use_reliable_get_clock is set only if
> > > > mach_use_reliable_get_clock is true.
> > > 
> > > Done.
> > > 
> > > > > +        }
> > > > >  
> > > > > -        /* We can't rely on the migrated clock value, just discard it */
> > > > > -        if (time_at_migration) {
> > > > > -            s->clock = time_at_migration;
> > > > > +        /* We can't rely on the saved clock value, just discard it */
> > > > > +        if (pvclock_via_mem) {
> > > > > +            s->clock = pvclock_via_mem;
> > > > >          }
> > > > >  
> > > > > +        s->clock_valid = false;
> > > > > +
> > > > >          data.clock = s->clock;
> > > > >          ret = kvm_vm_ioctl(kvm_state, KVM_SET_CLOCK, &data);
> > > > >          if (ret < 0) {
> > > > > @@ -120,8 +161,6 @@
> > > > >              }
> > > > >          }
> > > > >      } else {
> > > > > -        struct kvm_clock_data data;
> > > > > -        int ret;
> > > > >  
> > > > >          if (s->clock_valid) {
> > > > >              return;
> > > > > @@ -129,13 +168,7 @@
> > > > >  
> > > > >          kvm_synchronize_all_tsc();
> > > > >  
> > > > > -        ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
> > > > > -        if (ret < 0) {
> > > > > -            fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
> > > > > -            abort();
> > > > > -        }
> > > > > -        s->clock = data.clock;
> > > > > -
> > > > > +        s->clock = kvm_get_clock();
> > > > >          /*
> > > > >           * If the VM is stopped, declare the clock state valid to
> > > > >           * avoid re-reading it on next vmsave (which would return
> > > > > @@ -152,22 +185,69 @@
> > > > >      qemu_add_vm_change_state_handler(kvmclock_vm_state_change, s);
> > > > >  }
> > > > >  
> > > > > +static bool kvmclock_src_use_reliable_get_clock(void *opaque)
> > > > > +{
> > > > > +    KVMClockState *s = opaque;
> > > > > +
> > > > > +    /*
> > > > > +     * On machine types that support reliable KVM_GET_CLOCK,
> > > > > +     * if host kernel does provide reliable KVM_GET_CLOCK,
> > > > > +     * set src_use_reliable_get_clock=true so that destination
> > > > > +     * avoids reading kvmclock from memory.
> > > > > +     */
> > > > > +    if (s->mach_use_reliable_get_clock && kvm_has_adjust_clock_stable()) {
> > > > > +        s->src_use_reliable_get_clock = true;
> > > > > +    }
> > > > 
> > > > It feels fragile to change device state inside the .needed
> > > > function. Better to initialize src_use_reliable_get_clock on
> > > > kvmclock_realize()?
> > > > 
> > > > What exactly ensures src_use_reliable_get_clock is correctly
> > > > initialized on the migration destination as well?
> > > 
> > > Its initialized to false (because its part of device state).
> > 
> > Do you mean it is always going to be false on the destination
> > host? What if .needed is called by other code? .needed shouldn't
> > have any side-effects.
> 
> I don't think needed is called by any other code.

I don't know if it is called. But it may be called by other code,
so it can't have side-effects.

> 
> > Also, the difference between the variables on the destination and
> > the source is very confusing, see other comments below:
> 
> Yes it is: its because you don't know when is what. But if you consider
> separate events:
> 
>     initialization, incoming migration, vmstop, vmcontinue.
> 
> And the values of the variables for each case, then it becomes 
> clear.
> 
> If anyone has a suggestion to improve clarify, its welcome.

I think it was confusing when src_use_reliable_get_clock was
initialized on .needed, but now it makes sense to me because
mach_use_reliable_get_clock and src_use_reliable_get_clock have
the same value on source and destination.

> 
> > > > 
> > > > > +
> > > > > +    return s->mach_use_reliable_get_clock;
> > > > 
> > > > If if kvm_has_adjust_clock_stable() is false, isn't it simpler to
> > > > simply skip the section?
> > > > 
> > > > It looks like mach_use_reliable_get_clock and
> > > > src_use_reliable_get_clock could become a single field:
> > > > 
> > > > * use_reliable_get_clock ("x-use-reliable-get-clock" property)
> > > >   set to true by default (set on DEFINE_PROP_BOOL parameter)
> > > > * "x-use-reliable-get-clock" set to false by default on older
> > > >   machine-types
> > > > * use_reliable_get_clock forced to false on kvmclock_realize() if
> > > >   !kvm_has_adjust_clock_stable()
> > > > * kvmclock_reliable_get_clock.needed return s->use_reliable_get_clock
> > > 
> > > No because use_reliable_get_clock can be false because the destination host
> > > does not support kvm_has_adjust_clock_stable(), but the source host
> > > supports it (so we want to use the value in the first incoming
> > > migration).
> > > 
> > 
> > Are you talking about the value of the fields on the source host,
> > or on the destination host? In this case, you mean that
> > use_reliable_get_clock should be false on the destination, right?
> 
> Yes, destination.
> 
> > > So the variable indicates whether the source host supported
> > > migration, not whether the destination supports it.
> > 
> > Good, that's the whole point of the field, isn't it? I will try
> > to summarize all cases below. Please correct me if something is
> > incorrect.
> 
> Yes. So in this case, the cleanup/simplification to use a single
> variable is not possible (or maybe it is, but it just complicates
> things).
> 
> > --------+------------+---------------+-----------+------------+
> >         | kvm_has_adj_clock_stable() | use_reliable_get_clock |
> > machine | src        | dst           | src       | dst        |
> > --------+------------+---------------+-----------+------------+
> > pc-2.8  | false      | false         | false     | false      |
> > pc-2.8  | false      | true          | false     | false (*)  |
> > pc-2.8  | true       | false         | true      | true (**)  |
> > pc-2.8  | true       | true          | true      | true       |
> > --------+------------+---------------+-----------+------------+
> > pc-2.7  | (any)      | (any)         | false     | false      |
> > --------+------------+---------------+-----------+------------+
> > 
> > (*) I guess this is where things would break if we skip the
> >     subsection (and use a single field). If the subsection is
> >     skipped, use_reliable_get_clock would be set to true on the
> >     destination. (See my comment below on the new patch you sent)
> 
> Well, perhaps you can massage things so that it works 
> without sending subsection (that is):
> 
>     * Send subsection only in case value is true.
>     * Do not send subsection in case value is false (then treat
>       non presence of subsection and mach_reliable_clock=true 
>       to mean source did not support kvm_has_reliable_clock).

Yes, it would be more complex. Always sending it if
mach_use_reliable_get_clock=true (like you proposed) is simpler.

> 
> Once you agree on "send a subsection for both cases", then you 
> handle it.
> 
> The code is there, its simple, readable, so...
> 
> (what you can't do is to unify src_use_get_reliable_clock (remote 
> information), and local information). Because you need 
> to know both things to be able to handle the 4 cases properly on 
> incoming migration.

OK.

> [...]

-- 
Eduardo

^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2016-11-17 19:59 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-11-17 13:24 [qemu patch V2 0/2] improve kvmclock difference on migration Marcelo Tosatti
2016-11-17 13:24 ` [Qemu-devel] " Marcelo Tosatti
2016-11-17 13:24 ` [qemu patch V2 1/2] kvm: sync linux headers Marcelo Tosatti
2016-11-17 13:24   ` [Qemu-devel] " Marcelo Tosatti
2016-11-17 13:24 ` [qemu patch V2 2/2] kvmclock: reduce kvmclock difference on migration Marcelo Tosatti
2016-11-17 13:24   ` [Qemu-devel] " Marcelo Tosatti
2016-11-17 14:11   ` Eduardo Habkost
2016-11-17 14:11     ` [Qemu-devel] " Eduardo Habkost
2016-11-17 14:15     ` Paolo Bonzini
2016-11-17 14:15       ` [Qemu-devel] " Paolo Bonzini
2016-11-17 16:30       ` Marcelo Tosatti
2016-11-17 16:30         ` [Qemu-devel] " Marcelo Tosatti
2016-11-17 17:41         ` Eduardo Habkost
2016-11-17 17:41           ` [Qemu-devel] " Eduardo Habkost
2016-11-17 17:15     ` Marcelo Tosatti
2016-11-17 17:15       ` [Qemu-devel] " Marcelo Tosatti
2016-11-17 18:16       ` Eduardo Habkost
2016-11-17 18:16         ` [Qemu-devel] " Eduardo Habkost
2016-11-17 19:15         ` Marcelo Tosatti
2016-11-17 19:15           ` [Qemu-devel] " Marcelo Tosatti
2016-11-17 19:59           ` Eduardo Habkost
2016-11-17 19:59             ` [Qemu-devel] " Eduardo Habkost

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.