All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] TSC scaling for live migration between platforms with different TSC frequecies
@ 2009-06-18  2:56 Zhang, Xiantao
  2009-06-18  7:37 ` [PATCH] TSC scaling for live migration betweenplatforms " Jan Beulich
                   ` (4 more replies)
  0 siblings, 5 replies; 35+ messages in thread
From: Zhang, Xiantao @ 2009-06-18  2:56 UTC (permalink / raw)
  To: Keir Fraser; +Cc: xen-devel

[-- Attachment #1: Type: text/plain, Size: 790 bytes --]

Hi, Keir

    This patchset targets for enabling TSC scaling in software for live migration between platforms with different TSC frequecies.  Once found the target host's frequency is different with source host's, hypervisor will trap and emulate guest's all rdtsc instructions with its expected frequency.  
    If hardware's TSC frequency is difffernt with guest's exepcted freq, guest may behave abnormally, eg. incorrect wallclock, soft lockup, even hang in some cases.  Therefore, this patchset is necessary to avoid such issues. 

PATCH 0001-- Save guest's preferred TSC in image for save/restore and migration
PATCH 0002-- Move multidiv64 as a library function. 
PATCH 0003-- Scaling host TSC freqeuncy patch. 

Signed-off-by Xiantao Zhang <xiantao.zhang@intel.com>
Xiantao

[-- Attachment #2: 0001-save_tsc_frequency.patch --]
[-- Type: application/octet-stream, Size: 3964 bytes --]

# HG changeset patch
# User root@localhost.localdomain
# Date 1245206466 14400
# Node ID 4015e09394c1857a6e68e6d6909d11cf5ecba241
# Parent  f8187a343ad2bdbfe3166d7ee7e3d55a9f157fdc
save/restore : Save guest's preferred TSC frequency in image

For save/restore or live migration between two different frequency platforms
, guest's preferred TSC frequency is required to caculate guest's TSC after resotre, so save it in the image header.

Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>

diff -r f8187a343ad2 -r 4015e09394c1 xen/arch/x86/hvm/i8254.c
--- a/xen/arch/x86/hvm/i8254.c	Fri Feb 20 17:02:36 2009 +0000
+++ b/xen/arch/x86/hvm/i8254.c	Tue Jun 16 22:41:06 2009 -0400
@@ -481,8 +481,6 @@ void pit_init(struct vcpu *v, unsigned l
     register_portio_handler(v->domain, PIT_BASE, 4, handle_pit_io);
     register_portio_handler(v->domain, 0x61, 1, handle_speaker_io);
 
-    ticks_per_sec(v) = cpu_khz * (int64_t)1000;
-
     pit_reset(v->domain);
 }
 
diff -r f8187a343ad2 -r 4015e09394c1 xen/arch/x86/hvm/save.c
--- a/xen/arch/x86/hvm/save.c	Fri Feb 20 17:02:36 2009 +0000
+++ b/xen/arch/x86/hvm/save.c	Tue Jun 16 22:41:06 2009 -0400
@@ -32,7 +32,8 @@ void arch_hvm_save(struct domain *d, str
     cpuid(1, &eax, &ebx, &ecx, &edx);
     hdr->cpuid = eax;
 
-    hdr->pad0 = 0;
+    /* Save guest's preferred TSC. */
+    hdr->gtsc_khz = d->arch.hvm_domain.gtsc_khz;
 }
 
 int arch_hvm_load(struct domain *d, struct hvm_save_header *hdr)
@@ -59,6 +60,9 @@ int arch_hvm_load(struct domain *d, stru
         gdprintk(XENLOG_WARNING, "HVM restore: saved CPUID (%#"PRIx32") "
                "does not match host (%#"PRIx32").\n", hdr->cpuid, eax);
 
+    /* Restore guest's preferred TSC frequency. */
+    d->arch.hvm_domain.gtsc_khz = hdr->gtsc_khz;
+
     /* VGA state is not saved/restored, so we nobble the cache. */
     d->arch.hvm_domain.stdvga.cache = 0;
 
diff -r f8187a343ad2 -r 4015e09394c1 xen/arch/x86/hvm/vpt.c
--- a/xen/arch/x86/hvm/vpt.c	Fri Feb 20 17:02:36 2009 +0000
+++ b/xen/arch/x86/hvm/vpt.c	Tue Jun 16 22:41:06 2009 -0400
@@ -32,6 +32,8 @@ void hvm_init_guest_time(struct domain *
     spin_lock_init(&pl->pl_time_lock);
     pl->stime_offset = -(u64)get_s_time();
     pl->last_guest_time = 0;
+
+    d->arch.hvm_domain.gtsc_khz = cpu_khz;
 }
 
 u64 hvm_get_guest_time(struct vcpu *v)
diff -r f8187a343ad2 -r 4015e09394c1 xen/include/asm-x86/hvm/domain.h
--- a/xen/include/asm-x86/hvm/domain.h	Fri Feb 20 17:02:36 2009 +0000
+++ b/xen/include/asm-x86/hvm/domain.h	Tue Jun 16 22:41:06 2009 -0400
@@ -44,7 +44,8 @@ struct hvm_domain {
     struct hvm_ioreq_page  ioreq;
     struct hvm_ioreq_page  buf_ioreq;
 
-    s64                    tsc_frequency;
+    uint32_t               gtsc_khz; /* kHz */
+    uint32_t               pad0;
     struct pl_time         pl_time;
 
     struct hvm_io_handler  io_handler;
diff -r f8187a343ad2 -r 4015e09394c1 xen/include/asm-x86/hvm/vpt.h
--- a/xen/include/asm-x86/hvm/vpt.h	Fri Feb 20 17:02:36 2009 +0000
+++ b/xen/include/asm-x86/hvm/vpt.h	Tue Jun 16 22:41:06 2009 -0400
@@ -136,8 +136,6 @@ struct pl_time {    /* platform time */
     spinlock_t pl_time_lock;
 };
 
-#define ticks_per_sec(v) (v->domain->arch.hvm_domain.tsc_frequency)
-
 void pt_save_timer(struct vcpu *v);
 void pt_restore_timer(struct vcpu *v);
 void pt_update_irq(struct vcpu *v);
diff -r f8187a343ad2 -r 4015e09394c1 xen/include/public/arch-x86/hvm/save.h
--- a/xen/include/public/arch-x86/hvm/save.h	Fri Feb 20 17:02:36 2009 +0000
+++ b/xen/include/public/arch-x86/hvm/save.h	Tue Jun 16 22:41:06 2009 -0400
@@ -38,7 +38,7 @@ struct hvm_save_header {
     uint32_t version;           /* File format version */
     uint64_t changeset;         /* Version of Xen that saved this file */
     uint32_t cpuid;             /* CPUID[0x01][%eax] on the saving machine */
-    uint32_t pad0;
+    uint32_t gtsc_khz;        /* Guest's TSC frequency in kHz */
 };
 
 DECLARE_HVM_SAVE_TYPE(HEADER, 1, struct hvm_save_header);

[-- Attachment #3: 0002-move_multidiv64_out.patch --]
[-- Type: application/octet-stream, Size: 3360 bytes --]

# HG changeset patch
# User root@localhost.localdomain
# Date 1245207897 14400
# Node ID 09f3065668f20ebc1d6424d3c051c352baeba618
# Parent  4015e09394c1857a6e68e6d6909d11cf5ecba241
Move muldiv64 out and make it as a public function.

muldiv64 is used to caculate u64*u32/u32, and we
will use it for TSC scaling.

Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>

diff -r 4015e09394c1 -r 09f3065668f2 xen/arch/x86/Makefile
--- a/xen/arch/x86/Makefile	Tue Jun 16 22:41:06 2009 -0400
+++ b/xen/arch/x86/Makefile	Tue Jun 16 23:04:57 2009 -0400
@@ -53,6 +53,7 @@ obj-y += crash.o
 obj-y += crash.o
 obj-y += tboot.o
 obj-y += hpet.o
+obj-y += lib.o
 obj-y += bzimage.o
 
 obj-$(crash_debug) += gdbstub.o
diff -r 4015e09394c1 -r 09f3065668f2 xen/arch/x86/hvm/i8254.c
--- a/xen/arch/x86/hvm/i8254.c	Tue Jun 16 22:41:06 2009 -0400
+++ b/xen/arch/x86/hvm/i8254.c	Tue Jun 16 23:04:57 2009 -0400
@@ -56,30 +56,6 @@ static int handle_speaker_io(
 
 #define get_guest_time(v) \
    (is_hvm_vcpu(v) ? hvm_get_guest_time(v) : (u64)get_s_time())
-
-/* Compute with 96 bit intermediate result: (a*b)/c */
-static uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
-{
-    union {
-        uint64_t ll;
-        struct {
-#ifdef WORDS_BIGENDIAN
-            uint32_t high, low;
-#else
-            uint32_t low, high;
-#endif            
-        } l;
-    } u, res;
-    uint64_t rl, rh;
-
-    u.ll = a;
-    rl = (uint64_t)u.l.low * (uint64_t)b;
-    rh = (uint64_t)u.l.high * (uint64_t)b;
-    rh += (rl >> 32);
-    res.l.high = rh / c;
-    res.l.low = (((rh % c) << 32) + (rl & 0xffffffff)) / c;
-    return res.ll;
-}
 
 static int pit_get_count(PITState *pit, int channel)
 {
diff -r 4015e09394c1 -r 09f3065668f2 xen/arch/x86/lib.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/lib.c	Tue Jun 16 23:04:57 2009 -0400
@@ -0,0 +1,44 @@
+#include <xen/ctype.h>
+#include <xen/lib.h>
+#include <xen/types.h>
+#include <asm/byteorder.h>
+
+/* Compute with 96 bit intermediate result: (a*b)/c */
+
+#ifdef __x86_64__
+ uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
+ {
+     __asm__ __volatile__ ("mul %%rdx;"
+                            "div %%rcx;"
+                            : "=a"(a)
+                            : "0"(a), "d"(b), "c"(c)
+                            );
+    return a;
+}
+
+#else
+
+uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
+{
+    union {
+        uint64_t ll;
+        struct {
+#ifdef WORDS_BIGENDIAN
+            uint32_t high, low;
+#else
+            uint32_t low, high;
+#endif            
+        } l;
+    } u, res;
+    uint64_t rl, rh;
+
+    u.ll = a;
+    rl = (uint64_t)u.l.low * (uint64_t)b;
+    rh = (uint64_t)u.l.high * (uint64_t)b;
+    rh += (rl >> 32);
+    res.l.high = rh / c;
+    res.l.low = (((rh % c) << 32) + (rl & 0xffffffff)) / c;
+    return res.ll;
+}
+
+#endif
diff -r 4015e09394c1 -r 09f3065668f2 xen/include/xen/lib.h
--- a/xen/include/xen/lib.h	Tue Jun 16 22:41:06 2009 -0400
+++ b/xen/include/xen/lib.h	Tue Jun 16 23:04:57 2009 -0400
@@ -91,6 +91,8 @@ unsigned long long simple_strtoull(
 
 unsigned long long parse_size_and_unit(const char *s, const char **ps);
 
+uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c);
+
 #define TAINT_UNSAFE_SMP                (1<<0)
 #define TAINT_MACHINE_CHECK             (1<<1)
 #define TAINT_BAD_PAGE                  (1<<2)

[-- Attachment #4: 0003-scaling_guest_tsc.patch --]
[-- Type: application/octet-stream, Size: 5774 bytes --]

# HG changeset patch
# User root@localhost.localdomain
# Date 1245206943 14400
# Node ID 824c4af117d8c39511db49cbd8aef091e512f52c
# Parent  e3d6e4bdb6341fa5b86a3b6c28e2acb7e0d31e9a
Scaling guest's TSC when the target machine's frequency is different with its requirement.

Using trap&emulate for guest's each rdtsc instruction first, maybe it can be optimized later.

Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>

diff -r 09f3065668f2 xen/arch/x86/hvm/hvm.c
--- a/xen/arch/x86/hvm/hvm.c	Tue Jun 16 23:04:57 2009 -0400
+++ b/xen/arch/x86/hvm/hvm.c	Wed Jun 17 21:58:11 2009 -0400
@@ -139,26 +139,62 @@ uint8_t hvm_combine_hw_exceptions(uint8_
     return TRAP_double_fault;
 }
 
+void hvm_enable_rdtsc_exiting(struct domain *d)
+{
+    struct vcpu *v;
+
+    for_each_vcpu ( d, v ) {
+        if ( hvm_funcs.enable_rdtsc_exiting )
+            hvm_funcs.enable_rdtsc_exiting(v);
+    }
+}
+
+int hvm_gtsc_need_scale(struct domain *d)
+{
+    uint32_t gtsc_khz;
+
+    gtsc_khz = d->arch.hvm_domain.gtsc_khz / 1000;
+
+    if ( gtsc_khz && gtsc_khz != (uint32_t)cpu_khz / 1000 ) {
+        d->arch.hvm_domain.tsc_scaled = 1;
+        return 1;
+    }
+
+    d->arch.hvm_domain.tsc_scaled = 0;
+    return 0;
+}
+
+static u64 hvm_h2g_scale_tsc(struct vcpu *v, u64 host_tsc)
+{
+    u32 gtsc_khz, scaled_htsc = host_tsc;
+
+    if ( v->domain->arch.hvm_domain.tsc_scaled ) {
+        gtsc_khz = v->domain->arch.hvm_domain.gtsc_khz;
+        scaled_htsc = muldiv64(host_tsc, gtsc_khz, cpu_khz);
+    }
+
+    return scaled_htsc;
+}
+
 void hvm_set_guest_tsc(struct vcpu *v, u64 guest_tsc)
 {
-    u64 host_tsc;
+    u64 host_tsc, scaled_htsc;
 
     rdtscll(host_tsc);
-
-    v->arch.hvm_vcpu.cache_tsc_offset = guest_tsc - host_tsc;
+    scaled_htsc = hvm_h2g_scale_tsc(v, host_tsc);
+
+    v->arch.hvm_vcpu.cache_tsc_offset = guest_tsc - scaled_htsc;
     hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset);
 }
 
 u64 hvm_get_guest_tsc(struct vcpu *v)
 {
-    u64 host_tsc;
-
-    if ( opt_softtsc )
-        host_tsc = hvm_get_guest_time(v);
-    else
-        rdtscll(host_tsc);
-
-    return host_tsc + v->arch.hvm_vcpu.cache_tsc_offset;
+    u64 host_tsc, scaled_htsc;
+
+    rdtscll(host_tsc);
+    scaled_htsc = hvm_h2g_scale_tsc(v, host_tsc);
+
+    return scaled_htsc + v->arch.hvm_vcpu.cache_tsc_offset;
 }
 
 void hvm_migrate_timers(struct vcpu *v)
diff -r 09f3065668f2 xen/arch/x86/hvm/save.c
--- a/xen/arch/x86/hvm/save.c	Tue Jun 16 23:04:57 2009 -0400
+++ b/xen/arch/x86/hvm/save.c	Wed Jun 17 21:56:19 2009 -0400
@@ -63,6 +63,14 @@ int arch_hvm_load(struct domain *d, stru
     /* Restore guest's preferred TSC frequency. */
     d->arch.hvm_domain.gtsc_khz = hdr->gtsc_khz;
 
+    if ( hdr->gtsc_khz && hvm_gtsc_need_scale(d) ) {
+        hvm_enable_rdtsc_exiting(d);
+
+        printk("Migrate to a platform with different freq:%ldMhz, "
+            "expected freq:%dMhz, enable rdtsc exiting!\n",
+                    cpu_khz / 1000, hdr->gtsc_khz / 1000);
+    }
+
     /* VGA state is not saved/restored, so we nobble the cache. */
     d->arch.hvm_domain.stdvga.cache = 0;
 
diff -r 09f3065668f2 xen/arch/x86/hvm/vmx/vmx.c
--- a/xen/arch/x86/hvm/vmx/vmx.c	Tue Jun 16 23:04:57 2009 -0400
+++ b/xen/arch/x86/hvm/vmx/vmx.c	Wed Jun 17 21:56:19 2009 -0400
@@ -946,6 +946,14 @@ static void vmx_set_tsc_offset(struct vc
     vmx_vmcs_exit(v);
 }
 
+static void vmx_enable_rdtsc_exiting(struct vcpu *v)
+{
+    vmx_vmcs_enter(v);
+    v->arch.hvm_vmx.exec_control |= CPU_BASED_RDTSC_EXITING;
+    __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
+     vmx_vmcs_exit(v);
+ }
+
 void do_nmi(struct cpu_user_regs *);
 
 static void vmx_init_hypercall_page(struct domain *d, void *hypercall_page)
@@ -1371,7 +1379,8 @@ static struct hvm_function_table vmx_fun
     .msr_write_intercept  = vmx_msr_write_intercept,
     .invlpg_intercept     = vmx_invlpg_intercept,
     .set_uc_mode          = vmx_set_uc_mode,
-    .set_info_guest       = vmx_set_info_guest
+    .set_info_guest       = vmx_set_info_guest,
+    .enable_rdtsc_exiting = vmx_enable_rdtsc_exiting
 };
 
 static unsigned long *vpid_bitmap;
diff -r 09f3065668f2 xen/arch/x86/hvm/vpt.c
--- a/xen/arch/x86/hvm/vpt.c	Tue Jun 16 23:04:57 2009 -0400
+++ b/xen/arch/x86/hvm/vpt.c	Wed Jun 17 21:56:19 2009 -0400
@@ -34,6 +34,7 @@ void hvm_init_guest_time(struct domain *
     pl->last_guest_time = 0;
 
     d->arch.hvm_domain.gtsc_khz = cpu_khz;
+    d->arch.hvm_domain.tsc_scaled = 0;
 }
 
 u64 hvm_get_guest_time(struct vcpu *v)
diff -r 09f3065668f2 xen/include/asm-x86/hvm/domain.h
--- a/xen/include/asm-x86/hvm/domain.h	Tue Jun 16 23:04:57 2009 -0400
+++ b/xen/include/asm-x86/hvm/domain.h	Wed Jun 17 21:56:19 2009 -0400
@@ -45,7 +45,7 @@ struct hvm_domain {
     struct hvm_ioreq_page  buf_ioreq;
 
     uint32_t               gtsc_khz; /* kHz */
-    uint32_t               pad0;
+    uint32_t               tsc_scaled;
     struct pl_time         pl_time;
 
     struct hvm_io_handler  io_handler;
diff -r 09f3065668f2 xen/include/asm-x86/hvm/hvm.h
--- a/xen/include/asm-x86/hvm/hvm.h	Tue Jun 16 23:04:57 2009 -0400
+++ b/xen/include/asm-x86/hvm/hvm.h	Wed Jun 17 21:56:19 2009 -0400
@@ -129,6 +129,7 @@ struct hvm_function_table {
     void (*invlpg_intercept)(unsigned long vaddr);
     void (*set_uc_mode)(struct vcpu *v);
     void (*set_info_guest)(struct vcpu *v);
+    void (*enable_rdtsc_exiting)(struct vcpu *v);
 };
 
 extern struct hvm_function_table hvm_funcs;
@@ -282,6 +283,9 @@ int hvm_event_needs_reinjection(uint8_t 
 
 uint8_t hvm_combine_hw_exceptions(uint8_t vec1, uint8_t vec2);
 
+void hvm_enable_rdtsc_exiting(struct domain *d);
+int hvm_gtsc_need_scale(struct domain *d);
+
 static inline int hvm_cpu_up(void)
 {
     if ( hvm_funcs.cpu_up )

[-- Attachment #5: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] TSC scaling for live migration betweenplatforms with different TSC frequecies
  2009-06-18  2:56 [PATCH] TSC scaling for live migration between platforms with different TSC frequecies Zhang, Xiantao
@ 2009-06-18  7:37 ` Jan Beulich
  2009-06-18  8:52   ` Zhang, Xiantao
  2009-06-18  9:02 ` [PATCH] TSC scaling for live migration between platforms " Tim Deegan
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 35+ messages in thread
From: Jan Beulich @ 2009-06-18  7:37 UTC (permalink / raw)
  To: Xiantao Zhang; +Cc: xen-devel, Keir Fraser

>>> "Zhang, Xiantao" <xiantao.zhang@intel.com> 18.06.09 04:56 >>>
>PATCH 0003-- Scaling host TSC freqeuncy patch. 

>+int hvm_gtsc_need_scale(struct domain *d)
>+{
>+    uint32_t gtsc_khz;
>+
>+    gtsc_khz = d->arch.hvm_domain.gtsc_khz / 1000;

Can the variable please be renamed to what it contains (i.e. gtsc_mhz)?

> u64 hvm_get_guest_tsc(struct vcpu *v)
> {
>-    u64 host_tsc;
>-
>-    if ( opt_softtsc )
>-        host_tsc = hvm_get_guest_time(v);
>-    else
>-        rdtscll(host_tsc);
>-
>-    return host_tsc + v->arch.hvm_vcpu.cache_tsc_offset;
>+    u64 host_tsc, scaled_htsc;
>+
>+    rdtscll(host_tsc);
>+    scaled_htsc = hvm_h2g_scale_tsc(v, host_tsc);
>+
>+    return scaled_htsc + v->arch.hvm_vcpu.cache_tsc_offset;
> }
> 
> void hvm_migrate_timers(struct vcpu *v)

I'm getting the impression that the opt_softtsc functionality got lost here.

>+        printk("Migrate to a platform with different freq:%ldMhz, "
>+            "expected freq:%dMhz, enable rdtsc exiting!\n",
>+                    cpu_khz / 1000, hdr->gtsc_khz / 1000);

gdprintk()? At least, I think, any guest related printk-s should identify which
guest they're about.

Jan

^ permalink raw reply	[flat|nested] 35+ messages in thread

* RE: [PATCH] TSC scaling for live migration betweenplatforms with different TSC frequecies
  2009-06-18  7:37 ` [PATCH] TSC scaling for live migration betweenplatforms " Jan Beulich
@ 2009-06-18  8:52   ` Zhang, Xiantao
  2009-06-18 15:40     ` Dan Magenheimer
  0 siblings, 1 reply; 35+ messages in thread
From: Zhang, Xiantao @ 2009-06-18  8:52 UTC (permalink / raw)
  To: Jan Beulich, dan.magenheimer; +Cc: xen-devel, Keir Fraser

Jan Beulich wrote:
>>>> "Zhang, Xiantao" <xiantao.zhang@intel.com> 18.06.09 04:56 >>>
>> PATCH 0003-- Scaling host TSC freqeuncy patch.
> 
>> +int hvm_gtsc_need_scale(struct domain *d)
>> +{
>> +    uint32_t gtsc_khz;
>> +
>> +    gtsc_khz = d->arch.hvm_domain.gtsc_khz / 1000;
> 
> Can the variable please be renamed to what it contains (i.e.
> gtsc_mhz)? 
> 
>> u64 hvm_get_guest_tsc(struct vcpu *v)
>> {
>> -    u64 host_tsc;
>> -
>> -    if ( opt_softtsc )
>> -        host_tsc = hvm_get_guest_time(v);
>> -    else
>> -        rdtscll(host_tsc);
>> -
>> -    return host_tsc + v->arch.hvm_vcpu.cache_tsc_offset;
>> +    u64 host_tsc, scaled_htsc;
>> +
>> +    rdtscll(host_tsc);
>> +    scaled_htsc = hvm_h2g_scale_tsc(v, host_tsc);
>> +
>> +    return scaled_htsc + v->arch.hvm_vcpu.cache_tsc_offset; }
>> 
>> void hvm_migrate_timers(struct vcpu *v)
> 
> I'm getting the impression that the opt_softtsc functionality got
> lost here.

I am still confused by opt_softtsc check here. If want to use platform timer to emulate guest's tsc, hvm_set_guest_tsc should also need  perform this check to get correct cache_tsc_offset, but I didn't see it.   A bug ? 
If use host's tsc to emulate guest's tsc, the check is useless, so I removed it in my patch.  Maybe we need Dan's explanation about the check here to determin whether keep it or not.

> 
>> +        printk("Migrate to a platform with different freq:%ldMhz, "
>> +            "expected freq:%dMhz, enable rdtsc exiting!\n",
>> +                    cpu_khz / 1000, hdr->gtsc_khz / 1000);
> 
> gdprintk()? At least, I think, any guest related printk-s should
> identify which guest they're about. 

Added in the attached patch. Thanks!
Xiantao

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] TSC scaling for live migration between platforms with different TSC frequecies
  2009-06-18  2:56 [PATCH] TSC scaling for live migration between platforms with different TSC frequecies Zhang, Xiantao
  2009-06-18  7:37 ` [PATCH] TSC scaling for live migration betweenplatforms " Jan Beulich
@ 2009-06-18  9:02 ` Tim Deegan
  2009-06-18  9:46   ` Zhang, Xiantao
  2009-06-18  9:10 ` Patrick Colp
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 35+ messages in thread
From: Tim Deegan @ 2009-06-18  9:02 UTC (permalink / raw)
  To: Zhang, Xiantao; +Cc: xen-devel, Keir Fraser

At 03:56 +0100 on 18 Jun (1245297406), Zhang, Xiantao wrote:
> --- a/xen/include/public/arch-x86/hvm/save.h	Fri Feb 20 17:02:36 2009 +0000
> +++ b/xen/include/public/arch-x86/hvm/save.h	Tue Jun 16 22:41:06 2009 -0400
> @@ -38,7 +38,7 @@ struct hvm_save_header {
>      uint32_t version;           /* File format version */
>      uint64_t changeset;         /* Version of Xen that saved this file */
>      uint32_t cpuid;             /* CPUID[0x01][%eax] on the saving machine */
> -    uint32_t pad0;
> +    uint32_t gtsc_khz;        /* Guest's TSC frequency in kHz */
>  };
>  
>  DECLARE_HVM_SAVE_TYPE(HEADER, 1, struct hvm_save_header);

I'm not sure this is the best place for this field -- it's a property of
the guest CPU rather than of the host that saved the record.  I think it
would be better to give it its own save record type.

Cheers,

Tim.

-- 
Tim Deegan <Tim.Deegan@citrix.com>
Principal Software Engineer, Citrix Systems (R&D) Ltd.
[Company #02300071, SL9 0DZ, UK.]

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] TSC scaling for live migration between platforms with different TSC frequecies
  2009-06-18  2:56 [PATCH] TSC scaling for live migration between platforms with different TSC frequecies Zhang, Xiantao
  2009-06-18  7:37 ` [PATCH] TSC scaling for live migration betweenplatforms " Jan Beulich
  2009-06-18  9:02 ` [PATCH] TSC scaling for live migration between platforms " Tim Deegan
@ 2009-06-18  9:10 ` Patrick Colp
  2009-06-18  9:27   ` Tim Deegan
  2009-06-18 10:56 ` Ian Pratt
  2009-06-22  5:14 ` Zhang, Xiantao
  4 siblings, 1 reply; 35+ messages in thread
From: Patrick Colp @ 2009-06-18  9:10 UTC (permalink / raw)
  To: Xiantao Zhang; +Cc: xen-devel, Keir Fraser

>+        printk("Migrate to a platform with different freq:%ldMhz, "
>+            "expected freq:%dMhz, enable rdtsc exiting!\n",
>+                    cpu_khz / 1000, hdr->gtsc_khz / 1000);

Being pedantic, this should probably be:

     printk("Migrate to a platform with different freq: %ldMHz, "
            "expected freq: %dMHz, enable rdtsc exiting!\n",
             cpu_khz / 1000, hdr->gtsc_khz / 1000);


Patrick

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] TSC scaling for live migration between platforms with different TSC frequecies
  2009-06-18  9:10 ` Patrick Colp
@ 2009-06-18  9:27   ` Tim Deegan
  2009-06-18  9:47     ` Zhang, Xiantao
  0 siblings, 1 reply; 35+ messages in thread
From: Tim Deegan @ 2009-06-18  9:27 UTC (permalink / raw)
  To: Patrick Colp; +Cc: xen-devel, Keir Fraser, Xiantao Zhang

At 10:10 +0100 on 18 Jun (1245319857), Patrick Colp wrote:
> >+        printk("Migrate to a platform with different freq:%ldMhz, "
> >+            "expected freq:%dMhz, enable rdtsc exiting!\n",
> >+                    cpu_khz / 1000, hdr->gtsc_khz / 1000);
> 
> Being pedantic, this should probably be:
> 
>      printk("Migrate to a platform with different freq: %ldMHz, "
>             "expected freq: %dMHz, enable rdtsc exiting!\n",
>              cpu_khz / 1000, hdr->gtsc_khz / 1000);

Being _pedantic_, it should be 

      gdprintk(XENLOG_INFO, "Loaded VM expects a %"PRIu32"MHz TSC "
               "but CPU is %ldMHz; enabling RDTSC exiting.\n", 
               hdr->gtsc_khz / 1000, cpu_khz / 1000);

:) 

Tim.

-- 
Tim Deegan <Tim.Deegan@citrix.com>
Principal Software Engineer, Citrix Systems (R&D) Ltd.
[Company #02300071, SL9 0DZ, UK.]

^ permalink raw reply	[flat|nested] 35+ messages in thread

* RE: [PATCH] TSC scaling for live migration between platforms with different TSC frequecies
  2009-06-18  9:02 ` [PATCH] TSC scaling for live migration between platforms " Tim Deegan
@ 2009-06-18  9:46   ` Zhang, Xiantao
  2009-06-18  9:56     ` Tim Deegan
  0 siblings, 1 reply; 35+ messages in thread
From: Zhang, Xiantao @ 2009-06-18  9:46 UTC (permalink / raw)
  To: Tim Deegan; +Cc: xen-devel, Keir Fraser

[-- Attachment #1: Type: text/plain, Size: 1237 bytes --]

Tim Deegan wrote:
> At 03:56 +0100 on 18 Jun (1245297406), Zhang, Xiantao wrote:
>> --- a/xen/include/public/arch-x86/hvm/save.h	Fri Feb 20 17:02:36
>> 2009 +0000 +++ b/xen/include/public/arch-x86/hvm/save.h	Tue Jun 16
>> 22:41:06 2009 -0400 @@ -38,7 +38,7 @@ struct hvm_save_header {
>>      uint32_t version;           /* File format version */
>>      uint64_t changeset;         /* Version of Xen that saved this
>>      file */ uint32_t cpuid;             /* CPUID[0x01][%eax] on the
>> saving machine */ -    uint32_t pad0; +    uint32_t gtsc_khz;       
>> /* Guest's TSC frequency in kHz */  }; 
>> 
>>  DECLARE_HVM_SAVE_TYPE(HEADER, 1, struct hvm_save_header);
> 
> I'm not sure this is the best place for this field -- it's a property
> of the guest CPU rather than of the host that saved the record.  I
> think it would be better to give it its own save record type. 
Hi, Tim
    Guest's preferred TSC frequency should be per-domain concept instead of constraining it to vcpus.  If we introduce a separate save type for this only field,  maybe too heavy.
In addition, TSC frequency field is saved by original proposal about image format, but don't know why it is dropped.  See attached pdf doc. 
Xiantao


[-- Attachment #2: pdfghoqtwxKby.pdf --]
[-- Type: application/pdf, Size: 112329 bytes --]

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 35+ messages in thread

* RE: [PATCH] TSC scaling for live migration between platforms with different TSC frequecies
  2009-06-18  9:27   ` Tim Deegan
@ 2009-06-18  9:47     ` Zhang, Xiantao
  2009-06-18 15:45       ` Dan Magenheimer
  0 siblings, 1 reply; 35+ messages in thread
From: Zhang, Xiantao @ 2009-06-18  9:47 UTC (permalink / raw)
  To: Tim Deegan, Patrick Colp; +Cc: Keir, xen-devel, Fraser

Tim Deegan wrote:
> At 10:10 +0100 on 18 Jun (1245319857), Patrick Colp wrote:
>>> +        printk("Migrate to a platform with different freq:%ldMhz, "
>>> +            "expected freq:%dMhz, enable rdtsc exiting!\n",
>>> +                    cpu_khz / 1000, hdr->gtsc_khz / 1000);
>> 
>> Being pedantic, this should probably be:
>> 
>>      printk("Migrate to a platform with different freq: %ldMHz, "
>>             "expected freq: %dMHz, enable rdtsc exiting!\n",
>>              cpu_khz / 1000, hdr->gtsc_khz / 1000);
> 
> Being _pedantic_, it should be
> 
>       gdprintk(XENLOG_INFO, "Loaded VM expects a %"PRIu32"MHz TSC "
>                "but CPU is %ldMHz; enabling RDTSC exiting.\n",
>                hdr->gtsc_khz / 1000, cpu_khz / 1000); 

It maybe wonderful to add VM info (eg. Domain id) as Jan says in another mail. Thanks for your suggestions! I will change it in the final version!  :-)
Xiantao

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] TSC scaling for live migration between platforms with different TSC frequecies
  2009-06-18  9:46   ` Zhang, Xiantao
@ 2009-06-18  9:56     ` Tim Deegan
  0 siblings, 0 replies; 35+ messages in thread
From: Tim Deegan @ 2009-06-18  9:56 UTC (permalink / raw)
  To: Zhang, Xiantao; +Cc: xen-devel, Keir Fraser

At 10:46 +0100 on 18 Jun (1245322016), Zhang, Xiantao wrote:
> Hi, Tim
>     Guest's preferred TSC frequency should be per-domain concept
> instead of constraining it to vcpus.  If we introduce a separate save
> type for this only field, maybe too heavy.

Sorry, what I meant was: this is a property of the _guest_ so should
probably have a save record like other guest properties.  The cpuid info
in the header is a property of the _host_ that saved the record and is
there just for sanity-checking the compatibility of the record (like the
version number).

Cheers,

Tim.

> In addition, TSC frequency field is saved by original proposal about
> image format, but don't know why it is dropped.  See attached pdf doc.

-- 
Tim Deegan <Tim.Deegan@citrix.com>
Principal Software Engineer, Citrix Systems (R&D) Ltd.
[Company #02300071, SL9 0DZ, UK.]

^ permalink raw reply	[flat|nested] 35+ messages in thread

* RE: [PATCH] TSC scaling for live migration between platforms with different TSC frequecies
  2009-06-18  2:56 [PATCH] TSC scaling for live migration between platforms with different TSC frequecies Zhang, Xiantao
                   ` (2 preceding siblings ...)
  2009-06-18  9:10 ` Patrick Colp
@ 2009-06-18 10:56 ` Ian Pratt
  2009-06-18 15:58   ` Dan Magenheimer
  2009-06-19  1:34   ` Zhang, Xiantao
  2009-06-22  5:14 ` Zhang, Xiantao
  4 siblings, 2 replies; 35+ messages in thread
From: Ian Pratt @ 2009-06-18 10:56 UTC (permalink / raw)
  To: Zhang, Xiantao, Keir Fraser; +Cc: Ian Pratt, xen-devel

>     This patchset targets for enabling TSC scaling in software for live
> migration between platforms with different TSC frequecies.  Once found the
> target host's frequency is different with source host's, hypervisor will
> trap and emulate guest's all rdtsc instructions with its expected
> frequency.
>     If hardware's TSC frequency is difffernt with guest's exepcted freq,
> guest may behave abnormally, eg. incorrect wallclock, soft lockup, even
> hang in some cases.  Therefore, this patchset is necessary to avoid such
> issues.
> 
> PATCH 0001-- Save guest's preferred TSC in image for save/restore and
> migration PATCH 0002-- Move multidiv64 as a library function.
> PATCH 0003-- Scaling host TSC freqeuncy patch.

I think this needs to be a feature which is enabled/disabled on a per VM basis (in the config file).

I'm not sure what the default should be. Windows VMs and applications don't seem to much care about the TSC which is an argument for leaving the default as it is at the moment. However, one could argue that things that don't care about the TSC aren't going to be reading it much, so the overhead of making the default to scale the TSC shouldn't be too high.

Ian

^ permalink raw reply	[flat|nested] 35+ messages in thread

* RE: [PATCH] TSC scaling for live migration betweenplatforms with different TSC frequecies
  2009-06-18  8:52   ` Zhang, Xiantao
@ 2009-06-18 15:40     ` Dan Magenheimer
  2009-06-19  1:48       ` Zhang, Xiantao
  0 siblings, 1 reply; 35+ messages in thread
From: Dan Magenheimer @ 2009-06-18 15:40 UTC (permalink / raw)
  To: Zhang, Xiantao, Jan Beulich; +Cc: xen-devel, Keir Fraser

> I am still confused by opt_softtsc check here. If want to use 
> platform timer to emulate guest's tsc, hvm_set_guest_tsc 
> should also need  perform this check to get correct 
> cache_tsc_offset, but I didn't see it.   A bug ? 
> If use host's tsc to emulate guest's tsc, the check is 
> useless, so I removed it in my patch.  Maybe we need Dan's 
> explanation about the check here to determin whether keep it or not.

Please do keep the opt_softtsc check.  I agree that there
is a bug, that hvm_set_guest_tsc should check as well.
IIRC, my guest never set the TSC.

The softtsc option is for handling skew problems
not scaling/migration problems but should probably
be updated to handle your TSC scaling as well.

http://lists.xensource.com/archives/html/xen-devel/2008-07/msg00495.html

Thanks,
Dan

> -----Original Message-----
> From: Zhang, Xiantao [mailto:xiantao.zhang@intel.com]
> Sent: Thursday, June 18, 2009 2:52 AM
> To: Jan Beulich; Dan Magenheimer
> Cc: Keir Fraser; xen-devel@lists.xensource.com
> Subject: RE: [Xen-devel] [PATCH] TSC scaling for live migration
> betweenplatforms with different TSC frequecies
> 
> 
> Jan Beulich wrote:
> >>>> "Zhang, Xiantao" <xiantao.zhang@intel.com> 18.06.09 04:56 >>>
> >> PATCH 0003-- Scaling host TSC freqeuncy patch.
> > 
> >> +int hvm_gtsc_need_scale(struct domain *d)
> >> +{
> >> +    uint32_t gtsc_khz;
> >> +
> >> +    gtsc_khz = d->arch.hvm_domain.gtsc_khz / 1000;
> > 
> > Can the variable please be renamed to what it contains (i.e.
> > gtsc_mhz)? 
> > 
> >> u64 hvm_get_guest_tsc(struct vcpu *v)
> >> {
> >> -    u64 host_tsc;
> >> -
> >> -    if ( opt_softtsc )
> >> -        host_tsc = hvm_get_guest_time(v);
> >> -    else
> >> -        rdtscll(host_tsc);
> >> -
> >> -    return host_tsc + v->arch.hvm_vcpu.cache_tsc_offset;
> >> +    u64 host_tsc, scaled_htsc;
> >> +
> >> +    rdtscll(host_tsc);
> >> +    scaled_htsc = hvm_h2g_scale_tsc(v, host_tsc);
> >> +
> >> +    return scaled_htsc + v->arch.hvm_vcpu.cache_tsc_offset; }
> >> 
> >> void hvm_migrate_timers(struct vcpu *v)
> > 
> > I'm getting the impression that the opt_softtsc functionality got
> > lost here.
> 
> I am still confused by opt_softtsc check here. If want to use 
> platform timer to emulate guest's tsc, hvm_set_guest_tsc 
> should also need  perform this check to get correct 
> cache_tsc_offset, but I didn't see it.   A bug ? 
> If use host's tsc to emulate guest's tsc, the check is 
> useless, so I removed it in my patch.  Maybe we need Dan's 
> explanation about the check here to determin whether keep it or not.
> 
> > 
> >> +        printk("Migrate to a platform with different 
> freq:%ldMhz, "
> >> +            "expected freq:%dMhz, enable rdtsc exiting!\n",
> >> +                    cpu_khz / 1000, hdr->gtsc_khz / 1000);
> > 
> > gdprintk()? At least, I think, any guest related printk-s should
> > identify which guest they're about. 
> 
> Added in the attached patch. Thanks!
> Xiantao
>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* RE: [PATCH] TSC scaling for live migration between platforms with different TSC frequecies
  2009-06-18  9:47     ` Zhang, Xiantao
@ 2009-06-18 15:45       ` Dan Magenheimer
  2009-06-18 16:04         ` Tim Deegan
  0 siblings, 1 reply; 35+ messages in thread
From: Dan Magenheimer @ 2009-06-18 15:45 UTC (permalink / raw)
  To: Zhang, Xiantao, Tim Deegan, Patrick Colp; +Cc: Keir, xen-devel, Fraser

>       gdprintk(XENLOG_INFO, "Loaded VM expects a %"PRIu32"MHz TSC "
>                "but CPU is %ldMHz; enabling RDTSC exiting.\n",
>                hdr->gtsc_khz / 1000, cpu_khz / 1000); 

RDTSC "exiting"?  Do you mean RDTSC emulation?  Also,
frankly, given the potential performance ramifications, perhaps
this should be higher than XENLOG_INFO?

> -----Original Message-----
> From: Zhang, Xiantao [mailto:xiantao.zhang@intel.com]
> Sent: Thursday, June 18, 2009 3:47 AM
> To: Tim Deegan; Patrick Colp
> Cc: Keir@acsinet12.oracle.com; xen-devel@lists.xensource.com; Fraser
> Subject: RE: [Xen-devel] [PATCH] TSC scaling for live 
> migration between
> platforms with different TSC frequecies
> 
> 
> Tim Deegan wrote:
> > At 10:10 +0100 on 18 Jun (1245319857), Patrick Colp wrote:
> >>> +        printk("Migrate to a platform with different 
> freq:%ldMhz, "
> >>> +            "expected freq:%dMhz, enable rdtsc exiting!\n",
> >>> +                    cpu_khz / 1000, hdr->gtsc_khz / 1000);
> >> 
> >> Being pedantic, this should probably be:
> >> 
> >>      printk("Migrate to a platform with different freq: %ldMHz, "
> >>             "expected freq: %dMHz, enable rdtsc exiting!\n",
> >>              cpu_khz / 1000, hdr->gtsc_khz / 1000);
> > 
> > Being _pedantic_, it should be
> > 
> >       gdprintk(XENLOG_INFO, "Loaded VM expects a %"PRIu32"MHz TSC "
> >                "but CPU is %ldMHz; enabling RDTSC exiting.\n",
> >                hdr->gtsc_khz / 1000, cpu_khz / 1000); 
> 
> It maybe wonderful to add VM info (eg. Domain id) as Jan says 
> in another mail. Thanks for your suggestions! I will change 
> it in the final version!  :-)
> Xiantao
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel
>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* RE: [PATCH] TSC scaling for live migration between platforms with different TSC frequecies
  2009-06-18 10:56 ` Ian Pratt
@ 2009-06-18 15:58   ` Dan Magenheimer
  2009-06-18 16:45     ` John Levon
  2009-06-19  1:34   ` Zhang, Xiantao
  1 sibling, 1 reply; 35+ messages in thread
From: Dan Magenheimer @ 2009-06-18 15:58 UTC (permalink / raw)
  To: Ian Pratt, Zhang, Xiantao, Keir Fraser; +Cc: xen-devel

This is a real virtualization problem.  Some apps
may use TSC as a monotonically increasing "sequence
number" to mark transactions.  For these apps,
TSC scaling is irrelevant (as long as the transition
doesn't cause the TSC to stop or go backwards).
Other apps (and/or the OS kernel) may use TSC to
approximate the passage of time, and for these apps
(and gettimeofday in the Linux kernel), this TSC scaling
patch is a must.  Unfortunately, both kinds of apps could
be running simultaneously on the same guest.  And
in either case, RDTSC frequency may be quite high.

I think a key missing fact is the overhead measurement
for trapping RDTSC.  I think someone at Intel measured
this once and it was quite bad.  Perhaps a better
question is:  If it is important to ALWAYS emulate RDTSC,
can the Xen code be written to handle RDTSC emulation
much faster?  If it could be made fast enough, the
proper default might best be always emulate (even
for PV guests... which also points out the fact that the
proposed patch only fixes HVM guests).

> -----Original Message-----
> From: Ian Pratt [mailto:Ian.Pratt@eu.citrix.com]
> Sent: Thursday, June 18, 2009 4:56 AM
> To: Zhang, Xiantao; Keir Fraser
> Cc: Ian Pratt; xen-devel@lists.xensource.com
> Subject: RE: [Xen-devel] [PATCH] TSC scaling for live 
> migration between
> platforms with different TSC frequecies
> 
> 
> >     This patchset targets for enabling TSC scaling in 
> software for live
> > migration between platforms with different TSC frequecies.  
> Once found the
> > target host's frequency is different with source host's, 
> hypervisor will
> > trap and emulate guest's all rdtsc instructions with its expected
> > frequency.
> >     If hardware's TSC frequency is difffernt with guest's 
> exepcted freq,
> > guest may behave abnormally, eg. incorrect wallclock, soft 
> lockup, even
> > hang in some cases.  Therefore, this patchset is necessary 
> to avoid such
> > issues.
> > 
> > PATCH 0001-- Save guest's preferred TSC in image for 
> save/restore and
> > migration PATCH 0002-- Move multidiv64 as a library function.
> > PATCH 0003-- Scaling host TSC freqeuncy patch.
> 
> I think this needs to be a feature which is enabled/disabled 
> on a per VM basis (in the config file).
> 
> I'm not sure what the default should be. Windows VMs and 
> applications don't seem to much care about the TSC which is 
> an argument for leaving the default as it is at the moment. 
> However, one could argue that things that don't care about 
> the TSC aren't going to be reading it much, so the overhead 
> of making the default to scale the TSC shouldn't be too high.
> 
> Ian
> 
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel
>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] TSC scaling for live migration between platforms with different TSC frequecies
  2009-06-18 15:45       ` Dan Magenheimer
@ 2009-06-18 16:04         ` Tim Deegan
  2009-06-18 20:07           ` Dan Magenheimer
  0 siblings, 1 reply; 35+ messages in thread
From: Tim Deegan @ 2009-06-18 16:04 UTC (permalink / raw)
  To: Dan Magenheimer
  Cc: Patrick Colp, Keir, xen-devel, Zhang, Xiantao, Keir Fraser

At 16:45 +0100 on 18 Jun (1245343507), Dan Magenheimer wrote:
> >       gdprintk(XENLOG_INFO, "Loaded VM expects a %"PRIu32"MHz TSC "
> >                "but CPU is %ldMHz; enabling RDTSC exiting.\n",
> >                hdr->gtsc_khz / 1000, cpu_khz / 1000); 
> 
> RDTSC "exiting"?  Do you mean RDTSC emulation?

Meh.  "RDTSC exiting" is Intel's term for it in the SDMs.

> Also, frankly, given the potential performance ramifications, perhaps
> this should be higher than XENLOG_INFO?

Not if it's going to print that out every time I migrate a VM.  This
whole feature is something I'd turn off anyway.

Cheers,

Tim.

> > -----Original Message-----
> > From: Zhang, Xiantao [mailto:xiantao.zhang@intel.com]
> > Sent: Thursday, June 18, 2009 3:47 AM
> > To: Tim Deegan; Patrick Colp
> > Cc: Keir@acsinet12.oracle.com; xen-devel@lists.xensource.com; Fraser
> > Subject: RE: [Xen-devel] [PATCH] TSC scaling for live 
> > migration between
> > platforms with different TSC frequecies
> > 
> > 
> > Tim Deegan wrote:
> > > At 10:10 +0100 on 18 Jun (1245319857), Patrick Colp wrote:
> > >>> +        printk("Migrate to a platform with different 
> > freq:%ldMhz, "
> > >>> +            "expected freq:%dMhz, enable rdtsc exiting!\n",
> > >>> +                    cpu_khz / 1000, hdr->gtsc_khz / 1000);
> > >> 
> > >> Being pedantic, this should probably be:
> > >> 
> > >>      printk("Migrate to a platform with different freq: %ldMHz, "
> > >>             "expected freq: %dMHz, enable rdtsc exiting!\n",
> > >>              cpu_khz / 1000, hdr->gtsc_khz / 1000);
> > > 
> > > Being _pedantic_, it should be
> > > 
> > >       gdprintk(XENLOG_INFO, "Loaded VM expects a %"PRIu32"MHz TSC "
> > >                "but CPU is %ldMHz; enabling RDTSC exiting.\n",
> > >                hdr->gtsc_khz / 1000, cpu_khz / 1000); 
> > 
> > It maybe wonderful to add VM info (eg. Domain id) as Jan says 
> > in another mail. Thanks for your suggestions! I will change 
> > it in the final version!  :-)
> > Xiantao
> > 
> > 
> > _______________________________________________
> > Xen-devel mailing list
> > Xen-devel@lists.xensource.com
> > http://lists.xensource.com/xen-devel
> >

-- 
Tim Deegan <Tim.Deegan@citrix.com>
Principal Software Engineer, Citrix Systems (R&D) Ltd.
[Company #02300071, SL9 0DZ, UK.]

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] TSC scaling for live migration between platforms with different TSC frequecies
  2009-06-18 15:58   ` Dan Magenheimer
@ 2009-06-18 16:45     ` John Levon
  2009-06-18 20:27       ` Dan Magenheimer
                         ` (2 more replies)
  0 siblings, 3 replies; 35+ messages in thread
From: John Levon @ 2009-06-18 16:45 UTC (permalink / raw)
  To: Dan Magenheimer; +Cc: Ian Pratt, xen-devel, Keir Fraser, Zhang, Xiantao

On Thu, Jun 18, 2009 at 08:58:49AM -0700, Dan Magenheimer wrote:

> Other apps (and/or the OS kernel) may use TSC to
> approximate the passage of time, and for these apps
> (and gettimeofday in the Linux kernel), this TSC scaling
> patch is a must.  Unfortunately, both kinds of apps could
> be running simultaneously on the same guest.  And
> in either case, RDTSC frequency may be quite high.

Certainly Solaris relies on the TSC for time-keeping, and uses it very
heavily. To the extent that I doubt it's even feasible to migrate to a
machine where scaling needs to be done, and such a migration should be
refused, since it would essentially kill the guest.

> question is:  If it is important to ALWAYS emulate RDTSC,
> can the Xen code be written to handle RDTSC emulation
> much faster?  If it could be made fast enough, the

I'd be amazed if this were possible.

regards
john

^ permalink raw reply	[flat|nested] 35+ messages in thread

* RE: [PATCH] TSC scaling for live migration between platforms with different TSC frequecies
  2009-06-18 16:04         ` Tim Deegan
@ 2009-06-18 20:07           ` Dan Magenheimer
  0 siblings, 0 replies; 35+ messages in thread
From: Dan Magenheimer @ 2009-06-18 20:07 UTC (permalink / raw)
  To: Tim Deegan; +Cc: Patrick Colp, Keir, xen-devel, Zhang, Xiantao, Keir Fraser

> Meh.  "RDTSC exiting" is Intel's term for it in the SDMs.

Well Intel's choice of term in some obscure corner of
the SDM doesn't seem like a good choice when it is
misleading to mortals.  "Exiting" means "leaving" or
"quitting", so mortals probably would read this as
that RDTSC ("whatever that is" sez the mere mortal)
is shutting down.  Not to mention the fact that
"enabling XXX exiting" sounds like an oxymoron. :-)

> Not if it's going to print that out every time I
> migrate a VM.  This whole feature is something I'd
> turn off anyway.

You'd turn it off at your peril unless you are
certain all the potential migration target machines
in your data center have identical TSC rates
and/or all of your guests don't have any reliance
on TSC for keeping time.

The message should only print if you ARE migrating
between machines with different TSC rates, not for
every migration.  Since the consequence of turning
it on is a potentially large loss in performance,
a clear log message seems a small price to pay.

Dan

> -----Original Message-----
> From: Tim Deegan [mailto:Tim.Deegan@citrix.com]
> Sent: Thursday, June 18, 2009 10:04 AM
> To: Dan Magenheimer
> Cc: Patrick Colp; Keir@acsinet12.oracle.com;
> xen-devel@lists.xensource.com; Zhang, Xiantao; Keir Fraser
> Subject: Re: [Xen-devel] [PATCH] TSC scaling for live 
> migration between
> platforms with different TSC frequecies
> 
> 
> At 16:45 +0100 on 18 Jun (1245343507), Dan Magenheimer wrote:
> > >       gdprintk(XENLOG_INFO, "Loaded VM expects a 
> %"PRIu32"MHz TSC "
> > >                "but CPU is %ldMHz; enabling RDTSC exiting.\n",
> > >                hdr->gtsc_khz / 1000, cpu_khz / 1000); 
> > 
> > RDTSC "exiting"?  Do you mean RDTSC emulation?
> 
> Meh.  "RDTSC exiting" is Intel's term for it in the SDMs.
> 
> > Also, frankly, given the potential performance 
> ramifications, perhaps
> > this should be higher than XENLOG_INFO?
> 
> Not if it's going to print that out every time I migrate a VM.  This
> whole feature is something I'd turn off anyway.
> 
> Cheers,
> 
> Tim.
> 
> > > -----Original Message-----
> > > From: Zhang, Xiantao [mailto:xiantao.zhang@intel.com]
> > > Sent: Thursday, June 18, 2009 3:47 AM
> > > To: Tim Deegan; Patrick Colp
> > > Cc: Keir@acsinet12.oracle.com; 
> xen-devel@lists.xensource.com; Fraser
> > > Subject: RE: [Xen-devel] [PATCH] TSC scaling for live 
> > > migration between
> > > platforms with different TSC frequecies
> > > 
> > > 
> > > Tim Deegan wrote:
> > > > At 10:10 +0100 on 18 Jun (1245319857), Patrick Colp wrote:
> > > >>> +        printk("Migrate to a platform with different 
> > > freq:%ldMhz, "
> > > >>> +            "expected freq:%dMhz, enable rdtsc exiting!\n",
> > > >>> +                    cpu_khz / 1000, hdr->gtsc_khz / 1000);
> > > >> 
> > > >> Being pedantic, this should probably be:
> > > >> 
> > > >>      printk("Migrate to a platform with different 
> freq: %ldMHz, "
> > > >>             "expected freq: %dMHz, enable rdtsc exiting!\n",
> > > >>              cpu_khz / 1000, hdr->gtsc_khz / 1000);
> > > > 
> > > > Being _pedantic_, it should be
> > > > 
> > > >       gdprintk(XENLOG_INFO, "Loaded VM expects a 
> %"PRIu32"MHz TSC "
> > > >                "but CPU is %ldMHz; enabling RDTSC exiting.\n",
> > > >                hdr->gtsc_khz / 1000, cpu_khz / 1000); 
> > > 
> > > It maybe wonderful to add VM info (eg. Domain id) as Jan says 
> > > in another mail. Thanks for your suggestions! I will change 
> > > it in the final version!  :-)
> > > Xiantao
> > > 
> > > 
> > > _______________________________________________
> > > Xen-devel mailing list
> > > Xen-devel@lists.xensource.com
> > > http://lists.xensource.com/xen-devel
> > >
> 
> -- 
> Tim Deegan <Tim.Deegan@citrix.com>
> Principal Software Engineer, Citrix Systems (R&D) Ltd.
> [Company #02300071, SL9 0DZ, UK.]
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel
>

^ permalink raw reply	[flat|nested] 35+ messages in thread

* RE: [PATCH] TSC scaling for live migration between platforms with different TSC frequecies
  2009-06-18 16:45     ` John Levon
@ 2009-06-18 20:27       ` Dan Magenheimer
  2009-06-18 20:45         ` John Levon
  2009-06-18 23:49       ` Dong, Eddie
  2009-06-19  2:25       ` Zhang, Xiantao
  2 siblings, 1 reply; 35+ messages in thread
From: Dan Magenheimer @ 2009-06-18 20:27 UTC (permalink / raw)
  To: John Levon; +Cc: Ian Pratt, xen-devel, Keir Fraser, Zhang, Xiantao

> > Other apps (and/or the OS kernel) may use TSC to
> > approximate the passage of time, and for these apps
> > (and gettimeofday in the Linux kernel), this TSC scaling
> > patch is a must.  Unfortunately, both kinds of apps could
> > be running simultaneously on the same guest.  And
> > in either case, RDTSC frequency may be quite high.
> 
> Certainly Solaris relies on the TSC for time-keeping, and uses it very
> heavily. To the extent that I doubt it's even feasible to migrate to a
> machine where scaling needs to be done, and such a migration should be
> refused, since it would essentially kill the guest.

Hmmm... any numbers?  Certainly Solaris isn't reading TSC much
more than a thousand times per second, is it?  Are you suggesting
that data centers running Solaris guests must segregate sets of
their machines by clock rate and disallow migrations
between the sets?
 
> > question is:  If it is important to ALWAYS emulate RDTSC,
> > can the Xen code be written to handle RDTSC emulation
> > much faster?  If it could be made fast enough, the
> 
> I'd be amazed if this were possible.

If it were PA-RISC or Itanium, I'd take on the challenge,
but I just don't know x86 well enough.  Are traps really
THAT expensive on x86?  (If max(TSC/sec/processor)~=1000 and
cycles/emulation~=5000, total degradation would be
less than 1%.  (Sounds high, but if the alternative is
clocks going haywire, seems a small price to pay. And
I expect the frequency and cost estimates (1000 and 5000)
are probably too high.)

Also, might turning RDTSC emulation on be much faster
on newer processors than old?

Dan

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] TSC scaling for live migration between platforms with different TSC frequecies
  2009-06-18 20:27       ` Dan Magenheimer
@ 2009-06-18 20:45         ` John Levon
  2009-06-18 20:57           ` Dan Magenheimer
  0 siblings, 1 reply; 35+ messages in thread
From: John Levon @ 2009-06-18 20:45 UTC (permalink / raw)
  To: Dan Magenheimer; +Cc: Ian Pratt, xen-devel, Keir Fraser, Zhang, Xiantao

On Thu, Jun 18, 2009 at 01:27:23PM -0700, Dan Magenheimer wrote:

> > Certainly Solaris relies on the TSC for time-keeping, and uses it very
> > heavily. To the extent that I doubt it's even feasible to migrate to a
> > machine where scaling needs to be done, and such a migration should be
> > refused, since it would essentially kill the guest.
> 
> Hmmm... any numbers?  Certainly Solaris isn't reading TSC much

# dtrace -n 'fbt::tsc_gethrtime:entry /cpu == 0/ { @ = sum(1); }' -c "sleep 10"
dtrace: description 'fbt::tsc_gethrtime:entry ' matched 1 probe
dtrace: pid 29708 has exited

            27798

This is on a basically idle 8-way system. (The other CPUs are less busy.)

http://blogs.sun.com/eschrock/entry/microstate_accounting_in_solaris_10

> that data centers running Solaris guests must segregate sets of
> their machines by clock rate and disallow migrations
> between the sets?

Certainly for Solaris HVM that has to be the case until we make it use PV time
(presuming that is safe, which I'm not sure offhand).

> THAT expensive on x86?  (If max(TSC/sec/processor)~=1000 and
> cycles/emulation~=5000, total degradation would be
> less than 1%.  (Sounds high, but if the alternative is

At the end of the day, though, only testing will tell us for sure.

regards
john

^ permalink raw reply	[flat|nested] 35+ messages in thread

* RE: [PATCH] TSC scaling for live migration between platforms with different TSC frequecies
  2009-06-18 20:45         ` John Levon
@ 2009-06-18 20:57           ` Dan Magenheimer
  2009-06-18 21:00             ` John Levon
  0 siblings, 1 reply; 35+ messages in thread
From: Dan Magenheimer @ 2009-06-18 20:57 UTC (permalink / raw)
  To: John Levon; +Cc: Ian Pratt, xen-devel, Keir Fraser, Zhang, Xiantao

> > Hmmm... any numbers?  Certainly Solaris isn't reading TSC much
> 
> # dtrace -n 'fbt::tsc_gethrtime:entry /cpu == 0/ { @ = 
> sum(1); }' -c "sleep 10"
> dtrace: description 'fbt::tsc_gethrtime:entry ' matched 1 probe
> dtrace: pid 29708 has exited
> 
>             27798
> 
> This is on a basically idle 8-way system. (The other CPUs are 
> less busy.)

Just checking... this is in 10 seconds and each processor is
"ticking" (and possibly a system-wide timer tick as well),
so this is ~350 rdtsc/sec/processor, correct?

>> that data centers running Solaris guests must segregate sets of
>> their machines by clock rate and disallow migrations
>> between the sets?

> Certainly for Solaris HVM that has to be the case until we make it use PV time
> (presuming that is safe, which I'm not sure offhand).

I believe PV is no more safe (and the proposed patch doesn't
work for PV).

>> THAT expensive on x86?  (If max(TSC/sec/processor)~=1000 and
>> cycles/emulation~=5000, total degradation would be
>> less than 1%.  (Sounds high, but if the alternative is

> At the end of the day, though, only testing will tell us for sure.

Yes indeed.  It would be nice if some x86 wizard could
spin up a best case for this and if someone with good
hardware measurement tools could compare current vs best
(as well as measure true instruction frequency as apps
might be rdtsc'ing directly).

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] TSC scaling for live migration between platforms with different TSC frequecies
  2009-06-18 20:57           ` Dan Magenheimer
@ 2009-06-18 21:00             ` John Levon
  2009-06-18 22:27               ` Dan Magenheimer
  2009-06-19  1:21               ` Zhang, Xiantao
  0 siblings, 2 replies; 35+ messages in thread
From: John Levon @ 2009-06-18 21:00 UTC (permalink / raw)
  To: Dan Magenheimer; +Cc: Ian Pratt, xen-devel, Keir Fraser, Zhang, Xiantao

On Thu, Jun 18, 2009 at 01:57:21PM -0700, Dan Magenheimer wrote:

> > # dtrace -n 'fbt::tsc_gethrtime:entry /cpu == 0/ { @ = 
> > sum(1); }' -c "sleep 10"
> > dtrace: description 'fbt::tsc_gethrtime:entry ' matched 1 probe
> > dtrace: pid 29708 has exited
> > 
> >             27798
> > 
> > This is on a basically idle 8-way system. (The other CPUs are 
> > less busy.)
> 
> Just checking... this is in 10 seconds and each processor is
> "ticking" (and possibly a system-wide timer tick as well),
> so this is ~350 rdtsc/sec/processor, correct?

No. That's CPU0 only ('cpu == 0'). Solaris only has one system-wide
timer tick. This is mstate accounting: every kernel/user boundary, every
interrupt, etc. incurs at least one TSC read. (And of course the machine
is idle.)

regards
john

^ permalink raw reply	[flat|nested] 35+ messages in thread

* RE: [PATCH] TSC scaling for live migration between platforms with different TSC frequecies
  2009-06-18 21:00             ` John Levon
@ 2009-06-18 22:27               ` Dan Magenheimer
  2009-06-19 13:36                 ` John Levon
  2009-06-19  1:21               ` Zhang, Xiantao
  1 sibling, 1 reply; 35+ messages in thread
From: Dan Magenheimer @ 2009-06-18 22:27 UTC (permalink / raw)
  To: John Levon; +Cc: Ian Pratt, xen-devel, Keir Fraser, Zhang, Xiantao

> > Just checking... this is in 10 seconds and each processor is
> > "ticking" (and possibly a system-wide timer tick as well),
> > so this is ~350 rdtsc/sec/processor, correct?
> 
> No. That's CPU0 only ('cpu == 0'). Solaris only has one system-wide
> timer tick. This is mstate accounting: every kernel/user 
> boundary, every
> interrupt, etc. incurs at least one TSC read. (And of course 
> the machine
> is idle.)

Wow.  (and I repeat for emphasis) Wow.

Even when restricted to physical hardware, using the TSC
for such purposes seems ill-advised.  In a virtual data
center, the data will be often useless.

Is mstate accounting used for anything other than providing
interesting performance data if one cares to look at it?
Does mstate accounting ignore negative values for delta TSC?

Well, even a few thousand RDTSC/second is not too bad
if RDTSC emulation can be brought in under a couple thousand
cycles.  (I don't know that it can, but I also don't know
that it can't.)

Dan

^ permalink raw reply	[flat|nested] 35+ messages in thread

* RE: [PATCH] TSC scaling for live migration between platforms with different TSC frequecies
  2009-06-18 16:45     ` John Levon
  2009-06-18 20:27       ` Dan Magenheimer
@ 2009-06-18 23:49       ` Dong, Eddie
  2009-06-19  2:25       ` Zhang, Xiantao
  2 siblings, 0 replies; 35+ messages in thread
From: Dong, Eddie @ 2009-06-18 23:49 UTC (permalink / raw)
  To: John Levon, Dan Magenheimer
  Cc: Ian Pratt, xen-devel, Dong, Eddie, Keir Fraser, Zhang, Xiantao


>> question is:  If it is important to ALWAYS emulate RDTSC,
>> can the Xen code be written to handle RDTSC emulation
>> much faster?  If it could be made fast enough, the

This is in our plan. The perf overhead of sysbench.oltp could be ~10% due to rdtsc exiting. But even with optimization, we still need this software scaling as a fall back.

>I'd be amazed if this were possible.

We posted the idea several weeks ago. See http://lists.xensource.com/archives/html/xen-devel/2009-04/msg00890.html.
Will that impact Solaris?

^ permalink raw reply	[flat|nested] 35+ messages in thread

* RE: [PATCH] TSC scaling for live migration between platforms with different TSC frequecies
  2009-06-18 21:00             ` John Levon
  2009-06-18 22:27               ` Dan Magenheimer
@ 2009-06-19  1:21               ` Zhang, Xiantao
  2009-06-19 13:54                 ` John Levon
  1 sibling, 1 reply; 35+ messages in thread
From: Zhang, Xiantao @ 2009-06-19  1:21 UTC (permalink / raw)
  To: John Levon, Dan Magenheimer; +Cc: Ian Pratt, xen-devel, Keir Fraser

John Levon wrote:
> On Thu, Jun 18, 2009 at 01:57:21PM -0700, Dan Magenheimer wrote:
> 
>>> # dtrace -n 'fbt::tsc_gethrtime:entry /cpu == 0/ { @ =
>>> sum(1); }' -c "sleep 10"
>>> dtrace: description 'fbt::tsc_gethrtime:entry ' matched 1 probe
>>> dtrace: pid 29708 has exited 
>>> 
>>>             27798
>>> 
>>> This is on a basically idle 8-way system. (The other CPUs are
>>> less busy.)
>> 
>> Just checking... this is in 10 seconds and each processor is
>> "ticking" (and possibly a system-wide timer tick as well),
>> so this is ~350 rdtsc/sec/processor, correct?
> 
> No. That's CPU0 only ('cpu == 0'). Solaris only has one system-wide
> timer tick. This is mstate accounting: every kernel/user boundary,
> every interrupt, etc. incurs at least one TSC read. (And of course
> the machine is idle.)

So the rdtsc rate in the system is 2779.8/s per your testing ? If so, the performance impact can be ignored. We had done the performance testing with sysbench oltp, and in the testing the rdtsc rate exceeds 120000 rdtsc/sec, but even in such extreme case perfomrance loss is still less 10%.  In addition, we also measured the emulation cost, and the result showes rdtsc can be done in 1500-1800 cycles in emulation case.
Xiantao 

^ permalink raw reply	[flat|nested] 35+ messages in thread

* RE: [PATCH] TSC scaling for live migration between platforms with different TSC frequecies
  2009-06-18 10:56 ` Ian Pratt
  2009-06-18 15:58   ` Dan Magenheimer
@ 2009-06-19  1:34   ` Zhang, Xiantao
  1 sibling, 0 replies; 35+ messages in thread
From: Zhang, Xiantao @ 2009-06-19  1:34 UTC (permalink / raw)
  To: Ian Pratt, Keir Fraser; +Cc: xen-devel

Ian Pratt wrote:
>>     This patchset targets for enabling TSC scaling in software for
>> live migration between platforms with different TSC frequecies. 
>> Once found the target host's frequency is different with source
>> host's, hypervisor will trap and emulate guest's all rdtsc
>>     instructions with its expected frequency. If hardware's TSC
>> frequency is difffernt with guest's exepcted freq, guest may behave
>> abnormally, eg. incorrect wallclock, soft lockup, even hang in some
>> cases.  Therefore, this patchset is necessary to avoid such issues. 
>> 
>> PATCH 0001-- Save guest's preferred TSC in image for save/restore and
>> migration PATCH 0002-- Move multidiv64 as a library function.
>> PATCH 0003-- Scaling host TSC freqeuncy patch.
> 
> I think this needs to be a feature which is enabled/disabled on a per
> VM basis (in the config file). 
> 
> I'm not sure what the default should be. Windows VMs and applications
> don't seem to much care about the TSC which is an argument for
> leaving the default as it is at the moment. However, one could argue
> that things that don't care about the TSC aren't going to be reading
> it much, so the overhead of making the default to scale the TSC
> shouldn't be too high.

Hi, Ian
   We also considered to add an option to disable/enable this feature. But you know, this logic is only efffective after migration, but option should be determined in guest creation stage, so it maybe useless for most of cases.  Even if windows system doesn't care much about tsc, but we can't say applications don't use it at all. I assume Windows's timing mechanism also need build the relationship between tsc and time source, and if the assumption is right, guest's applications may behave abnoramlly in a different tsc frequency.  Thanks! :-)
Xiantao

^ permalink raw reply	[flat|nested] 35+ messages in thread

* RE: [PATCH] TSC scaling for live migration betweenplatforms with different TSC frequecies
  2009-06-18 15:40     ` Dan Magenheimer
@ 2009-06-19  1:48       ` Zhang, Xiantao
  0 siblings, 0 replies; 35+ messages in thread
From: Zhang, Xiantao @ 2009-06-19  1:48 UTC (permalink / raw)
  To: Dan Magenheimer, Jan Beulich; +Cc: xen-devel, Keir Fraser

Dan Magenheimer wrote:
>> I am still confused by opt_softtsc check here. If want to use
>> platform timer to emulate guest's tsc, hvm_set_guest_tsc
>> should also need  perform this check to get correct
>> cache_tsc_offset, but I didn't see it.   A bug ?
>> If use host's tsc to emulate guest's tsc, the check is
>> useless, so I removed it in my patch.  Maybe we need Dan's
>> explanation about the check here to determin whether keep it or not.
> 
> Please do keep the opt_softtsc check.  I agree that there
> is a bug, that hvm_set_guest_tsc should check as well.
> IIRC, my guest never set the TSC.

Okay, for gerneral cases, we also need the check for opt_softtsc in hvm_set_guest_tsc, otherwise system may confuse its TSC after its setting tsc operation. 


> The softtsc option is for handling skew problems
> not scaling/migration problems but should probably
> be updated to handle your TSC scaling as well.

Okay, I will add the corresponding logic to handle it.  A question, according to the logic, the tsc frequech should be equal 10^9 once opt_softtsc is set, right ?

> http://lists.xensource.com/archives/html/xen-devel/2008-07/msg00495.html
> 
> Thanks,
> Dan
> 
>> -----Original Message-----
>> From: Zhang, Xiantao [mailto:xiantao.zhang@intel.com]
>> Sent: Thursday, June 18, 2009 2:52 AM
>> To: Jan Beulich; Dan Magenheimer
>> Cc: Keir Fraser; xen-devel@lists.xensource.com
>> Subject: RE: [Xen-devel] [PATCH] TSC scaling for live migration
>> betweenplatforms with different TSC frequecies
>> 
>> 
>> Jan Beulich wrote:
>>>>>> "Zhang, Xiantao" <xiantao.zhang@intel.com> 18.06.09 04:56 >>>
>>>> PATCH 0003-- Scaling host TSC freqeuncy patch.
>>> 
>>>> +int hvm_gtsc_need_scale(struct domain *d)
>>>> +{
>>>> +    uint32_t gtsc_khz;
>>>> +
>>>> +    gtsc_khz = d->arch.hvm_domain.gtsc_khz / 1000;
>>> 
>>> Can the variable please be renamed to what it contains (i.e.
>>> gtsc_mhz)? 
>>> 
>>>> u64 hvm_get_guest_tsc(struct vcpu *v)
>>>> {
>>>> -    u64 host_tsc;
>>>> -
>>>> -    if ( opt_softtsc )
>>>> -        host_tsc = hvm_get_guest_time(v);
>>>> -    else
>>>> -        rdtscll(host_tsc);
>>>> -
>>>> -    return host_tsc + v->arch.hvm_vcpu.cache_tsc_offset;
>>>> +    u64 host_tsc, scaled_htsc;
>>>> +
>>>> +    rdtscll(host_tsc);
>>>> +    scaled_htsc = hvm_h2g_scale_tsc(v, host_tsc);
>>>> +
>>>> +    return scaled_htsc + v->arch.hvm_vcpu.cache_tsc_offset; }
>>>> 
>>>> void hvm_migrate_timers(struct vcpu *v)
>>> 
>>> I'm getting the impression that the opt_softtsc functionality got
>>> lost here.
>> 
>> I am still confused by opt_softtsc check here. If want to use
>> platform timer to emulate guest's tsc, hvm_set_guest_tsc
>> should also need  perform this check to get correct
>> cache_tsc_offset, but I didn't see it.   A bug ?
>> If use host's tsc to emulate guest's tsc, the check is
>> useless, so I removed it in my patch.  Maybe we need Dan's
>> explanation about the check here to determin whether keep it or not.
>> 
>>> 
>>>> +        printk("Migrate to a platform with different freq:%ldMhz,
>>>> " +            "expected freq:%dMhz, enable rdtsc exiting!\n",
>>>> +                    cpu_khz / 1000, hdr->gtsc_khz / 1000);
>>> 
>>> gdprintk()? At least, I think, any guest related printk-s should
>>> identify which guest they're about.
>> 
>> Added in the attached patch. Thanks!
>> Xiantao

^ permalink raw reply	[flat|nested] 35+ messages in thread

* RE: [PATCH] TSC scaling for live migration between platforms with different TSC frequecies
  2009-06-18 16:45     ` John Levon
  2009-06-18 20:27       ` Dan Magenheimer
  2009-06-18 23:49       ` Dong, Eddie
@ 2009-06-19  2:25       ` Zhang, Xiantao
  2009-06-19 13:53         ` John Levon
  2 siblings, 1 reply; 35+ messages in thread
From: Zhang, Xiantao @ 2009-06-19  2:25 UTC (permalink / raw)
  To: John Levon, Dan Magenheimer; +Cc: Ian Pratt, xen-devel, Keir Fraser

John Levon wrote:
> On Thu, Jun 18, 2009 at 08:58:49AM -0700, Dan Magenheimer wrote:
> 
>> Other apps (and/or the OS kernel) may use TSC to
>> approximate the passage of time, and for these apps
>> (and gettimeofday in the Linux kernel), this TSC scaling
>> patch is a must.  Unfortunately, both kinds of apps could
>> be running simultaneously on the same guest.  And
>> in either case, RDTSC frequency may be quite high.
> 
> Certainly Solaris relies on the TSC for time-keeping, and uses it very
> heavily. To the extent that I doubt it's even feasible to migrate to a
> machine where scaling needs to be done, and such a migration should be
> refused, since it would essentially kill the guest.
> 
>> question is:  If it is important to ALWAYS emulate RDTSC,
>> can the Xen code be written to handle RDTSC emulation
>> much faster?  If it could be made fast enough, the
> 
> I'd be amazed if this were possible.

Actually, we had done some optimizations and make most of rdtsc not trap to hypervisor and always keep tsc monotonically increasing in each virtual processor, but it is hard to solve the tsc drift issue between all vcpus.  In our solution, tsc drift may exceeds 100000 cycles. You know, TSC drift maybe also exist in real platforms, but maybe not quite large like 1000000 cycles. Do you know what's the limit of tsc drift OS or applications can bear? At least, Linux doesn't care about the drift much, but don't have no idea about applicatoin's behavior for such large drift.  
Xiantao

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] TSC scaling for live migration between platforms with different TSC frequecies
  2009-06-18 22:27               ` Dan Magenheimer
@ 2009-06-19 13:36                 ` John Levon
  0 siblings, 0 replies; 35+ messages in thread
From: John Levon @ 2009-06-19 13:36 UTC (permalink / raw)
  To: Dan Magenheimer; +Cc: Ian Pratt, xen-devel, Keir Fraser, Zhang, Xiantao

On Thu, Jun 18, 2009 at 03:27:54PM -0700, Dan Magenheimer wrote:

> Even when restricted to physical hardware, using the TSC
> for such purposes seems ill-advised.

In practice it's not so bad, if you only do power management on P-state
invariant TSC CPUs, and disable C1 clock ramping. I'm sure there are all
sorts of fun caveats, but I don't think we've had many practical
problems.

> In a virtual data center, the data will be often useless.

It won't be happy across different machines indeed.

We haven't retested past 3.1, but the PV timer isn't even monotonic in
SMP guests. We have to global-sync to get one.

You mentioned the PV timer can't handle migration - why doesn't
tsc_to_system_mul account for it? If ever a subsystem badly needed a
detailed write-up...

> Is mstate accounting used for anything other than providing
> interesting performance data if one cares to look at it?

You make it sounds like that isn't critically important :)

> Does mstate accounting ignore negative values for delta TSC?

No, the system time is assumed monotonic (it's not in TSC units). The
TSC code
(http://src.opensolaris.org/source/xref/onnv/onnv-gate/usr/src/uts/i86pc/os/timestamp.c)
is expected to provide monotonicity across all CPUs. And /that/ code
assumes there's no inter-CPU drift. Deltas are allowed, but of course
HVM assumes that Xen has dealt with that (since it can't possible
compute deltas between VCPUs).

All this stuff is painful. What I wouldn't give for a single cheap
monotonic timer source that worked under all circumstances.

regards
john

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] TSC scaling for live migration between platforms with different TSC frequecies
  2009-06-19  2:25       ` Zhang, Xiantao
@ 2009-06-19 13:53         ` John Levon
  2009-06-19 15:07           ` Zhang, Xiantao
  0 siblings, 1 reply; 35+ messages in thread
From: John Levon @ 2009-06-19 13:53 UTC (permalink / raw)
  To: Zhang, Xiantao; +Cc: Dan Magenheimer, xen-devel, Ian Pratt, Keir Fraser

On Fri, Jun 19, 2009 at 10:25:36AM +0800, Zhang, Xiantao wrote:

> Actually, we had done some optimizations and make most of rdtsc not
> trap to hypervisor and always keep tsc monotonically increasing in
> each virtual processor, but it is hard to solve the tsc drift issue
> between all vcpus.  In our solution, tsc drift may exceeds 100000
> cycles. You know, TSC drift maybe also exist in real platforms, but
> maybe not quite large like 1000000 cycles. Do you know what's the

I'm not sure what you mean by "drift of 1000000 cycles". In what time
period? Or are you talking about skew (a constant difference between TSC
values read between CPUs) ? Solaris handles arbitrary skew (I think) but
that's only accounted for at boot time. A fudge factor (tsc_max_delta)
accounts for any minor post-calibration deltas.

On HVM or VMWare we don't even try, since we can't possibly know the
real CPUs skew: the assumption is the VM platform has already done this
for us. And at least Xen attempts to sync up the physical CPUs.

Significant drift (where different CPUs are ticking at different rates)
is bad news, and can easily lead to non-monotonicity. I don't know what
"significant" means though, unfortunately.

Finally, a change across all CPUs in the tsc tick rate (so no drift, but
a sudden change after, say, a migration) is also bad news. Solaris used
to recalibrate the scale rate once a second, but that was removed.

All this is HVM/metal code of course.

regards
john

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] TSC scaling for live migration between platforms with different TSC frequecies
  2009-06-19  1:21               ` Zhang, Xiantao
@ 2009-06-19 13:54                 ` John Levon
  0 siblings, 0 replies; 35+ messages in thread
From: John Levon @ 2009-06-19 13:54 UTC (permalink / raw)
  To: Zhang, Xiantao; +Cc: Dan Magenheimer, xen-devel, Ian Pratt, Keir Fraser

On Fri, Jun 19, 2009 at 09:21:55AM +0800, Zhang, Xiantao wrote:

> > No. That's CPU0 only ('cpu == 0'). Solaris only has one system-wide
> > timer tick. This is mstate accounting: every kernel/user boundary,
> > every interrupt, etc. incurs at least one TSC read. (And of course
> > the machine is idle.)
> 
> So the rdtsc rate in the system is 2779.8/s per your testing ?

No the rdtsc rate on a single CPU on an idle system on one mache was
around that :)

> If so, the performance impact can be ignored. We had done the
> performance testing with sysbench oltp, and in the testing the rdtsc
> rate exceeds 120000 rdtsc/sec, but even in such extreme case
> perfomrance loss is still less 10%.  In addition, we also measured the
> emulation cost, and the result showes rdtsc can be done in 1500-1800
> cycles in emulation case.

It would be really good to see some Solaris perf results.

regards
john

^ permalink raw reply	[flat|nested] 35+ messages in thread

* RE: [PATCH] TSC scaling for live migration between platforms with different TSC frequecies
  2009-06-19 13:53         ` John Levon
@ 2009-06-19 15:07           ` Zhang, Xiantao
  2009-06-19 20:44             ` Dan Magenheimer
  0 siblings, 1 reply; 35+ messages in thread
From: Zhang, Xiantao @ 2009-06-19 15:07 UTC (permalink / raw)
  To: John Levon; +Cc: Dan Magenheimer, xen-devel, Ian Pratt, Keir Fraser

John Levon wrote:
> On Fri, Jun 19, 2009 at 10:25:36AM +0800, Zhang, Xiantao wrote:
> 
>> Actually, we had done some optimizations and make most of rdtsc not
>> trap to hypervisor and always keep tsc monotonically increasing in
>> each virtual processor, but it is hard to solve the tsc drift issue
>> between all vcpus.  In our solution, tsc drift may exceeds 100000
>> cycles. You know, TSC drift maybe also exist in real platforms, but
>> maybe not quite large like 1000000 cycles. Do you know what's the
> 
> I'm not sure what you mean by "drift of 1000000 cycles". In what time
> period? Or are you talking about skew (a constant difference between
> TSC values read between CPUs) ? Solaris handles arbitrary skew (I
> think) but that's only accounted for at boot time. A fudge factor
> (tsc_max_delta) accounts for any minor post-calibration deltas.

I mean inter-cpu TSC drift at any time. In our solution, we always keep guest's TSC across all vcpus synced to its expected TSC in 1 ms or 0.5 ms.  That is to say, all vcpus' TSC monotonically increases, but may generate little drift(10^5 ~10^6 cycles) due to unsync vm exits of vcpus. 


> On HVM or VMWare we don't even try, since we can't possibly know the
> real CPUs skew: the assumption is the VM platform has already done
> this for us. And at least Xen attempts to sync up the physical CPUs.
> Significant drift (where different CPUs are ticking at different
> rates) is bad news, and can easily lead to non-monotonicity. I don't
> know what "significant" means though, unfortunately.

We can guanrantee each vcpu's TSC is increasing monotonically, but there maybe some diff between vcpus.  I am not sure 10^5 cycles is significant, but it should exceed a stable hardware's drift in general. 


> Finally, a change across all CPUs in the tsc tick rate (so no drift,
> but a sudden change after, say, a migration) is also bad news.
> Solaris used to recalibrate the scale rate once a second, but that
> was removed.

Bad news. 

> All this is HVM/metal code of course.
> 
> regards
> john

^ permalink raw reply	[flat|nested] 35+ messages in thread

* RE: [PATCH] TSC scaling for live migration between platforms with different TSC frequecies
  2009-06-19 15:07           ` Zhang, Xiantao
@ 2009-06-19 20:44             ` Dan Magenheimer
  2009-06-22  1:38               ` Zhang, Xiantao
  0 siblings, 1 reply; 35+ messages in thread
From: Dan Magenheimer @ 2009-06-19 20:44 UTC (permalink / raw)
  To: Zhang, Xiantao, John Levon; +Cc: Ian Pratt, xen-devel, Keir Fraser

> > On HVM or VMWare we don't even try, since we can't possibly know the
> > real CPUs skew: the assumption is the VM platform has already done
> > this for us. And at least Xen attempts to sync up the physical CPUs.
> > Significant drift (where different CPUs are ticking at different
> > rates) is bad news, and can easily lead to non-monotonicity. I don't
> > know what "significant" means though, unfortunately.
> 
> We can guanrantee each vcpu's TSC is increasing 
> monotonically, but there maybe some diff between vcpus.  I am 
> not sure 10^5 cycles is significant, but it should exceed a 
> stable hardware's drift in general. 

Let me attempt to define "significant":

Assume that two kernel- or user-threads are able to synchronize
such that they can guarantee execution order.  If:

1) thread A reads TSC, and then
2) thread A and thread B sync to guarantee ordering, and then
3) thread B reads TSC, but
4) thread B's TSC value is less than thread A's TSC value

then the TSC skew is "significant".

If thread A and thread B are for example using TSC values
to timestamp journal transactions, then transaction guarantees
will not be valid.

So the question becomes: What is the smallest number of
cycles that are required to allow thread A and thread B
to synchronize for ordering?

I assert that this value is low enough _in theory_ that
only full TSC emulation can guarantee the proper result.
In _practice_, I don't know.  But I suspect that
it is much lower than 10^5 cycles.

Dan

^ permalink raw reply	[flat|nested] 35+ messages in thread

* RE: [PATCH] TSC scaling for live migration between platforms with different TSC frequecies
  2009-06-19 20:44             ` Dan Magenheimer
@ 2009-06-22  1:38               ` Zhang, Xiantao
  0 siblings, 0 replies; 35+ messages in thread
From: Zhang, Xiantao @ 2009-06-22  1:38 UTC (permalink / raw)
  To: Dan Magenheimer, John Levon; +Cc: Ian Pratt, xen-devel, Keir Fraser

Dan Magenheimer wrote:
>>> On HVM or VMWare we don't even try, since we can't possibly know the
>>> real CPUs skew: the assumption is the VM platform has already done
>>> this for us. And at least Xen attempts to sync up the physical CPUs.
>>> Significant drift (where different CPUs are ticking at different
>>> rates) is bad news, and can easily lead to non-monotonicity. I don't
>>> know what "significant" means though, unfortunately.
>> 
>> We can guanrantee each vcpu's TSC is increasing
>> monotonically, but there maybe some diff between vcpus.  I am
>> not sure 10^5 cycles is significant, but it should exceed a
>> stable hardware's drift in general.
> 
> Let me attempt to define "significant":
> 
> Assume that two kernel- or user-threads are able to synchronize
> such that they can guarantee execution order.  If:
> 
> 1) thread A reads TSC, and then
> 2) thread A and thread B sync to guarantee ordering, and then
> 3) thread B reads TSC, but
> 4) thread B's TSC value is less than thread A's TSC value
> 
> then the TSC skew is "significant".
> 
> If thread A and thread B are for example using TSC values
> to timestamp journal transactions, then transaction guarantees
> will not be valid.

Agree.

> So the question becomes: What is the smallest number of
> cycles that are required to allow thread A and thread B
> to synchronize for ordering?

This is the key point to determin whether we can perform furture optimization.  If the skew between vcpus can't be ignored, we should have no fast way to handle it and have to resort to TSC emulationand suffer the performance loss. 
But anyway, TSC emualtion method should be the first step to go. 

> I assert that this value is low enough _in theory_ that
> only full TSC emulation can guarantee the proper result.
> In _practice_, I don't know.  But I suspect that
> it is much lower than 10^5 cycles.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* RE: [PATCH] TSC scaling for live migration between platforms with different TSC frequecies
  2009-06-18  2:56 [PATCH] TSC scaling for live migration between platforms with different TSC frequecies Zhang, Xiantao
                   ` (3 preceding siblings ...)
  2009-06-18 10:56 ` Ian Pratt
@ 2009-06-22  5:14 ` Zhang, Xiantao
  2009-06-23 10:18   ` Keir Fraser
  4 siblings, 1 reply; 35+ messages in thread
From: Zhang, Xiantao @ 2009-06-22  5:14 UTC (permalink / raw)
  To: Keir Fraser
  Cc: Dan Magenheimer, xen-devel, Tim Deegan, Levon, Ian Pratt, John

[-- Attachment #1: Type: text/plain, Size: 998 bytes --]

Hi, Keir
	This is the new version which has addressed the comments from the mailing list. Please review it again.  Thanks!
Xiantao

Zhang, Xiantao wrote:
> Hi, Keir
> 
>     This patchset targets for enabling TSC scaling in software for
>     live migration between platforms with different TSC frequecies. 
> Once found the target host's frequency is different with source
> host's, hypervisor will trap and emulate guest's all rdtsc
> instructions with its expected frequency. If hardware's TSC frequency
> is difffernt with guest's exepcted freq, guest may behave abnormally,
> eg. incorrect wallclock, soft lockup, even hang in some cases. 
> Therefore, this patchset is necessary to avoid such issues.      
> 
> PATCH 0001-- Save guest's preferred TSC in image for save/restore and
> migration 
> PATCH 0002-- Move multidiv64 as a library function.
> PATCH 0003-- Scaling host TSC freqeuncy patch.
> 
> Signed-off-by Xiantao Zhang <xiantao.zhang@intel.com>
> Xiantao


[-- Attachment #2: 0003-scaling_guest_tsc.patch --]
[-- Type: application/octet-stream, Size: 6584 bytes --]

# HG changeset patch
# User root@localhost.localdomain
# Date 1245637435 14400
# Node ID 5d96e4d8bd4b0f0cfa17230a1958fa60bca5ba7d
# Parent  09f3065668f20ebc1d6424d3c051c352baeba618
Scaling guest's TSC when the target machine's frequency is different with its requirement.

Using trap&emulate for guest's each rdtsc instruction first, maybe it can be optimized later.

Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>

diff -r 09f3065668f2 -r 5d96e4d8bd4b xen/arch/x86/hvm/hvm.c
--- a/xen/arch/x86/hvm/hvm.c	Tue Jun 16 23:04:57 2009 -0400
+++ b/xen/arch/x86/hvm/hvm.c	Sun Jun 21 22:23:55 2009 -0400
@@ -139,26 +139,83 @@ uint8_t hvm_combine_hw_exceptions(uint8_
     return TRAP_double_fault;
 }
 
+void hvm_enable_rdtsc_exiting(struct domain *d)
+{
+    struct vcpu *v;
+
+    if ( opt_softtsc )
+        return;
+
+    for_each_vcpu ( d, v ) {
+        if ( hvm_funcs.enable_rdtsc_exiting )
+            hvm_funcs.enable_rdtsc_exiting(v);
+    }
+}
+
+int hvm_gtsc_need_scale(struct domain *d)
+{
+    uint32_t gtsc_mhz, htsc_mhz;
+
+    gtsc_mhz = d->arch.hvm_domain.gtsc_khz / 1000;
+
+    if ( opt_softtsc )
+        htsc_mhz = 1000;
+    else
+        htsc_mhz = (uint32_t)cpu_khz / 1000;
+
+    if ( gtsc_mhz && gtsc_mhz != htsc_mhz  ) {
+        d->arch.hvm_domain.tsc_scaled = 1;
+        return 1;
+    }
+
+    d->arch.hvm_domain.tsc_scaled = 0;
+    return 0;
+}
+
+static u64 hvm_h2g_scale_tsc(struct vcpu *v, u64 host_tsc)
+{
+    u32 gtsc_khz, htsc_khz, scaled_htsc = host_tsc;
+
+    if ( v->domain->arch.hvm_domain.tsc_scaled ) {
+        if( opt_softtsc )
+            htsc_khz = 1000000;
+        else
+            htsc_khz = cpu_khz;
+
+        gtsc_khz = v->domain->arch.hvm_domain.gtsc_khz;
+        scaled_htsc = muldiv64(host_tsc, gtsc_khz, htsc_khz);
+    }
+
+    return scaled_htsc;
+}
+
 void hvm_set_guest_tsc(struct vcpu *v, u64 guest_tsc)
 {
-    u64 host_tsc;
-
-    rdtscll(host_tsc);
-
-    v->arch.hvm_vcpu.cache_tsc_offset = guest_tsc - host_tsc;
-    hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset);
-}
-
-u64 hvm_get_guest_tsc(struct vcpu *v)
-{
-    u64 host_tsc;
+    u64 host_tsc, scaled_htsc;
 
     if ( opt_softtsc )
         host_tsc = hvm_get_guest_time(v);
     else
         rdtscll(host_tsc);
 
-    return host_tsc + v->arch.hvm_vcpu.cache_tsc_offset;
+    scaled_htsc = hvm_h2g_scale_tsc(v, host_tsc);
+
+    v->arch.hvm_vcpu.cache_tsc_offset = guest_tsc - scaled_htsc;
+    hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset);
+}
+
+u64 hvm_get_guest_tsc(struct vcpu *v)
+{
+    u64 host_tsc, scaled_htsc;
+
+    if ( opt_softtsc )
+        host_tsc = hvm_get_guest_time(v); 
+    else
+        rdtscll(host_tsc);
+
+    scaled_htsc = hvm_h2g_scale_tsc(v, host_tsc);
+
+    return scaled_htsc + v->arch.hvm_vcpu.cache_tsc_offset;
 }
 
 void hvm_migrate_timers(struct vcpu *v)
diff -r 09f3065668f2 -r 5d96e4d8bd4b xen/arch/x86/hvm/save.c
--- a/xen/arch/x86/hvm/save.c	Tue Jun 16 23:04:57 2009 -0400
+++ b/xen/arch/x86/hvm/save.c	Sun Jun 21 22:23:55 2009 -0400
@@ -63,6 +63,15 @@ int arch_hvm_load(struct domain *d, stru
     /* Restore guest's preferred TSC frequency. */
     d->arch.hvm_domain.gtsc_khz = hdr->gtsc_khz;
 
+    if ( hdr->gtsc_khz && hvm_gtsc_need_scale(d) ) {
+        hvm_enable_rdtsc_exiting(d);
+
+        gdprintk(XENLOG_WARNING, "Loading VM(id:%d) expects freq: %dmHz, "
+                "but host's freq :%"PRIu64"mHz, trap and emulate rdtsc!!!\n",
+                d->domain_id, hdr->gtsc_khz / 1000, opt_softtsc ? 1000 :
+                cpu_khz / 1000);
+    }
+
     /* VGA state is not saved/restored, so we nobble the cache. */
     d->arch.hvm_domain.stdvga.cache = 0;
 
diff -r 09f3065668f2 -r 5d96e4d8bd4b xen/arch/x86/hvm/vmx/vmx.c
--- a/xen/arch/x86/hvm/vmx/vmx.c	Tue Jun 16 23:04:57 2009 -0400
+++ b/xen/arch/x86/hvm/vmx/vmx.c	Sun Jun 21 22:23:55 2009 -0400
@@ -946,6 +946,14 @@ static void vmx_set_tsc_offset(struct vc
     vmx_vmcs_exit(v);
 }
 
+static void vmx_enable_rdtsc_exiting(struct vcpu *v)
+{
+    vmx_vmcs_enter(v);
+    v->arch.hvm_vmx.exec_control |= CPU_BASED_RDTSC_EXITING;
+    __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
+     vmx_vmcs_exit(v);
+ }
+
 void do_nmi(struct cpu_user_regs *);
 
 static void vmx_init_hypercall_page(struct domain *d, void *hypercall_page)
@@ -1371,7 +1379,8 @@ static struct hvm_function_table vmx_fun
     .msr_write_intercept  = vmx_msr_write_intercept,
     .invlpg_intercept     = vmx_invlpg_intercept,
     .set_uc_mode          = vmx_set_uc_mode,
-    .set_info_guest       = vmx_set_info_guest
+    .set_info_guest       = vmx_set_info_guest,
+    .enable_rdtsc_exiting = vmx_enable_rdtsc_exiting
 };
 
 static unsigned long *vpid_bitmap;
diff -r 09f3065668f2 -r 5d96e4d8bd4b xen/arch/x86/hvm/vpt.c
--- a/xen/arch/x86/hvm/vpt.c	Tue Jun 16 23:04:57 2009 -0400
+++ b/xen/arch/x86/hvm/vpt.c	Sun Jun 21 22:23:55 2009 -0400
@@ -33,7 +33,12 @@ void hvm_init_guest_time(struct domain *
     pl->stime_offset = -(u64)get_s_time();
     pl->last_guest_time = 0;
 
-    d->arch.hvm_domain.gtsc_khz = cpu_khz;
+    if (opt_softtsc)
+        d->arch.hvm_domain.gtsc_khz = 1000000;
+    else
+        d->arch.hvm_domain.gtsc_khz = cpu_khz;
+
+    d->arch.hvm_domain.tsc_scaled = 0;
 }
 
 u64 hvm_get_guest_time(struct vcpu *v)
diff -r 09f3065668f2 -r 5d96e4d8bd4b xen/include/asm-x86/hvm/domain.h
--- a/xen/include/asm-x86/hvm/domain.h	Tue Jun 16 23:04:57 2009 -0400
+++ b/xen/include/asm-x86/hvm/domain.h	Sun Jun 21 22:23:55 2009 -0400
@@ -45,7 +45,7 @@ struct hvm_domain {
     struct hvm_ioreq_page  buf_ioreq;
 
     uint32_t               gtsc_khz; /* kHz */
-    uint32_t               pad0;
+    uint32_t               tsc_scaled;
     struct pl_time         pl_time;
 
     struct hvm_io_handler  io_handler;
diff -r 09f3065668f2 -r 5d96e4d8bd4b xen/include/asm-x86/hvm/hvm.h
--- a/xen/include/asm-x86/hvm/hvm.h	Tue Jun 16 23:04:57 2009 -0400
+++ b/xen/include/asm-x86/hvm/hvm.h	Sun Jun 21 22:23:55 2009 -0400
@@ -129,6 +129,7 @@ struct hvm_function_table {
     void (*invlpg_intercept)(unsigned long vaddr);
     void (*set_uc_mode)(struct vcpu *v);
     void (*set_info_guest)(struct vcpu *v);
+    void (*enable_rdtsc_exiting)(struct vcpu *v);
 };
 
 extern struct hvm_function_table hvm_funcs;
@@ -282,6 +283,9 @@ int hvm_event_needs_reinjection(uint8_t 
 
 uint8_t hvm_combine_hw_exceptions(uint8_t vec1, uint8_t vec2);
 
+void hvm_enable_rdtsc_exiting(struct domain *d);
+int hvm_gtsc_need_scale(struct domain *d);
+
 static inline int hvm_cpu_up(void)
 {
     if ( hvm_funcs.cpu_up )

[-- Attachment #3: 0001-save_tsc_frequency.patch --]
[-- Type: application/octet-stream, Size: 3964 bytes --]

# HG changeset patch
# User root@localhost.localdomain
# Date 1245206466 14400
# Node ID 4015e09394c1857a6e68e6d6909d11cf5ecba241
# Parent  f8187a343ad2bdbfe3166d7ee7e3d55a9f157fdc
save/restore : Save guest's preferred TSC frequency in image

For save/restore or live migration between two different frequency platforms
, guest's preferred TSC frequency is required to caculate guest's TSC after resotre, so save it in the image header.

Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>

diff -r f8187a343ad2 -r 4015e09394c1 xen/arch/x86/hvm/i8254.c
--- a/xen/arch/x86/hvm/i8254.c	Fri Feb 20 17:02:36 2009 +0000
+++ b/xen/arch/x86/hvm/i8254.c	Tue Jun 16 22:41:06 2009 -0400
@@ -481,8 +481,6 @@ void pit_init(struct vcpu *v, unsigned l
     register_portio_handler(v->domain, PIT_BASE, 4, handle_pit_io);
     register_portio_handler(v->domain, 0x61, 1, handle_speaker_io);
 
-    ticks_per_sec(v) = cpu_khz * (int64_t)1000;
-
     pit_reset(v->domain);
 }
 
diff -r f8187a343ad2 -r 4015e09394c1 xen/arch/x86/hvm/save.c
--- a/xen/arch/x86/hvm/save.c	Fri Feb 20 17:02:36 2009 +0000
+++ b/xen/arch/x86/hvm/save.c	Tue Jun 16 22:41:06 2009 -0400
@@ -32,7 +32,8 @@ void arch_hvm_save(struct domain *d, str
     cpuid(1, &eax, &ebx, &ecx, &edx);
     hdr->cpuid = eax;
 
-    hdr->pad0 = 0;
+    /* Save guest's preferred TSC. */
+    hdr->gtsc_khz = d->arch.hvm_domain.gtsc_khz;
 }
 
 int arch_hvm_load(struct domain *d, struct hvm_save_header *hdr)
@@ -59,6 +60,9 @@ int arch_hvm_load(struct domain *d, stru
         gdprintk(XENLOG_WARNING, "HVM restore: saved CPUID (%#"PRIx32") "
                "does not match host (%#"PRIx32").\n", hdr->cpuid, eax);
 
+    /* Restore guest's preferred TSC frequency. */
+    d->arch.hvm_domain.gtsc_khz = hdr->gtsc_khz;
+
     /* VGA state is not saved/restored, so we nobble the cache. */
     d->arch.hvm_domain.stdvga.cache = 0;
 
diff -r f8187a343ad2 -r 4015e09394c1 xen/arch/x86/hvm/vpt.c
--- a/xen/arch/x86/hvm/vpt.c	Fri Feb 20 17:02:36 2009 +0000
+++ b/xen/arch/x86/hvm/vpt.c	Tue Jun 16 22:41:06 2009 -0400
@@ -32,6 +32,8 @@ void hvm_init_guest_time(struct domain *
     spin_lock_init(&pl->pl_time_lock);
     pl->stime_offset = -(u64)get_s_time();
     pl->last_guest_time = 0;
+
+    d->arch.hvm_domain.gtsc_khz = cpu_khz;
 }
 
 u64 hvm_get_guest_time(struct vcpu *v)
diff -r f8187a343ad2 -r 4015e09394c1 xen/include/asm-x86/hvm/domain.h
--- a/xen/include/asm-x86/hvm/domain.h	Fri Feb 20 17:02:36 2009 +0000
+++ b/xen/include/asm-x86/hvm/domain.h	Tue Jun 16 22:41:06 2009 -0400
@@ -44,7 +44,8 @@ struct hvm_domain {
     struct hvm_ioreq_page  ioreq;
     struct hvm_ioreq_page  buf_ioreq;
 
-    s64                    tsc_frequency;
+    uint32_t               gtsc_khz; /* kHz */
+    uint32_t               pad0;
     struct pl_time         pl_time;
 
     struct hvm_io_handler  io_handler;
diff -r f8187a343ad2 -r 4015e09394c1 xen/include/asm-x86/hvm/vpt.h
--- a/xen/include/asm-x86/hvm/vpt.h	Fri Feb 20 17:02:36 2009 +0000
+++ b/xen/include/asm-x86/hvm/vpt.h	Tue Jun 16 22:41:06 2009 -0400
@@ -136,8 +136,6 @@ struct pl_time {    /* platform time */
     spinlock_t pl_time_lock;
 };
 
-#define ticks_per_sec(v) (v->domain->arch.hvm_domain.tsc_frequency)
-
 void pt_save_timer(struct vcpu *v);
 void pt_restore_timer(struct vcpu *v);
 void pt_update_irq(struct vcpu *v);
diff -r f8187a343ad2 -r 4015e09394c1 xen/include/public/arch-x86/hvm/save.h
--- a/xen/include/public/arch-x86/hvm/save.h	Fri Feb 20 17:02:36 2009 +0000
+++ b/xen/include/public/arch-x86/hvm/save.h	Tue Jun 16 22:41:06 2009 -0400
@@ -38,7 +38,7 @@ struct hvm_save_header {
     uint32_t version;           /* File format version */
     uint64_t changeset;         /* Version of Xen that saved this file */
     uint32_t cpuid;             /* CPUID[0x01][%eax] on the saving machine */
-    uint32_t pad0;
+    uint32_t gtsc_khz;        /* Guest's TSC frequency in kHz */
 };
 
 DECLARE_HVM_SAVE_TYPE(HEADER, 1, struct hvm_save_header);

[-- Attachment #4: 0002-move_multidiv64_out.patch --]
[-- Type: application/octet-stream, Size: 3360 bytes --]

# HG changeset patch
# User root@localhost.localdomain
# Date 1245207897 14400
# Node ID 09f3065668f20ebc1d6424d3c051c352baeba618
# Parent  4015e09394c1857a6e68e6d6909d11cf5ecba241
Move muldiv64 out and make it as a public function.

muldiv64 is used to caculate u64*u32/u32, and we
will use it for TSC scaling.

Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>

diff -r 4015e09394c1 -r 09f3065668f2 xen/arch/x86/Makefile
--- a/xen/arch/x86/Makefile	Tue Jun 16 22:41:06 2009 -0400
+++ b/xen/arch/x86/Makefile	Tue Jun 16 23:04:57 2009 -0400
@@ -53,6 +53,7 @@ obj-y += crash.o
 obj-y += crash.o
 obj-y += tboot.o
 obj-y += hpet.o
+obj-y += lib.o
 obj-y += bzimage.o
 
 obj-$(crash_debug) += gdbstub.o
diff -r 4015e09394c1 -r 09f3065668f2 xen/arch/x86/hvm/i8254.c
--- a/xen/arch/x86/hvm/i8254.c	Tue Jun 16 22:41:06 2009 -0400
+++ b/xen/arch/x86/hvm/i8254.c	Tue Jun 16 23:04:57 2009 -0400
@@ -56,30 +56,6 @@ static int handle_speaker_io(
 
 #define get_guest_time(v) \
    (is_hvm_vcpu(v) ? hvm_get_guest_time(v) : (u64)get_s_time())
-
-/* Compute with 96 bit intermediate result: (a*b)/c */
-static uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
-{
-    union {
-        uint64_t ll;
-        struct {
-#ifdef WORDS_BIGENDIAN
-            uint32_t high, low;
-#else
-            uint32_t low, high;
-#endif            
-        } l;
-    } u, res;
-    uint64_t rl, rh;
-
-    u.ll = a;
-    rl = (uint64_t)u.l.low * (uint64_t)b;
-    rh = (uint64_t)u.l.high * (uint64_t)b;
-    rh += (rl >> 32);
-    res.l.high = rh / c;
-    res.l.low = (((rh % c) << 32) + (rl & 0xffffffff)) / c;
-    return res.ll;
-}
 
 static int pit_get_count(PITState *pit, int channel)
 {
diff -r 4015e09394c1 -r 09f3065668f2 xen/arch/x86/lib.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/lib.c	Tue Jun 16 23:04:57 2009 -0400
@@ -0,0 +1,44 @@
+#include <xen/ctype.h>
+#include <xen/lib.h>
+#include <xen/types.h>
+#include <asm/byteorder.h>
+
+/* Compute with 96 bit intermediate result: (a*b)/c */
+
+#ifdef __x86_64__
+ uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
+ {
+     __asm__ __volatile__ ("mul %%rdx;"
+                            "div %%rcx;"
+                            : "=a"(a)
+                            : "0"(a), "d"(b), "c"(c)
+                            );
+    return a;
+}
+
+#else
+
+uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c)
+{
+    union {
+        uint64_t ll;
+        struct {
+#ifdef WORDS_BIGENDIAN
+            uint32_t high, low;
+#else
+            uint32_t low, high;
+#endif            
+        } l;
+    } u, res;
+    uint64_t rl, rh;
+
+    u.ll = a;
+    rl = (uint64_t)u.l.low * (uint64_t)b;
+    rh = (uint64_t)u.l.high * (uint64_t)b;
+    rh += (rl >> 32);
+    res.l.high = rh / c;
+    res.l.low = (((rh % c) << 32) + (rl & 0xffffffff)) / c;
+    return res.ll;
+}
+
+#endif
diff -r 4015e09394c1 -r 09f3065668f2 xen/include/xen/lib.h
--- a/xen/include/xen/lib.h	Tue Jun 16 22:41:06 2009 -0400
+++ b/xen/include/xen/lib.h	Tue Jun 16 23:04:57 2009 -0400
@@ -91,6 +91,8 @@ unsigned long long simple_strtoull(
 
 unsigned long long parse_size_and_unit(const char *s, const char **ps);
 
+uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c);
+
 #define TAINT_UNSAFE_SMP                (1<<0)
 #define TAINT_MACHINE_CHECK             (1<<1)
 #define TAINT_BAD_PAGE                  (1<<2)

[-- Attachment #5: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] TSC scaling for live migration between platforms with different TSC frequecies
  2009-06-22  5:14 ` Zhang, Xiantao
@ 2009-06-23 10:18   ` Keir Fraser
  2009-06-24  1:18     ` Zhang, Xiantao
  0 siblings, 1 reply; 35+ messages in thread
From: Keir Fraser @ 2009-06-23 10:18 UTC (permalink / raw)
  To: Zhang, Xiantao
  Cc: Tim Deegan, Dan Magenheimer, xen-devel, Ian Pratt, John Levon

Stuffing the guest freq in a save-image pad field is not backward
compatible. Old images will not have that field filled in and you'll
probably end up doing something stupid like give them a zero-hertz TSC.
Please think about backward compatibility and use a separate save record.
Like Tim asked you to do already.

 -- Keir

On 22/06/2009 06:14, "Zhang, Xiantao" <xiantao.zhang@intel.com> wrote:

> Hi, Keir
> This is the new version which has addressed the comments from the mailing
> list. Please review it again.  Thanks!
> Xiantao
> 
> Zhang, Xiantao wrote:
>> Hi, Keir
>> 
>>     This patchset targets for enabling TSC scaling in software for
>>     live migration between platforms with different TSC frequecies.
>> Once found the target host's frequency is different with source
>> host's, hypervisor will trap and emulate guest's all rdtsc
>> instructions with its expected frequency. If hardware's TSC frequency
>> is difffernt with guest's exepcted freq, guest may behave abnormally,
>> eg. incorrect wallclock, soft lockup, even hang in some cases.
>> Therefore, this patchset is necessary to avoid such issues.
>> 
>> PATCH 0001-- Save guest's preferred TSC in image for save/restore and
>> migration 
>> PATCH 0002-- Move multidiv64 as a library function.
>> PATCH 0003-- Scaling host TSC freqeuncy patch.
>> 
>> Signed-off-by Xiantao Zhang <xiantao.zhang@intel.com>
>> Xiantao
> 

^ permalink raw reply	[flat|nested] 35+ messages in thread

* RE: [PATCH] TSC scaling for live migration between platforms with different TSC frequecies
  2009-06-23 10:18   ` Keir Fraser
@ 2009-06-24  1:18     ` Zhang, Xiantao
  0 siblings, 0 replies; 35+ messages in thread
From: Zhang, Xiantao @ 2009-06-24  1:18 UTC (permalink / raw)
  To: Keir Fraser
  Cc: Dan Magenheimer, xen-devel, Tim Deegan, Levon, Ian Pratt, John

Keir Fraser wrote:
> Stuffing the guest freq in a save-image pad field is not backward
> compatible. Old images will not have that field filled in and you'll
> probably end up doing something stupid like give them a zero-hertz
> TSC. Please think about backward compatibility and use a separate

Hi, Keir
    I also checked the filed to solve the backward compatibility issue, and once found the field is zero, we won't do anything about TSC scaling(reference hvm_gtsc_need_scale to get the detail), so guest never uses a zero-hertz frequency in any case.  You know, since old images can't provide TSC frequency info, so TSC scaling logic shouldn't cover it.  
Xiantao

>  -- Keir
> 
> On 22/06/2009 06:14, "Zhang, Xiantao" <xiantao.zhang@intel.com> wrote:
> 
>> Hi, Keir
>> This is the new version which has addressed the comments from the
>> mailing list. Please review it again.  Thanks!
>> Xiantao
>> 
>> Zhang, Xiantao wrote:
>>> Hi, Keir
>>> 
>>>     This patchset targets for enabling TSC scaling in software for
>>>     live migration between platforms with different TSC frequecies.
>>> Once found the target host's frequency is different with source
>>> host's, hypervisor will trap and emulate guest's all rdtsc
>>> instructions with its expected frequency. If hardware's TSC
>>> frequency is difffernt with guest's exepcted freq, guest may behave
>>> abnormally, eg. incorrect wallclock, soft lockup, even hang in some
>>> cases. Therefore, this patchset is necessary to avoid such issues.
>>> 
>>> PATCH 0001-- Save guest's preferred TSC in image for save/restore
>>> and migration PATCH 0002-- Move multidiv64 as a library function.
>>> PATCH 0003-- Scaling host TSC freqeuncy patch.
>>> 
>>> Signed-off-by Xiantao Zhang <xiantao.zhang@intel.com>
>>> Xiantao

^ permalink raw reply	[flat|nested] 35+ messages in thread

end of thread, other threads:[~2009-06-24  1:18 UTC | newest]

Thread overview: 35+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-06-18  2:56 [PATCH] TSC scaling for live migration between platforms with different TSC frequecies Zhang, Xiantao
2009-06-18  7:37 ` [PATCH] TSC scaling for live migration betweenplatforms " Jan Beulich
2009-06-18  8:52   ` Zhang, Xiantao
2009-06-18 15:40     ` Dan Magenheimer
2009-06-19  1:48       ` Zhang, Xiantao
2009-06-18  9:02 ` [PATCH] TSC scaling for live migration between platforms " Tim Deegan
2009-06-18  9:46   ` Zhang, Xiantao
2009-06-18  9:56     ` Tim Deegan
2009-06-18  9:10 ` Patrick Colp
2009-06-18  9:27   ` Tim Deegan
2009-06-18  9:47     ` Zhang, Xiantao
2009-06-18 15:45       ` Dan Magenheimer
2009-06-18 16:04         ` Tim Deegan
2009-06-18 20:07           ` Dan Magenheimer
2009-06-18 10:56 ` Ian Pratt
2009-06-18 15:58   ` Dan Magenheimer
2009-06-18 16:45     ` John Levon
2009-06-18 20:27       ` Dan Magenheimer
2009-06-18 20:45         ` John Levon
2009-06-18 20:57           ` Dan Magenheimer
2009-06-18 21:00             ` John Levon
2009-06-18 22:27               ` Dan Magenheimer
2009-06-19 13:36                 ` John Levon
2009-06-19  1:21               ` Zhang, Xiantao
2009-06-19 13:54                 ` John Levon
2009-06-18 23:49       ` Dong, Eddie
2009-06-19  2:25       ` Zhang, Xiantao
2009-06-19 13:53         ` John Levon
2009-06-19 15:07           ` Zhang, Xiantao
2009-06-19 20:44             ` Dan Magenheimer
2009-06-22  1:38               ` Zhang, Xiantao
2009-06-19  1:34   ` Zhang, Xiantao
2009-06-22  5:14 ` Zhang, Xiantao
2009-06-23 10:18   ` Keir Fraser
2009-06-24  1:18     ` Zhang, Xiantao

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.