* [PATCH v2 1/5] x86/time: Alter tsc_set_info() to return an error value
2018-11-20 14:36 [PATCH v2 0/5] x86: Drop PVRDTSCP and fix MSR_TSC_AUX handling for guests Andrew Cooper
@ 2018-11-20 14:36 ` Andrew Cooper
2018-11-20 16:26 ` Jan Beulich
2018-11-20 14:36 ` [PATCH v2 2/5] x86: Begin to remove TSC mode PVRDTSCP Andrew Cooper
` (4 subsequent siblings)
5 siblings, 1 reply; 15+ messages in thread
From: Andrew Cooper @ 2018-11-20 14:36 UTC (permalink / raw)
To: Xen-devel; +Cc: Andrew Cooper, Wei Liu, Jan Beulich, Roger Pau Monné
Currently, tsc_set_info() performs no parameter checking, and an invalid
tsc_mode goes largely unnoticed. Fix it to reject invalid tsc_modes with
-EINVAL, and update the callers to check the return value.
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Jan Beulich <JBeulich@suse.com>
CC: Wei Liu <wei.liu2@citrix.com>
CC: Roger Pau Monné <roger.pau@citrix.com>
v2:
* New
---
xen/arch/x86/domain.c | 4 ++--
xen/arch/x86/domctl.c | 8 ++++----
xen/arch/x86/time.c | 18 +++++++++++++-----
xen/include/asm-x86/time.h | 7 +++----
4 files changed, 22 insertions(+), 15 deletions(-)
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index 295b10c..245300b 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -599,8 +599,8 @@ int arch_domain_create(struct domain *d,
else
ASSERT_UNREACHABLE(); /* Not HVM and not PV? */
- /* initialize default tsc behavior in case tools don't */
- tsc_set_info(d, TSC_MODE_DEFAULT, 0UL, 0, 0);
+ if ( (rc = tsc_set_info(d, TSC_MODE_DEFAULT, 0UL, 0, 0)) != 0 )
+ goto fail;
/* PV/PVH guests get an emulated PIT too for video BIOSes to use. */
pit_init(d, cpu_khz);
diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c
index aa8ad19..ed46df8 100644
--- a/xen/arch/x86/domctl.c
+++ b/xen/arch/x86/domctl.c
@@ -975,10 +975,10 @@ long arch_do_domctl(
else
{
domain_pause(d);
- tsc_set_info(d, domctl->u.tsc_info.tsc_mode,
- domctl->u.tsc_info.elapsed_nsec,
- domctl->u.tsc_info.gtsc_khz,
- domctl->u.tsc_info.incarnation);
+ ret = tsc_set_info(d, domctl->u.tsc_info.tsc_mode,
+ domctl->u.tsc_info.elapsed_nsec,
+ domctl->u.tsc_info.gtsc_khz,
+ domctl->u.tsc_info.incarnation);
domain_unpause(d);
}
break;
diff --git a/xen/arch/x86/time.c b/xen/arch/x86/time.c
index 24d4c27..d80a586 100644
--- a/xen/arch/x86/time.c
+++ b/xen/arch/x86/time.c
@@ -2194,19 +2194,19 @@ void tsc_get_info(struct domain *d, uint32_t *tsc_mode,
* only the last "sticks" and all are completed before the guest executes
* an rdtsc instruction
*/
-void tsc_set_info(struct domain *d,
- uint32_t tsc_mode, uint64_t elapsed_nsec,
- uint32_t gtsc_khz, uint32_t incarnation)
+int tsc_set_info(struct domain *d,
+ uint32_t tsc_mode, uint64_t elapsed_nsec,
+ uint32_t gtsc_khz, uint32_t incarnation)
{
ASSERT(!is_system_domain(d));
if ( is_pv_domain(d) && is_hardware_domain(d) )
{
d->arch.vtsc = 0;
- return;
+ return 0;
}
- switch ( d->arch.tsc_mode = tsc_mode )
+ switch ( tsc_mode )
{
bool enable_tsc_scaling;
@@ -2253,7 +2253,13 @@ void tsc_set_info(struct domain *d,
elapsed_nsec;
}
break;
+
+ default:
+ return -EINVAL;
}
+
+ d->arch.tsc_mode = tsc_mode;
+
d->arch.incarnation = incarnation + 1;
if ( is_hvm_domain(d) )
{
@@ -2280,6 +2286,8 @@ void tsc_set_info(struct domain *d,
}
recalculate_cpuid_policy(d);
+
+ return 0;
}
/* vtsc may incur measurable performance degradation, diagnose with this */
diff --git a/xen/include/asm-x86/time.h b/xen/include/asm-x86/time.h
index ce96ec9..6111fdc 100644
--- a/xen/include/asm-x86/time.h
+++ b/xen/include/asm-x86/time.h
@@ -60,12 +60,11 @@ uint64_t pv_soft_rdtsc(const struct vcpu *v, const struct cpu_user_regs *regs);
u64 gtime_to_gtsc(struct domain *d, u64 time);
u64 gtsc_to_gtime(struct domain *d, u64 tsc);
-void tsc_set_info(struct domain *d, uint32_t tsc_mode, uint64_t elapsed_nsec,
- uint32_t gtsc_khz, uint32_t incarnation);
-
+int tsc_set_info(struct domain *d, uint32_t tsc_mode, uint64_t elapsed_nsec,
+ uint32_t gtsc_khz, uint32_t incarnation);
+
void tsc_get_info(struct domain *d, uint32_t *tsc_mode, uint64_t *elapsed_nsec,
uint32_t *gtsc_khz, uint32_t *incarnation);
-
void force_update_vcpu_system_time(struct vcpu *v);
--
2.1.4
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH v2 4/5] x86/msr: Handle MSR_TSC_AUX consistently for PV and HVM guests
2018-11-20 14:36 [PATCH v2 0/5] x86: Drop PVRDTSCP and fix MSR_TSC_AUX handling for guests Andrew Cooper
` (2 preceding siblings ...)
2018-11-20 14:37 ` [PATCH v2 3/5] x86/pv: Remove deferred RDTSC{, P} handling in pv_emulate_privileged_op() Andrew Cooper
@ 2018-11-20 14:37 ` Andrew Cooper
2018-11-20 16:33 ` Jan Beulich
` (2 more replies)
2018-11-20 14:37 ` [PATCH v2 5/5] x86/pv: Expose RDTSCP to PV guests Andrew Cooper
2018-12-14 0:52 ` [PATCH v2 6/5] tools/docs: Remove PVRDTSCP remnants Andrew Cooper
5 siblings, 3 replies; 15+ messages in thread
From: Andrew Cooper @ 2018-11-20 14:37 UTC (permalink / raw)
To: Xen-devel
Cc: Kevin Tian, Wei Liu, Jan Beulich, Konrad Rzeszutek Wilk,
Andrew Cooper, Jun Nakajima, Boris Ostrovsky, Brian Woods,
Suravee Suthikulpanit, Roger Pau Monné
With PVRDTSCP mode removed, handling of MSR_TSC_AUX can move into the common
code. Move its storage into struct vcpu_msrs (dropping the HVM-specific
msr_tsc_aux), and add an RDPID feature check as this bit also enumerates the
presence of the MSR.
Introduce cpu_has_rdpid along with the synthesized cpu_has_msr_tsc_aux to
correct the context switch paths, as MSR_TSC_AUX is enumerated by either
RDTSCP or RDPID.
Drop hvm_msr_tsc_aux() entirely, and use v->arch.msrs->tsc_aux directly.
Update hvm_load_cpu_ctxt() to check that the incoming ctxt.msr_tsc_aux isn't
out of range. In practice, no previous version of Xen ever wrote an
out-of-range value. Add MSR_TSC_AUX to the list of MSRs migrated for PV
guests, but leave the HVM path using the existing space in hvm_hw_cpu.
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Jan Beulich <JBeulich@suse.com>
CC: Wei Liu <wei.liu2@citrix.com>
CC: Roger Pau Monné <roger.pau@citrix.com>
CC: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
CC: Jun Nakajima <jun.nakajima@intel.com>
CC: Kevin Tian <kevin.tian@intel.com>
CC: Boris Ostrovsky <boris.ostrovsky@oracle.com>
CC: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
CC: Brian Woods <brian.woods@amd.com>
v2:
* Rebase over "x86/msr: Handle MSR_AMD64_DR{0-3}_ADDRESS_MASK in the new MSR infrastructure"
* Move the HVM msr_tsc_aux check earlier in hvm_load_cpu_ctxt()
* Introduce cpu_has_msr_tsc_aux
RFC: I'm not overly happy with cpu_has_msr_tsc_aux because in practice all
hardware with rdpid has rdtscp, making this an effectively dead conditional in
the context switch path. I'm tempted to go with
#define cpu_has_msr_tsc_aux (cpu_has_rdtscp /* || cpu_has_rdpid */)
to get the point across, but without the extra jump.
---
xen/arch/x86/domain.c | 5 ++---
xen/arch/x86/domctl.c | 2 ++
xen/arch/x86/hvm/hvm.c | 22 +++++++++-------------
xen/arch/x86/hvm/svm/svm.c | 6 +++---
xen/arch/x86/hvm/vmx/vmx.c | 6 +++---
xen/arch/x86/msr.c | 18 ++++++++++++++++++
xen/arch/x86/pv/emul-priv-op.c | 4 ----
xen/include/asm-x86/cpufeature.h | 5 +++++
xen/include/asm-x86/hvm/hvm.h | 6 ------
xen/include/asm-x86/hvm/vcpu.h | 1 -
xen/include/asm-x86/msr.h | 9 +++++++++
11 files changed, 51 insertions(+), 33 deletions(-)
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index 245300b..38c233e 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1592,9 +1592,8 @@ void paravirt_ctxt_switch_to(struct vcpu *v)
if ( unlikely(v->arch.dr7 & DR7_ACTIVE_MASK) )
activate_debugregs(v);
- if ( cpu_has_rdtscp )
- wrmsr_tsc_aux(v->domain->arch.tsc_mode == TSC_MODE_PVRDTSCP
- ? v->domain->arch.incarnation : 0);
+ if ( cpu_has_msr_tsc_aux )
+ wrmsr_tsc_aux(v->arch.msrs->tsc_aux);
}
/* Update per-VCPU guest runstate shared memory area (if registered). */
diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c
index ed46df8..9bf2d08 100644
--- a/xen/arch/x86/domctl.c
+++ b/xen/arch/x86/domctl.c
@@ -1274,6 +1274,7 @@ long arch_do_domctl(
static const uint32_t msrs_to_send[] = {
MSR_SPEC_CTRL,
MSR_INTEL_MISC_FEATURES_ENABLES,
+ MSR_TSC_AUX,
MSR_AMD64_DR0_ADDRESS_MASK,
MSR_AMD64_DR1_ADDRESS_MASK,
MSR_AMD64_DR2_ADDRESS_MASK,
@@ -1373,6 +1374,7 @@ long arch_do_domctl(
{
case MSR_SPEC_CTRL:
case MSR_INTEL_MISC_FEATURES_ENABLES:
+ case MSR_TSC_AUX:
case MSR_AMD64_DR0_ADDRESS_MASK:
case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
if ( guest_wrmsr(v, msr.index, msr.value) != X86EMUL_OKAY )
diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index e2e4204..1f9bafc 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -774,7 +774,7 @@ static int hvm_save_cpu_ctxt(struct vcpu *v, hvm_domain_context_t *h)
struct segment_register seg;
struct hvm_hw_cpu ctxt = {
.tsc = hvm_get_guest_tsc_fixed(v, v->domain->arch.hvm.sync_tsc),
- .msr_tsc_aux = hvm_msr_tsc_aux(v),
+ .msr_tsc_aux = v->arch.msrs->tsc_aux,
.rax = v->arch.user_regs.rax,
.rbx = v->arch.user_regs.rbx,
.rcx = v->arch.user_regs.rcx,
@@ -1014,6 +1014,13 @@ static int hvm_load_cpu_ctxt(struct domain *d, hvm_domain_context_t *h)
return -EINVAL;
}
+ if ( ctxt.msr_tsc_aux != (uint32_t)ctxt.msr_tsc_aux )
+ {
+ printk(XENLOG_G_ERR "%pv: HVM restore: bad MSR_TSC_AUX %#"PRIx64"\n",
+ v, ctxt.msr_tsc_aux);
+ return -EINVAL;
+ }
+
/* Older Xen versions used to save the segment arbytes directly
* from the VMCS on Intel hosts. Detect this and rearrange them
* into the struct segment_register format. */
@@ -1040,7 +1047,7 @@ static int hvm_load_cpu_ctxt(struct domain *d, hvm_domain_context_t *h)
if ( hvm_funcs.tsc_scaling.setup )
hvm_funcs.tsc_scaling.setup(v);
- v->arch.hvm.msr_tsc_aux = ctxt.msr_tsc_aux;
+ v->arch.msrs->tsc_aux = ctxt.msr_tsc_aux;
hvm_set_guest_tsc_fixed(v, ctxt.tsc, d->arch.hvm.sync_tsc);
@@ -3406,10 +3413,6 @@ int hvm_msr_read_intercept(unsigned int msr, uint64_t *msr_content)
*msr_content = v->arch.hvm.msr_tsc_adjust;
break;
- case MSR_TSC_AUX:
- *msr_content = hvm_msr_tsc_aux(v);
- break;
-
case MSR_IA32_APICBASE:
*msr_content = vcpu_vlapic(v)->hw.apic_base_msr;
break;
@@ -3562,13 +3565,6 @@ int hvm_msr_write_intercept(unsigned int msr, uint64_t msr_content,
hvm_set_guest_tsc_adjust(v, msr_content);
break;
- case MSR_TSC_AUX:
- v->arch.hvm.msr_tsc_aux = (uint32_t)msr_content;
- if ( cpu_has_rdtscp
- && (v->domain->arch.tsc_mode != TSC_MODE_PVRDTSCP) )
- wrmsr_tsc_aux(msr_content);
- break;
-
case MSR_IA32_APICBASE:
if ( !vlapic_msr_set(vcpu_vlapic(v), msr_content) )
goto gp_fault;
diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c
index b9a8900..df6f262 100644
--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -1069,8 +1069,8 @@ static void svm_ctxt_switch_to(struct vcpu *v)
svm_lwp_load(v);
svm_tsc_ratio_load(v);
- if ( cpu_has_rdtscp )
- wrmsr_tsc_aux(hvm_msr_tsc_aux(v));
+ if ( cpu_has_msr_tsc_aux )
+ wrmsr_tsc_aux(v->arch.msrs->tsc_aux);
}
static void noreturn svm_do_resume(struct vcpu *v)
@@ -2968,7 +2968,7 @@ void svm_vmexit_handler(struct cpu_user_regs *regs)
break;
case VMEXIT_RDTSCP:
- regs->rcx = hvm_msr_tsc_aux(v);
+ regs->rcx = v->arch.msrs->tsc_aux;
/* fall through */
case VMEXIT_RDTSC:
svm_vmexit_do_rdtsc(regs);
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 365eeb2..9b691d9 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -511,8 +511,8 @@ static void vmx_restore_guest_msrs(struct vcpu *v)
wrmsrl(MSR_LSTAR, v->arch.hvm.vmx.lstar);
wrmsrl(MSR_SYSCALL_MASK, v->arch.hvm.vmx.sfmask);
- if ( cpu_has_rdtscp )
- wrmsr_tsc_aux(hvm_msr_tsc_aux(v));
+ if ( cpu_has_msr_tsc_aux )
+ wrmsr_tsc_aux(v->arch.msrs->tsc_aux);
}
void vmx_update_cpu_exec_control(struct vcpu *v)
@@ -3956,7 +3956,7 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
vmx_invlpg_intercept(exit_qualification);
break;
case EXIT_REASON_RDTSCP:
- regs->rcx = hvm_msr_tsc_aux(v);
+ regs->rcx = v->arch.msrs->tsc_aux;
/* fall through */
case EXIT_REASON_RDTSC:
update_guest_eip(); /* Safe: RDTSC, RDTSCP */
diff --git a/xen/arch/x86/msr.c b/xen/arch/x86/msr.c
index 76cb6ef..f86da8f 100644
--- a/xen/arch/x86/msr.c
+++ b/xen/arch/x86/msr.c
@@ -162,6 +162,13 @@ int guest_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val)
ret = guest_rdmsr_xen(v, msr, val);
break;
+ case MSR_TSC_AUX:
+ if ( !cp->extd.rdtscp && !cp->feat.rdpid )
+ goto gp_fault;
+
+ *val = msrs->tsc_aux;
+ break;
+
case MSR_AMD64_DR0_ADDRESS_MASK:
case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
if ( !cp->extd.dbext )
@@ -309,6 +316,17 @@ int guest_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val)
ret = guest_wrmsr_xen(v, msr, val);
break;
+ case MSR_TSC_AUX:
+ if ( !cp->extd.rdtscp && !cp->feat.rdpid )
+ goto gp_fault;
+ if ( val != (uint32_t)val )
+ goto gp_fault;
+
+ msrs->tsc_aux = val;
+ if ( v == curr )
+ wrmsr_tsc_aux(val);
+ break;
+
case MSR_AMD64_DR0_ADDRESS_MASK:
case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
if ( !cp->extd.dbext || val != (uint32_t)val )
diff --git a/xen/arch/x86/pv/emul-priv-op.c b/xen/arch/x86/pv/emul-priv-op.c
index 5133c35..942ece2 100644
--- a/xen/arch/x86/pv/emul-priv-op.c
+++ b/xen/arch/x86/pv/emul-priv-op.c
@@ -847,10 +847,6 @@ static int read_msr(unsigned int reg, uint64_t *val,
*val = currd->arch.vtsc ? pv_soft_rdtsc(curr, ctxt->regs) : rdtsc();
return X86EMUL_OKAY;
- case MSR_TSC_AUX:
- *val = 0;
- return X86EMUL_OKAY;
-
case MSR_EFER:
/* Hide unknown bits, and unconditionally hide SVME from guests. */
*val = read_efer() & EFER_KNOWN_MASK & ~EFER_SVME;
diff --git a/xen/include/asm-x86/cpufeature.h b/xen/include/asm-x86/cpufeature.h
index c2b0f6a..5592e17 100644
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -107,6 +107,9 @@
#define cpu_has_avx512bw boot_cpu_has(X86_FEATURE_AVX512BW)
#define cpu_has_avx512vl boot_cpu_has(X86_FEATURE_AVX512VL)
+/* CPUID level 0x00000007:0.ecx */
+#define cpu_has_rdpid boot_cpu_has(X86_FEATURE_RDPID)
+
/* CPUID level 0x80000007.edx */
#define cpu_has_itsc boot_cpu_has(X86_FEATURE_ITSC)
@@ -117,6 +120,8 @@
#define cpu_has_lfence_dispatch boot_cpu_has(X86_FEATURE_LFENCE_DISPATCH)
#define cpu_has_xen_lbr boot_cpu_has(X86_FEATURE_XEN_LBR)
+#define cpu_has_msr_tsc_aux (cpu_has_rdtscp || cpu_has_rdpid)
+
enum _cache_type {
CACHE_TYPE_NULL = 0,
CACHE_TYPE_DATA = 1,
diff --git a/xen/include/asm-x86/hvm/hvm.h b/xen/include/asm-x86/hvm/hvm.h
index 3d3250d..3a92bb3 100644
--- a/xen/include/asm-x86/hvm/hvm.h
+++ b/xen/include/asm-x86/hvm/hvm.h
@@ -563,12 +563,6 @@ static inline void hvm_invalidate_regs_fields(struct cpu_user_regs *regs)
#endif
}
-#define hvm_msr_tsc_aux(v) ({ \
- struct domain *__d = (v)->domain; \
- (__d->arch.tsc_mode == TSC_MODE_PVRDTSCP) \
- ? (u32)__d->arch.incarnation : (u32)(v)->arch.hvm.msr_tsc_aux; \
-})
-
/*
* Nested HVM
*/
diff --git a/xen/include/asm-x86/hvm/vcpu.h b/xen/include/asm-x86/hvm/vcpu.h
index 39778f9..c8a40f6 100644
--- a/xen/include/asm-x86/hvm/vcpu.h
+++ b/xen/include/asm-x86/hvm/vcpu.h
@@ -170,7 +170,6 @@ struct hvm_vcpu {
struct hvm_vcpu_asid n1asid;
- u32 msr_tsc_aux;
u64 msr_tsc_adjust;
u64 msr_xss;
diff --git a/xen/include/asm-x86/msr.h b/xen/include/asm-x86/msr.h
index 05d905b..adfa2fa 100644
--- a/xen/include/asm-x86/msr.h
+++ b/xen/include/asm-x86/msr.h
@@ -289,6 +289,15 @@ struct vcpu_msrs
} misc_features_enables;
/*
+ * 0xc0000103 - MSR_TSC_AUX
+ *
+ * Value is guest chosen, and always loaded in vcpu context. Guests have
+ * no direct MSR access, and the value is accessible to userspace with the
+ * RDTSCP and RDPID instructions.
+ */
+ uint32_t tsc_aux;
+
+ /*
* 0xc00110{27,19-1b} MSR_AMD64_DR{0-3}_ADDRESS_MASK
*
* Loaded into hardware for guests which have active %dr7 settings.
--
2.1.4
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH v2 6/5] tools/docs: Remove PVRDTSCP remnants
2018-11-20 14:36 [PATCH v2 0/5] x86: Drop PVRDTSCP and fix MSR_TSC_AUX handling for guests Andrew Cooper
` (4 preceding siblings ...)
2018-11-20 14:37 ` [PATCH v2 5/5] x86/pv: Expose RDTSCP to PV guests Andrew Cooper
@ 2018-12-14 0:52 ` Andrew Cooper
2018-12-14 10:19 ` Jan Beulich
2018-12-18 15:29 ` Ian Jackson
5 siblings, 2 replies; 15+ messages in thread
From: Andrew Cooper @ 2018-12-14 0:52 UTC (permalink / raw)
To: Xen-devel; +Cc: Andrew Cooper, Wei Liu, Jan Beulich, Roger Pau Monné
PVRDTSCP is believed-unused, and its implementation has adverse consequences
on unrelated functionality in the hypervisor. As a result, support has been
removed.
Modify libxl to provide a slightly more helpful error message if it encounters
PVRDTSCP being selected. While adjusting TSC handling, make libxl check for
errors from the set_tsc hypercall.
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Jan Beulich <JBeulich@suse.com>
CC: Wei Liu <wei.liu2@citrix.com>
CC: Roger Pau Monné <roger.pau@citrix.com>
---
docs/man/xen-tscmode.pod.7 | 94 +-----------
docs/man/xl.cfg.pod.5.in | 9 +-
docs/misc/pvrdtscp.c | 307 --------------------------------------
tools/libxl/libxl_x86.c | 13 +-
tools/python/xen/lowlevel/xc/xc.c | 2 +-
5 files changed, 19 insertions(+), 406 deletions(-)
delete mode 100644 docs/misc/pvrdtscp.c
diff --git a/docs/man/xen-tscmode.pod.7 b/docs/man/xen-tscmode.pod.7
index 819c61d..1d81a3f 100644
--- a/docs/man/xen-tscmode.pod.7
+++ b/docs/man/xen-tscmode.pod.7
@@ -77,9 +77,7 @@ highest performance is required.
=item * B<tsc_mode=3> (PVRDTSCP).
-High-TSC-frequency apps may be paravirtualized (modified) to
-obtain both correctness and highest performance; any unmodified
-apps must be TSC-resilient.
+This mode has been removed.
=back
@@ -215,30 +213,6 @@ is emulated. Note that, though emulated, the "apparent" TSC frequency
will be the TSC frequency of the initial physical machine, even after
migration.
-For environments where both TSC-safeness AND highest performance
-even across migration is a requirement, application code can be specially
-modified to use an algorithm explicitly designed into Xen for this purpose.
-This mode (tsc_mode==3) is called PVRDTSCP, because it requires
-app paravirtualization (awareness by the app that it may be running
-on top of Xen), and utilizes a variation of the rdtsc instruction
-called rdtscp that is available on most recent generation processors.
-(The rdtscp instruction differs from the rdtsc instruction in that it
-reads not only the TSC but an additional register set by system software.)
-When a pvrdtscp-modified app is running on a processor that is both TSC-safe
-and supports the rdtscp instruction, information can be obtained
-about migration and TSC frequency/offset adjustment to allow the
-vast majority of timestamps to be obtained at top performance; when
-running on a TSC-unsafe processor or a processor that doesn't support
-the rdtscp instruction, rdtscp is emulated.
-
-PVRDTSCP (tsc_mode==3) has two limitations. First, it applies to
-all apps running in this virtual machine. This means that all
-apps must either be TSC-resilient or pvrdtscp-modified. Second,
-highest performance is only obtained on TSC-safe machines that
-support the rdtscp instruction; when running on older machines,
-rdtscp is emulated and thus slower. For more information on PVRDTSCP,
-see below.
-
Finally, tsc_mode==1 always enables TSC emulation, regardless of
the underlying physical hardware. The "apparent" TSC frequency will
be the TSC frequency of the initial physical machine, even after migration.
@@ -287,56 +261,7 @@ have been replaced by a paravirtualized equivalent of the cpuid instruction
("pvcpuid") and also trap to Xen. But apps in a PV guest that use a
cpuid instruction execute it directly, without a trap to Xen. As a result,
an app may directly examine the physical TSC Invariant cpuid bit and make
-decisions based on that bit. This is still an unsolved problem, though
-a workaround exists as part of the PVRDTSCP tsc_mode for apps that
-can be modified.
-
-=head1 MORE ON PVRDTSCP
-
-Paravirtualized OS's use the "pvclock" algorithm to manage the passing
-of time. This sophisticated algorithm obtains information from a memory
-page shared between Xen and the OS and selects information from this
-page based on the current virtual CPU (vcpu) in order to properly adapt to
-TSC-unsafe systems and changes that occur across migration. Neither
-this shared page nor the vcpu information is available to a userland
-app so the pvclock algorithm cannot be directly used by an app, at least
-without performance degradation roughly equal to the cost of just
-emulating an rdtsc.
-
-As a result, as of 4.0, Xen provides capabilities for a userland app
-to obtain key time values similar to the information accessible
-to the PV OS pvclock algorithm. The app uses the rdtscp instruction
-which is defined in recent processors to obtain both the TSC and an
-auxiliary value called TSC_AUX. Xen is responsible for setting TSC_AUX
-to the same value on all vcpus running any domain with tsc_mode==3;
-further, Xen tools are responsible for monotonically incrementing TSC_AUX
-anytime the domain is restored/migrated (thus changing key time values);
-and, when the domain is running on a physical machine that either
-is not TSC-safe or does not support the rdtscp instruction, Xen
-is responsible for emulating the rdtscp instruction and for setting
-TSC_AUX to zero on all processors.
-
-Xen also provides pvclock information via a "pvcpuid" instruction.
-While this results in a slow trap, the information changes
-(and thus must be reobtained via pvcpuid) ONLY when TSC_AUX
-has changed, which should be very rare relative to a high
-frequency of rdtscp instructions.
-
-Finally, Xen provides additional time-related information via
-other pvcpuid instructions. First, an app is capable of
-determining if it is currently running on Xen, next whether
-the tsc_mode setting of the domain in which it is running,
-and finally whether the underlying hardware is TSC-safe and
-supports the rdtscp instruction.
-
-As a result, a pvrdtscp-modified app has sufficient information
-to compute the pvclock "elapsed nanoseconds" which can
-be used as a timestamp. And this can be done nearly as
-fast as a native rdtsc instruction, much faster than emulation,
-and also much faster than nearly all OS-provided time mechanisms.
-While pvrtscp is too complex for most apps, certain enterprise
-TSC-sensitive high-TSC-frequency apps may find it useful to
-obtain a significant performance gain.
+decisions based on that bit.
=head1 HARDWARE TSC SCALING
@@ -344,21 +269,16 @@ Intel VMX TSC scaling and AMD SVM TSC ratio allow the guest TSC read
by guest rdtsc/p increasing in a different frequency than the host
TSC frequency.
-If a HVM container in default TSC mode (tsc_mode=0) or PVRDTSCP mode
-(tsc_mode=3) is created on a host that provides constant TSC, its
-guest TSC frequency will be the same as the host. If it is later
-migrated to another host that provides constant TSC and supports Intel
-VMX TSC scaling/AMD SVM TSC ratio, its guest TSC frequency will be the
-same before and after migration.
+If a HVM container in default TSC mode (tsc_mode=0) is created on a host
+that provides constant TSC, its guest TSC frequency will be the same as
+the host. If it is later migrated to another host that provides constant
+TSC and supports Intel VMX TSC scaling/AMD SVM TSC ratio, its guest TSC
+frequency will be the same before and after migration.
For above HVM container in default TSC mode (tsc_mode=0), if above
hosts support rdtscp, both guest rdtsc and rdtscp instructions will be
executed natively before and after migration.
-For above HVM container in PVRDTSCP mode (tsc_mode=3), if the
-destination host does not support rdtscp, the guest rdtscp instruction
-will be emulated with the guest TSC frequency.
-
=head1 AUTHORS
Dan Magenheimer <dan.magenheimer@oracle.com>
diff --git a/docs/man/xl.cfg.pod.5.in b/docs/man/xl.cfg.pod.5.in
index b1c0be1..3b92f39 100644
--- a/docs/man/xl.cfg.pod.5.in
+++ b/docs/man/xl.cfg.pod.5.in
@@ -2099,14 +2099,7 @@ by h/w, else executed natively.
=item B<native_paravirt>
-Same as B<native>, except Xen manages the TSC_AUX register so the guest can
-determine when a restore/migration has occurred and assumes guest
-obtains/uses a pvclock-like mechanism to adjust for monotonicity and
-frequency changes.
-
-If a HVM container in B<native_paravirt> TSC mode can execute both guest
-rdtsc and guest rdtscp natively, then the guest TSC frequency will be
-determined in a similar way to that of B<default> TSC mode.
+This mode has been removed.
=back
diff --git a/docs/misc/pvrdtscp.c b/docs/misc/pvrdtscp.c
deleted file mode 100644
index 8d25843..0000000
--- a/docs/misc/pvrdtscp.c
+++ /dev/null
@@ -1,307 +0,0 @@
-/* pvrdtscp algorithm
- *
- * This sample code demonstrates the use of the paravirtualized rdtscp
- * algorithm. Using this algorithm, an application may communicate with
- * the Xen hypervisor (version 4.0+) to obtain timestamp information which
- * is both monotonically increasing and has a fixed 1 GHz rate, even across
- * migrations between machines with different TSC rates and offsets.
- * Further,the algorithm provides performance near the performance of a
- * native rdtsc/rdtscp instruction -- much faster than emulation PROVIDED
- * the application is running on a machine on which the rdtscp instruction
- * is supported and TSC is "safe". The application must also be running in a
- * PV domain. (HVM domains may be supported at a later time.) On machines
- * where TSC is unsafe or the rdtscp instruction is not supported, Xen
- * (v4.0+) provides emulation which is slower but consistent with the pvrdtscp
- * algorithm, thus providing support for the algorithm for live migration
- * across all machines.
- *
- * More information can be found within the Xen (4.0+) source tree at
- * docs/misc/tscmode.txt
- *
- * Copyright (c) 2009 Oracle Corporation and/or its affiliates.
- * All rights reserved
- * Written by: Dan Magenheimer <dan.magenheimer@oracle.com>
- *
- * This code is derived from code licensed under the GNU
- * General Public License ("GPL") version 2 and is therefore itself
- * also licensed under the GPL version 2.
- *
- * This code is known to compile and run on Oracle Enterprise Linux 5 Update 2
- * using gcc version 4.1.2, but its purpose is to describe the pvrdtscp
- * algorithm and its ABI to Xen version 4.0+
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/wait.h>
-
-#ifdef __LP64__
-#define __X86_64__
-typedef unsigned short u16;
-typedef unsigned int u32;
-typedef unsigned long u64;
-typedef int i32;
-typedef long i64;
-#define NSEC_PER_SEC 1000000000
-#else
-#define __X86_32__
-typedef unsigned int u16;
-typedef unsigned long u32;
-typedef unsigned long long u64;
-typedef long i32;
-typedef long long i64;
-#define NSEC_PER_SEC 1000000000L
-#endif
-
-static inline void hvm_cpuid(u32 idx, u32 sub,
- u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
-{
- *eax = idx, *ecx = sub;
- asm("cpuid" : "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx)
- : "0" (*eax), "2" (*ecx));
-}
-
-static inline void pv_cpuid(u32 idx, u32 sub,
- u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
-{
- *eax = idx, *ecx = sub;
- asm volatile ( "ud2a ; .ascii \"xen\"; cpuid" : "=a" (*eax),
- "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "0" (*eax), "2" (*ecx));
-}
-
-static inline u64 do_rdtscp(u32 *aux)
-{
-static u64 last = 0;
- u32 lo32, hi32;
- u64 val;
-
- asm volatile(".byte 0x0f,0x01,0xf9":"=a"(lo32),"=d"(hi32),"=c" (*aux));
- val = lo32 | ((u64)hi32 << 32);
- return val;
-}
-
-static inline int get_xen_tsc_mode(void)
-{
- u32 val, dummy1, dummy2, dummy3;
- pv_cpuid(0x40000003,0,&dummy1,&val,&dummy2,&dummy3);
- return val;
-}
-
-static inline int get_xen_vtsc(void)
-{
- u32 val, dummy1, dummy2, dummy3;
- pv_cpuid(0x40000003,0,&val,&dummy1,&dummy2,&dummy3);
- return val & 1;
-}
-
-static inline int get_xen_vtsc_khz(void)
-{
- u32 val, dummy1, dummy2, dummy3;
- pv_cpuid(0x40000003,0,&dummy1,&dummy2,&val,&dummy3);
- return val;
-}
-
-static inline u32 get_xen_cpu_khz(void)
-{
- u32 cpu_khz, dummy1, dummy2, dummy3;
- pv_cpuid(0x40000003,2,&cpu_khz,&dummy1,&dummy2,&dummy3);
- return cpu_khz;
-}
-
-static inline u32 get_xen_incarnation(void)
-{
- u32 incarn, dummy1, dummy2, dummy3;
- pv_cpuid(0x40000003,0,&dummy1,&dummy2,&dummy3,&incarn);
- return incarn;
-}
-
-static inline void get_xen_time_values(u64 *offset, u32 *mul_frac, u32 *shift)
-{
- u32 off_lo, off_hi, sys_lo, sys_hi, dummy;
-
- pv_cpuid(0x40000003,1,&off_lo,&off_hi,mul_frac,shift);
- *offset = off_lo | ((u64)off_hi << 32);
-}
-
-static inline u64 scale_delta(u64 delta, u32 tsc_mul_frac, i32 tsc_shift)
-{
- u64 product;
-#ifdef __X86_32__
- u32 tmp1, tmp2;
-#endif
-
- if ( tsc_shift < 0 )
- delta >>= -tsc_shift;
- else
- delta <<= tsc_shift;
-
-#ifdef __X86_32__
- asm (
- "mul %5 ; "
- "mov %4,%%eax ; "
- "mov %%edx,%4 ; "
- "mul %5 ; "
- "xor %5,%5 ; "
- "add %4,%%eax ; "
- "adc %5,%%edx ; "
- : "=A" (product), "=r" (tmp1), "=r" (tmp2)
- : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (tsc_mul_frac) );
-#else
- asm (
- "mul %%rdx ; shrd $32,%%rdx,%%rax"
- : "=a" (product) : "0" (delta), "d" ((u64)tsc_mul_frac) );
-#endif
-
- return product;
-}
-
-static inline u64 get_pvrdtscp_timestamp(int *discontinuity)
-{
- static int firsttime = 1;
- static u64 last_pvrdtscp_timestamp = 0;
- static u32 last_tsc_aux;
- static u64 xen_ns_offset;
- static u32 xen_tsc_to_ns_mul_frac, xen_tsc_to_ns_shift;
- u32 this_tsc_aux;
- u64 timestamp, cur_tsc, cur_ns;
-
- if (firsttime) {
- cur_tsc = do_rdtscp(&last_tsc_aux);
- get_xen_time_values(&xen_ns_offset, &xen_tsc_to_ns_mul_frac,
- &xen_tsc_to_ns_shift);
- cur_ns = scale_delta(cur_tsc, xen_tsc_to_ns_mul_frac,
- xen_tsc_to_ns_shift);
- timestamp = cur_ns - xen_ns_offset;
- last_pvrdtscp_timestamp = timestamp;
- firsttime = 0;
- }
- cur_tsc = do_rdtscp(&this_tsc_aux);
- *discontinuity = 0;
- while (this_tsc_aux != last_tsc_aux) {
- /* if tsc_aux changed, try again */
- last_tsc_aux = this_tsc_aux;
- get_xen_time_values(&xen_ns_offset, &xen_tsc_to_ns_mul_frac,
- &xen_tsc_to_ns_shift);
- cur_tsc = do_rdtscp(&this_tsc_aux);
- *discontinuity = 1;
- }
-
- /* compute nsec from TSC and Xen time values */
- cur_ns = scale_delta(cur_tsc, xen_tsc_to_ns_mul_frac,
- xen_tsc_to_ns_shift);
- timestamp = cur_ns - xen_ns_offset;
-
- /* enforce monotonicity just in case */
- if ((i64)(timestamp - last_pvrdtscp_timestamp) > 0)
- last_pvrdtscp_timestamp = timestamp;
- else {
- /* this should never happen but we'll check it anyway in
- * case of some strange combination of scaling errors
- * occurs across a very fast migration */
- printf("Time went backwards by %lluns\n",
- (unsigned long long)(last_pvrdtscp_timestamp-timestamp));
- timestamp = ++last_pvrdtscp_timestamp;
- }
- return timestamp;
-}
-
-#define HVM 1
-#define PVM 0
-
-static int running_on_xen(int hvm, u16 *version_major, u16 *version_minor)
-{
- u32 eax, ebx, ecx, edx, base;
- union { char csig[16]; u32 u[4]; } sig;
-
- for (base=0x40000000; base < 0x40010000; base += 0x100) {
- if (hvm==HVM)
- hvm_cpuid(base,0,&eax,&ebx,&ecx,&edx);
- else
- pv_cpuid(base,0,&eax,&ebx,&ecx,&edx);
- sig.u[0] = ebx; sig.u[1] = ecx; sig.u[2] = edx;
- sig.csig[12] = '\0';
- if (!strcmp("XenVMMXenVMM",&sig.csig[0]) && (eax >= (base+2))) {
- if (hvm==HVM)
- hvm_cpuid(base+1,0,&eax,&ebx,&ecx,&edx);
- else
- pv_cpuid(base+1,0,&eax,&ebx,&ecx,&edx);
- *version_major = (eax >> 16) & 0xffff;
- *version_minor = eax & 0xffff;
- return 1;
- }
- }
- return 0;
-}
-
-main(int ac, char **av)
-{
- u32 dummy;
- u16 version_hi, version_lo;
- u64 ts, last_ts;
- int status, discontinuity = 0;
- pid_t pid;
-
- if (running_on_xen(HVM,&version_hi,&version_lo)) {
- printf("running on Xen v%d.%d as an HVM domain, "
- "pvrdtsc not supported, exiting\n",
- (int)version_hi, (int)version_lo);
- exit(0);
- }
- pid = fork();
- if (pid == -1) {
- fprintf(stderr,"Huh? Fork failed\n");
- return 0;
- }
- else if (pid == 0) { /* child */
- pv_cpuid(0x40000000,0,&dummy,&dummy,&dummy,&dummy);
- exit(0);
- }
- waitpid(pid,&status,0);
- if (!WIFEXITED(status))
- exit(0);
- if (!running_on_xen(PVM,&version_hi,&version_lo)) {
- printf("not running on Xen, exiting\n");
- exit(0);
- }
- printf("running on Xen v%d.%d as a PV domain\n",
- (int)version_hi, (int)version_lo);
- if ( version_hi <= 3 ) {
- printf("pvrdtscp requires Xen version 4.0 or greater\n");
- /* exit(0); FIXME after xen-unstable is officially v4.0 */
- }
- if ( get_xen_tsc_mode() != 3 )
- printf("tsc_mode not pvrdtscp, set tsc_mode=3, exiting\n");
-
- /* OK, we are on Xen, now loop forever checking timestamps */
- ts = get_pvrdtscp_timestamp(&discontinuity);
- printf("Starting with ts=%lluns 0x%llx (%llusec)\n",ts,ts,ts/NSEC_PER_SEC);
- printf("incarn=%d: vtsc=%d, vtsc_khz=%lu, phys cpu_khz=%lu\n",
- (unsigned long)get_xen_incarnation(),
- (unsigned long)get_xen_vtsc(),
- (unsigned long)get_xen_vtsc_khz(),
- (unsigned long)get_xen_cpu_khz());
- ts = get_pvrdtscp_timestamp(&discontinuity);
- last_ts = ts;
- while (1) {
- ts = get_pvrdtscp_timestamp(&discontinuity);
- if (discontinuity)
- printf("migrated/restored, incarn=%d: "
- "vtsc now %d, vtsc_khz=%lu, phys cpu_khz=%lu\n",
- (unsigned long)get_xen_incarnation(),
- (unsigned long)get_xen_vtsc(),
- (unsigned long)get_xen_vtsc_khz(),
- (unsigned long)get_xen_cpu_khz());
- if (ts < last_ts)
- /* this should NEVER happen, especially since there
- * is a check for it in get_pvrdtscp_timestamp() */
- printf("Time went backwards: %lluns (%llusec)\n",
- last_ts-ts,(last_ts-ts)/NSEC_PER_SEC);
- if (ts > last_ts + 200000000LL)
- /* this is OK, usually about 2sec for save/restore
- * and a fraction of a second for live migrate */
- printf("Time jumped forward %lluns (%llusec)\n",
- ts-last_ts,(ts-last_ts)/NSEC_PER_SEC);
- last_ts = ts;
- }
-}
diff --git a/tools/libxl/libxl_x86.c b/tools/libxl/libxl_x86.c
index c04fd75..c0f88a7 100644
--- a/tools/libxl/libxl_x86.c
+++ b/tools/libxl/libxl_x86.c
@@ -309,12 +309,19 @@ int libxl__arch_domain_create(libxl__gc *gc, libxl_domain_config *d_config,
tsc_mode = 2;
break;
case LIBXL_TSC_MODE_NATIVE_PARAVIRT:
- tsc_mode = 3;
- break;
+ LOGD(ERROR, domid, "TSC Mode native_paravirt (a.k.a PVRDTSCP) has been removed");
+ ret = ERROR_FEATURE_REMOVED;
+ goto out;
default:
abort();
}
- xc_domain_set_tsc_info(ctx->xch, domid, tsc_mode, 0, 0, 0);
+
+ if (xc_domain_set_tsc_info(ctx->xch, domid, tsc_mode, 0, 0, 0)) {
+ LOGE(ERROR, "xc_domain_set_tsc_info() failed");
+ ret = ERROR_FAIL;
+ goto out;
+ }
+
if (libxl_defbool_val(d_config->b_info.disable_migrate))
xc_domain_disable_migrate(ctx->xch, domid);
rtc_timeoffset = d_config->b_info.rtc_timeoffset;
diff --git a/tools/python/xen/lowlevel/xc/xc.c b/tools/python/xen/lowlevel/xc/xc.c
index 484b790..cc8175a 100644
--- a/tools/python/xen/lowlevel/xc/xc.c
+++ b/tools/python/xen/lowlevel/xc/xc.c
@@ -2439,7 +2439,7 @@ static PyMethodDef pyxc_methods[] = {
"Set a domain's TSC mode\n"
" dom [int]: Domain whose TSC mode is being set.\n"
" tsc_mode [int]: 0=default (monotonic, but native where possible)\n"
- " 1=always emulate 2=never emulate 3=pvrdtscp\n"
+ " 1=always emulate 2=never emulate\n"
"Returns: [int] 0 on success; -1 on error.\n" },
{ "domain_disable_migrate",
--
2.1.4
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
^ permalink raw reply related [flat|nested] 15+ messages in thread