All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH RFC 0/4] x86/AMD: support newer hardware features
       [not found] <532880230200007800125450@nat28.tlf.novell.com>
@ 2014-03-19  9:16 ` Jan Beulich
  2014-03-19  9:46   ` [PATCH RFC 1/4] x86/SVM: support data breakpoint extension registers Jan Beulich
                     ` (5 more replies)
  0 siblings, 6 replies; 28+ messages in thread
From: Jan Beulich @ 2014-03-19  9:16 UTC (permalink / raw)
  To: xen-devel
  Cc: Keir Fraser, Ian Jackson, Ian Campbell, suravee.suthikulpanit,
	Sherry Hurwitz

1: SVM: support data breakpoint extension registers
2: PV: support data breakpoint extension registers
3: x86/AMD: support further feature masking MSRs
4: x86/AMD: clean up pre-canned family/revision handling for CPUID masking

Patches 3 and 4 are fully tested; for patches 1 and 2 I'm lacking
suitable hardware, and hence depend on AMD's help (and patch
1 is also still lacking some tools side adjustments). Patch 3, otoh,
has a couple of questions to be answered. Overall this makes the
whole series RFC only for now.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH RFC 1/4] x86/SVM: support data breakpoint extension registers
  2014-03-19  9:16 ` [PATCH RFC 0/4] x86/AMD: support newer hardware features Jan Beulich
@ 2014-03-19  9:46   ` Jan Beulich
  2014-03-19  9:47   ` [PATCH RFC 2/4] x86/PV: " Jan Beulich
                     ` (4 subsequent siblings)
  5 siblings, 0 replies; 28+ messages in thread
From: Jan Beulich @ 2014-03-19  9:46 UTC (permalink / raw)
  To: xen-devel
  Cc: Keir Fraser, Ian Jackson, Ian Campbell, suravee.suthikulpanit,
	Sherry Hurwitz

[-- Attachment #1: Type: text/plain, Size: 11403 bytes --]

Leveraging the generic MSR save/restore logic introduced a little while
ago.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/tools/libxc/xc_cpufeature.h
+++ b/tools/libxc/xc_cpufeature.h
@@ -125,6 +125,7 @@
 #define X86_FEATURE_NODEID_MSR  19 /* NodeId MSR */
 #define X86_FEATURE_TBM         21 /* trailing bit manipulations */
 #define X86_FEATURE_TOPOEXT     22 /* topology extensions CPUID leafs */
+#define X86_FEATURE_DBEXT       26 /* data breakpoint extension */
 
 /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx) */
 #define X86_FEATURE_FSGSBASE     0 /* {RD,WR}{FS,GS}BASE instructions */
--- a/tools/libxc/xc_cpuid_x86.c
+++ b/tools/libxc/xc_cpuid_x86.c
@@ -110,9 +110,10 @@ static void amd_xc_cpuid_policy(
                     bitmaskof(X86_FEATURE_3DNOWPREFETCH) |
                     bitmaskof(X86_FEATURE_OSVW) |
                     bitmaskof(X86_FEATURE_XOP) |
+                    bitmaskof(X86_FEATURE_LWP) |
                     bitmaskof(X86_FEATURE_FMA4) |
                     bitmaskof(X86_FEATURE_TBM) |
-                    bitmaskof(X86_FEATURE_LWP));
+                    bitmaskof(X86_FEATURE_DBEXT));
         regs[3] &= (0x0183f3ff | /* features shared with 0x00000001:EDX */
                     (is_pae ? bitmaskof(X86_FEATURE_NX) : 0) |
                     (is_64bit ? bitmaskof(X86_FEATURE_LM) : 0) |
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -3082,6 +3082,9 @@ void hvm_cpuid(unsigned int input, unsig
         /* Only provide PSE36 when guest runs in 32bit PAE or in long mode */
         if ( !(hvm_pae_enabled(v) || hvm_long_mode_enabled(v)) )
             *edx &= ~cpufeat_mask(X86_FEATURE_PSE36);
+        /* Hide data breakpoint extensions if the hardware has not support. */
+        if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
+            *ecx &= ~cpufeat_mask(X86_FEATURE_DBEXT);
         break;
     }
 }
--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -160,14 +160,28 @@ void svm_intercept_msr(struct vcpu *v, u
 static void svm_save_dr(struct vcpu *v)
 {
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
+    unsigned int flag_dr_dirty = v->arch.hvm_vcpu.flag_dr_dirty;
 
-    if ( !v->arch.hvm_vcpu.flag_dr_dirty )
+    if ( !flag_dr_dirty )
         return;
 
     /* Clear the DR dirty flag and re-enable intercepts for DR accesses. */
     v->arch.hvm_vcpu.flag_dr_dirty = 0;
     vmcb_set_dr_intercepts(vmcb, ~0u);
 
+    if ( flag_dr_dirty & 2 )
+    {
+        svm_intercept_msr(v, MSR_AMD64_DR0_ADDRESS_MASK, MSR_INTERCEPT_RW);
+        svm_intercept_msr(v, MSR_AMD64_DR1_ADDRESS_MASK, MSR_INTERCEPT_RW);
+        svm_intercept_msr(v, MSR_AMD64_DR2_ADDRESS_MASK, MSR_INTERCEPT_RW);
+        svm_intercept_msr(v, MSR_AMD64_DR3_ADDRESS_MASK, MSR_INTERCEPT_RW);
+
+        rdmsrl(MSR_AMD64_DR0_ADDRESS_MASK, v->arch.hvm_svm.dr_mask[0]);
+        rdmsrl(MSR_AMD64_DR1_ADDRESS_MASK, v->arch.hvm_svm.dr_mask[1]);
+        rdmsrl(MSR_AMD64_DR2_ADDRESS_MASK, v->arch.hvm_svm.dr_mask[2]);
+        rdmsrl(MSR_AMD64_DR3_ADDRESS_MASK, v->arch.hvm_svm.dr_mask[3]);
+    }
+
     v->arch.debugreg[0] = read_debugreg(0);
     v->arch.debugreg[1] = read_debugreg(1);
     v->arch.debugreg[2] = read_debugreg(2);
@@ -178,12 +192,32 @@ static void svm_save_dr(struct vcpu *v)
 
 static void __restore_debug_registers(struct vmcb_struct *vmcb, struct vcpu *v)
 {
+    unsigned int ecx;
+
     if ( v->arch.hvm_vcpu.flag_dr_dirty )
         return;
 
     v->arch.hvm_vcpu.flag_dr_dirty = 1;
     vmcb_set_dr_intercepts(vmcb, 0);
 
+    ASSERT(v == current);
+    hvm_cpuid(0x80000001, NULL, NULL, &ecx, NULL);
+    if ( test_bit(X86_FEATURE_DBEXT & 31, &ecx) )
+    {
+        svm_intercept_msr(v, MSR_AMD64_DR0_ADDRESS_MASK, MSR_INTERCEPT_NONE);
+        svm_intercept_msr(v, MSR_AMD64_DR1_ADDRESS_MASK, MSR_INTERCEPT_NONE);
+        svm_intercept_msr(v, MSR_AMD64_DR2_ADDRESS_MASK, MSR_INTERCEPT_NONE);
+        svm_intercept_msr(v, MSR_AMD64_DR3_ADDRESS_MASK, MSR_INTERCEPT_NONE);
+
+        wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, v->arch.hvm_svm.dr_mask[0]);
+        wrmsrl(MSR_AMD64_DR1_ADDRESS_MASK, v->arch.hvm_svm.dr_mask[1]);
+        wrmsrl(MSR_AMD64_DR2_ADDRESS_MASK, v->arch.hvm_svm.dr_mask[2]);
+        wrmsrl(MSR_AMD64_DR3_ADDRESS_MASK, v->arch.hvm_svm.dr_mask[3]);
+
+        /* Can't use hvm_cpuid() in svm_save_dr(): v != current. */
+        v->arch.hvm_vcpu.flag_dr_dirty |= 2;
+    }
+
     write_debugreg(0, v->arch.debugreg[0]);
     write_debugreg(1, v->arch.debugreg[1]);
     write_debugreg(2, v->arch.debugreg[2]);
@@ -355,6 +389,72 @@ static int svm_load_vmcb_ctxt(struct vcp
     return 0;
 }
 
+static unsigned int __init svm_init_msr(void)
+{
+    return boot_cpu_has(X86_FEATURE_DBEXT) ? 4 : 0;
+}
+
+static void svm_save_msr(struct vcpu *v, struct hvm_msr *ctxt)
+{
+    if ( boot_cpu_has(X86_FEATURE_DBEXT) )
+    {
+        ctxt->msr[ctxt->count].val = v->arch.hvm_svm.dr_mask[0];
+        if ( ctxt->msr[ctxt->count].val )
+            ctxt->msr[ctxt->count++].index = MSR_AMD64_DR0_ADDRESS_MASK;
+
+        ctxt->msr[ctxt->count].val = v->arch.hvm_svm.dr_mask[1];
+        if ( ctxt->msr[ctxt->count].val )
+            ctxt->msr[ctxt->count++].index = MSR_AMD64_DR1_ADDRESS_MASK;
+
+        ctxt->msr[ctxt->count].val = v->arch.hvm_svm.dr_mask[2];
+        if ( ctxt->msr[ctxt->count].val )
+            ctxt->msr[ctxt->count++].index = MSR_AMD64_DR2_ADDRESS_MASK;
+
+        ctxt->msr[ctxt->count].val = v->arch.hvm_svm.dr_mask[3];
+        if ( ctxt->msr[ctxt->count].val )
+            ctxt->msr[ctxt->count++].index = MSR_AMD64_DR3_ADDRESS_MASK;
+    }
+}
+
+static int svm_load_msr(struct vcpu *v, struct hvm_msr *ctxt)
+{
+    unsigned int i, idx;
+    int err = 0;
+
+    for ( i = 0; i < ctxt->count; ++i )
+    {
+        switch ( idx = ctxt->msr[i].index )
+        {
+        case MSR_AMD64_DR0_ADDRESS_MASK:
+            if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
+                err = -ENXIO;
+            else if ( ctxt->msr[i].val >> 32 )
+                err = -EDOM;
+            else
+                v->arch.hvm_svm.dr_mask[0] = ctxt->msr[i].val;
+            break;
+
+        case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
+            if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
+                err = -ENXIO;
+            else if ( ctxt->msr[i].val >> 32 )
+                err = -EDOM;
+            else
+                v->arch.hvm_svm.dr_mask[idx - MSR_AMD64_DR1_ADDRESS_MASK + 1] =
+                    ctxt->msr[i].val;
+            break;
+
+        default:
+            continue;
+        }
+        if ( err )
+            break;
+        ctxt->msr[i]._rsvd = 1;
+    }
+
+    return err;
+}
+
 static void svm_fpu_enter(struct vcpu *v)
 {
     struct vmcb_struct *n1vmcb = vcpu_nestedhvm(v).nv_n1vmcx;
@@ -1451,6 +1551,8 @@ static int svm_msr_read_intercept(unsign
 
     switch ( msr )
     {
+        unsigned int ecx;
+
     case MSR_IA32_SYSENTER_CS:
         *msr_content = v->arch.hvm_svm.guest_sysenter_cs;
         break;
@@ -1526,6 +1628,21 @@ static int svm_msr_read_intercept(unsign
         vpmu_do_rdmsr(msr, msr_content);
         break;
 
+    case MSR_AMD64_DR0_ADDRESS_MASK:
+        hvm_cpuid(0x80000001, NULL, NULL, &ecx, NULL);
+        if ( !test_bit(X86_FEATURE_DBEXT & 31, &ecx) )
+            goto gpf;
+        *msr_content = v->arch.hvm_svm.dr_mask[0];
+        break;
+
+    case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
+        hvm_cpuid(0x80000001, NULL, NULL, &ecx, NULL);
+        if ( !test_bit(X86_FEATURE_DBEXT & 31, &ecx) )
+            goto gpf;
+        *msr_content =
+            v->arch.hvm_svm.dr_mask[msr - MSR_AMD64_DR1_ADDRESS_MASK + 1];
+        break;
+
     case MSR_AMD_OSVW_ID_LENGTH:
     case MSR_AMD_OSVW_STATUS:
         ret = svm_handle_osvw(v, msr, msr_content, 1);
@@ -1594,6 +1711,8 @@ static int svm_msr_write_intercept(unsig
 
     switch ( msr )
     {
+        unsigned int ecx;
+
     case MSR_IA32_SYSENTER_CS:
         vmcb->sysenter_cs = v->arch.hvm_svm.guest_sysenter_cs = msr_content;
         break;
@@ -1669,6 +1788,21 @@ static int svm_msr_write_intercept(unsig
          */
         break;
 
+    case MSR_AMD64_DR0_ADDRESS_MASK:
+        hvm_cpuid(0x80000001, NULL, NULL, &ecx, NULL);
+        if ( !test_bit(X86_FEATURE_DBEXT & 31, &ecx) || (msr_content >> 32) )
+            goto gpf;
+        v->arch.hvm_svm.dr_mask[0] = msr_content;
+        break;
+
+    case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
+        hvm_cpuid(0x80000001, NULL, NULL, &ecx, NULL);
+        if ( !test_bit(X86_FEATURE_DBEXT & 31, &ecx) || (msr_content >> 32) )
+            goto gpf;
+        v->arch.hvm_svm.dr_mask[msr - MSR_AMD64_DR1_ADDRESS_MASK + 1] =
+            msr_content;
+        break;
+
     case MSR_AMD_OSVW_ID_LENGTH:
     case MSR_AMD_OSVW_STATUS:
         ret = svm_handle_osvw(v, msr, &msr_content, 0);
@@ -2022,6 +2156,9 @@ static struct hvm_function_table __initd
     .vcpu_destroy         = svm_vcpu_destroy,
     .save_cpu_ctxt        = svm_save_vmcb_ctxt,
     .load_cpu_ctxt        = svm_load_vmcb_ctxt,
+    .init_msr             = svm_init_msr,
+    .save_msr             = svm_save_msr,
+    .load_msr             = svm_load_msr,
     .get_interrupt_shadow = svm_get_interrupt_shadow,
     .set_interrupt_shadow = svm_set_interrupt_shadow,
     .guest_x86_mode       = svm_guest_x86_mode,
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -134,6 +134,7 @@
 #define X86_FEATURE_NODEID_MSR  (6*32+19) /* NodeId MSR */
 #define X86_FEATURE_TBM         (6*32+21) /* trailing bit manipulations */
 #define X86_FEATURE_TOPOEXT     (6*32+22) /* topology extensions CPUID leafs */
+#define X86_FEATURE_DBEXT       (6*32+26) /* data breakpoint extension */
 
 /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 7 */
 #define X86_FEATURE_FSGSBASE	(7*32+ 0) /* {RD,WR}{FS,GS}BASE instructions */
--- a/xen/include/asm-x86/hvm/svm/vmcb.h
+++ b/xen/include/asm-x86/hvm/svm/vmcb.h
@@ -515,6 +515,9 @@ struct arch_svm_struct {
     uint64_t guest_lwp_cfg;      /* guest version */
     uint64_t cpu_lwp_cfg;        /* CPU version */
 
+    /* data breakpoint extension MSRs */
+    uint32_t dr_mask[4];
+
     /* OSVW MSRs */
     struct {
         u64 length;
--- a/xen/include/asm-x86/hvm/vcpu.h
+++ b/xen/include/asm-x86/hvm/vcpu.h
@@ -140,7 +140,7 @@ struct hvm_vcpu {
 
     int                 xen_port;
 
-    bool_t              flag_dr_dirty;
+    u8                  flag_dr_dirty;
     bool_t              debug_state_latch;
     bool_t              single_step;
 
--- a/xen/include/asm-x86/msr-index.h
+++ b/xen/include/asm-x86/msr-index.h
@@ -206,6 +206,11 @@
 #define MSR_AMD64_DC_CFG		0xc0011022
 #define AMD64_NB_CFG_CF8_EXT_ENABLE_BIT	46
 
+#define MSR_AMD64_DR0_ADDRESS_MASK	0xc0011027
+#define MSR_AMD64_DR1_ADDRESS_MASK	0xc0011019
+#define MSR_AMD64_DR2_ADDRESS_MASK	0xc001101a
+#define MSR_AMD64_DR3_ADDRESS_MASK	0xc001101b
+
 /* AMD Family10h machine check MSRs */
 #define MSR_F10_MC4_MISC1		0xc0000408
 #define MSR_F10_MC4_MISC2		0xc0000409



[-- Attachment #2: SVM-debug-address-mask-MSRs.patch --]
[-- Type: text/plain, Size: 11455 bytes --]

x86/SVM: support data breakpoint extension registers

Leveraging the generic MSR save/restore logic introduced a little while
ago.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/tools/libxc/xc_cpufeature.h
+++ b/tools/libxc/xc_cpufeature.h
@@ -125,6 +125,7 @@
 #define X86_FEATURE_NODEID_MSR  19 /* NodeId MSR */
 #define X86_FEATURE_TBM         21 /* trailing bit manipulations */
 #define X86_FEATURE_TOPOEXT     22 /* topology extensions CPUID leafs */
+#define X86_FEATURE_DBEXT       26 /* data breakpoint extension */
 
 /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx) */
 #define X86_FEATURE_FSGSBASE     0 /* {RD,WR}{FS,GS}BASE instructions */
--- a/tools/libxc/xc_cpuid_x86.c
+++ b/tools/libxc/xc_cpuid_x86.c
@@ -110,9 +110,10 @@ static void amd_xc_cpuid_policy(
                     bitmaskof(X86_FEATURE_3DNOWPREFETCH) |
                     bitmaskof(X86_FEATURE_OSVW) |
                     bitmaskof(X86_FEATURE_XOP) |
+                    bitmaskof(X86_FEATURE_LWP) |
                     bitmaskof(X86_FEATURE_FMA4) |
                     bitmaskof(X86_FEATURE_TBM) |
-                    bitmaskof(X86_FEATURE_LWP));
+                    bitmaskof(X86_FEATURE_DBEXT));
         regs[3] &= (0x0183f3ff | /* features shared with 0x00000001:EDX */
                     (is_pae ? bitmaskof(X86_FEATURE_NX) : 0) |
                     (is_64bit ? bitmaskof(X86_FEATURE_LM) : 0) |
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -3082,6 +3082,9 @@ void hvm_cpuid(unsigned int input, unsig
         /* Only provide PSE36 when guest runs in 32bit PAE or in long mode */
         if ( !(hvm_pae_enabled(v) || hvm_long_mode_enabled(v)) )
             *edx &= ~cpufeat_mask(X86_FEATURE_PSE36);
+        /* Hide data breakpoint extensions if the hardware has not support. */
+        if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
+            *ecx &= ~cpufeat_mask(X86_FEATURE_DBEXT);
         break;
     }
 }
--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -160,14 +160,28 @@ void svm_intercept_msr(struct vcpu *v, u
 static void svm_save_dr(struct vcpu *v)
 {
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
+    unsigned int flag_dr_dirty = v->arch.hvm_vcpu.flag_dr_dirty;
 
-    if ( !v->arch.hvm_vcpu.flag_dr_dirty )
+    if ( !flag_dr_dirty )
         return;
 
     /* Clear the DR dirty flag and re-enable intercepts for DR accesses. */
     v->arch.hvm_vcpu.flag_dr_dirty = 0;
     vmcb_set_dr_intercepts(vmcb, ~0u);
 
+    if ( flag_dr_dirty & 2 )
+    {
+        svm_intercept_msr(v, MSR_AMD64_DR0_ADDRESS_MASK, MSR_INTERCEPT_RW);
+        svm_intercept_msr(v, MSR_AMD64_DR1_ADDRESS_MASK, MSR_INTERCEPT_RW);
+        svm_intercept_msr(v, MSR_AMD64_DR2_ADDRESS_MASK, MSR_INTERCEPT_RW);
+        svm_intercept_msr(v, MSR_AMD64_DR3_ADDRESS_MASK, MSR_INTERCEPT_RW);
+
+        rdmsrl(MSR_AMD64_DR0_ADDRESS_MASK, v->arch.hvm_svm.dr_mask[0]);
+        rdmsrl(MSR_AMD64_DR1_ADDRESS_MASK, v->arch.hvm_svm.dr_mask[1]);
+        rdmsrl(MSR_AMD64_DR2_ADDRESS_MASK, v->arch.hvm_svm.dr_mask[2]);
+        rdmsrl(MSR_AMD64_DR3_ADDRESS_MASK, v->arch.hvm_svm.dr_mask[3]);
+    }
+
     v->arch.debugreg[0] = read_debugreg(0);
     v->arch.debugreg[1] = read_debugreg(1);
     v->arch.debugreg[2] = read_debugreg(2);
@@ -178,12 +192,32 @@ static void svm_save_dr(struct vcpu *v)
 
 static void __restore_debug_registers(struct vmcb_struct *vmcb, struct vcpu *v)
 {
+    unsigned int ecx;
+
     if ( v->arch.hvm_vcpu.flag_dr_dirty )
         return;
 
     v->arch.hvm_vcpu.flag_dr_dirty = 1;
     vmcb_set_dr_intercepts(vmcb, 0);
 
+    ASSERT(v == current);
+    hvm_cpuid(0x80000001, NULL, NULL, &ecx, NULL);
+    if ( test_bit(X86_FEATURE_DBEXT & 31, &ecx) )
+    {
+        svm_intercept_msr(v, MSR_AMD64_DR0_ADDRESS_MASK, MSR_INTERCEPT_NONE);
+        svm_intercept_msr(v, MSR_AMD64_DR1_ADDRESS_MASK, MSR_INTERCEPT_NONE);
+        svm_intercept_msr(v, MSR_AMD64_DR2_ADDRESS_MASK, MSR_INTERCEPT_NONE);
+        svm_intercept_msr(v, MSR_AMD64_DR3_ADDRESS_MASK, MSR_INTERCEPT_NONE);
+
+        wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, v->arch.hvm_svm.dr_mask[0]);
+        wrmsrl(MSR_AMD64_DR1_ADDRESS_MASK, v->arch.hvm_svm.dr_mask[1]);
+        wrmsrl(MSR_AMD64_DR2_ADDRESS_MASK, v->arch.hvm_svm.dr_mask[2]);
+        wrmsrl(MSR_AMD64_DR3_ADDRESS_MASK, v->arch.hvm_svm.dr_mask[3]);
+
+        /* Can't use hvm_cpuid() in svm_save_dr(): v != current. */
+        v->arch.hvm_vcpu.flag_dr_dirty |= 2;
+    }
+
     write_debugreg(0, v->arch.debugreg[0]);
     write_debugreg(1, v->arch.debugreg[1]);
     write_debugreg(2, v->arch.debugreg[2]);
@@ -355,6 +389,72 @@ static int svm_load_vmcb_ctxt(struct vcp
     return 0;
 }
 
+static unsigned int __init svm_init_msr(void)
+{
+    return boot_cpu_has(X86_FEATURE_DBEXT) ? 4 : 0;
+}
+
+static void svm_save_msr(struct vcpu *v, struct hvm_msr *ctxt)
+{
+    if ( boot_cpu_has(X86_FEATURE_DBEXT) )
+    {
+        ctxt->msr[ctxt->count].val = v->arch.hvm_svm.dr_mask[0];
+        if ( ctxt->msr[ctxt->count].val )
+            ctxt->msr[ctxt->count++].index = MSR_AMD64_DR0_ADDRESS_MASK;
+
+        ctxt->msr[ctxt->count].val = v->arch.hvm_svm.dr_mask[1];
+        if ( ctxt->msr[ctxt->count].val )
+            ctxt->msr[ctxt->count++].index = MSR_AMD64_DR1_ADDRESS_MASK;
+
+        ctxt->msr[ctxt->count].val = v->arch.hvm_svm.dr_mask[2];
+        if ( ctxt->msr[ctxt->count].val )
+            ctxt->msr[ctxt->count++].index = MSR_AMD64_DR2_ADDRESS_MASK;
+
+        ctxt->msr[ctxt->count].val = v->arch.hvm_svm.dr_mask[3];
+        if ( ctxt->msr[ctxt->count].val )
+            ctxt->msr[ctxt->count++].index = MSR_AMD64_DR3_ADDRESS_MASK;
+    }
+}
+
+static int svm_load_msr(struct vcpu *v, struct hvm_msr *ctxt)
+{
+    unsigned int i, idx;
+    int err = 0;
+
+    for ( i = 0; i < ctxt->count; ++i )
+    {
+        switch ( idx = ctxt->msr[i].index )
+        {
+        case MSR_AMD64_DR0_ADDRESS_MASK:
+            if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
+                err = -ENXIO;
+            else if ( ctxt->msr[i].val >> 32 )
+                err = -EDOM;
+            else
+                v->arch.hvm_svm.dr_mask[0] = ctxt->msr[i].val;
+            break;
+
+        case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
+            if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
+                err = -ENXIO;
+            else if ( ctxt->msr[i].val >> 32 )
+                err = -EDOM;
+            else
+                v->arch.hvm_svm.dr_mask[idx - MSR_AMD64_DR1_ADDRESS_MASK + 1] =
+                    ctxt->msr[i].val;
+            break;
+
+        default:
+            continue;
+        }
+        if ( err )
+            break;
+        ctxt->msr[i]._rsvd = 1;
+    }
+
+    return err;
+}
+
 static void svm_fpu_enter(struct vcpu *v)
 {
     struct vmcb_struct *n1vmcb = vcpu_nestedhvm(v).nv_n1vmcx;
@@ -1451,6 +1551,8 @@ static int svm_msr_read_intercept(unsign
 
     switch ( msr )
     {
+        unsigned int ecx;
+
     case MSR_IA32_SYSENTER_CS:
         *msr_content = v->arch.hvm_svm.guest_sysenter_cs;
         break;
@@ -1526,6 +1628,21 @@ static int svm_msr_read_intercept(unsign
         vpmu_do_rdmsr(msr, msr_content);
         break;
 
+    case MSR_AMD64_DR0_ADDRESS_MASK:
+        hvm_cpuid(0x80000001, NULL, NULL, &ecx, NULL);
+        if ( !test_bit(X86_FEATURE_DBEXT & 31, &ecx) )
+            goto gpf;
+        *msr_content = v->arch.hvm_svm.dr_mask[0];
+        break;
+
+    case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
+        hvm_cpuid(0x80000001, NULL, NULL, &ecx, NULL);
+        if ( !test_bit(X86_FEATURE_DBEXT & 31, &ecx) )
+            goto gpf;
+        *msr_content =
+            v->arch.hvm_svm.dr_mask[msr - MSR_AMD64_DR1_ADDRESS_MASK + 1];
+        break;
+
     case MSR_AMD_OSVW_ID_LENGTH:
     case MSR_AMD_OSVW_STATUS:
         ret = svm_handle_osvw(v, msr, msr_content, 1);
@@ -1594,6 +1711,8 @@ static int svm_msr_write_intercept(unsig
 
     switch ( msr )
     {
+        unsigned int ecx;
+
     case MSR_IA32_SYSENTER_CS:
         vmcb->sysenter_cs = v->arch.hvm_svm.guest_sysenter_cs = msr_content;
         break;
@@ -1669,6 +1788,21 @@ static int svm_msr_write_intercept(unsig
          */
         break;
 
+    case MSR_AMD64_DR0_ADDRESS_MASK:
+        hvm_cpuid(0x80000001, NULL, NULL, &ecx, NULL);
+        if ( !test_bit(X86_FEATURE_DBEXT & 31, &ecx) || (msr_content >> 32) )
+            goto gpf;
+        v->arch.hvm_svm.dr_mask[0] = msr_content;
+        break;
+
+    case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
+        hvm_cpuid(0x80000001, NULL, NULL, &ecx, NULL);
+        if ( !test_bit(X86_FEATURE_DBEXT & 31, &ecx) || (msr_content >> 32) )
+            goto gpf;
+        v->arch.hvm_svm.dr_mask[msr - MSR_AMD64_DR1_ADDRESS_MASK + 1] =
+            msr_content;
+        break;
+
     case MSR_AMD_OSVW_ID_LENGTH:
     case MSR_AMD_OSVW_STATUS:
         ret = svm_handle_osvw(v, msr, &msr_content, 0);
@@ -2022,6 +2156,9 @@ static struct hvm_function_table __initd
     .vcpu_destroy         = svm_vcpu_destroy,
     .save_cpu_ctxt        = svm_save_vmcb_ctxt,
     .load_cpu_ctxt        = svm_load_vmcb_ctxt,
+    .init_msr             = svm_init_msr,
+    .save_msr             = svm_save_msr,
+    .load_msr             = svm_load_msr,
     .get_interrupt_shadow = svm_get_interrupt_shadow,
     .set_interrupt_shadow = svm_set_interrupt_shadow,
     .guest_x86_mode       = svm_guest_x86_mode,
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -134,6 +134,7 @@
 #define X86_FEATURE_NODEID_MSR  (6*32+19) /* NodeId MSR */
 #define X86_FEATURE_TBM         (6*32+21) /* trailing bit manipulations */
 #define X86_FEATURE_TOPOEXT     (6*32+22) /* topology extensions CPUID leafs */
+#define X86_FEATURE_DBEXT       (6*32+26) /* data breakpoint extension */
 
 /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 7 */
 #define X86_FEATURE_FSGSBASE	(7*32+ 0) /* {RD,WR}{FS,GS}BASE instructions */
--- a/xen/include/asm-x86/hvm/svm/vmcb.h
+++ b/xen/include/asm-x86/hvm/svm/vmcb.h
@@ -515,6 +515,9 @@ struct arch_svm_struct {
     uint64_t guest_lwp_cfg;      /* guest version */
     uint64_t cpu_lwp_cfg;        /* CPU version */
 
+    /* data breakpoint extension MSRs */
+    uint32_t dr_mask[4];
+
     /* OSVW MSRs */
     struct {
         u64 length;
--- a/xen/include/asm-x86/hvm/vcpu.h
+++ b/xen/include/asm-x86/hvm/vcpu.h
@@ -140,7 +140,7 @@ struct hvm_vcpu {
 
     int                 xen_port;
 
-    bool_t              flag_dr_dirty;
+    u8                  flag_dr_dirty;
     bool_t              debug_state_latch;
     bool_t              single_step;
 
--- a/xen/include/asm-x86/msr-index.h
+++ b/xen/include/asm-x86/msr-index.h
@@ -206,6 +206,11 @@
 #define MSR_AMD64_DC_CFG		0xc0011022
 #define AMD64_NB_CFG_CF8_EXT_ENABLE_BIT	46
 
+#define MSR_AMD64_DR0_ADDRESS_MASK	0xc0011027
+#define MSR_AMD64_DR1_ADDRESS_MASK	0xc0011019
+#define MSR_AMD64_DR2_ADDRESS_MASK	0xc001101a
+#define MSR_AMD64_DR3_ADDRESS_MASK	0xc001101b
+
 /* AMD Family10h machine check MSRs */
 #define MSR_F10_MC4_MISC1		0xc0000408
 #define MSR_F10_MC4_MISC2		0xc0000409

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH RFC 2/4] x86/PV: support data breakpoint extension registers
  2014-03-19  9:16 ` [PATCH RFC 0/4] x86/AMD: support newer hardware features Jan Beulich
  2014-03-19  9:46   ` [PATCH RFC 1/4] x86/SVM: support data breakpoint extension registers Jan Beulich
@ 2014-03-19  9:47   ` Jan Beulich
  2014-03-19  9:47   ` [PATCH RFC 3/4] x86/AMD: support further feature masking MSRs Jan Beulich
                     ` (3 subsequent siblings)
  5 siblings, 0 replies; 28+ messages in thread
From: Jan Beulich @ 2014-03-19  9:47 UTC (permalink / raw)
  To: xen-devel
  Cc: Keir Fraser, Ian Jackson, Ian Campbell, suravee.suthikulpanit,
	Sherry Hurwitz

[-- Attachment #1: Type: text/plain, Size: 13423 bytes --]

Introducing an extension to XEN_DOMCTL_[gs]et_ext_vcpucontext similar
to the generic MSR save/restore logic recently added for HVM.

This also moves some debug register related declarations/definition to
the header intended for these.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
TBD: libxc adjustment still missing (want to get some basic feedback on
     the domctl extension first)

--- a/xen/arch/x86/acpi/suspend.c
+++ b/xen/arch/x86/acpi/suspend.c
@@ -9,6 +9,7 @@
 #include <xen/smp.h>
 #include <asm/processor.h>
 #include <asm/msr.h>
+#include <asm/debugreg.h>
 #include <asm/flushtlb.h>
 #include <asm/hvm/hvm.h>
 #include <asm/hvm/support.h>
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1316,14 +1316,7 @@ static void paravirt_ctxt_switch_to(stru
         write_cr4(cr4);
 
     if ( unlikely(v->arch.debugreg[7] & DR7_ACTIVE_MASK) )
-    {
-        write_debugreg(0, v->arch.debugreg[0]);
-        write_debugreg(1, v->arch.debugreg[1]);
-        write_debugreg(2, v->arch.debugreg[2]);
-        write_debugreg(3, v->arch.debugreg[3]);
-        write_debugreg(6, v->arch.debugreg[6]);
-        write_debugreg(7, v->arch.debugreg[7]);
-    }
+        activate_debugregs(v);
 
     if ( (v->domain->arch.tsc_mode ==  TSC_MODE_PVRDTSCP) &&
          boot_cpu_has(X86_FEATURE_RDTSCP) )
--- a/xen/arch/x86/domctl.c
+++ b/xen/arch/x86/domctl.c
@@ -52,6 +52,7 @@ long arch_do_domctl(
 {
     long ret = 0;
     bool_t copyback = 0;
+    unsigned long i;
 
     switch ( domctl->cmd )
     {
@@ -319,7 +320,6 @@ long arch_do_domctl(
 
     case XEN_DOMCTL_getmemlist:
     {
-        int i;
         unsigned long max_pfns = domctl->u.getmemlist.max_pfns;
         uint64_t mfn;
         struct page_info *page;
@@ -645,7 +645,6 @@ long arch_do_domctl(
         unsigned long mfn = domctl->u.memory_mapping.first_mfn;
         unsigned long nr_mfns = domctl->u.memory_mapping.nr_mfns;
         int add = domctl->u.memory_mapping.add_mapping;
-        unsigned long i;
 
         ret = -EINVAL;
         if ( (mfn + nr_mfns - 1) < mfn || /* wrap? */
@@ -809,6 +808,7 @@ long arch_do_domctl(
     {
         struct xen_domctl_ext_vcpucontext *evc;
         struct vcpu *v;
+        struct xen_domctl_ext_vcpu_msr msr;
 
         evc = &domctl->u.ext_vcpucontext;
 
@@ -854,7 +854,42 @@ long arch_do_domctl(
             evc->vmce.mci_ctl2_bank0 = v->arch.vmce.bank[0].mci_ctl2;
             evc->vmce.mci_ctl2_bank1 = v->arch.vmce.bank[1].mci_ctl2;
 
-            ret = 0;
+            i = ret = 0;
+            if ( boot_cpu_has(X86_FEATURE_DBEXT) )
+            {
+                unsigned int j;
+
+                if ( v->arch.pv_vcpu.dr_mask[0] )
+                {
+                    if ( i < evc->msr_count && !ret )
+                    {
+                        msr.index = MSR_AMD64_DR0_ADDRESS_MASK;
+                        msr.reserved = 0;
+                        msr.value = v->arch.pv_vcpu.dr_mask[0];
+                        if ( copy_to_guest_offset(evc->msrs, i, &msr, 1) )
+                            ret = -EFAULT;
+                    }
+                    ++i;
+                }
+                for ( j = 0; j < 3; ++j )
+                {
+                    if ( !v->arch.pv_vcpu.dr_mask[1 + j] )
+                        continue;
+                    if ( i < evc->msr_count && !ret )
+                    {
+                        msr.index = MSR_AMD64_DR1_ADDRESS_MASK + j;
+                        msr.reserved = 0;
+                        msr.value = v->arch.pv_vcpu.dr_mask[1 + j];
+                        if ( copy_to_guest_offset(evc->msrs, i, &msr, 1) )
+                            ret = -EFAULT;
+                    }
+                    ++i;
+                }
+            }
+            if ( i > evc->msr_count && !ret )
+                ret = -ENOBUFS;
+            evc->msr_count = i;
+
             vcpu_unpause(v);
             copyback = 1;
         }
@@ -909,9 +944,49 @@ long arch_do_domctl(
 
                 ret = vmce_restore_vcpu(v, &vmce);
             }
+            else if ( evc->size > offsetof(typeof(*evc), vmce) )
+                ret = -EINVAL;
             else
                 ret = 0;
 
+            if ( ret || evc->size <= offsetof(typeof(*evc), msrs) )
+                /* nothing */;
+            else if ( evc->size < offsetof(typeof(*evc), msrs) +
+                                  sizeof(evc->msrs) )
+                ret = -EINVAL;
+            else
+            {
+                for ( i = 0; i < evc->msr_count; ++i )
+                {
+                    ret = -EFAULT;
+                    if ( copy_from_guest_offset(&msr, evc->msrs, i, 1) )
+                        break;
+                    ret = -EINVAL;
+                    if ( msr.reserved )
+                        break;
+                    switch ( msr.index )
+                    {
+                    case MSR_AMD64_DR0_ADDRESS_MASK:
+                        if ( !boot_cpu_has(X86_FEATURE_DBEXT) ||
+                             (msr.value >> 32) )
+                            break;
+                        v->arch.pv_vcpu.dr_mask[0] = msr.value;
+                        continue;
+                    case MSR_AMD64_DR1_ADDRESS_MASK ...
+                         MSR_AMD64_DR3_ADDRESS_MASK:
+                        if ( !boot_cpu_has(X86_FEATURE_DBEXT) ||
+                             (msr.value >> 32) )
+                            break;
+                        msr.index -= MSR_AMD64_DR1_ADDRESS_MASK - 1;
+                        v->arch.pv_vcpu.dr_mask[msr.index] = msr.value;
+                        continue;
+                    }
+                    break;
+                }
+                if ( i == evc->msr_count )
+                    ret = 0;
+            }
+
             domain_unpause(d);
         }
     }
@@ -921,7 +996,6 @@ long arch_do_domctl(
     {
         xen_domctl_cpuid_t *ctl = &domctl->u.cpuid;
         cpuid_input_t *cpuid = NULL; 
-        int i;
 
         for ( i = 0; i < MAX_CPUID_INPUT; i++ )
         {
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -2498,6 +2498,23 @@ static int emulate_privileged_op(struct 
             if ( wrmsr_safe(regs->ecx, msr_content) != 0 )
                 goto fail;
             break;
+
+        case MSR_AMD64_DR0_ADDRESS_MASK:
+            if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (msr_content >> 32) )
+                goto fail;
+            v->arch.pv_vcpu.dr_mask[0] = msr_content;
+            if ( v->arch.debugreg[7] & DR7_ACTIVE_MASK )
+                wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, msr_content);
+            break;
+        case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
+            if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (msr_content >> 32) )
+                goto fail;
+            v->arch.pv_vcpu.dr_mask
+                [regs->_ecx - MSR_AMD64_DR1_ADDRESS_MASK + 1] = msr_content;
+            if ( v->arch.debugreg[7] & DR7_ACTIVE_MASK )
+                wrmsrl(regs->_ecx, msr_content);
+            break;
+
         default:
             if ( wrmsr_hypervisor_regs(regs->ecx, msr_content) == 1 )
                 break;
@@ -2585,6 +2602,21 @@ static int emulate_privileged_op(struct 
             regs->eax = (uint32_t)msr_content;
             regs->edx = (uint32_t)(msr_content >> 32);
             break;
+
+        case MSR_AMD64_DR0_ADDRESS_MASK:
+            if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
+                goto fail;
+            regs->eax = v->arch.pv_vcpu.dr_mask[0];
+            regs->edx = 0;
+            break;
+        case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
+            if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
+                goto fail;
+            regs->eax = v->arch.pv_vcpu.dr_mask
+                            [regs->_ecx - MSR_AMD64_DR1_ADDRESS_MASK + 1];
+            regs->edx = 0;
+            break;
+
         default:
             if ( rdmsr_hypervisor_regs(regs->ecx, &val) )
             {
@@ -3628,7 +3660,27 @@ long do_set_trap_table(XEN_GUEST_HANDLE_
     return rc;
 }
 
-long set_debugreg(struct vcpu *v, int reg, unsigned long value)
+void activate_debugregs(const struct vcpu *curr)
+{
+    ASSERT(curr == current);
+
+    write_debugreg(0, curr->arch.debugreg[0]);
+    write_debugreg(1, curr->arch.debugreg[1]);
+    write_debugreg(2, curr->arch.debugreg[2]);
+    write_debugreg(3, curr->arch.debugreg[3]);
+    write_debugreg(6, curr->arch.debugreg[6]);
+    write_debugreg(7, curr->arch.debugreg[7]);
+
+    if ( boot_cpu_has(X86_FEATURE_DBEXT) )
+    {
+        wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, curr->arch.pv_vcpu.dr_mask[0]);
+        wrmsrl(MSR_AMD64_DR1_ADDRESS_MASK, curr->arch.pv_vcpu.dr_mask[1]);
+        wrmsrl(MSR_AMD64_DR2_ADDRESS_MASK, curr->arch.pv_vcpu.dr_mask[2]);
+        wrmsrl(MSR_AMD64_DR3_ADDRESS_MASK, curr->arch.pv_vcpu.dr_mask[3]);
+    }
+}
+
+long set_debugreg(struct vcpu *v, unsigned int reg, unsigned long value)
 {
     int i;
     struct vcpu *curr = current;
@@ -3709,11 +3761,8 @@ long set_debugreg(struct vcpu *v, int re
             if ( (v == curr) &&
                  !(v->arch.debugreg[7] & DR7_ACTIVE_MASK) )
             {
-                write_debugreg(0, v->arch.debugreg[0]);
-                write_debugreg(1, v->arch.debugreg[1]);
-                write_debugreg(2, v->arch.debugreg[2]);
-                write_debugreg(3, v->arch.debugreg[3]);
-                write_debugreg(6, v->arch.debugreg[6]);
+                activate_debugregs(curr);
+                break;
             }
         }
         if ( v == curr )
--- a/xen/include/asm-x86/debugreg.h
+++ b/xen/include/asm-x86/debugreg.h
@@ -64,4 +64,16 @@
 #define DR_GLOBAL_EXACT_ENABLE   (0x00000200ul) /* Global exact enable */
 #define DR_GENERAL_DETECT        (0x00002000ul) /* General detect enable */
 
+#define write_debugreg(reg, val) do {                       \
+    unsigned long __val = val;                              \
+    asm volatile ( "mov %0,%%db" #reg : : "r" (__val) );    \
+} while (0)
+#define read_debugreg(reg) ({                               \
+    unsigned long __val;                                    \
+    asm volatile ( "mov %%db" #reg ",%0" : "=r" (__val) );  \
+    __val;                                                  \
+})
+long set_debugreg(struct vcpu *, unsigned int reg, unsigned long value);
+void activate_debugregs(const struct vcpu *);
+
 #endif /* _X86_DEBUGREG_H */
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -374,6 +374,9 @@ struct pv_vcpu
     unsigned long shadow_ldt_mapcnt;
     spinlock_t shadow_ldt_lock;
 
+    /* data breakpoint extension MSRs */
+    uint32_t dr_mask[4];
+
     /* Deferred VA-based update state. */
     bool_t need_update_runstate_area;
     struct vcpu_time_info pending_system_time;
--- a/xen/include/asm-x86/processor.h
+++ b/xen/include/asm-x86/processor.h
@@ -462,17 +462,6 @@ long set_gdt(struct vcpu *d, 
              unsigned long *frames, 
              unsigned int entries);
 
-#define write_debugreg(reg, val) do {                       \
-    unsigned long __val = val;                              \
-    asm volatile ( "mov %0,%%db" #reg : : "r" (__val) );    \
-} while (0)
-#define read_debugreg(reg) ({                               \
-    unsigned long __val;                                    \
-    asm volatile ( "mov %%db" #reg ",%0" : "=r" (__val) );  \
-    __val;                                                  \
-})
-long set_debugreg(struct vcpu *p, int reg, unsigned long value);
-
 /* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
 static always_inline void rep_nop(void)
 {
--- a/xen/include/public/domctl.h
+++ b/xen/include/public/domctl.h
@@ -36,7 +36,7 @@
 #include "grant_table.h"
 #include "hvm/save.h"
 
-#define XEN_DOMCTL_INTERFACE_VERSION 0x00000009
+#define XEN_DOMCTL_INTERFACE_VERSION 0x0000000a
 
 /*
  * NB. xen_domctl.domain is an IN/OUT parameter for this operation.
@@ -563,6 +563,16 @@ typedef struct xen_domctl_pin_mem_cachea
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_pin_mem_cacheattr_t);
 
 
+#if defined(__i386__) || defined(__x86_64__)
+struct xen_domctl_ext_vcpu_msr {
+    uint32_t         index;
+    uint32_t         reserved;
+    uint64_aligned_t value;
+};
+typedef struct xen_domctl_ext_vcpu_msr xen_domctl_ext_vcpu_msr_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_ext_vcpu_msr_t);
+#endif
+
 /* XEN_DOMCTL_set_ext_vcpucontext */
 /* XEN_DOMCTL_get_ext_vcpucontext */
 struct xen_domctl_ext_vcpucontext {
@@ -582,6 +592,7 @@ struct xen_domctl_ext_vcpucontext {
     uint16_t         sysenter_callback_cs;
     uint8_t          syscall32_disables_events;
     uint8_t          sysenter_disables_events;
+    uint16_t         msr_count;
 #if defined(__GNUC__)
     union {
         uint64_aligned_t mcg_cap;
@@ -590,6 +601,7 @@ struct xen_domctl_ext_vcpucontext {
 #else
     struct hvm_vmce_vcpu vmce;
 #endif
+    XEN_GUEST_HANDLE_64(xen_domctl_ext_vcpu_msr_t) msrs;
 #endif
 };
 typedef struct xen_domctl_ext_vcpucontext xen_domctl_ext_vcpucontext_t;



[-- Attachment #2: PV-debug-address-mask-MSRs.patch --]
[-- Type: text/plain, Size: 13474 bytes --]

x86/PV: support data breakpoint extension registers

Introducing an extension to XEN_DOMCTL_[gs]et_ext_vcpucontext similar
to the generic MSR save/restore logic recently added for HVM.

This also moves some debug register related declarations/definition to
the header intended for these.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
TBD: libxc adjustment still missing (want to get some basic feedback on
     the domctl extension first)

--- a/xen/arch/x86/acpi/suspend.c
+++ b/xen/arch/x86/acpi/suspend.c
@@ -9,6 +9,7 @@
 #include <xen/smp.h>
 #include <asm/processor.h>
 #include <asm/msr.h>
+#include <asm/debugreg.h>
 #include <asm/flushtlb.h>
 #include <asm/hvm/hvm.h>
 #include <asm/hvm/support.h>
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1316,14 +1316,7 @@ static void paravirt_ctxt_switch_to(stru
         write_cr4(cr4);
 
     if ( unlikely(v->arch.debugreg[7] & DR7_ACTIVE_MASK) )
-    {
-        write_debugreg(0, v->arch.debugreg[0]);
-        write_debugreg(1, v->arch.debugreg[1]);
-        write_debugreg(2, v->arch.debugreg[2]);
-        write_debugreg(3, v->arch.debugreg[3]);
-        write_debugreg(6, v->arch.debugreg[6]);
-        write_debugreg(7, v->arch.debugreg[7]);
-    }
+        activate_debugregs(v);
 
     if ( (v->domain->arch.tsc_mode ==  TSC_MODE_PVRDTSCP) &&
          boot_cpu_has(X86_FEATURE_RDTSCP) )
--- a/xen/arch/x86/domctl.c
+++ b/xen/arch/x86/domctl.c
@@ -52,6 +52,7 @@ long arch_do_domctl(
 {
     long ret = 0;
     bool_t copyback = 0;
+    unsigned long i;
 
     switch ( domctl->cmd )
     {
@@ -319,7 +320,6 @@ long arch_do_domctl(
 
     case XEN_DOMCTL_getmemlist:
     {
-        int i;
         unsigned long max_pfns = domctl->u.getmemlist.max_pfns;
         uint64_t mfn;
         struct page_info *page;
@@ -645,7 +645,6 @@ long arch_do_domctl(
         unsigned long mfn = domctl->u.memory_mapping.first_mfn;
         unsigned long nr_mfns = domctl->u.memory_mapping.nr_mfns;
         int add = domctl->u.memory_mapping.add_mapping;
-        unsigned long i;
 
         ret = -EINVAL;
         if ( (mfn + nr_mfns - 1) < mfn || /* wrap? */
@@ -809,6 +808,7 @@ long arch_do_domctl(
     {
         struct xen_domctl_ext_vcpucontext *evc;
         struct vcpu *v;
+        struct xen_domctl_ext_vcpu_msr msr;
 
         evc = &domctl->u.ext_vcpucontext;
 
@@ -854,7 +854,42 @@ long arch_do_domctl(
             evc->vmce.mci_ctl2_bank0 = v->arch.vmce.bank[0].mci_ctl2;
             evc->vmce.mci_ctl2_bank1 = v->arch.vmce.bank[1].mci_ctl2;
 
-            ret = 0;
+            i = ret = 0;
+            if ( boot_cpu_has(X86_FEATURE_DBEXT) )
+            {
+                unsigned int j;
+
+                if ( v->arch.pv_vcpu.dr_mask[0] )
+                {
+                    if ( i < evc->msr_count && !ret )
+                    {
+                        msr.index = MSR_AMD64_DR0_ADDRESS_MASK;
+                        msr.reserved = 0;
+                        msr.value = v->arch.pv_vcpu.dr_mask[0];
+                        if ( copy_to_guest_offset(evc->msrs, i, &msr, 1) )
+                            ret = -EFAULT;
+                    }
+                    ++i;
+                }
+                for ( j = 0; j < 3; ++j )
+                {
+                    if ( !v->arch.pv_vcpu.dr_mask[1 + j] )
+                        continue;
+                    if ( i < evc->msr_count && !ret )
+                    {
+                        msr.index = MSR_AMD64_DR1_ADDRESS_MASK + j;
+                        msr.reserved = 0;
+                        msr.value = v->arch.pv_vcpu.dr_mask[1 + j];
+                        if ( copy_to_guest_offset(evc->msrs, i, &msr, 1) )
+                            ret = -EFAULT;
+                    }
+                    ++i;
+                }
+            }
+            if ( i > evc->msr_count && !ret )
+                ret = -ENOBUFS;
+            evc->msr_count = i;
+
             vcpu_unpause(v);
             copyback = 1;
         }
@@ -909,9 +944,49 @@ long arch_do_domctl(
 
                 ret = vmce_restore_vcpu(v, &vmce);
             }
+            else if ( evc->size > offsetof(typeof(*evc), vmce) )
+                ret = -EINVAL;
             else
                 ret = 0;
 
+            if ( ret || evc->size <= offsetof(typeof(*evc), msrs) )
+                /* nothing */;
+            else if ( evc->size < offsetof(typeof(*evc), msrs) +
+                                  sizeof(evc->msrs) )
+                ret = -EINVAL;
+            else
+            {
+                for ( i = 0; i < evc->msr_count; ++i )
+                {
+                    ret = -EFAULT;
+                    if ( copy_from_guest_offset(&msr, evc->msrs, i, 1) )
+                        break;
+                    ret = -EINVAL;
+                    if ( msr.reserved )
+                        break;
+                    switch ( msr.index )
+                    {
+                    case MSR_AMD64_DR0_ADDRESS_MASK:
+                        if ( !boot_cpu_has(X86_FEATURE_DBEXT) ||
+                             (msr.value >> 32) )
+                            break;
+                        v->arch.pv_vcpu.dr_mask[0] = msr.value;
+                        continue;
+                    case MSR_AMD64_DR1_ADDRESS_MASK ...
+                         MSR_AMD64_DR3_ADDRESS_MASK:
+                        if ( !boot_cpu_has(X86_FEATURE_DBEXT) ||
+                             (msr.value >> 32) )
+                            break;
+                        msr.index -= MSR_AMD64_DR1_ADDRESS_MASK - 1;
+                        v->arch.pv_vcpu.dr_mask[msr.index] = msr.value;
+                        continue;
+                    }
+                    break;
+                }
+                if ( i == evc->msr_count )
+                    ret = 0;
+            }
+
             domain_unpause(d);
         }
     }
@@ -921,7 +996,6 @@ long arch_do_domctl(
     {
         xen_domctl_cpuid_t *ctl = &domctl->u.cpuid;
         cpuid_input_t *cpuid = NULL; 
-        int i;
 
         for ( i = 0; i < MAX_CPUID_INPUT; i++ )
         {
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -2498,6 +2498,23 @@ static int emulate_privileged_op(struct 
             if ( wrmsr_safe(regs->ecx, msr_content) != 0 )
                 goto fail;
             break;
+
+        case MSR_AMD64_DR0_ADDRESS_MASK:
+            if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (msr_content >> 32) )
+                goto fail;
+            v->arch.pv_vcpu.dr_mask[0] = msr_content;
+            if ( v->arch.debugreg[7] & DR7_ACTIVE_MASK )
+                wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, msr_content);
+            break;
+        case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
+            if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (msr_content >> 32) )
+                goto fail;
+            v->arch.pv_vcpu.dr_mask
+                [regs->_ecx - MSR_AMD64_DR1_ADDRESS_MASK + 1] = msr_content;
+            if ( v->arch.debugreg[7] & DR7_ACTIVE_MASK )
+                wrmsrl(regs->_ecx, msr_content);
+            break;
+
         default:
             if ( wrmsr_hypervisor_regs(regs->ecx, msr_content) == 1 )
                 break;
@@ -2585,6 +2602,21 @@ static int emulate_privileged_op(struct 
             regs->eax = (uint32_t)msr_content;
             regs->edx = (uint32_t)(msr_content >> 32);
             break;
+
+        case MSR_AMD64_DR0_ADDRESS_MASK:
+            if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
+                goto fail;
+            regs->eax = v->arch.pv_vcpu.dr_mask[0];
+            regs->edx = 0;
+            break;
+        case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
+            if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
+                goto fail;
+            regs->eax = v->arch.pv_vcpu.dr_mask
+                            [regs->_ecx - MSR_AMD64_DR1_ADDRESS_MASK + 1];
+            regs->edx = 0;
+            break;
+
         default:
             if ( rdmsr_hypervisor_regs(regs->ecx, &val) )
             {
@@ -3628,7 +3660,27 @@ long do_set_trap_table(XEN_GUEST_HANDLE_
     return rc;
 }
 
-long set_debugreg(struct vcpu *v, int reg, unsigned long value)
+void activate_debugregs(const struct vcpu *curr)
+{
+    ASSERT(curr == current);
+
+    write_debugreg(0, curr->arch.debugreg[0]);
+    write_debugreg(1, curr->arch.debugreg[1]);
+    write_debugreg(2, curr->arch.debugreg[2]);
+    write_debugreg(3, curr->arch.debugreg[3]);
+    write_debugreg(6, curr->arch.debugreg[6]);
+    write_debugreg(7, curr->arch.debugreg[7]);
+
+    if ( boot_cpu_has(X86_FEATURE_DBEXT) )
+    {
+        wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, curr->arch.pv_vcpu.dr_mask[0]);
+        wrmsrl(MSR_AMD64_DR1_ADDRESS_MASK, curr->arch.pv_vcpu.dr_mask[1]);
+        wrmsrl(MSR_AMD64_DR2_ADDRESS_MASK, curr->arch.pv_vcpu.dr_mask[2]);
+        wrmsrl(MSR_AMD64_DR3_ADDRESS_MASK, curr->arch.pv_vcpu.dr_mask[3]);
+    }
+}
+
+long set_debugreg(struct vcpu *v, unsigned int reg, unsigned long value)
 {
     int i;
     struct vcpu *curr = current;
@@ -3709,11 +3761,8 @@ long set_debugreg(struct vcpu *v, int re
             if ( (v == curr) &&
                  !(v->arch.debugreg[7] & DR7_ACTIVE_MASK) )
             {
-                write_debugreg(0, v->arch.debugreg[0]);
-                write_debugreg(1, v->arch.debugreg[1]);
-                write_debugreg(2, v->arch.debugreg[2]);
-                write_debugreg(3, v->arch.debugreg[3]);
-                write_debugreg(6, v->arch.debugreg[6]);
+                activate_debugregs(curr);
+                break;
             }
         }
         if ( v == curr )
--- a/xen/include/asm-x86/debugreg.h
+++ b/xen/include/asm-x86/debugreg.h
@@ -64,4 +64,16 @@
 #define DR_GLOBAL_EXACT_ENABLE   (0x00000200ul) /* Global exact enable */
 #define DR_GENERAL_DETECT        (0x00002000ul) /* General detect enable */
 
+#define write_debugreg(reg, val) do {                       \
+    unsigned long __val = val;                              \
+    asm volatile ( "mov %0,%%db" #reg : : "r" (__val) );    \
+} while (0)
+#define read_debugreg(reg) ({                               \
+    unsigned long __val;                                    \
+    asm volatile ( "mov %%db" #reg ",%0" : "=r" (__val) );  \
+    __val;                                                  \
+})
+long set_debugreg(struct vcpu *, unsigned int reg, unsigned long value);
+void activate_debugregs(const struct vcpu *);
+
 #endif /* _X86_DEBUGREG_H */
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -374,6 +374,9 @@ struct pv_vcpu
     unsigned long shadow_ldt_mapcnt;
     spinlock_t shadow_ldt_lock;
 
+    /* data breakpoint extension MSRs */
+    uint32_t dr_mask[4];
+
     /* Deferred VA-based update state. */
     bool_t need_update_runstate_area;
     struct vcpu_time_info pending_system_time;
--- a/xen/include/asm-x86/processor.h
+++ b/xen/include/asm-x86/processor.h
@@ -462,17 +462,6 @@ long set_gdt(struct vcpu *d, 
              unsigned long *frames, 
              unsigned int entries);
 
-#define write_debugreg(reg, val) do {                       \
-    unsigned long __val = val;                              \
-    asm volatile ( "mov %0,%%db" #reg : : "r" (__val) );    \
-} while (0)
-#define read_debugreg(reg) ({                               \
-    unsigned long __val;                                    \
-    asm volatile ( "mov %%db" #reg ",%0" : "=r" (__val) );  \
-    __val;                                                  \
-})
-long set_debugreg(struct vcpu *p, int reg, unsigned long value);
-
 /* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
 static always_inline void rep_nop(void)
 {
--- a/xen/include/public/domctl.h
+++ b/xen/include/public/domctl.h
@@ -36,7 +36,7 @@
 #include "grant_table.h"
 #include "hvm/save.h"
 
-#define XEN_DOMCTL_INTERFACE_VERSION 0x00000009
+#define XEN_DOMCTL_INTERFACE_VERSION 0x0000000a
 
 /*
  * NB. xen_domctl.domain is an IN/OUT parameter for this operation.
@@ -563,6 +563,16 @@ typedef struct xen_domctl_pin_mem_cachea
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_pin_mem_cacheattr_t);
 
 
+#if defined(__i386__) || defined(__x86_64__)
+struct xen_domctl_ext_vcpu_msr {
+    uint32_t         index;
+    uint32_t         reserved;
+    uint64_aligned_t value;
+};
+typedef struct xen_domctl_ext_vcpu_msr xen_domctl_ext_vcpu_msr_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_ext_vcpu_msr_t);
+#endif
+
 /* XEN_DOMCTL_set_ext_vcpucontext */
 /* XEN_DOMCTL_get_ext_vcpucontext */
 struct xen_domctl_ext_vcpucontext {
@@ -582,6 +592,7 @@ struct xen_domctl_ext_vcpucontext {
     uint16_t         sysenter_callback_cs;
     uint8_t          syscall32_disables_events;
     uint8_t          sysenter_disables_events;
+    uint16_t         msr_count;
 #if defined(__GNUC__)
     union {
         uint64_aligned_t mcg_cap;
@@ -590,6 +601,7 @@ struct xen_domctl_ext_vcpucontext {
 #else
     struct hvm_vmce_vcpu vmce;
 #endif
+    XEN_GUEST_HANDLE_64(xen_domctl_ext_vcpu_msr_t) msrs;
 #endif
 };
 typedef struct xen_domctl_ext_vcpucontext xen_domctl_ext_vcpucontext_t;

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH RFC 3/4] x86/AMD: support further feature masking MSRs
  2014-03-19  9:16 ` [PATCH RFC 0/4] x86/AMD: support newer hardware features Jan Beulich
  2014-03-19  9:46   ` [PATCH RFC 1/4] x86/SVM: support data breakpoint extension registers Jan Beulich
  2014-03-19  9:47   ` [PATCH RFC 2/4] x86/PV: " Jan Beulich
@ 2014-03-19  9:47   ` Jan Beulich
       [not found]     ` <CAAAAutDJ3CzvGp9jWCS7=b3rQsG81+w_W7qXLKxDWB7QBU23DQ@mail.gmail.com>
  2014-03-19  9:48   ` [PATCH RFC 4/4] x86/AMD: clean up pre-canned family/revision handling for CPUID masking Jan Beulich
                     ` (2 subsequent siblings)
  5 siblings, 1 reply; 28+ messages in thread
From: Jan Beulich @ 2014-03-19  9:47 UTC (permalink / raw)
  To: xen-devel
  Cc: Keir Fraser, Ian Jackson, Ian Campbell, suravee.suthikulpanit,
	Sherry Hurwitz

Newer AMD CPUs also allow masking CPUID leaf 6 ECX and CPUID leaf 7
sub-leaf 0 EAX and EBX.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

TBD:
- Fam15M0x is documented to not have MSR C0011002 despite CPUID[7,0].EBX != 0 from model 02
  onwards, and contrary to what I see on the system I have access to (which is model 02)
- Fam12, Fam14, and Fam16 are documented to not have MSR C0011003 despite CPUID[6].ECX != 0
- Fam11 is documented to not even have MSR C0011004 and C0011005

--- a/docs/misc/xen-command-line.markdown
+++ b/docs/misc/xen-command-line.markdown
@@ -331,24 +331,42 @@ Indicate where the responsibility for dr
 ### cpuid\_mask\_cpu (AMD only)
 > `= fam_0f_rev_c | fam_0f_rev_d | fam_0f_rev_e | fam_0f_rev_f | fam_0f_rev_g | fam_10_rev_b | fam_10_rev_c | fam_11_rev_b`
 
-If the other **cpuid\_mask\_{,ext\_}e{c,d}x** options are fully set
-(unspecified on the command line), specify a pre-canned cpuid mask to
-mask the current processor down to appear as the specified processor.
-It is important to ensure that all hosts in a pool appear the same to
-guests to allow successful live migration.
+If the other **cpuid\_mask\_{,ext\_,thermal\_,l7s0\_}e{a,b,c,d}x**
+options are fully set (unspecified on the command line), specify a
+pre-canned cpuid mask to mask the current processor down to appear as
+the specified processor. It is important to ensure that all hosts in a
+pool appear the same to guests to allow successful live migration.
 
-### cpuid\_mask\_ ecx,edx,ext\_ecx,ext\_edx,xsave_eax
+### cpuid\_mask\_{{,ext\_}ecx,edx}
 > `= <integer>`
 
 > Default: `~0` (all bits set)
 
-These five command line parameters are used to specify cpuid masks to
+These four command line parameters are used to specify cpuid masks to
 help with cpuid levelling across a pool of hosts.  Setting a bit in
 the mask indicates that the feature should be enabled, while clearing
 a bit in the mask indicates that the feature should be disabled.  It
 is important to ensure that all hosts in a pool appear the same to
 guests to allow successful live migration.
 
+### cpuid\_mask\_xsave\_eax (Intel only)
+> `= <integer>`
+
+> Default: `~0` (all bits set)
+
+This command line parameter is also used to specify a cpuid mask to
+help with cpuid levelling across a pool of hosts.  See the description
+of the other respective options above.
+
+### cpuid\_mask\_{l7s0\_{eax,ebx},thermal\_ecx} (AMD only)
+> `= <integer>`
+
+> Default: `~0` (all bits set)
+
+These three command line parameters are also used to specify cpuid
+masks to help with cpuid levelling across a pool of hosts.  See the
+description of the other respective options above.
+
 ### cpuidle
 > `= <boolean>`
 
--- a/xen/arch/x86/cpu/amd.c
+++ b/xen/arch/x86/cpu/amd.c
@@ -30,9 +30,17 @@
  * "fam_10_rev_c"
  * "fam_11_rev_b"
  */
-static char opt_famrev[14];
+static char __initdata opt_famrev[14];
 string_param("cpuid_mask_cpu", opt_famrev);
 
+static unsigned int __initdata opt_cpuid_mask_l7s0_eax = ~0u;
+integer_param("cpuid_mask_l7s0_eax", opt_cpuid_mask_l7s0_eax);
+static unsigned int __initdata opt_cpuid_mask_l7s0_ebx = ~0u;
+integer_param("cpuid_mask_l7s0_ebx", opt_cpuid_mask_l7s0_ebx);
+
+static unsigned int __initdata opt_cpuid_mask_thermal_ecx = ~0u;
+integer_param("cpuid_mask_thermal_ecx", opt_cpuid_mask_thermal_ecx);
+
 /* 1 = allow, 0 = don't allow guest creation, -1 = don't allow boot */
 s8 __read_mostly opt_allow_unsafe;
 boolean_param("allow_unsafe", opt_allow_unsafe);
@@ -96,7 +104,11 @@ static void __devinit set_cpuidmask(cons
 {
 	static unsigned int feat_ecx, feat_edx;
 	static unsigned int extfeat_ecx, extfeat_edx;
+	static unsigned int l7s0_eax, l7s0_ebx;
+	static unsigned int thermal_ecx;
+	static bool_t skip_l7s0_eax_ebx, skip_thermal_ecx;
 	static enum { not_parsed, no_mask, set_mask } status;
+	unsigned int eax, ebx, ecx, edx;
 
 	if (status == no_mask)
 		return;
@@ -104,15 +116,20 @@ static void __devinit set_cpuidmask(cons
 	if (status == set_mask)
 		goto setmask;
 
-	ASSERT((status == not_parsed) && (smp_processor_id() == 0));
+	ASSERT((status == not_parsed) && (c == &boot_cpu_data));
 	status = no_mask;
 
 	if (~(opt_cpuid_mask_ecx & opt_cpuid_mask_edx &
-	      opt_cpuid_mask_ext_ecx & opt_cpuid_mask_ext_edx)) {
+	      opt_cpuid_mask_ext_ecx & opt_cpuid_mask_ext_edx &
+	      opt_cpuid_mask_l7s0_eax & opt_cpuid_mask_l7s0_ebx &
+	      opt_cpuid_mask_thermal_ecx)) {
 		feat_ecx = opt_cpuid_mask_ecx;
 		feat_edx = opt_cpuid_mask_edx;
 		extfeat_ecx = opt_cpuid_mask_ext_ecx;
 		extfeat_edx = opt_cpuid_mask_ext_edx;
+		l7s0_eax = opt_cpuid_mask_l7s0_eax;
+		l7s0_ebx = opt_cpuid_mask_l7s0_ebx;
+		thermal_ecx = opt_cpuid_mask_thermal_ecx;
 	} else if (*opt_famrev == '\0') {
 		return;
 	} else if (!strcmp(opt_famrev, "fam_0f_rev_c")) {
@@ -175,12 +192,39 @@ static void __devinit set_cpuidmask(cons
 	printk("Writing CPUID extended feature mask ECX:EDX -> %08Xh:%08Xh\n", 
 	       extfeat_ecx, extfeat_edx);
 
+	if (c->cpuid_level >= 7)
+		cpuid_count(7, 0, &eax, &ebx, &ecx, &edx);
+	else
+		ebx = eax = 0;
+	if ((eax | ebx) && ~(l7s0_eax & l7s0_ebx)) {
+		if (l7s0_eax > eax)
+			l7s0_eax = eax;
+		l7s0_ebx &= ebx;
+		printk("Writing CPUID leaf 7 subleaf 0 feature mask EAX:EBX -> %08Xh:%08Xh\n",
+		       l7s0_eax, l7s0_ebx);
+	} else
+		skip_l7s0_eax_ebx = 1;
+
+	ecx = c->cpuid_level >= 6 ? cpuid_ecx(6) : 0;
+	if (ecx && ~thermal_ecx) {
+		thermal_ecx &= ecx;
+		printk("Writing CPUID thermal/power feature mask ECX -> %08Xh\n",
+		       thermal_ecx);
+	} else
+		skip_thermal_ecx = 1;
+
  setmask:
 	/* FIXME check if processor supports CPUID masking */
 	/* AMD processors prior to family 10h required a 32-bit password */
 	if (c->x86 >= 0x10) {
 		wrmsr(MSR_K8_FEATURE_MASK, feat_edx, feat_ecx);
 		wrmsr(MSR_K8_EXT_FEATURE_MASK, extfeat_edx, extfeat_ecx);
+		if (!skip_l7s0_eax_ebx)
+			wrmsr(MSR_AMD_L7S0_FEATURE_MASK, l7s0_ebx, l7s0_eax);
+		if (!skip_thermal_ecx) {
+			rdmsr(MSR_AMD_THRM_FEATURE_MASK, eax, edx);
+			wrmsr(MSR_AMD_THRM_FEATURE_MASK, thermal_ecx, edx);
+		}
 	} else {
 		wrmsr_amd(MSR_K8_FEATURE_MASK, feat_edx, feat_ecx);
 		wrmsr_amd(MSR_K8_EXT_FEATURE_MASK, extfeat_edx, extfeat_ecx);
--- a/xen/include/asm-x86/msr-index.h
+++ b/xen/include/asm-x86/msr-index.h
@@ -192,6 +192,8 @@
 #define MSR_AMD_FAM15H_EVNTSEL5		0xc001020a
 #define MSR_AMD_FAM15H_PERFCTR5		0xc001020b
 
+#define MSR_AMD_L7S0_FEATURE_MASK	0xc0011002
+#define MSR_AMD_THRM_FEATURE_MASK	0xc0011003
 #define MSR_K8_FEATURE_MASK		0xc0011004
 #define MSR_K8_EXT_FEATURE_MASK		0xc0011005

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH RFC 4/4] x86/AMD: clean up pre-canned family/revision handling for CPUID masking
  2014-03-19  9:16 ` [PATCH RFC 0/4] x86/AMD: support newer hardware features Jan Beulich
                     ` (2 preceding siblings ...)
  2014-03-19  9:47   ` [PATCH RFC 3/4] x86/AMD: support further feature masking MSRs Jan Beulich
@ 2014-03-19  9:48   ` Jan Beulich
  2014-03-26 14:13   ` [PATCH v2 RFC 2/4] x86/PV: support data breakpoint extension registers Jan Beulich
       [not found]   ` <CAAAAutAgdSJqYEjc4dtixuTS2S=PUx-L4Sy=JCQRZ57rTdoPCQ@mail.gmail.com>
  5 siblings, 0 replies; 28+ messages in thread
From: Jan Beulich @ 2014-03-19  9:48 UTC (permalink / raw)
  To: xen-devel
  Cc: Keir Fraser, Ian Jackson, Ian Campbell, suravee.suthikulpanit,
	Sherry Hurwitz

[-- Attachment #1: Type: text/plain, Size: 4118 bytes --]

Make it so this is easier to extend, and move the parsing code/data
into .init.* sections.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/cpu/amd.c
+++ b/xen/arch/x86/cpu/amd.c
@@ -91,6 +91,51 @@ static inline int wrmsr_amd_safe(unsigne
 	return err;
 }
 
+static const struct cpuidmask {
+	uint16_t fam;
+	char rev[2];
+	unsigned int ecx, edx, ext_ecx, ext_edx;
+} pre_canned[] __initconst = {
+#define CAN(fam, id, rev) { \
+		fam, #rev, \
+		AMD_FEATURES_##id##_REV_##rev##_ECX, \
+		AMD_FEATURES_##id##_REV_##rev##_EDX, \
+		AMD_EXTFEATURES_##id##_REV_##rev##_ECX, \
+		AMD_EXTFEATURES_##id##_REV_##rev##_EDX \
+	}
+#define CAN_FAM(fam, rev) CAN(0x##fam, FAM##fam##h, rev)
+#define CAN_K8(rev)       CAN(0x0f,    K8,          rev)
+	CAN_FAM(11, B),
+	CAN_FAM(10, C),
+	CAN_FAM(10, B),
+	CAN_K8(G),
+	CAN_K8(F),
+	CAN_K8(E),
+	CAN_K8(D),
+	CAN_K8(C)
+#undef CAN
+};
+
+static const struct cpuidmask *__init noinline get_cpuidmask(const char *opt)
+{
+	unsigned long fam;
+	char rev;
+	unsigned int i;
+
+	if (strncmp(opt, "fam_", 4))
+		return NULL;
+	fam = simple_strtoul(opt + 4, &opt, 16);
+	if (strncmp(opt, "_rev_", 5))
+		return NULL;
+	rev = toupper(opt[5]);
+
+	for (i = 0; i < ARRAY_SIZE(pre_canned); ++i)
+		if (fam == pre_canned[i].fam && rev == *pre_canned[i].rev)
+			return &pre_canned[i];
+
+	return NULL;
+}
+
 /*
  * Mask the features and extended features returned by CPUID.  Parameters are
  * set from the boot line via two methods:
@@ -132,50 +177,18 @@ static void __devinit set_cpuidmask(cons
 		thermal_ecx = opt_cpuid_mask_thermal_ecx;
 	} else if (*opt_famrev == '\0') {
 		return;
-	} else if (!strcmp(opt_famrev, "fam_0f_rev_c")) {
-		feat_ecx = AMD_FEATURES_K8_REV_C_ECX;
-		feat_edx = AMD_FEATURES_K8_REV_C_EDX;
-		extfeat_ecx = AMD_EXTFEATURES_K8_REV_C_ECX;
-		extfeat_edx = AMD_EXTFEATURES_K8_REV_C_EDX;
-	} else if (!strcmp(opt_famrev, "fam_0f_rev_d")) {
-		feat_ecx = AMD_FEATURES_K8_REV_D_ECX;
-		feat_edx = AMD_FEATURES_K8_REV_D_EDX;
-		extfeat_ecx = AMD_EXTFEATURES_K8_REV_D_ECX;
-		extfeat_edx = AMD_EXTFEATURES_K8_REV_D_EDX;
-	} else if (!strcmp(opt_famrev, "fam_0f_rev_e")) {
-		feat_ecx = AMD_FEATURES_K8_REV_E_ECX;
-		feat_edx = AMD_FEATURES_K8_REV_E_EDX;
-		extfeat_ecx = AMD_EXTFEATURES_K8_REV_E_ECX;
-		extfeat_edx = AMD_EXTFEATURES_K8_REV_E_EDX;
-	} else if (!strcmp(opt_famrev, "fam_0f_rev_f")) {
-		feat_ecx = AMD_FEATURES_K8_REV_F_ECX;
-		feat_edx = AMD_FEATURES_K8_REV_F_EDX;
-		extfeat_ecx = AMD_EXTFEATURES_K8_REV_F_ECX;
-		extfeat_edx = AMD_EXTFEATURES_K8_REV_F_EDX;
-	} else if (!strcmp(opt_famrev, "fam_0f_rev_g")) {
-		feat_ecx = AMD_FEATURES_K8_REV_G_ECX;
-		feat_edx = AMD_FEATURES_K8_REV_G_EDX;
-		extfeat_ecx = AMD_EXTFEATURES_K8_REV_G_ECX;
-		extfeat_edx = AMD_EXTFEATURES_K8_REV_G_EDX;
-	} else if (!strcmp(opt_famrev, "fam_10_rev_b")) {
-		feat_ecx = AMD_FEATURES_FAM10h_REV_B_ECX;
-		feat_edx = AMD_FEATURES_FAM10h_REV_B_EDX;
-		extfeat_ecx = AMD_EXTFEATURES_FAM10h_REV_B_ECX;
-		extfeat_edx = AMD_EXTFEATURES_FAM10h_REV_B_EDX;
-	} else if (!strcmp(opt_famrev, "fam_10_rev_c")) {
-		feat_ecx = AMD_FEATURES_FAM10h_REV_C_ECX;
-		feat_edx = AMD_FEATURES_FAM10h_REV_C_EDX;
-		extfeat_ecx = AMD_EXTFEATURES_FAM10h_REV_C_ECX;
-		extfeat_edx = AMD_EXTFEATURES_FAM10h_REV_C_EDX;
-	} else if (!strcmp(opt_famrev, "fam_11_rev_b")) {
-		feat_ecx = AMD_FEATURES_FAM11h_REV_B_ECX;
-		feat_edx = AMD_FEATURES_FAM11h_REV_B_EDX;
-		extfeat_ecx = AMD_EXTFEATURES_FAM11h_REV_B_ECX;
-		extfeat_edx = AMD_EXTFEATURES_FAM11h_REV_B_EDX;
 	} else {
-		printk("Invalid processor string: %s\n", opt_famrev);
-		printk("CPUID will not be masked\n");
-		return;
+		const struct cpuidmask *m = get_cpuidmask(opt_famrev);
+
+		if (!m) {
+			printk("Invalid processor string: %s\n", opt_famrev);
+			printk("CPUID will not be masked\n");
+			return;
+		}
+		feat_ecx = m->ecx;
+		feat_edx = m->edx;
+		extfeat_ecx = m->ext_ecx;
+		extfeat_edx = m->ext_edx;
 	}
 
         /* Setting bits in the CPUID mask MSR that are not set in the



[-- Attachment #2: x86-AMD-simplify-feature-masks.patch --]
[-- Type: text/plain, Size: 4189 bytes --]

x86/AMD: clean up pre-canned family/revision handling for CPUID masking

Make it so this is easier to extend, and move the parsing code/data
into .init.* sections.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/cpu/amd.c
+++ b/xen/arch/x86/cpu/amd.c
@@ -91,6 +91,51 @@ static inline int wrmsr_amd_safe(unsigne
 	return err;
 }
 
+static const struct cpuidmask {
+	uint16_t fam;
+	char rev[2];
+	unsigned int ecx, edx, ext_ecx, ext_edx;
+} pre_canned[] __initconst = {
+#define CAN(fam, id, rev) { \
+		fam, #rev, \
+		AMD_FEATURES_##id##_REV_##rev##_ECX, \
+		AMD_FEATURES_##id##_REV_##rev##_EDX, \
+		AMD_EXTFEATURES_##id##_REV_##rev##_ECX, \
+		AMD_EXTFEATURES_##id##_REV_##rev##_EDX \
+	}
+#define CAN_FAM(fam, rev) CAN(0x##fam, FAM##fam##h, rev)
+#define CAN_K8(rev)       CAN(0x0f,    K8,          rev)
+	CAN_FAM(11, B),
+	CAN_FAM(10, C),
+	CAN_FAM(10, B),
+	CAN_K8(G),
+	CAN_K8(F),
+	CAN_K8(E),
+	CAN_K8(D),
+	CAN_K8(C)
+#undef CAN
+};
+
+static const struct cpuidmask *__init noinline get_cpuidmask(const char *opt)
+{
+	unsigned long fam;
+	char rev;
+	unsigned int i;
+
+	if (strncmp(opt, "fam_", 4))
+		return NULL;
+	fam = simple_strtoul(opt + 4, &opt, 16);
+	if (strncmp(opt, "_rev_", 5))
+		return NULL;
+	rev = toupper(opt[5]);
+
+	for (i = 0; i < ARRAY_SIZE(pre_canned); ++i)
+		if (fam == pre_canned[i].fam && rev == *pre_canned[i].rev)
+			return &pre_canned[i];
+
+	return NULL;
+}
+
 /*
  * Mask the features and extended features returned by CPUID.  Parameters are
  * set from the boot line via two methods:
@@ -132,50 +177,18 @@ static void __devinit set_cpuidmask(cons
 		thermal_ecx = opt_cpuid_mask_thermal_ecx;
 	} else if (*opt_famrev == '\0') {
 		return;
-	} else if (!strcmp(opt_famrev, "fam_0f_rev_c")) {
-		feat_ecx = AMD_FEATURES_K8_REV_C_ECX;
-		feat_edx = AMD_FEATURES_K8_REV_C_EDX;
-		extfeat_ecx = AMD_EXTFEATURES_K8_REV_C_ECX;
-		extfeat_edx = AMD_EXTFEATURES_K8_REV_C_EDX;
-	} else if (!strcmp(opt_famrev, "fam_0f_rev_d")) {
-		feat_ecx = AMD_FEATURES_K8_REV_D_ECX;
-		feat_edx = AMD_FEATURES_K8_REV_D_EDX;
-		extfeat_ecx = AMD_EXTFEATURES_K8_REV_D_ECX;
-		extfeat_edx = AMD_EXTFEATURES_K8_REV_D_EDX;
-	} else if (!strcmp(opt_famrev, "fam_0f_rev_e")) {
-		feat_ecx = AMD_FEATURES_K8_REV_E_ECX;
-		feat_edx = AMD_FEATURES_K8_REV_E_EDX;
-		extfeat_ecx = AMD_EXTFEATURES_K8_REV_E_ECX;
-		extfeat_edx = AMD_EXTFEATURES_K8_REV_E_EDX;
-	} else if (!strcmp(opt_famrev, "fam_0f_rev_f")) {
-		feat_ecx = AMD_FEATURES_K8_REV_F_ECX;
-		feat_edx = AMD_FEATURES_K8_REV_F_EDX;
-		extfeat_ecx = AMD_EXTFEATURES_K8_REV_F_ECX;
-		extfeat_edx = AMD_EXTFEATURES_K8_REV_F_EDX;
-	} else if (!strcmp(opt_famrev, "fam_0f_rev_g")) {
-		feat_ecx = AMD_FEATURES_K8_REV_G_ECX;
-		feat_edx = AMD_FEATURES_K8_REV_G_EDX;
-		extfeat_ecx = AMD_EXTFEATURES_K8_REV_G_ECX;
-		extfeat_edx = AMD_EXTFEATURES_K8_REV_G_EDX;
-	} else if (!strcmp(opt_famrev, "fam_10_rev_b")) {
-		feat_ecx = AMD_FEATURES_FAM10h_REV_B_ECX;
-		feat_edx = AMD_FEATURES_FAM10h_REV_B_EDX;
-		extfeat_ecx = AMD_EXTFEATURES_FAM10h_REV_B_ECX;
-		extfeat_edx = AMD_EXTFEATURES_FAM10h_REV_B_EDX;
-	} else if (!strcmp(opt_famrev, "fam_10_rev_c")) {
-		feat_ecx = AMD_FEATURES_FAM10h_REV_C_ECX;
-		feat_edx = AMD_FEATURES_FAM10h_REV_C_EDX;
-		extfeat_ecx = AMD_EXTFEATURES_FAM10h_REV_C_ECX;
-		extfeat_edx = AMD_EXTFEATURES_FAM10h_REV_C_EDX;
-	} else if (!strcmp(opt_famrev, "fam_11_rev_b")) {
-		feat_ecx = AMD_FEATURES_FAM11h_REV_B_ECX;
-		feat_edx = AMD_FEATURES_FAM11h_REV_B_EDX;
-		extfeat_ecx = AMD_EXTFEATURES_FAM11h_REV_B_ECX;
-		extfeat_edx = AMD_EXTFEATURES_FAM11h_REV_B_EDX;
 	} else {
-		printk("Invalid processor string: %s\n", opt_famrev);
-		printk("CPUID will not be masked\n");
-		return;
+		const struct cpuidmask *m = get_cpuidmask(opt_famrev);
+
+		if (!m) {
+			printk("Invalid processor string: %s\n", opt_famrev);
+			printk("CPUID will not be masked\n");
+			return;
+		}
+		feat_ecx = m->ecx;
+		feat_edx = m->edx;
+		extfeat_ecx = m->ext_ecx;
+		extfeat_edx = m->ext_edx;
 	}
 
         /* Setting bits in the CPUID mask MSR that are not set in the

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH v2 RFC 2/4] x86/PV: support data breakpoint extension registers
  2014-03-19  9:16 ` [PATCH RFC 0/4] x86/AMD: support newer hardware features Jan Beulich
                     ` (3 preceding siblings ...)
  2014-03-19  9:48   ` [PATCH RFC 4/4] x86/AMD: clean up pre-canned family/revision handling for CPUID masking Jan Beulich
@ 2014-03-26 14:13   ` Jan Beulich
  2014-03-27 15:28     ` Ian Campbell
       [not found]   ` <CAAAAutAgdSJqYEjc4dtixuTS2S=PUx-L4Sy=JCQRZ57rTdoPCQ@mail.gmail.com>
  5 siblings, 1 reply; 28+ messages in thread
From: Jan Beulich @ 2014-03-26 14:13 UTC (permalink / raw)
  To: xen-devel
  Cc: Keir Fraser, Ian Jackson, Ian Campbell, suravee.suthikulpanit,
	Sherry Hurwitz

[-- Attachment #1: Type: text/plain, Size: 19156 bytes --]

Introducing an extension to XEN_DOMCTL_[gs]et_ext_vcpucontext similar
to the generic MSR save/restore logic recently added for HVM.

This also moves some debug register related declarations/definition to
the header intended for these.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: libxc adjustment put in place (depending on the separately posted
    http://lists.xenproject.org/archives/html/xen-devel/2014-03/msg03059.html)

--- a/tools/libxc/xc_domain_restore.c
+++ b/tools/libxc/xc_domain_restore.c
@@ -590,8 +590,13 @@ static int buffer_tail_pv(xc_interface *
                           uint32_t vcpuextstate_size)
 {
     unsigned int i;
-    size_t pfnlen, vcpulen;
+    size_t pfnlen, vcpulen, total;
+    int alloc = 0;
     struct domain_info_context *dinfo = &ctx->dinfo;
+    union {
+        const unsigned char *raw;
+        const xen_domctl_ext_vcpucontext_t *evc;
+    } ptr;
 
     /* TODO: handle changing pfntab and vcpu counts */
     /* PFN tab */
@@ -634,11 +639,36 @@ static int buffer_tail_pv(xc_interface *
             ERROR("Error allocating VCPU ctxt tail buffer");
             goto free_pfntab;
         }
+        alloc = 1;
     }
     // DPRINTF("Reading VCPUS: %d bytes\n", vcpulen);
-    if ( RDEXACT(fd, buf->vcpubuf, vcpulen) ) {
-        PERROR("Error when reading ctxt");
-        goto free_vcpus;
+    for (total = i = 0, ptr.raw = buf->vcpubuf; ext_vcpucontext; ) {
+        if ( RDEXACT(fd, buf->vcpubuf + total, vcpulen) ) {
+            PERROR("Error when reading ctxt");
+            goto free_vcpus;
+        }
+        total += vcpulen;
+        for (vcpulen = 0; i < buf->vcpucount; ++i) {
+            size_t msrlen;
+
+            if ((const unsigned char *)(ptr.evc + 1) > buf->vcpubuf + total)
+                break;
+            msrlen = ptr.evc->msr_count * sizeof(xen_domctl_ext_vcpu_msr_t);
+            vcpulen += msrlen;
+            ptr.raw += 128 + msrlen + vcpuextstate_size;
+        }
+        if (!vcpulen)
+            break;
+        if (alloc) {
+            void *nbuf = realloc(buf->vcpubuf, total + vcpulen);
+
+            if (!nbuf) {
+                ERROR("Error growing VCPU ctxt tail buffer");
+                goto free_vcpus;
+            }
+            ptr.raw = nbuf + (ptr.raw - buf->vcpubuf);
+            buf->vcpubuf = nbuf;
+        }
     }
 
     /* load shared_info_page */
@@ -1996,6 +2026,8 @@ int xc_domain_restore(xc_interface *xch,
     vcpup = tailbuf.u.pv.vcpubuf;
     for ( i = 0; i <= max_vcpu_id; i++ )
     {
+        DECLARE_HYPERCALL_BUFFER(xen_domctl_ext_vcpu_msr_t, msrs);
+
         if ( !(vcpumap[i/64] & (1ULL << (i%64))) )
             continue;
 
@@ -2130,9 +2162,25 @@ int xc_domain_restore(xc_interface *xch,
             goto vcpu_ext_state_restore;
         memcpy(&domctl.u.ext_vcpucontext, vcpup, 128);
         vcpup += 128;
+        if ( domctl.u.ext_vcpucontext.msr_count )
+        {
+            size_t sz = domctl.u.ext_vcpucontext.msr_count * sizeof(*msrs);
+
+            msrs = xc_hypercall_buffer_alloc(xch, msrs, sz);
+            if ( !msrs )
+            {
+                PERROR("No memory for vcpu%d MSRs", i);
+                goto out;
+            }
+            memcpy(msrs, vcpup, sz);
+            vcpup += sz;
+            set_xen_guest_handle(domctl.u.ext_vcpucontext.msrs, msrs);
+        }
         domctl.cmd = XEN_DOMCTL_set_ext_vcpucontext;
         domctl.domain = dom;
         frc = xc_domctl(xch, &domctl);
+        if ( msrs )
+            xc_hypercall_buffer_free(xch, msrs);
         if ( frc != 0 )
         {
             PERROR("Couldn't set extended vcpu%d info", i);
--- a/tools/libxc/xc_domain_save.c
+++ b/tools/libxc/xc_domain_save.c
@@ -836,6 +836,9 @@ int xc_domain_save(xc_interface *xch, in
     /* base of the region in which domain memory is mapped */
     unsigned char *region_base = NULL;
 
+    /* MSR extensions to xen_domctl_ext_vcpucontext_t */
+    DECLARE_HYPERCALL_BUFFER(xen_domctl_ext_vcpu_msr_t, msrs);
+
     /* A copy of the CPU eXtended States of the guest. */
     DECLARE_HYPERCALL_BUFFER(void, buffer);
 
@@ -1960,16 +1963,36 @@ int xc_domain_save(xc_interface *xch, in
         domctl.domain = dom;
         memset(&domctl.u, 0, sizeof(domctl.u));
         domctl.u.ext_vcpucontext.vcpu = i;
-        if ( xc_domctl(xch, &domctl) < 0 )
+        frc = xc_domctl(xch, &domctl);
+        if ( frc < 0 && errno == ENOBUFS && domctl.u.ext_vcpucontext.msr_count )
+        {
+            msrs = xc_hypercall_buffer_alloc(xch, msrs,
+                                             domctl.u.ext_vcpucontext.msr_count *
+                                             sizeof(*msrs));
+            set_xen_guest_handle(domctl.u.ext_vcpucontext.msrs, msrs);
+            frc = msrs ? xc_domctl(xch, &domctl) : -1;
+        }
+        if ( frc < 0 )
         {
             PERROR("No extended context for VCPU%d", i);
             goto out;
         }
         if ( wrexact(io_fd, &domctl.u.ext_vcpucontext, 128) )
         {
-            PERROR("Error when writing to state file (2)");
+            PERROR("Error when writing to state file (ext ctxt)");
             goto out;
         }
+        if ( msrs )
+        {
+            if ( wrexact(io_fd, msrs,
+                         domctl.u.ext_vcpucontext.msr_count * sizeof(*msrs)) )
+            {
+                PERROR("Error when writing to state file (MSRs)");
+                goto out;
+            }
+            xc_hypercall_buffer_free(xch, msrs);
+            msrs = NULL;
+        }
 
         /* Start to fetch CPU eXtended States */
         /* Get buffer size first */
@@ -2134,6 +2157,8 @@ int xc_domain_save(xc_interface *xch, in
 
     xc_hypercall_buffer_free_pages(xch, to_send, NRPAGES(bitmap_size(dinfo->p2m_size)));
     xc_hypercall_buffer_free_pages(xch, to_skip, NRPAGES(bitmap_size(dinfo->p2m_size)));
+    if (msrs)
+        xc_hypercall_buffer_free(xch, msrs);
 
     free(pfn_type);
     free(pfn_batch);
--- a/xen/arch/x86/acpi/suspend.c
+++ b/xen/arch/x86/acpi/suspend.c
@@ -9,6 +9,7 @@
 #include <xen/smp.h>
 #include <asm/processor.h>
 #include <asm/msr.h>
+#include <asm/debugreg.h>
 #include <asm/flushtlb.h>
 #include <asm/hvm/hvm.h>
 #include <asm/hvm/support.h>
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1316,14 +1316,7 @@ static void paravirt_ctxt_switch_to(stru
         write_cr4(cr4);
 
     if ( unlikely(v->arch.debugreg[7] & DR7_ACTIVE_MASK) )
-    {
-        write_debugreg(0, v->arch.debugreg[0]);
-        write_debugreg(1, v->arch.debugreg[1]);
-        write_debugreg(2, v->arch.debugreg[2]);
-        write_debugreg(3, v->arch.debugreg[3]);
-        write_debugreg(6, v->arch.debugreg[6]);
-        write_debugreg(7, v->arch.debugreg[7]);
-    }
+        activate_debugregs(v);
 
     if ( (v->domain->arch.tsc_mode ==  TSC_MODE_PVRDTSCP) &&
          boot_cpu_has(X86_FEATURE_RDTSCP) )
--- a/xen/arch/x86/domctl.c
+++ b/xen/arch/x86/domctl.c
@@ -52,6 +52,7 @@ long arch_do_domctl(
 {
     long ret = 0;
     bool_t copyback = 0;
+    unsigned long i;
 
     switch ( domctl->cmd )
     {
@@ -319,7 +320,6 @@ long arch_do_domctl(
 
     case XEN_DOMCTL_getmemlist:
     {
-        int i;
         unsigned long max_pfns = domctl->u.getmemlist.max_pfns;
         uint64_t mfn;
         struct page_info *page;
@@ -645,7 +645,6 @@ long arch_do_domctl(
         unsigned long mfn = domctl->u.memory_mapping.first_mfn;
         unsigned long nr_mfns = domctl->u.memory_mapping.nr_mfns;
         int add = domctl->u.memory_mapping.add_mapping;
-        unsigned long i;
 
         ret = -EINVAL;
         if ( (mfn + nr_mfns - 1) < mfn || /* wrap? */
@@ -809,6 +808,7 @@ long arch_do_domctl(
     {
         struct xen_domctl_ext_vcpucontext *evc;
         struct vcpu *v;
+        struct xen_domctl_ext_vcpu_msr msr;
 
         evc = &domctl->u.ext_vcpucontext;
 
@@ -854,7 +854,42 @@ long arch_do_domctl(
             evc->vmce.mci_ctl2_bank0 = v->arch.vmce.bank[0].mci_ctl2;
             evc->vmce.mci_ctl2_bank1 = v->arch.vmce.bank[1].mci_ctl2;
 
-            ret = 0;
+            i = ret = 0;
+            if ( boot_cpu_has(X86_FEATURE_DBEXT) )
+            {
+                unsigned int j;
+
+                if ( v->arch.pv_vcpu.dr_mask[0] )
+                {
+                    if ( i < evc->msr_count && !ret )
+                    {
+                        msr.index = MSR_AMD64_DR0_ADDRESS_MASK;
+                        msr.reserved = 0;
+                        msr.value = v->arch.pv_vcpu.dr_mask[0];
+                        if ( copy_to_guest_offset(evc->msrs, i, &msr, 1) )
+                            ret = -EFAULT;
+                    }
+                    ++i;
+                }
+                for ( j = 0; j < 3; ++j )
+                {
+                    if ( !v->arch.pv_vcpu.dr_mask[1 + j] )
+                        continue;
+                    if ( i < evc->msr_count && !ret )
+                    {
+                        msr.index = MSR_AMD64_DR1_ADDRESS_MASK + j;
+                        msr.reserved = 0;
+                        msr.value = v->arch.pv_vcpu.dr_mask[1 + j];
+                        if ( copy_to_guest_offset(evc->msrs, i, &msr, 1) )
+                            ret = -EFAULT;
+                    }
+                    ++i;
+                }
+            }
+            if ( i > evc->msr_count && !ret )
+                ret = -ENOBUFS;
+            evc->msr_count = i;
+
             vcpu_unpause(v);
             copyback = 1;
         }
@@ -909,9 +944,49 @@ long arch_do_domctl(
 
                 ret = vmce_restore_vcpu(v, &vmce);
             }
+            else if ( evc->size > offsetof(typeof(*evc), vmce) )
+                ret = -EINVAL;
             else
                 ret = 0;
 
+            if ( ret || evc->size <= offsetof(typeof(*evc), msrs) )
+                /* nothing */;
+            else if ( evc->size < offsetof(typeof(*evc), msrs) +
+                                  sizeof(evc->msrs) )
+                ret = -EINVAL;
+            else
+            {
+                for ( i = 0; i < evc->msr_count; ++i )
+                {
+                    ret = -EFAULT;
+                    if ( copy_from_guest_offset(&msr, evc->msrs, i, 1) )
+                        break;
+                    ret = -EINVAL;
+                    if ( msr.reserved )
+                        break;
+                    switch ( msr.index )
+                    {
+                    case MSR_AMD64_DR0_ADDRESS_MASK:
+                        if ( !boot_cpu_has(X86_FEATURE_DBEXT) ||
+                             (msr.value >> 32) )
+                            break;
+                        v->arch.pv_vcpu.dr_mask[0] = msr.value;
+                        continue;
+                    case MSR_AMD64_DR1_ADDRESS_MASK ...
+                         MSR_AMD64_DR3_ADDRESS_MASK:
+                        if ( !boot_cpu_has(X86_FEATURE_DBEXT) ||
+                             (msr.value >> 32) )
+                            break;
+                        msr.index -= MSR_AMD64_DR1_ADDRESS_MASK - 1;
+                        v->arch.pv_vcpu.dr_mask[msr.index] = msr.value;
+                        continue;
+                    }
+                    break;
+                }
+                if ( i == evc->msr_count )
+                    ret = 0;
+            }
+
             domain_unpause(d);
         }
     }
@@ -921,7 +996,6 @@ long arch_do_domctl(
     {
         xen_domctl_cpuid_t *ctl = &domctl->u.cpuid;
         cpuid_input_t *cpuid = NULL; 
-        int i;
 
         for ( i = 0; i < MAX_CPUID_INPUT; i++ )
         {
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -2498,6 +2498,23 @@ static int emulate_privileged_op(struct 
             if ( wrmsr_safe(regs->ecx, msr_content) != 0 )
                 goto fail;
             break;
+
+        case MSR_AMD64_DR0_ADDRESS_MASK:
+            if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (msr_content >> 32) )
+                goto fail;
+            v->arch.pv_vcpu.dr_mask[0] = msr_content;
+            if ( v->arch.debugreg[7] & DR7_ACTIVE_MASK )
+                wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, msr_content);
+            break;
+        case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
+            if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (msr_content >> 32) )
+                goto fail;
+            v->arch.pv_vcpu.dr_mask
+                [regs->_ecx - MSR_AMD64_DR1_ADDRESS_MASK + 1] = msr_content;
+            if ( v->arch.debugreg[7] & DR7_ACTIVE_MASK )
+                wrmsrl(regs->_ecx, msr_content);
+            break;
+
         default:
             if ( wrmsr_hypervisor_regs(regs->ecx, msr_content) == 1 )
                 break;
@@ -2585,6 +2602,21 @@ static int emulate_privileged_op(struct 
             regs->eax = (uint32_t)msr_content;
             regs->edx = (uint32_t)(msr_content >> 32);
             break;
+
+        case MSR_AMD64_DR0_ADDRESS_MASK:
+            if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
+                goto fail;
+            regs->eax = v->arch.pv_vcpu.dr_mask[0];
+            regs->edx = 0;
+            break;
+        case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
+            if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
+                goto fail;
+            regs->eax = v->arch.pv_vcpu.dr_mask
+                            [regs->_ecx - MSR_AMD64_DR1_ADDRESS_MASK + 1];
+            regs->edx = 0;
+            break;
+
         default:
             if ( rdmsr_hypervisor_regs(regs->ecx, &val) )
             {
@@ -3628,7 +3660,27 @@ long do_set_trap_table(XEN_GUEST_HANDLE_
     return rc;
 }
 
-long set_debugreg(struct vcpu *v, int reg, unsigned long value)
+void activate_debugregs(const struct vcpu *curr)
+{
+    ASSERT(curr == current);
+
+    write_debugreg(0, curr->arch.debugreg[0]);
+    write_debugreg(1, curr->arch.debugreg[1]);
+    write_debugreg(2, curr->arch.debugreg[2]);
+    write_debugreg(3, curr->arch.debugreg[3]);
+    write_debugreg(6, curr->arch.debugreg[6]);
+    write_debugreg(7, curr->arch.debugreg[7]);
+
+    if ( boot_cpu_has(X86_FEATURE_DBEXT) )
+    {
+        wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, curr->arch.pv_vcpu.dr_mask[0]);
+        wrmsrl(MSR_AMD64_DR1_ADDRESS_MASK, curr->arch.pv_vcpu.dr_mask[1]);
+        wrmsrl(MSR_AMD64_DR2_ADDRESS_MASK, curr->arch.pv_vcpu.dr_mask[2]);
+        wrmsrl(MSR_AMD64_DR3_ADDRESS_MASK, curr->arch.pv_vcpu.dr_mask[3]);
+    }
+}
+
+long set_debugreg(struct vcpu *v, unsigned int reg, unsigned long value)
 {
     int i;
     struct vcpu *curr = current;
@@ -3709,11 +3761,8 @@ long set_debugreg(struct vcpu *v, int re
             if ( (v == curr) &&
                  !(v->arch.debugreg[7] & DR7_ACTIVE_MASK) )
             {
-                write_debugreg(0, v->arch.debugreg[0]);
-                write_debugreg(1, v->arch.debugreg[1]);
-                write_debugreg(2, v->arch.debugreg[2]);
-                write_debugreg(3, v->arch.debugreg[3]);
-                write_debugreg(6, v->arch.debugreg[6]);
+                activate_debugregs(curr);
+                break;
             }
         }
         if ( v == curr )
--- a/xen/include/asm-x86/debugreg.h
+++ b/xen/include/asm-x86/debugreg.h
@@ -64,4 +64,16 @@
 #define DR_GLOBAL_EXACT_ENABLE   (0x00000200ul) /* Global exact enable */
 #define DR_GENERAL_DETECT        (0x00002000ul) /* General detect enable */
 
+#define write_debugreg(reg, val) do {                       \
+    unsigned long __val = val;                              \
+    asm volatile ( "mov %0,%%db" #reg : : "r" (__val) );    \
+} while (0)
+#define read_debugreg(reg) ({                               \
+    unsigned long __val;                                    \
+    asm volatile ( "mov %%db" #reg ",%0" : "=r" (__val) );  \
+    __val;                                                  \
+})
+long set_debugreg(struct vcpu *, unsigned int reg, unsigned long value);
+void activate_debugregs(const struct vcpu *);
+
 #endif /* _X86_DEBUGREG_H */
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -374,6 +374,9 @@ struct pv_vcpu
     unsigned long shadow_ldt_mapcnt;
     spinlock_t shadow_ldt_lock;
 
+    /* data breakpoint extension MSRs */
+    uint32_t dr_mask[4];
+
     /* Deferred VA-based update state. */
     bool_t need_update_runstate_area;
     struct vcpu_time_info pending_system_time;
--- a/xen/include/asm-x86/processor.h
+++ b/xen/include/asm-x86/processor.h
@@ -462,17 +462,6 @@ long set_gdt(struct vcpu *d, 
              unsigned long *frames, 
              unsigned int entries);
 
-#define write_debugreg(reg, val) do {                       \
-    unsigned long __val = val;                              \
-    asm volatile ( "mov %0,%%db" #reg : : "r" (__val) );    \
-} while (0)
-#define read_debugreg(reg) ({                               \
-    unsigned long __val;                                    \
-    asm volatile ( "mov %%db" #reg ",%0" : "=r" (__val) );  \
-    __val;                                                  \
-})
-long set_debugreg(struct vcpu *p, int reg, unsigned long value);
-
 /* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
 static always_inline void rep_nop(void)
 {
--- a/xen/include/public/domctl.h
+++ b/xen/include/public/domctl.h
@@ -36,7 +36,7 @@
 #include "grant_table.h"
 #include "hvm/save.h"
 
-#define XEN_DOMCTL_INTERFACE_VERSION 0x00000009
+#define XEN_DOMCTL_INTERFACE_VERSION 0x0000000a
 
 /*
  * NB. xen_domctl.domain is an IN/OUT parameter for this operation.
@@ -563,6 +563,16 @@ typedef struct xen_domctl_pin_mem_cachea
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_pin_mem_cacheattr_t);
 
 
+#if defined(__i386__) || defined(__x86_64__)
+struct xen_domctl_ext_vcpu_msr {
+    uint32_t         index;
+    uint32_t         reserved;
+    uint64_aligned_t value;
+};
+typedef struct xen_domctl_ext_vcpu_msr xen_domctl_ext_vcpu_msr_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_ext_vcpu_msr_t);
+#endif
+
 /* XEN_DOMCTL_set_ext_vcpucontext */
 /* XEN_DOMCTL_get_ext_vcpucontext */
 struct xen_domctl_ext_vcpucontext {
@@ -582,6 +592,7 @@ struct xen_domctl_ext_vcpucontext {
     uint16_t         sysenter_callback_cs;
     uint8_t          syscall32_disables_events;
     uint8_t          sysenter_disables_events;
+    uint16_t         msr_count;
 #if defined(__GNUC__)
     union {
         uint64_aligned_t mcg_cap;
@@ -590,6 +601,7 @@ struct xen_domctl_ext_vcpucontext {
 #else
     struct hvm_vmce_vcpu vmce;
 #endif
+    XEN_GUEST_HANDLE_64(xen_domctl_ext_vcpu_msr_t) msrs;
 #endif
 };
 typedef struct xen_domctl_ext_vcpucontext xen_domctl_ext_vcpucontext_t;



[-- Attachment #2: PV-debug-address-mask-MSRs.patch --]
[-- Type: text/plain, Size: 19207 bytes --]

x86/PV: support data breakpoint extension registers

Introducing an extension to XEN_DOMCTL_[gs]et_ext_vcpucontext similar
to the generic MSR save/restore logic recently added for HVM.

This also moves some debug register related declarations/definition to
the header intended for these.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: libxc adjustment put in place (depending on the separately posted
    http://lists.xenproject.org/archives/html/xen-devel/2014-03/msg03059.html)

--- a/tools/libxc/xc_domain_restore.c
+++ b/tools/libxc/xc_domain_restore.c
@@ -590,8 +590,13 @@ static int buffer_tail_pv(xc_interface *
                           uint32_t vcpuextstate_size)
 {
     unsigned int i;
-    size_t pfnlen, vcpulen;
+    size_t pfnlen, vcpulen, total;
+    int alloc = 0;
     struct domain_info_context *dinfo = &ctx->dinfo;
+    union {
+        const unsigned char *raw;
+        const xen_domctl_ext_vcpucontext_t *evc;
+    } ptr;
 
     /* TODO: handle changing pfntab and vcpu counts */
     /* PFN tab */
@@ -634,11 +639,36 @@ static int buffer_tail_pv(xc_interface *
             ERROR("Error allocating VCPU ctxt tail buffer");
             goto free_pfntab;
         }
+        alloc = 1;
     }
     // DPRINTF("Reading VCPUS: %d bytes\n", vcpulen);
-    if ( RDEXACT(fd, buf->vcpubuf, vcpulen) ) {
-        PERROR("Error when reading ctxt");
-        goto free_vcpus;
+    for (total = i = 0, ptr.raw = buf->vcpubuf; ext_vcpucontext; ) {
+        if ( RDEXACT(fd, buf->vcpubuf + total, vcpulen) ) {
+            PERROR("Error when reading ctxt");
+            goto free_vcpus;
+        }
+        total += vcpulen;
+        for (vcpulen = 0; i < buf->vcpucount; ++i) {
+            size_t msrlen;
+
+            if ((const unsigned char *)(ptr.evc + 1) > buf->vcpubuf + total)
+                break;
+            msrlen = ptr.evc->msr_count * sizeof(xen_domctl_ext_vcpu_msr_t);
+            vcpulen += msrlen;
+            ptr.raw += 128 + msrlen + vcpuextstate_size;
+        }
+        if (!vcpulen)
+            break;
+        if (alloc) {
+            void *nbuf = realloc(buf->vcpubuf, total + vcpulen);
+
+            if (!nbuf) {
+                ERROR("Error growing VCPU ctxt tail buffer");
+                goto free_vcpus;
+            }
+            ptr.raw = nbuf + (ptr.raw - buf->vcpubuf);
+            buf->vcpubuf = nbuf;
+        }
     }
 
     /* load shared_info_page */
@@ -1996,6 +2026,8 @@ int xc_domain_restore(xc_interface *xch,
     vcpup = tailbuf.u.pv.vcpubuf;
     for ( i = 0; i <= max_vcpu_id; i++ )
     {
+        DECLARE_HYPERCALL_BUFFER(xen_domctl_ext_vcpu_msr_t, msrs);
+
         if ( !(vcpumap[i/64] & (1ULL << (i%64))) )
             continue;
 
@@ -2130,9 +2162,25 @@ int xc_domain_restore(xc_interface *xch,
             goto vcpu_ext_state_restore;
         memcpy(&domctl.u.ext_vcpucontext, vcpup, 128);
         vcpup += 128;
+        if ( domctl.u.ext_vcpucontext.msr_count )
+        {
+            size_t sz = domctl.u.ext_vcpucontext.msr_count * sizeof(*msrs);
+
+            msrs = xc_hypercall_buffer_alloc(xch, msrs, sz);
+            if ( !msrs )
+            {
+                PERROR("No memory for vcpu%d MSRs", i);
+                goto out;
+            }
+            memcpy(msrs, vcpup, sz);
+            vcpup += sz;
+            set_xen_guest_handle(domctl.u.ext_vcpucontext.msrs, msrs);
+        }
         domctl.cmd = XEN_DOMCTL_set_ext_vcpucontext;
         domctl.domain = dom;
         frc = xc_domctl(xch, &domctl);
+        if ( msrs )
+            xc_hypercall_buffer_free(xch, msrs);
         if ( frc != 0 )
         {
             PERROR("Couldn't set extended vcpu%d info", i);
--- a/tools/libxc/xc_domain_save.c
+++ b/tools/libxc/xc_domain_save.c
@@ -836,6 +836,9 @@ int xc_domain_save(xc_interface *xch, in
     /* base of the region in which domain memory is mapped */
     unsigned char *region_base = NULL;
 
+    /* MSR extensions to xen_domctl_ext_vcpucontext_t */
+    DECLARE_HYPERCALL_BUFFER(xen_domctl_ext_vcpu_msr_t, msrs);
+
     /* A copy of the CPU eXtended States of the guest. */
     DECLARE_HYPERCALL_BUFFER(void, buffer);
 
@@ -1960,16 +1963,36 @@ int xc_domain_save(xc_interface *xch, in
         domctl.domain = dom;
         memset(&domctl.u, 0, sizeof(domctl.u));
         domctl.u.ext_vcpucontext.vcpu = i;
-        if ( xc_domctl(xch, &domctl) < 0 )
+        frc = xc_domctl(xch, &domctl);
+        if ( frc < 0 && errno == ENOBUFS && domctl.u.ext_vcpucontext.msr_count )
+        {
+            msrs = xc_hypercall_buffer_alloc(xch, msrs,
+                                             domctl.u.ext_vcpucontext.msr_count *
+                                             sizeof(*msrs));
+            set_xen_guest_handle(domctl.u.ext_vcpucontext.msrs, msrs);
+            frc = msrs ? xc_domctl(xch, &domctl) : -1;
+        }
+        if ( frc < 0 )
         {
             PERROR("No extended context for VCPU%d", i);
             goto out;
         }
         if ( wrexact(io_fd, &domctl.u.ext_vcpucontext, 128) )
         {
-            PERROR("Error when writing to state file (2)");
+            PERROR("Error when writing to state file (ext ctxt)");
             goto out;
         }
+        if ( msrs )
+        {
+            if ( wrexact(io_fd, msrs,
+                         domctl.u.ext_vcpucontext.msr_count * sizeof(*msrs)) )
+            {
+                PERROR("Error when writing to state file (MSRs)");
+                goto out;
+            }
+            xc_hypercall_buffer_free(xch, msrs);
+            msrs = NULL;
+        }
 
         /* Start to fetch CPU eXtended States */
         /* Get buffer size first */
@@ -2134,6 +2157,8 @@ int xc_domain_save(xc_interface *xch, in
 
     xc_hypercall_buffer_free_pages(xch, to_send, NRPAGES(bitmap_size(dinfo->p2m_size)));
     xc_hypercall_buffer_free_pages(xch, to_skip, NRPAGES(bitmap_size(dinfo->p2m_size)));
+    if (msrs)
+        xc_hypercall_buffer_free(xch, msrs);
 
     free(pfn_type);
     free(pfn_batch);
--- a/xen/arch/x86/acpi/suspend.c
+++ b/xen/arch/x86/acpi/suspend.c
@@ -9,6 +9,7 @@
 #include <xen/smp.h>
 #include <asm/processor.h>
 #include <asm/msr.h>
+#include <asm/debugreg.h>
 #include <asm/flushtlb.h>
 #include <asm/hvm/hvm.h>
 #include <asm/hvm/support.h>
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1316,14 +1316,7 @@ static void paravirt_ctxt_switch_to(stru
         write_cr4(cr4);
 
     if ( unlikely(v->arch.debugreg[7] & DR7_ACTIVE_MASK) )
-    {
-        write_debugreg(0, v->arch.debugreg[0]);
-        write_debugreg(1, v->arch.debugreg[1]);
-        write_debugreg(2, v->arch.debugreg[2]);
-        write_debugreg(3, v->arch.debugreg[3]);
-        write_debugreg(6, v->arch.debugreg[6]);
-        write_debugreg(7, v->arch.debugreg[7]);
-    }
+        activate_debugregs(v);
 
     if ( (v->domain->arch.tsc_mode ==  TSC_MODE_PVRDTSCP) &&
          boot_cpu_has(X86_FEATURE_RDTSCP) )
--- a/xen/arch/x86/domctl.c
+++ b/xen/arch/x86/domctl.c
@@ -52,6 +52,7 @@ long arch_do_domctl(
 {
     long ret = 0;
     bool_t copyback = 0;
+    unsigned long i;
 
     switch ( domctl->cmd )
     {
@@ -319,7 +320,6 @@ long arch_do_domctl(
 
     case XEN_DOMCTL_getmemlist:
     {
-        int i;
         unsigned long max_pfns = domctl->u.getmemlist.max_pfns;
         uint64_t mfn;
         struct page_info *page;
@@ -645,7 +645,6 @@ long arch_do_domctl(
         unsigned long mfn = domctl->u.memory_mapping.first_mfn;
         unsigned long nr_mfns = domctl->u.memory_mapping.nr_mfns;
         int add = domctl->u.memory_mapping.add_mapping;
-        unsigned long i;
 
         ret = -EINVAL;
         if ( (mfn + nr_mfns - 1) < mfn || /* wrap? */
@@ -809,6 +808,7 @@ long arch_do_domctl(
     {
         struct xen_domctl_ext_vcpucontext *evc;
         struct vcpu *v;
+        struct xen_domctl_ext_vcpu_msr msr;
 
         evc = &domctl->u.ext_vcpucontext;
 
@@ -854,7 +854,42 @@ long arch_do_domctl(
             evc->vmce.mci_ctl2_bank0 = v->arch.vmce.bank[0].mci_ctl2;
             evc->vmce.mci_ctl2_bank1 = v->arch.vmce.bank[1].mci_ctl2;
 
-            ret = 0;
+            i = ret = 0;
+            if ( boot_cpu_has(X86_FEATURE_DBEXT) )
+            {
+                unsigned int j;
+
+                if ( v->arch.pv_vcpu.dr_mask[0] )
+                {
+                    if ( i < evc->msr_count && !ret )
+                    {
+                        msr.index = MSR_AMD64_DR0_ADDRESS_MASK;
+                        msr.reserved = 0;
+                        msr.value = v->arch.pv_vcpu.dr_mask[0];
+                        if ( copy_to_guest_offset(evc->msrs, i, &msr, 1) )
+                            ret = -EFAULT;
+                    }
+                    ++i;
+                }
+                for ( j = 0; j < 3; ++j )
+                {
+                    if ( !v->arch.pv_vcpu.dr_mask[1 + j] )
+                        continue;
+                    if ( i < evc->msr_count && !ret )
+                    {
+                        msr.index = MSR_AMD64_DR1_ADDRESS_MASK + j;
+                        msr.reserved = 0;
+                        msr.value = v->arch.pv_vcpu.dr_mask[1 + j];
+                        if ( copy_to_guest_offset(evc->msrs, i, &msr, 1) )
+                            ret = -EFAULT;
+                    }
+                    ++i;
+                }
+            }
+            if ( i > evc->msr_count && !ret )
+                ret = -ENOBUFS;
+            evc->msr_count = i;
+
             vcpu_unpause(v);
             copyback = 1;
         }
@@ -909,9 +944,49 @@ long arch_do_domctl(
 
                 ret = vmce_restore_vcpu(v, &vmce);
             }
+            else if ( evc->size > offsetof(typeof(*evc), vmce) )
+                ret = -EINVAL;
             else
                 ret = 0;
 
+            if ( ret || evc->size <= offsetof(typeof(*evc), msrs) )
+                /* nothing */;
+            else if ( evc->size < offsetof(typeof(*evc), msrs) +
+                                  sizeof(evc->msrs) )
+                ret = -EINVAL;
+            else
+            {
+                for ( i = 0; i < evc->msr_count; ++i )
+                {
+                    ret = -EFAULT;
+                    if ( copy_from_guest_offset(&msr, evc->msrs, i, 1) )
+                        break;
+                    ret = -EINVAL;
+                    if ( msr.reserved )
+                        break;
+                    switch ( msr.index )
+                    {
+                    case MSR_AMD64_DR0_ADDRESS_MASK:
+                        if ( !boot_cpu_has(X86_FEATURE_DBEXT) ||
+                             (msr.value >> 32) )
+                            break;
+                        v->arch.pv_vcpu.dr_mask[0] = msr.value;
+                        continue;
+                    case MSR_AMD64_DR1_ADDRESS_MASK ...
+                         MSR_AMD64_DR3_ADDRESS_MASK:
+                        if ( !boot_cpu_has(X86_FEATURE_DBEXT) ||
+                             (msr.value >> 32) )
+                            break;
+                        msr.index -= MSR_AMD64_DR1_ADDRESS_MASK - 1;
+                        v->arch.pv_vcpu.dr_mask[msr.index] = msr.value;
+                        continue;
+                    }
+                    break;
+                }
+                if ( i == evc->msr_count )
+                    ret = 0;
+            }
+
             domain_unpause(d);
         }
     }
@@ -921,7 +996,6 @@ long arch_do_domctl(
     {
         xen_domctl_cpuid_t *ctl = &domctl->u.cpuid;
         cpuid_input_t *cpuid = NULL; 
-        int i;
 
         for ( i = 0; i < MAX_CPUID_INPUT; i++ )
         {
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -2498,6 +2498,23 @@ static int emulate_privileged_op(struct 
             if ( wrmsr_safe(regs->ecx, msr_content) != 0 )
                 goto fail;
             break;
+
+        case MSR_AMD64_DR0_ADDRESS_MASK:
+            if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (msr_content >> 32) )
+                goto fail;
+            v->arch.pv_vcpu.dr_mask[0] = msr_content;
+            if ( v->arch.debugreg[7] & DR7_ACTIVE_MASK )
+                wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, msr_content);
+            break;
+        case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
+            if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (msr_content >> 32) )
+                goto fail;
+            v->arch.pv_vcpu.dr_mask
+                [regs->_ecx - MSR_AMD64_DR1_ADDRESS_MASK + 1] = msr_content;
+            if ( v->arch.debugreg[7] & DR7_ACTIVE_MASK )
+                wrmsrl(regs->_ecx, msr_content);
+            break;
+
         default:
             if ( wrmsr_hypervisor_regs(regs->ecx, msr_content) == 1 )
                 break;
@@ -2585,6 +2602,21 @@ static int emulate_privileged_op(struct 
             regs->eax = (uint32_t)msr_content;
             regs->edx = (uint32_t)(msr_content >> 32);
             break;
+
+        case MSR_AMD64_DR0_ADDRESS_MASK:
+            if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
+                goto fail;
+            regs->eax = v->arch.pv_vcpu.dr_mask[0];
+            regs->edx = 0;
+            break;
+        case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
+            if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
+                goto fail;
+            regs->eax = v->arch.pv_vcpu.dr_mask
+                            [regs->_ecx - MSR_AMD64_DR1_ADDRESS_MASK + 1];
+            regs->edx = 0;
+            break;
+
         default:
             if ( rdmsr_hypervisor_regs(regs->ecx, &val) )
             {
@@ -3628,7 +3660,27 @@ long do_set_trap_table(XEN_GUEST_HANDLE_
     return rc;
 }
 
-long set_debugreg(struct vcpu *v, int reg, unsigned long value)
+void activate_debugregs(const struct vcpu *curr)
+{
+    ASSERT(curr == current);
+
+    write_debugreg(0, curr->arch.debugreg[0]);
+    write_debugreg(1, curr->arch.debugreg[1]);
+    write_debugreg(2, curr->arch.debugreg[2]);
+    write_debugreg(3, curr->arch.debugreg[3]);
+    write_debugreg(6, curr->arch.debugreg[6]);
+    write_debugreg(7, curr->arch.debugreg[7]);
+
+    if ( boot_cpu_has(X86_FEATURE_DBEXT) )
+    {
+        wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, curr->arch.pv_vcpu.dr_mask[0]);
+        wrmsrl(MSR_AMD64_DR1_ADDRESS_MASK, curr->arch.pv_vcpu.dr_mask[1]);
+        wrmsrl(MSR_AMD64_DR2_ADDRESS_MASK, curr->arch.pv_vcpu.dr_mask[2]);
+        wrmsrl(MSR_AMD64_DR3_ADDRESS_MASK, curr->arch.pv_vcpu.dr_mask[3]);
+    }
+}
+
+long set_debugreg(struct vcpu *v, unsigned int reg, unsigned long value)
 {
     int i;
     struct vcpu *curr = current;
@@ -3709,11 +3761,8 @@ long set_debugreg(struct vcpu *v, int re
             if ( (v == curr) &&
                  !(v->arch.debugreg[7] & DR7_ACTIVE_MASK) )
             {
-                write_debugreg(0, v->arch.debugreg[0]);
-                write_debugreg(1, v->arch.debugreg[1]);
-                write_debugreg(2, v->arch.debugreg[2]);
-                write_debugreg(3, v->arch.debugreg[3]);
-                write_debugreg(6, v->arch.debugreg[6]);
+                activate_debugregs(curr);
+                break;
             }
         }
         if ( v == curr )
--- a/xen/include/asm-x86/debugreg.h
+++ b/xen/include/asm-x86/debugreg.h
@@ -64,4 +64,16 @@
 #define DR_GLOBAL_EXACT_ENABLE   (0x00000200ul) /* Global exact enable */
 #define DR_GENERAL_DETECT        (0x00002000ul) /* General detect enable */
 
+#define write_debugreg(reg, val) do {                       \
+    unsigned long __val = val;                              \
+    asm volatile ( "mov %0,%%db" #reg : : "r" (__val) );    \
+} while (0)
+#define read_debugreg(reg) ({                               \
+    unsigned long __val;                                    \
+    asm volatile ( "mov %%db" #reg ",%0" : "=r" (__val) );  \
+    __val;                                                  \
+})
+long set_debugreg(struct vcpu *, unsigned int reg, unsigned long value);
+void activate_debugregs(const struct vcpu *);
+
 #endif /* _X86_DEBUGREG_H */
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -374,6 +374,9 @@ struct pv_vcpu
     unsigned long shadow_ldt_mapcnt;
     spinlock_t shadow_ldt_lock;
 
+    /* data breakpoint extension MSRs */
+    uint32_t dr_mask[4];
+
     /* Deferred VA-based update state. */
     bool_t need_update_runstate_area;
     struct vcpu_time_info pending_system_time;
--- a/xen/include/asm-x86/processor.h
+++ b/xen/include/asm-x86/processor.h
@@ -462,17 +462,6 @@ long set_gdt(struct vcpu *d, 
              unsigned long *frames, 
              unsigned int entries);
 
-#define write_debugreg(reg, val) do {                       \
-    unsigned long __val = val;                              \
-    asm volatile ( "mov %0,%%db" #reg : : "r" (__val) );    \
-} while (0)
-#define read_debugreg(reg) ({                               \
-    unsigned long __val;                                    \
-    asm volatile ( "mov %%db" #reg ",%0" : "=r" (__val) );  \
-    __val;                                                  \
-})
-long set_debugreg(struct vcpu *p, int reg, unsigned long value);
-
 /* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
 static always_inline void rep_nop(void)
 {
--- a/xen/include/public/domctl.h
+++ b/xen/include/public/domctl.h
@@ -36,7 +36,7 @@
 #include "grant_table.h"
 #include "hvm/save.h"
 
-#define XEN_DOMCTL_INTERFACE_VERSION 0x00000009
+#define XEN_DOMCTL_INTERFACE_VERSION 0x0000000a
 
 /*
  * NB. xen_domctl.domain is an IN/OUT parameter for this operation.
@@ -563,6 +563,16 @@ typedef struct xen_domctl_pin_mem_cachea
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_pin_mem_cacheattr_t);
 
 
+#if defined(__i386__) || defined(__x86_64__)
+struct xen_domctl_ext_vcpu_msr {
+    uint32_t         index;
+    uint32_t         reserved;
+    uint64_aligned_t value;
+};
+typedef struct xen_domctl_ext_vcpu_msr xen_domctl_ext_vcpu_msr_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_ext_vcpu_msr_t);
+#endif
+
 /* XEN_DOMCTL_set_ext_vcpucontext */
 /* XEN_DOMCTL_get_ext_vcpucontext */
 struct xen_domctl_ext_vcpucontext {
@@ -582,6 +592,7 @@ struct xen_domctl_ext_vcpucontext {
     uint16_t         sysenter_callback_cs;
     uint8_t          syscall32_disables_events;
     uint8_t          sysenter_disables_events;
+    uint16_t         msr_count;
 #if defined(__GNUC__)
     union {
         uint64_aligned_t mcg_cap;
@@ -590,6 +601,7 @@ struct xen_domctl_ext_vcpucontext {
 #else
     struct hvm_vmce_vcpu vmce;
 #endif
+    XEN_GUEST_HANDLE_64(xen_domctl_ext_vcpu_msr_t) msrs;
 #endif
 };
 typedef struct xen_domctl_ext_vcpucontext xen_domctl_ext_vcpucontext_t;

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH RFC 0/4] x86/AMD: support newer hardware features
       [not found]   ` <CAAAAutAgdSJqYEjc4dtixuTS2S=PUx-L4Sy=JCQRZ57rTdoPCQ@mail.gmail.com>
@ 2014-03-26 22:35     ` Aravind Gopalakrishnan
  2014-04-01 23:10       ` Aravind Gopalakrishnan
  0 siblings, 1 reply; 28+ messages in thread
From: Aravind Gopalakrishnan @ 2014-03-26 22:35 UTC (permalink / raw)
  To: Jan Beulich, xen-devel
  Cc: Ian.Jackson, Keir Fraser, ian.campbell, Suthikulpanit, Suravee,
	Hurwitz, Sherry

>
> 1: SVM: support data breakpoint extension registers
> 2: PV: support data breakpoint extension registers
> 3: x86/AMD: support further feature masking MSRs
> 4: x86/AMD: clean up pre-canned family/revision handling for CPUID masking
>
> Patches 3 and 4 are fully tested; for patches 1 and 2 I'm lacking
> suitable hardware, and hence depend on AMD's help (and patch
> 1 is also still lacking some tools side adjustments). Patch 3, otoh,
> has a couple of questions to be answered. Overall this makes the
> whole series RFC only for now.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com <mailto:jbeulich@suse.com>>
>

Ah. I have been looking into adding support for data breakpoint 
extension myself.
Wish I had seen this earlier :(

Anyway, I will test the patches and let you know how it goes..

Thanks,
-Aravind.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v2 RFC 2/4] x86/PV: support data breakpoint extension registers
  2014-03-26 14:13   ` [PATCH v2 RFC 2/4] x86/PV: support data breakpoint extension registers Jan Beulich
@ 2014-03-27 15:28     ` Ian Campbell
  2014-03-27 16:03       ` Jan Beulich
  2014-03-27 16:03       ` Andrew Cooper
  0 siblings, 2 replies; 28+ messages in thread
From: Ian Campbell @ 2014-03-27 15:28 UTC (permalink / raw)
  To: Jan Beulich
  Cc: xen-devel, Keir Fraser, Ian Jackson, suravee.suthikulpanit,
	Sherry Hurwitz

On Wed, 2014-03-26 at 14:13 +0000, Jan Beulich wrote:
> Introducing an extension to XEN_DOMCTL_[gs]et_ext_vcpucontext similar
> to the generic MSR save/restore logic recently added for HVM.

"x86: generic MSRs save/restore" I guess?

Is the intention here that the extvcpu context is a blob of opaque data
from the pov of the migration tools?

Do the protocol docs in xg_save_restore need updating due to this
change?

How does N->N+1 migration work out?

> This also moves some debug register related declarations/definition to
> the header intended for these.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> v2: libxc adjustment put in place (depending on the separately posted
>     http://lists.xenproject.org/archives/html/xen-devel/2014-03/msg03059.html)
> 
> --- a/tools/libxc/xc_domain_restore.c
> +++ b/tools/libxc/xc_domain_restore.c
> @@ -590,8 +590,13 @@ static int buffer_tail_pv(xc_interface *
>                            uint32_t vcpuextstate_size)
>  {
>      unsigned int i;
> -    size_t pfnlen, vcpulen;
> +    size_t pfnlen, vcpulen, total;
> +    int alloc = 0;
>      struct domain_info_context *dinfo = &ctx->dinfo;
> +    union {
> +        const unsigned char *raw;
> +        const xen_domctl_ext_vcpucontext_t *evc;
> +    } ptr;
>  
>      /* TODO: handle changing pfntab and vcpu counts */
>      /* PFN tab */
> @@ -634,11 +639,36 @@ static int buffer_tail_pv(xc_interface *
>              ERROR("Error allocating VCPU ctxt tail buffer");
>              goto free_pfntab;
>          }
> +        alloc = 1;
>      }
>      // DPRINTF("Reading VCPUS: %d bytes\n", vcpulen);
> -    if ( RDEXACT(fd, buf->vcpubuf, vcpulen) ) {
> -        PERROR("Error when reading ctxt");
> -        goto free_vcpus;
> +    for (total = i = 0, ptr.raw = buf->vcpubuf; ext_vcpucontext; ) {
> +        if ( RDEXACT(fd, buf->vcpubuf + total, vcpulen) ) {
> +            PERROR("Error when reading ctxt");
> +            goto free_vcpus;
> +        }
> +        total += vcpulen;
> +        for (vcpulen = 0; i < buf->vcpucount; ++i) {
> +            size_t msrlen;
> +
> +            if ((const unsigned char *)(ptr.evc + 1) > buf->vcpubuf + total)
> +                break;
> +            msrlen = ptr.evc->msr_count * sizeof(xen_domctl_ext_vcpu_msr_t);
> +            vcpulen += msrlen;
> +            ptr.raw += 128 + msrlen + vcpuextstate_size;
> +        }
> +        if (!vcpulen)
> +            break;
> +        if (alloc) {
> +            void *nbuf = realloc(buf->vcpubuf, total + vcpulen);
> +
> +            if (!nbuf) {
> +                ERROR("Error growing VCPU ctxt tail buffer");
> +                goto free_vcpus;
> +            }
> +            ptr.raw = nbuf + (ptr.raw - buf->vcpubuf);
> +            buf->vcpubuf = nbuf;
> +        }
>      }
>  
>      /* load shared_info_page */
> @@ -1996,6 +2026,8 @@ int xc_domain_restore(xc_interface *xch,
>      vcpup = tailbuf.u.pv.vcpubuf;
>      for ( i = 0; i <= max_vcpu_id; i++ )
>      {
> +        DECLARE_HYPERCALL_BUFFER(xen_domctl_ext_vcpu_msr_t, msrs);
> +
>          if ( !(vcpumap[i/64] & (1ULL << (i%64))) )
>              continue;
>  
> @@ -2130,9 +2162,25 @@ int xc_domain_restore(xc_interface *xch,
>              goto vcpu_ext_state_restore;
>          memcpy(&domctl.u.ext_vcpucontext, vcpup, 128);
>          vcpup += 128;
> +        if ( domctl.u.ext_vcpucontext.msr_count )
> +        {
> +            size_t sz = domctl.u.ext_vcpucontext.msr_count * sizeof(*msrs);
> +
> +            msrs = xc_hypercall_buffer_alloc(xch, msrs, sz);
> +            if ( !msrs )
> +            {
> +                PERROR("No memory for vcpu%d MSRs", i);
> +                goto out;
> +            }
> +            memcpy(msrs, vcpup, sz);

This seems to be opencoding the hypercall bounce buffer stuff to some
extent.

> +            vcpup += sz;
> +            set_xen_guest_handle(domctl.u.ext_vcpucontext.msrs, msrs);
> +        }
>          domctl.cmd = XEN_DOMCTL_set_ext_vcpucontext;
>          domctl.domain = dom;
>          frc = xc_domctl(xch, &domctl);
> +        if ( msrs )
> +            xc_hypercall_buffer_free(xch, msrs);

I think you don't need the if ( msrs ) here.

>          if ( frc != 0 )
>          {
>              PERROR("Couldn't set extended vcpu%d info", i);
> --- a/tools/libxc/xc_domain_save.c
> +++ b/tools/libxc/xc_domain_save.c
> @@ -836,6 +836,9 @@ int xc_domain_save(xc_interface *xch, in
>      /* base of the region in which domain memory is mapped */
>      unsigned char *region_base = NULL;
>  
> +    /* MSR extensions to xen_domctl_ext_vcpucontext_t */
> +    DECLARE_HYPERCALL_BUFFER(xen_domctl_ext_vcpu_msr_t, msrs);
> +
>      /* A copy of the CPU eXtended States of the guest. */
>      DECLARE_HYPERCALL_BUFFER(void, buffer);
>  
> @@ -1960,16 +1963,36 @@ int xc_domain_save(xc_interface *xch, in
>          domctl.domain = dom;
>          memset(&domctl.u, 0, sizeof(domctl.u));
>          domctl.u.ext_vcpucontext.vcpu = i;
> -        if ( xc_domctl(xch, &domctl) < 0 )
> +        frc = xc_domctl(xch, &domctl);
> +        if ( frc < 0 && errno == ENOBUFS && domctl.u.ext_vcpucontext.msr_count )
> +        {
> +            msrs = xc_hypercall_buffer_alloc(xch, msrs,
> +                                             domctl.u.ext_vcpucontext.msr_count *
> +                                             sizeof(*msrs));
> +            set_xen_guest_handle(domctl.u.ext_vcpucontext.msrs, msrs);
> +            frc = msrs ? xc_domctl(xch, &domctl) : -1;
> +        }
> +        if ( frc < 0 )
>          {
>              PERROR("No extended context for VCPU%d", i);
>              goto out;
>          }
>          if ( wrexact(io_fd, &domctl.u.ext_vcpucontext, 128) )
>          {
> -            PERROR("Error when writing to state file (2)");
> +            PERROR("Error when writing to state file (ext ctxt)");
>              goto out;
>          }
> +        if ( msrs )
> +        {
> +            if ( wrexact(io_fd, msrs,
> +                         domctl.u.ext_vcpucontext.msr_count * sizeof(*msrs)) )
> +            {
> +                PERROR("Error when writing to state file (MSRs)");
> +                goto out;
> +            }
> +            xc_hypercall_buffer_free(xch, msrs);
> +            msrs = NULL;
> +        }
>  
>          /* Start to fetch CPU eXtended States */
>          /* Get buffer size first */
> @@ -2134,6 +2157,8 @@ int xc_domain_save(xc_interface *xch, in
>  
>      xc_hypercall_buffer_free_pages(xch, to_send, NRPAGES(bitmap_size(dinfo->p2m_size)));
>      xc_hypercall_buffer_free_pages(xch, to_skip, NRPAGES(bitmap_size(dinfo->p2m_size)));
> +    if (msrs)

if ( msrs ) if indeed it is needed at all.

> +        xc_hypercall_buffer_free(xch, msrs);
>  
>      free(pfn_type);
>      free(pfn_batch);

> @@ -563,6 +563,16 @@ typedef struct xen_domctl_pin_mem_cachea
>  DEFINE_XEN_GUEST_HANDLE(xen_domctl_pin_mem_cacheattr_t);
>  
> 
> +#if defined(__i386__) || defined(__x86_64__)
> +struct xen_domctl_ext_vcpu_msr {
> +    uint32_t         index;
> +    uint32_t         reserved;
> +    uint64_aligned_t value;
> +};

Maybe this is just me forgetting x86 stuff but are these "data
breakpoint extension registers" actually MSRs?

If so why is this not part of the generic MSR support which you
referenced earlier? At least at the domctl level it seems to be generic
here too?

> +typedef struct xen_domctl_ext_vcpu_msr xen_domctl_ext_vcpu_msr_t;
> +DEFINE_XEN_GUEST_HANDLE(xen_domctl_ext_vcpu_msr_t);
> +#endif
> +
>  /* XEN_DOMCTL_set_ext_vcpucontext */
>  /* XEN_DOMCTL_get_ext_vcpucontext */
>  struct xen_domctl_ext_vcpucontext {
> @@ -582,6 +592,7 @@ struct xen_domctl_ext_vcpucontext {
>      uint16_t         sysenter_callback_cs;
>      uint8_t          syscall32_disables_events;
>      uint8_t          sysenter_disables_events;
> +    uint16_t         msr_count;
>  #if defined(__GNUC__)
>      union {
>          uint64_aligned_t mcg_cap;
> @@ -590,6 +601,7 @@ struct xen_domctl_ext_vcpucontext {
>  #else
>      struct hvm_vmce_vcpu vmce;
>  #endif
> +    XEN_GUEST_HANDLE_64(xen_domctl_ext_vcpu_msr_t) msrs;
>  #endif
>  };
>  typedef struct xen_domctl_ext_vcpucontext xen_domctl_ext_vcpucontext_t;
> 
> 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v2 RFC 2/4] x86/PV: support data breakpoint extension registers
  2014-03-27 15:28     ` Ian Campbell
@ 2014-03-27 16:03       ` Jan Beulich
  2014-03-27 16:23         ` Ian Campbell
  2014-03-27 16:03       ` Andrew Cooper
  1 sibling, 1 reply; 28+ messages in thread
From: Jan Beulich @ 2014-03-27 16:03 UTC (permalink / raw)
  To: Ian Campbell
  Cc: xen-devel, Keir Fraser, Ian Jackson, suravee.suthikulpanit,
	Sherry Hurwitz

>>> On 27.03.14 at 16:28, <Ian.Campbell@citrix.com> wrote:
> On Wed, 2014-03-26 at 14:13 +0000, Jan Beulich wrote:
>> Introducing an extension to XEN_DOMCTL_[gs]et_ext_vcpucontext similar
>> to the generic MSR save/restore logic recently added for HVM.
> 
> "x86: generic MSRs save/restore" I guess?
> 
> Is the intention here that the extvcpu context is a blob of opaque data
> from the pov of the migration tools?

No, these tools will need to know about the structure of the data
(as seen in the patch).

> Do the protocol docs in xg_save_restore need updating due to this
> change?

Not sure about that - this doesn't seem to talk about what's inside
the "bodies".

> How does N->N+1 migration work out?

Since the structure size grows, and that size is embedded in the
structure, input coming from N will be treated as not having any
MSRs.

>> @@ -2130,9 +2162,25 @@ int xc_domain_restore(xc_interface *xch,
>>              goto vcpu_ext_state_restore;
>>          memcpy(&domctl.u.ext_vcpucontext, vcpup, 128);
>>          vcpup += 128;
>> +        if ( domctl.u.ext_vcpucontext.msr_count )
>> +        {
>> +            size_t sz = domctl.u.ext_vcpucontext.msr_count * sizeof(*msrs);
>> +
>> +            msrs = xc_hypercall_buffer_alloc(xch, msrs, sz);
>> +            if ( !msrs )
>> +            {
>> +                PERROR("No memory for vcpu%d MSRs", i);
>> +                goto out;
>> +            }
>> +            memcpy(msrs, vcpup, sz);
> 
> This seems to be opencoding the hypercall bounce buffer stuff to some
> extent.

Just like in the xstate logic. Aiui there's no way currently for
do_domctl() to know that in some cases embedded handles need
following. Or at least I didn't spot it, so if there is a mechanism to
avoid this open coding, please point me at it.

>> +            vcpup += sz;
>> +            set_xen_guest_handle(domctl.u.ext_vcpucontext.msrs, msrs);
>> +        }
>>          domctl.cmd = XEN_DOMCTL_set_ext_vcpucontext;
>>          domctl.domain = dom;
>>          frc = xc_domctl(xch, &domctl);
>> +        if ( msrs )
>> +            xc_hypercall_buffer_free(xch, msrs);
> 
> I think you don't need the if ( msrs ) here.

Ah, right - I saw xc__hypercall_buffer_free() dereference its argument,
not paying close enough attention that what gets passed here is the
address of an on-stack variable. Will drop.

>> @@ -563,6 +563,16 @@ typedef struct xen_domctl_pin_mem_cachea
>>  DEFINE_XEN_GUEST_HANDLE(xen_domctl_pin_mem_cacheattr_t);
>>  
>> 
>> +#if defined(__i386__) || defined(__x86_64__)
>> +struct xen_domctl_ext_vcpu_msr {
>> +    uint32_t         index;
>> +    uint32_t         reserved;
>> +    uint64_aligned_t value;
>> +};
> 
> Maybe this is just me forgetting x86 stuff but are these "data
> breakpoint extension registers" actually MSRs?

Yes, they are (and they're AMD-specific).

> If so why is this not part of the generic MSR support which you
> referenced earlier? At least at the domctl level it seems to be generic
> here too?

Because that was HVM-only. Here a similar extensible mechanism is
being introduced for PV.

Jan

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v2 RFC 2/4] x86/PV: support data breakpoint extension registers
  2014-03-27 15:28     ` Ian Campbell
  2014-03-27 16:03       ` Jan Beulich
@ 2014-03-27 16:03       ` Andrew Cooper
  2014-03-27 16:37         ` Jan Beulich
  1 sibling, 1 reply; 28+ messages in thread
From: Andrew Cooper @ 2014-03-27 16:03 UTC (permalink / raw)
  To: Ian Campbell
  Cc: Keir Fraser, Jan Beulich, Ian Jackson, suravee.suthikulpanit,
	Sherry Hurwitz, xen-devel

On 27/03/14 15:28, Ian Campbell wrote:
> On Wed, 2014-03-26 at 14:13 +0000, Jan Beulich wrote:
>> Introducing an extension to XEN_DOMCTL_[gs]et_ext_vcpucontext similar
>> to the generic MSR save/restore logic recently added for HVM.
> "x86: generic MSRs save/restore" I guess?
>
> Is the intention here that the extvcpu context is a blob of opaque data
> from the pov of the migration tools?
>
> Do the protocol docs in xg_save_restore need updating due to this
> change?
>
> How does N->N+1 migration work out?

Having just rewritten all of this code from scratch, the details are
painfully present in my brain.

The answer is badly.

The blob is exactly 128 bytes long, starting from the beginning of the
domctl union.  changeset e1dc98e3c "x86/vMCE: save/restore MCA
capabilities" introduced an explicit memset(0) on the content of the
struct domctl, meaning that this newly interpreted 'msr_count' shall
have the value 0.

Migration streams from before e1dc98e3c (Fri Feb 24 2012, Xen-4.2) will
have an uninitialised stack value instead of 0 for the newly interpreted
'msr_count' (unless the toolstack was complied with CONFIG_VALGRIND),
which will cause corruption in any migration stream from Xen-4.1 or before.

While I realise that this does technically fall within the statement of
supportability as far as upstream goes, it would be a show-stopper for
XenServer and I presume any other Xen based system with real customers
on older versions of Xen.

As this is a new feature, so not applicable for backport, can it wait
until my migration stream v2 work is submitted?  The migration code
churn to support this in v2 would be far less, and would also guarantee
no breakage of older migration streams.

~Andrew

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v2 RFC 2/4] x86/PV: support data breakpoint extension registers
  2014-03-27 16:03       ` Jan Beulich
@ 2014-03-27 16:23         ` Ian Campbell
  2014-03-27 16:44           ` Jan Beulich
  0 siblings, 1 reply; 28+ messages in thread
From: Ian Campbell @ 2014-03-27 16:23 UTC (permalink / raw)
  To: Jan Beulich
  Cc: xen-devel, Keir Fraser, Ian Jackson, suravee.suthikulpanit,
	Sherry Hurwitz

On Thu, 2014-03-27 at 16:03 +0000, Jan Beulich wrote:
> >>> On 27.03.14 at 16:28, <Ian.Campbell@citrix.com> wrote:
> > On Wed, 2014-03-26 at 14:13 +0000, Jan Beulich wrote:
> >> Introducing an extension to XEN_DOMCTL_[gs]et_ext_vcpucontext similar
> >> to the generic MSR save/restore logic recently added for HVM.
> > 
> > "x86: generic MSRs save/restore" I guess?
> > 
> > Is the intention here that the extvcpu context is a blob of opaque data
> > from the pov of the migration tools?
> 
> No, these tools will need to know about the structure of the data
> (as seen in the patch).

It looked to me like it was mostly just extracting a stream of bytes and
throwing it at a domctl, perhaps I'm misreading it.

> > Do the protocol docs in xg_save_restore need updating due to this
> > change?
> 
> Not sure about that - this doesn't seem to talk about what's inside
> the "bodies".

Is this extending the "VCPU context data"? or one of the XC_SAVE_ID_*
things? I think it would be worth extending that to indicate the new
"optional" data, where and what it is etc.

> > How does N->N+1 migration work out?
> 
> Since the structure size grows, and that size is embedded in the
> structure, input coming from N will be treated as not having any
> MSRs.

This is the size of the "extv" block in the header phase?

> 
> >> @@ -2130,9 +2162,25 @@ int xc_domain_restore(xc_interface *xch,
> >>              goto vcpu_ext_state_restore;
> >>          memcpy(&domctl.u.ext_vcpucontext, vcpup, 128);
> >>          vcpup += 128;
> >> +        if ( domctl.u.ext_vcpucontext.msr_count )
> >> +        {
> >> +            size_t sz = domctl.u.ext_vcpucontext.msr_count * sizeof(*msrs);
> >> +
> >> +            msrs = xc_hypercall_buffer_alloc(xch, msrs, sz);
> >> +            if ( !msrs )
> >> +            {
> >> +                PERROR("No memory for vcpu%d MSRs", i);
> >> +                goto out;
> >> +            }
> >> +            memcpy(msrs, vcpup, sz);
> > 
> > This seems to be opencoding the hypercall bounce buffer stuff to some
> > extent.
> 
> Just like in the xstate logic. Aiui there's no way currently for
> do_domctl() to know that in some cases embedded handles need
> following. Or at least I didn't spot it, so if there is a mechanism to
> avoid this open coding, please point me at it.

I didn't mean that do_domctl could do the bouncing, but rather than this
code could itself use the bounce buffer stuff to bounce from vcpup into
msrs automatically, i.e. avoiding the open coded memcpy.

Something like:
        if ( domctl.u.ext_vcpucontext.msr_count )
        {
        	/*const?*/size_t sz = domctl.u.ext_vcpucontext.msr_count * sizeof(*msrs);
        	DECLARE_HYPERCALL_BOUNCE(vcpup, sz, XC_HYPERCALL_BUFFER_BOUNCE_IN);
        
        	if ( xc_hypercall_bounce_pre(xch, vcpup) )
        		badness
        
        	domclt.cmd = ...;
        	domctl.domain = ...; etc
        	set_xen_guest_handle(domctl.u.ext_vcpucontext.msrs, vcpup);
        
        	frc = do_domctl(xch, &domctl)
        
        	xc_hypercall_boucne_post(xch, vcpup)
        }

Or it might be clearer to use
         DECLARE_NAMED_HYPERCALL_BOUNCE(msrs, vcpup, sz, ..._BOUNCE_IN)
so you can use "msrs" for the remainder.

Lastly you can pass size as 0 in the declaration and use
HYPERCALL_BOUNCE_SET_SIZE, if that works better.

> >> +            vcpup += sz;
> >> +            set_xen_guest_handle(domctl.u.ext_vcpucontext.msrs, msrs);
> >> +        }
> >>          domctl.cmd = XEN_DOMCTL_set_ext_vcpucontext;
> >>          domctl.domain = dom;
> >>          frc = xc_domctl(xch, &domctl);
> >> +        if ( msrs )
> >> +            xc_hypercall_buffer_free(xch, msrs);
> > 
> > I think you don't need the if ( msrs ) here.
> 
> Ah, right - I saw xc__hypercall_buffer_free() dereference its argument,
> not paying close enough attention that what gets passed here is the
> address of an on-stack variable.

Very easy to miss I agree.

> > If so why is this not part of the generic MSR support which you
> > referenced earlier? At least at the domctl level it seems to be generic
> > here too?
> 
> Because that was HVM-only. Here a similar extensible mechanism is
> being introduced for PV.

Ah, I see.

Ian.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v2 RFC 2/4] x86/PV: support data breakpoint extension registers
  2014-03-27 16:03       ` Andrew Cooper
@ 2014-03-27 16:37         ` Jan Beulich
  2014-03-27 17:11           ` Andrew Cooper
  0 siblings, 1 reply; 28+ messages in thread
From: Jan Beulich @ 2014-03-27 16:37 UTC (permalink / raw)
  To: Andrew Cooper, Ian Campbell
  Cc: xen-devel, Keir Fraser, Ian Jackson, suravee.suthikulpanit,
	Sherry Hurwitz

>>> On 27.03.14 at 17:03, <andrew.cooper3@citrix.com> wrote:
> On 27/03/14 15:28, Ian Campbell wrote:
>> On Wed, 2014-03-26 at 14:13 +0000, Jan Beulich wrote:
>>> Introducing an extension to XEN_DOMCTL_[gs]et_ext_vcpucontext similar
>>> to the generic MSR save/restore logic recently added for HVM.
>> "x86: generic MSRs save/restore" I guess?
>>
>> Is the intention here that the extvcpu context is a blob of opaque data
>> from the pov of the migration tools?
>>
>> Do the protocol docs in xg_save_restore need updating due to this
>> change?
>>
>> How does N->N+1 migration work out?
> 
> Having just rewritten all of this code from scratch, the details are
> painfully present in my brain.
> 
> The answer is badly.
> 
> The blob is exactly 128 bytes long, starting from the beginning of the
> domctl union.  changeset e1dc98e3c "x86/vMCE: save/restore MCA
> capabilities" introduced an explicit memset(0) on the content of the
> struct domctl, meaning that this newly interpreted 'msr_count' shall
> have the value 0.

As just written to Ian in another reply - this new field isn't even being
looked at when the size stored at the beginning of the structure doesn't
indicate the presence of the msrs[] handle.

> As this is a new feature, so not applicable for backport, can it wait
> until my migration stream v2 work is submitted?  The migration code
> churn to support this in v2 would be far less, and would also guarantee
> no breakage of older migration streams.

I wouldn't want to postpone this by too much (and certainly not past
a re-write of the migration code) because - other than upstream Xen -
we may want/need to backport this for SLE12.

Jan

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v2 RFC 2/4] x86/PV: support data breakpoint extension registers
  2014-03-27 16:23         ` Ian Campbell
@ 2014-03-27 16:44           ` Jan Beulich
  2014-03-27 17:29             ` Ian Campbell
  0 siblings, 1 reply; 28+ messages in thread
From: Jan Beulich @ 2014-03-27 16:44 UTC (permalink / raw)
  To: Ian Campbell
  Cc: xen-devel, Keir Fraser, Ian Jackson, suravee.suthikulpanit,
	Sherry Hurwitz

>>> On 27.03.14 at 17:23, <Ian.Campbell@citrix.com> wrote:
> On Thu, 2014-03-27 at 16:03 +0000, Jan Beulich wrote:
>> >>> On 27.03.14 at 16:28, <Ian.Campbell@citrix.com> wrote:
>> > Do the protocol docs in xg_save_restore need updating due to this
>> > change?
>> 
>> Not sure about that - this doesn't seem to talk about what's inside
>> the "bodies".
> 
> Is this extending the "VCPU context data"? or one of the XC_SAVE_ID_*
> things? I think it would be worth extending that to indicate the new
> "optional" data, where and what it is etc.

Yeah, I could probably add a note to the "extv" thing, even if it
doesn't really fit.

>> > How does N->N+1 migration work out?
>> 
>> Since the structure size grows, and that size is embedded in the
>> structure, input coming from N will be treated as not having any
>> MSRs.
> 
> This is the size of the "extv" block in the header phase?

No, the "size" member of struct xen_domctl_ext_vcpucontext.

>> >> @@ -2130,9 +2162,25 @@ int xc_domain_restore(xc_interface *xch,
>> >>              goto vcpu_ext_state_restore;
>> >>          memcpy(&domctl.u.ext_vcpucontext, vcpup, 128);
>> >>          vcpup += 128;
>> >> +        if ( domctl.u.ext_vcpucontext.msr_count )
>> >> +        {
>> >> +            size_t sz = domctl.u.ext_vcpucontext.msr_count * sizeof(*msrs);
>> >> +
>> >> +            msrs = xc_hypercall_buffer_alloc(xch, msrs, sz);
>> >> +            if ( !msrs )
>> >> +            {
>> >> +                PERROR("No memory for vcpu%d MSRs", i);
>> >> +                goto out;
>> >> +            }
>> >> +            memcpy(msrs, vcpup, sz);
>> > 
>> > This seems to be opencoding the hypercall bounce buffer stuff to some
>> > extent.
>> 
>> Just like in the xstate logic. Aiui there's no way currently for
>> do_domctl() to know that in some cases embedded handles need
>> following. Or at least I didn't spot it, so if there is a mechanism to
>> avoid this open coding, please point me at it.
> 
> I didn't mean that do_domctl could do the bouncing, but rather than this
> code could itself use the bounce buffer stuff to bounce from vcpup into
> msrs automatically, i.e. avoiding the open coded memcpy.
> 
> Something like:
>         if ( domctl.u.ext_vcpucontext.msr_count )
>         {
>         	/*const?*/size_t sz = domctl.u.ext_vcpucontext.msr_count * sizeof(*msrs);
>         	DECLARE_HYPERCALL_BOUNCE(vcpup, sz, XC_HYPERCALL_BUFFER_BOUNCE_IN);
>         
>         	if ( xc_hypercall_bounce_pre(xch, vcpup) )
>         		badness
>         
>         	domclt.cmd = ...;
>         	domctl.domain = ...; etc
>         	set_xen_guest_handle(domctl.u.ext_vcpucontext.msrs, vcpup);
>         
>         	frc = do_domctl(xch, &domctl)
>         
>         	xc_hypercall_boucne_post(xch, vcpup)
>         }
> 
> Or it might be clearer to use
>          DECLARE_NAMED_HYPERCALL_BOUNCE(msrs, vcpup, sz, ..._BOUNCE_IN)
> so you can use "msrs" for the remainder.
> 
> Lastly you can pass size as 0 in the declaration and use
> HYPERCALL_BOUNCE_SET_SIZE, if that works better.

I'll see if I can make this build (for testing I have to rely on AMD
anyway).

Jan

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v2 RFC 2/4] x86/PV: support data breakpoint extension registers
  2014-03-27 16:37         ` Jan Beulich
@ 2014-03-27 17:11           ` Andrew Cooper
  2014-03-28 10:46             ` Jan Beulich
  0 siblings, 1 reply; 28+ messages in thread
From: Andrew Cooper @ 2014-03-27 17:11 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Keir Fraser, Ian Campbell, Ian Jackson, suravee.suthikulpanit,
	Sherry Hurwitz, xen-devel

On 27/03/14 16:37, Jan Beulich wrote:
>>>> On 27.03.14 at 17:03, <andrew.cooper3@citrix.com> wrote:
>> On 27/03/14 15:28, Ian Campbell wrote:
>>> On Wed, 2014-03-26 at 14:13 +0000, Jan Beulich wrote:
>>>> Introducing an extension to XEN_DOMCTL_[gs]et_ext_vcpucontext similar
>>>> to the generic MSR save/restore logic recently added for HVM.
>>> "x86: generic MSRs save/restore" I guess?
>>>
>>> Is the intention here that the extvcpu context is a blob of opaque data
>>> from the pov of the migration tools?
>>>
>>> Do the protocol docs in xg_save_restore need updating due to this
>>> change?
>>>
>>> How does N->N+1 migration work out?
>> Having just rewritten all of this code from scratch, the details are
>> painfully present in my brain.
>>
>> The answer is badly.
>>
>> The blob is exactly 128 bytes long, starting from the beginning of the
>> domctl union.  changeset e1dc98e3c "x86/vMCE: save/restore MCA
>> capabilities" introduced an explicit memset(0) on the content of the
>> struct domctl, meaning that this newly interpreted 'msr_count' shall
>> have the value 0.
> As just written to Ian in another reply - this new field isn't even being
> looked at when the size stored at the beginning of the structure doesn't
> indicate the presence of the msrs[] handle.

I am frankly having a very hard time following the new logic in
buffer_tail_pv(), so cant yet comment on when it is actually doing the
correct thing in terms of calculating whether a block of msrs is present
in the stream.

However, it never nops out evc->msr_count in the case that msrs is not
present in the stream, meaning that the "if (
domctl.u.ext_vcpucontext.msr_count )" will be performing logic based on
uninitialised stack junk for migration streams from 4.1 or before.

>
>> As this is a new feature, so not applicable for backport, can it wait
>> until my migration stream v2 work is submitted?  The migration code
>> churn to support this in v2 would be far less, and would also guarantee
>> no breakage of older migration streams.
> I wouldn't want to postpone this by too much (and certainly not past
> a re-write of the migration code) because - other than upstream Xen -
> we may want/need to backport this for SLE12.
>
> Jan
>

Backporting to SLE12 is certainly your perogative, but the tools side of
this patch is not fit for committing yet.

The hypervisor side looks ok, although I would suggest that the
activate_debugregs() logic get split out for reviewing purposes as it
appears to be an orthogonal fix.

As far as the migration stream v2 goes, it is one of the highest
priority tasks in XenServer at the moment, and I am working full time on
it.  The current state is that 64bit PV non-live migrate is working. 
32bit PV guests are pending a formal fix to the toolstacks incorrect
assumptions about which L2 entries are safe to shoot.  I plan to get the
live bit working then present an RFC series upstream.  HVM migration is
substantially easier than PV migration, and will follow shortly afterwards.

~Andrew

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v2 RFC 2/4] x86/PV: support data breakpoint extension registers
  2014-03-27 16:44           ` Jan Beulich
@ 2014-03-27 17:29             ` Ian Campbell
  2014-03-28  8:05               ` Jan Beulich
  0 siblings, 1 reply; 28+ messages in thread
From: Ian Campbell @ 2014-03-27 17:29 UTC (permalink / raw)
  To: Jan Beulich
  Cc: xen-devel, Keir Fraser, Ian Jackson, suravee.suthikulpanit,
	Sherry Hurwitz

On Thu, 2014-03-27 at 16:44 +0000, Jan Beulich wrote:
> >>> On 27.03.14 at 17:23, <Ian.Campbell@citrix.com> wrote:
> > On Thu, 2014-03-27 at 16:03 +0000, Jan Beulich wrote:
> >> >>> On 27.03.14 at 16:28, <Ian.Campbell@citrix.com> wrote:
> >> > Do the protocol docs in xg_save_restore need updating due to this
> >> > change?
> >> 
> >> Not sure about that - this doesn't seem to talk about what's inside
> >> the "bodies".
> > 
> > Is this extending the "VCPU context data"? or one of the XC_SAVE_ID_*
> > things? I think it would be worth extending that to indicate the new
> > "optional" data, where and what it is etc.
> 
> Yeah, I could probably add a note to the "extv" thing, even if it
> doesn't really fit.

Sounds good, thanks.

> >> > How does N->N+1 migration work out?
> >> 
> >> Since the structure size grows, and that size is embedded in the
> >> structure, input coming from N will be treated as not having any
> >> MSRs.
> > 
> > This is the size of the "extv" block in the header phase?
> 
> No, the "size" member of struct xen_domctl_ext_vcpucontext.

OK, that makes sense.

Are you also breaking the 128b limit on the data returned by this
struct?

Ian.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v2 RFC 2/4] x86/PV: support data breakpoint extension registers
  2014-03-27 17:29             ` Ian Campbell
@ 2014-03-28  8:05               ` Jan Beulich
  2014-03-28  9:55                 ` Ian Campbell
  0 siblings, 1 reply; 28+ messages in thread
From: Jan Beulich @ 2014-03-28  8:05 UTC (permalink / raw)
  To: Ian Campbell
  Cc: xen-devel, Keir Fraser, Ian Jackson, suravee.suthikulpanit,
	Sherry Hurwitz

>>> On 27.03.14 at 18:29, <Ian.Campbell@citrix.com> wrote:
> On Thu, 2014-03-27 at 16:44 +0000, Jan Beulich wrote:
>> >>> On 27.03.14 at 17:23, <Ian.Campbell@citrix.com> wrote:
>> > On Thu, 2014-03-27 at 16:03 +0000, Jan Beulich wrote:
>> >> >>> On 27.03.14 at 16:28, <Ian.Campbell@citrix.com> wrote:
>> >> > How does N->N+1 migration work out?
>> >> 
>> >> Since the structure size grows, and that size is embedded in the
>> >> structure, input coming from N will be treated as not having any
>> >> MSRs.
>> > 
>> > This is the size of the "extv" block in the header phase?
>> 
>> No, the "size" member of struct xen_domctl_ext_vcpucontext.
> 
> OK, that makes sense.
> 
> Are you also breaking the 128b limit on the data returned by this
> struct?

Indirectly, not directly (by adding this pointer to outside the
structure). The 128-byte limit is the fundamental reason why a
separate array needs to be used here, and why therefore extra
data needs to follow the base structure in the save/restore
stream.

Jan

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v2 RFC 2/4] x86/PV: support data breakpoint extension registers
  2014-03-28  8:05               ` Jan Beulich
@ 2014-03-28  9:55                 ` Ian Campbell
  2014-03-28 10:43                   ` Jan Beulich
  0 siblings, 1 reply; 28+ messages in thread
From: Ian Campbell @ 2014-03-28  9:55 UTC (permalink / raw)
  To: Jan Beulich
  Cc: xen-devel, Keir Fraser, Ian Jackson, suravee.suthikulpanit,
	Sherry Hurwitz

On Fri, 2014-03-28 at 08:05 +0000, Jan Beulich wrote:
> >>> On 27.03.14 at 18:29, <Ian.Campbell@citrix.com> wrote:
> > On Thu, 2014-03-27 at 16:44 +0000, Jan Beulich wrote:
> >> >>> On 27.03.14 at 17:23, <Ian.Campbell@citrix.com> wrote:
> >> > On Thu, 2014-03-27 at 16:03 +0000, Jan Beulich wrote:
> >> >> >>> On 27.03.14 at 16:28, <Ian.Campbell@citrix.com> wrote:
> >> >> > How does N->N+1 migration work out?
> >> >> 
> >> >> Since the structure size grows, and that size is embedded in the
> >> >> structure, input coming from N will be treated as not having any
> >> >> MSRs.
> >> > 
> >> > This is the size of the "extv" block in the header phase?
> >> 
> >> No, the "size" member of struct xen_domctl_ext_vcpucontext.
> > 
> > OK, that makes sense.
> > 
> > Are you also breaking the 128b limit on the data returned by this
> > struct?
> 
> Indirectly, not directly (by adding this pointer to outside the
> structure). The 128-byte limit is the fundamental reason why a
> separate array needs to be used here, and why therefore extra
> data needs to follow the base structure in the save/restore
> stream.

Ah I see.

So am I correct that the xen_domctl_ext_vcpucontext is sent in its
entirety, including whatever the existing pointer is, which will
certainly be bogus on the other end. The receiver will naturally
overwrite it, but perhaps it should be poisoned on the wire just for
belt-and-braces sake?

In fact -- it seems that 128 bytes are sent regardless of
sizeof(xen_domctl_ext_vcpucontext) (which doesn't look to be == 128
yet). The code is already this way, but for a future cleanup perhaps
pass to 128 with some known value? (I guess this was the stack value
which Andrew was referring too?)

Ian.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v2 RFC 2/4] x86/PV: support data breakpoint extension registers
  2014-03-28  9:55                 ` Ian Campbell
@ 2014-03-28 10:43                   ` Jan Beulich
  2014-03-28 10:56                     ` Ian Campbell
  0 siblings, 1 reply; 28+ messages in thread
From: Jan Beulich @ 2014-03-28 10:43 UTC (permalink / raw)
  To: Ian Campbell
  Cc: xen-devel, Keir Fraser, Ian Jackson, suravee.suthikulpanit,
	Sherry Hurwitz

>>> On 28.03.14 at 10:55, <Ian.Campbell@citrix.com> wrote:
> On Fri, 2014-03-28 at 08:05 +0000, Jan Beulich wrote:
>> >>> On 27.03.14 at 18:29, <Ian.Campbell@citrix.com> wrote:
>> > On Thu, 2014-03-27 at 16:44 +0000, Jan Beulich wrote:
>> >> >>> On 27.03.14 at 17:23, <Ian.Campbell@citrix.com> wrote:
>> >> > On Thu, 2014-03-27 at 16:03 +0000, Jan Beulich wrote:
>> >> >> >>> On 27.03.14 at 16:28, <Ian.Campbell@citrix.com> wrote:
>> >> >> > How does N->N+1 migration work out?
>> >> >> 
>> >> >> Since the structure size grows, and that size is embedded in the
>> >> >> structure, input coming from N will be treated as not having any
>> >> >> MSRs.
>> >> > 
>> >> > This is the size of the "extv" block in the header phase?
>> >> 
>> >> No, the "size" member of struct xen_domctl_ext_vcpucontext.
>> > 
>> > OK, that makes sense.
>> > 
>> > Are you also breaking the 128b limit on the data returned by this
>> > struct?
>> 
>> Indirectly, not directly (by adding this pointer to outside the
>> structure). The 128-byte limit is the fundamental reason why a
>> separate array needs to be used here, and why therefore extra
>> data needs to follow the base structure in the save/restore
>> stream.
> 
> Ah I see.
> 
> So am I correct that the xen_domctl_ext_vcpucontext is sent in its
> entirety, including whatever the existing pointer is, which will
> certainly be bogus on the other end. The receiver will naturally
> overwrite it, but perhaps it should be poisoned on the wire just for
> belt-and-braces sake?

Will do.

> In fact -- it seems that 128 bytes are sent regardless of
> sizeof(xen_domctl_ext_vcpucontext) (which doesn't look to be == 128
> yet). The code is already this way, but for a future cleanup perhaps
> pass to 128 with some known value? (I guess this was the stack value
> which Andrew was referring too?)

If s/pass/pad/, then this could also be done (albeit it's not directly
related to the work here). The value Andrew was referring to was
one in the middle of the structure though.

Jan

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v2 RFC 2/4] x86/PV: support data breakpoint extension registers
  2014-03-27 17:11           ` Andrew Cooper
@ 2014-03-28 10:46             ` Jan Beulich
  0 siblings, 0 replies; 28+ messages in thread
From: Jan Beulich @ 2014-03-28 10:46 UTC (permalink / raw)
  To: Andrew Cooper
  Cc: KeirFraser, Ian Campbell, Ian Jackson, suravee.suthikulpanit,
	Sherry Hurwitz, xen-devel

>>> On 27.03.14 at 18:11, <andrew.cooper3@citrix.com> wrote:
> The hypervisor side looks ok, although I would suggest that the
> activate_debugregs() logic get split out for reviewing purposes as it
> appears to be an orthogonal fix.

And I'll actually split the tools side off, so the hypervisor side could go
in without the tools side in place (making migration fail for PV guests
using the new feature, or any other future one building on this MSR
saving).

Jan

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v2 RFC 2/4] x86/PV: support data breakpoint extension registers
  2014-03-28 10:43                   ` Jan Beulich
@ 2014-03-28 10:56                     ` Ian Campbell
  0 siblings, 0 replies; 28+ messages in thread
From: Ian Campbell @ 2014-03-28 10:56 UTC (permalink / raw)
  To: Jan Beulich
  Cc: xen-devel, Keir Fraser, Ian Jackson, suravee.suthikulpanit,
	Sherry Hurwitz

On Fri, 2014-03-28 at 10:43 +0000, Jan Beulich wrote:
> >>> On 28.03.14 at 10:55, <Ian.Campbell@citrix.com> wrote:
> > On Fri, 2014-03-28 at 08:05 +0000, Jan Beulich wrote:
> >> >>> On 27.03.14 at 18:29, <Ian.Campbell@citrix.com> wrote:
> >> > On Thu, 2014-03-27 at 16:44 +0000, Jan Beulich wrote:
> >> >> >>> On 27.03.14 at 17:23, <Ian.Campbell@citrix.com> wrote:
> >> >> > On Thu, 2014-03-27 at 16:03 +0000, Jan Beulich wrote:
> >> >> >> >>> On 27.03.14 at 16:28, <Ian.Campbell@citrix.com> wrote:
> >> >> >> > How does N->N+1 migration work out?
> >> >> >> 
> >> >> >> Since the structure size grows, and that size is embedded in the
> >> >> >> structure, input coming from N will be treated as not having any
> >> >> >> MSRs.
> >> >> > 
> >> >> > This is the size of the "extv" block in the header phase?
> >> >> 
> >> >> No, the "size" member of struct xen_domctl_ext_vcpucontext.
> >> > 
> >> > OK, that makes sense.
> >> > 
> >> > Are you also breaking the 128b limit on the data returned by this
> >> > struct?
> >> 
> >> Indirectly, not directly (by adding this pointer to outside the
> >> structure). The 128-byte limit is the fundamental reason why a
> >> separate array needs to be used here, and why therefore extra
> >> data needs to follow the base structure in the save/restore
> >> stream.
> > 
> > Ah I see.
> > 
> > So am I correct that the xen_domctl_ext_vcpucontext is sent in its
> > entirety, including whatever the existing pointer is, which will
> > certainly be bogus on the other end. The receiver will naturally
> > overwrite it, but perhaps it should be poisoned on the wire just for
> > belt-and-braces sake?
> 
> Will do.

Thanks.
> 
> > In fact -- it seems that 128 bytes are sent regardless of
> > sizeof(xen_domctl_ext_vcpucontext) (which doesn't look to be == 128
> > yet). The code is already this way, but for a future cleanup perhaps
> > pass to 128 with some known value? (I guess this was the stack value
> > which Andrew was referring too?)
> 
> If s/pass/pad/,

Yes, sorry, fingers on autopilot.

>  then this could also be done (albeit it's not directly
> related to the work here).

Agreed, it's a future cleanup if anything.

>  The value Andrew was referring to was
> one in the middle of the structure though.

Which makes me think -- on N->N+1 migration might the offset of the
existing struct hvm_vmce_vcpu change? I think the answer is no because
you've inserted it into a hole in the struct. But that is the
uninitialised value Andy meant. But as you say accesses to it are gated
on the total sizeof of the struct so we know we won't look at it under
N->N+1 circumstances. (one for the docs I'd say!)

I think I may have been retreading a previous conversation there, but I
think I've not got it straight in my head!

Ian.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH RFC 0/4] x86/AMD: support newer hardware features
  2014-03-26 22:35     ` [PATCH RFC 0/4] x86/AMD: support newer hardware features Aravind Gopalakrishnan
@ 2014-04-01 23:10       ` Aravind Gopalakrishnan
  0 siblings, 0 replies; 28+ messages in thread
From: Aravind Gopalakrishnan @ 2014-04-01 23:10 UTC (permalink / raw)
  To: Jan Beulich, xen-devel
  Cc: Ian.Jackson, Keir Fraser, ian.campbell, Suthikulpanit, Suravee,
	Hurwitz, Sherry

On 3/26/2014 5:35 PM, Aravind Gopalakrishnan wrote:
>>
>> 1: SVM: support data breakpoint extension registers
>> 2: PV: support data breakpoint extension registers

Tested-by: Aravind Gopalakrishnan<aravind.gopalakrishnan@amd.com>

>> 3: x86/AMD: support further feature masking MSRs
>> 4: x86/AMD: clean up pre-canned family/revision handling for CPUID 
>> masking
Reviewed-by: Aravind Gopalakrishnan<aravind.gopalakrishnan@amd.com>

>>
>> Patches 3 and 4 are fully tested; for patches 1 and 2 I'm lacking
>> suitable hardware, and hence depend on AMD's help (and patch
>> 1 is also still lacking some tools side adjustments). Patch 3, otoh,
>> has a couple of questions to be answered. Overall this makes the
>> whole series RFC only for now.
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>
>

Thanks,
-Aravind

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: Fwd: [PATCH RFC 3/4] x86/AMD: support further feature masking MSRs
       [not found]     ` <CAAAAutDJ3CzvGp9jWCS7=b3rQsG81+w_W7qXLKxDWB7QBU23DQ@mail.gmail.com>
@ 2014-04-01 23:10       ` Aravind Gopalakrishnan
  2014-04-03 22:44         ` Aravind Gopalakrishnan
  0 siblings, 1 reply; 28+ messages in thread
From: Aravind Gopalakrishnan @ 2014-04-01 23:10 UTC (permalink / raw)
  To: Jan Beulich, xen-devel
  Cc: Ian.Jackson, Keir Fraser, ian.campbell, Suthikulpanit, Suravee,
	Hurwitz, Sherry

> Newer AMD CPUs also allow masking CPUID leaf 6 ECX and CPUID leaf 7
> sub-leaf 0 EAX and EBX.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com <mailto:jbeulich@suse.com>>
>
> TBD:
> - Fam15M0x is documented to not have MSR C0011002 despite 
> CPUID[7,0].EBX != 0 from model 02
>   onwards, and contrary to what I see on the system I have access to 
> (which is model 02)
> - Fam12, Fam14, and Fam16 are documented to not have MSR C0011003 
> despite CPUID[6].ECX != 0

Fam10 too has cpuid[6].ecx != 0 but no MSR C0011003

> - Fam11 is documented to not even have MSR C0011004 and C0011005
>

I am still trying to get some clarity on this;
Shall let you know soon :)

-Aravind.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: Fwd: [PATCH RFC 3/4] x86/AMD: support further feature masking MSRs
  2014-04-01 23:10       ` Fwd: " Aravind Gopalakrishnan
@ 2014-04-03 22:44         ` Aravind Gopalakrishnan
  2014-04-04  9:37           ` Jan Beulich
  0 siblings, 1 reply; 28+ messages in thread
From: Aravind Gopalakrishnan @ 2014-04-03 22:44 UTC (permalink / raw)
  To: Jan Beulich, xen-devel
  Cc: Ian.Jackson, Keir Fraser, ian.campbell, Suthikulpanit, Suravee,
	Hurwitz, Sherry

On 4/1/2014 6:10 PM, Aravind Gopalakrishnan wrote:
>> Newer AMD CPUs also allow masking CPUID leaf 6 ECX and CPUID leaf 7
>> sub-leaf 0 EAX and EBX.
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com 
>> <mailto:jbeulich@suse.com>>
>>
>> TBD:
>> - Fam15M0x is documented to not have MSR C0011002 despite 
>> CPUID[7,0].EBX != 0 from model 02
>>   onwards, and contrary to what I see on the system I have access to 
>> (which is model 02)
>> - Fam12, Fam14, and Fam16 are documented to not have MSR C0011003 
>> despite CPUID[6].ECX != 0
>
> Fam10 too has cpuid[6].ecx != 0 but no MSR C0011003
>
>> - Fam11 is documented to not even have MSR C0011004 and C0011005
>>
>
> I am still trying to get some clarity on this;

Here's more info regarding your questions:
1. This is a documentation error.
2. We cannot mask cpuid[6].ecx on any of these families: 0x10, 0x11, 
0x12,0x14,0x16 as feature is not supported..
3. We cannot mask standard,extended cpuid feature bits on Fam0x11 as 
feature is not supported.

So simple enough additions to your patch to include some family checks:

+    if (c->cpuid_level >= 7)
+        cpuid_count(7, 0, &eax, &ebx, &ecx, &edx);
+    else
+        ebx = eax = 0;
+    if ((eax | ebx) && ~(l7s0_eax & l7s0_ebx) &&
+         c->x86 == 0x15 && c->x86_model >= 0x2) {
+        if (l7s0_eax > eax)
+            l7s0_eax = eax;
+            l7s0_ebx &= ebx;
+            printk("Writing CPUID leaf 7 subleaf 0 feature mask EAX:EBX 
-> %08Xh:%08Xh\n",
+                   l7s0_eax, l7s0_ebx);
+    } else
+          skip_l7s0_eax_ebx = 1;
+    ecx = c->cpuid_level >= 6 ? cpuid_ecx(6) : 0;
+    if (ecx && ~thermal_ecx && c->x86 == 0x15) {
+        thermal_ecx &= ecx;
+        printk("Writing CPUID thermal/power feature mask ECX -> %08Xh\n",
+                thermal_ecx);
+    } else
+        skip_thermal_ecx = 1;
   setmask:
-   /* FIXME check if processor supports CPUID masking */
     /* AMD processors prior to family 10h required a 32-bit password */
-   if (c->x86 >= 0x10) {
+   if (c->x86 >= 0x10 && c->x86 != 0x11) {
         wrmsr(MSR_K8_FEATURE_MASK, feat_edx, feat_ecx);
         wrmsr(MSR_K8_EXT_FEATURE_MASK, extfeat_edx, extfeat_ecx);
+        if (!skip_l7s0_eax_ebx)
+            wrmsr(MSR_AMD_L7S0_FEATURE_MASK, l7s0_ebx, l7s0_eax);
+        if (!skip_thermal_ecx) {
+            rdmsr(MSR_AMD_THRM_FEATURE_MASK, eax, edx);
+            wrmsr(MSR_AMD_THRM_FEATURE_MASK, thermal_ecx, edx);
+        }
     } else {
         wrmsr_amd(MSR_K8_FEATURE_MASK, feat_edx, feat_ecx);
         wrmsr_amd(MSR_K8_EXT_FEATURE_MASK, extfeat_edx, extfeat_ecx);

Tested on a fam16h system with "cpuid_mask_thermal_ecx=0x0 
cpuid_mask_l7s0_ebx=0x0" boot args and works fine.

Thanks,
-Aravind.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: Fwd: [PATCH RFC 3/4] x86/AMD: support further feature masking MSRs
  2014-04-03 22:44         ` Aravind Gopalakrishnan
@ 2014-04-04  9:37           ` Jan Beulich
  2014-04-04 16:21             ` Aravind Gopalakrishnan
  0 siblings, 1 reply; 28+ messages in thread
From: Jan Beulich @ 2014-04-04  9:37 UTC (permalink / raw)
  To: Aravind Gopalakrishnan, xen-devel
  Cc: Keir Fraser, Ian.Jackson, ian.campbell, Suravee Suthikulpanit,
	Sherry Hurwitz

>>> On 04.04.14 at 00:44, <aravind.gopalakrishnan@amd.com> wrote:
> On 4/1/2014 6:10 PM, Aravind Gopalakrishnan wrote:
>>> Newer AMD CPUs also allow masking CPUID leaf 6 ECX and CPUID leaf 7
>>> sub-leaf 0 EAX and EBX.
>>>
>>> Signed-off-by: Jan Beulich <jbeulich@suse.com 
>>> <mailto:jbeulich@suse.com>>
>>>
>>> TBD:
>>> - Fam15M0x is documented to not have MSR C0011002 despite 
>>> CPUID[7,0].EBX != 0 from model 02
>>>   onwards, and contrary to what I see on the system I have access to 
>>> (which is model 02)
>>> - Fam12, Fam14, and Fam16 are documented to not have MSR C0011003 
>>> despite CPUID[6].ECX != 0
>>
>> Fam10 too has cpuid[6].ecx != 0 but no MSR C0011003
>>
>>> - Fam11 is documented to not even have MSR C0011004 and C0011005
>>>
>>
>> I am still trying to get some clarity on this;
> 
> Here's more info regarding your questions:
> 1. This is a documentation error.
> 2. We cannot mask cpuid[6].ecx on any of these families: 0x10, 0x11, 
> 0x12,0x14,0x16 as feature is not supported..
> 3. We cannot mask standard,extended cpuid feature bits on Fam0x11 as 
> feature is not supported.
> 
> So simple enough additions to your patch to include some family checks:
> 
> +    if (c->cpuid_level >= 7)
> +        cpuid_count(7, 0, &eax, &ebx, &ecx, &edx);
> +    else
> +        ebx = eax = 0;
> +    if ((eax | ebx) && ~(l7s0_eax & l7s0_ebx) &&
> +         c->x86 == 0x15 && c->x86_model >= 0x2) {

This doesn't match up with any of the items listed above. Am I to
imply that models 0 and 1 have this CPUID leaf non-blank, but can't
mask the features?

Also, is Fam16 indeed not capable of masking features here too?

Assuming "yes" and "no", I'd rather use

    if ((eax | ebx) && ~(l7s0_eax & l7s0_ebx) &&
         (c->x86 != 0x15 || c->x86_model >= 0x2)) {

But then again models 0 and 1 are documented to have this CPUID
leaf blank, so we shouldn't need a family/model check here at all.

> +        if (l7s0_eax > eax)
> +            l7s0_eax = eax;
> +            l7s0_ebx &= ebx;
> +            printk("Writing CPUID leaf 7 subleaf 0 feature mask EAX:EBX -> %08Xh:%08Xh\n",
> +                   l7s0_eax, l7s0_ebx);
> +    } else
> +          skip_l7s0_eax_ebx = 1;
> +    ecx = c->cpuid_level >= 6 ? cpuid_ecx(6) : 0;
> +    if (ecx && ~thermal_ecx && c->x86 == 0x15) {

Will merge this, albeit I don't like == in checks like this at all.

> +        thermal_ecx &= ecx;
> +        printk("Writing CPUID thermal/power feature mask ECX -> %08Xh\n",
> +                thermal_ecx);
> +    } else
> +        skip_thermal_ecx = 1;
>    setmask:
> -   /* FIXME check if processor supports CPUID masking */
>      /* AMD processors prior to family 10h required a 32-bit password */
> -   if (c->x86 >= 0x10) {
> +   if (c->x86 >= 0x10 && c->x86 != 0x11) {

I'll use this one, but in a separate prefix patch (as I'll want to backport
this), and in a different fashion: I don't think the "else" path to this "if"
applies to Fam11 either based on documentation and your statements
above.

Jan

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: Fwd: [PATCH RFC 3/4] x86/AMD: support further feature masking MSRs
  2014-04-04  9:37           ` Jan Beulich
@ 2014-04-04 16:21             ` Aravind Gopalakrishnan
  2014-04-07  6:54               ` Jan Beulich
  0 siblings, 1 reply; 28+ messages in thread
From: Aravind Gopalakrishnan @ 2014-04-04 16:21 UTC (permalink / raw)
  To: Jan Beulich, xen-devel
  Cc: Keir Fraser, Ian.Jackson, ian.campbell, Suravee Suthikulpanit,
	Sherry Hurwitz

On 4/4/2014 4:37 AM, Jan Beulich wrote:
>>>> On 04.04.14 at 00:44, <aravind.gopalakrishnan@amd.com> wrote:
>> On 4/1/2014 6:10 PM, Aravind Gopalakrishnan wrote:
>>>> Newer AMD CPUs also allow masking CPUID leaf 6 ECX and CPUID leaf 7
>>>> sub-leaf 0 EAX and EBX.
>>>>
>>>> Signed-off-by: Jan Beulich <jbeulich@suse.com
>>>> <mailto:jbeulich@suse.com>>
>>>>
>>>> TBD:
>>>> - Fam15M0x is documented to not have MSR C0011002 despite
>>>> CPUID[7,0].EBX != 0 from model 02
>>>>    onwards, and contrary to what I see on the system I have access to
>>>> (which is model 02)
>>>> - Fam12, Fam14, and Fam16 are documented to not have MSR C0011003
>>>> despite CPUID[6].ECX != 0
>>> Fam10 too has cpuid[6].ecx != 0 but no MSR C0011003
>>>
>>>> - Fam11 is documented to not even have MSR C0011004 and C0011005
>>>>
>>> I am still trying to get some clarity on this;
>> Here's more info regarding your questions:
>> 1. This is a documentation error.
>> 2. We cannot mask cpuid[6].ecx on any of these families: 0x10, 0x11,
>> 0x12,0x14,0x16 as feature is not supported..
>> 3. We cannot mask standard,extended cpuid feature bits on Fam0x11 as
>> feature is not supported.
>>
>> So simple enough additions to your patch to include some family checks:
>>
>> +    if (c->cpuid_level >= 7)
>> +        cpuid_count(7, 0, &eax, &ebx, &ecx, &edx);
>> +    else
>> +        ebx = eax = 0;
>> +    if ((eax | ebx) && ~(l7s0_eax & l7s0_ebx) &&
>> +         c->x86 == 0x15 && c->x86_model >= 0x2) {
> Also, is Fam16 indeed not capable of masking features here too?
>
> Assuming "yes" and "no", I'd rather use
>
>      if ((eax | ebx) && ~(l7s0_eax & l7s0_ebx) &&
>           (c->x86 != 0x15 || c->x86_model >= 0x2)) {
>
> But then again models 0 and 1 are documented to have this CPUID
> leaf blank, so we shouldn't need a family/model check here at all.

Yes, you are right. No need for above family check at all.
But for fam16h, I was not able to access the MSR on my machine.
And here's why:
fam16h can use the MSR to mask the bit only if microcode patch level >= 
0x700010a

While at it, I tried to clean up the issues you pointed above (and the  
== 0x15 check below..).
Does this seem better? -
+    if (c->cpuid_level >= 7)
+        cpuid_count(7, 0, &eax, &ebx, &ecx, &edx);
+    else
+        ebx = eax = 0;
+
+    skip_l7s0_eax_ebx = skip_thermal_ecx = 1;
+    if ((eax | ebx) && ~(l7s0_eax & l7s0_ebx)) {
+        /* fam16h can mask l7s0 only if microcode lvl >= 0x700010a */
+        if (c->x86 == 0x16) {
+            uint32_t rev;
+            rdmsrl(MSR_AMD_PATCHLEVEL, rev);
+            if (rev < 0x700010a) {
+                printk("Cannot apply l7s0 mask as microcode patch lvl 
is insufficient\n");
+                /*
+                 * fam16h cannot mask MSR_AMD_THRM_FEATURE_MASK anyway
+                 * so, fall through
+                 */
+                goto setmask;
+            }
+        }
+        if (l7s0_eax > eax)
+            l7s0_eax = eax;
+            l7s0_ebx &= ebx;
+            skip_l7s0_eax_ebx = 0;
+            printk("Writing CPUID leaf 7 subleaf 0 feature mask EAX:EBX 
-> %08Xh:%08Xh\n",
+                   l7s0_eax, l7s0_ebx);
+    }
+    ecx = c->cpuid_level >= 6 ? cpuid_ecx(6) : 0;
+    /* MSR_AMD_THRM_FEATURE_MASK can be applied only on fam15h */
+    if (c->x86 != 0x15)
+        goto setmask;
+    if (ecx && ~thermal_ecx) {
+        thermal_ecx &= ecx;
+        skip_thermal_ecx = 0;
+        printk("Writing CPUID thermal/power feature mask ECX -> %08Xh\n",
+                thermal_ecx);
+    }
>> +        if (l7s0_eax > eax)
>> +            l7s0_eax = eax;
>> +            l7s0_ebx &= ebx;
>> +            printk("Writing CPUID leaf 7 subleaf 0 feature mask EAX:EBX -> %08Xh:%08Xh\n",
>> +                   l7s0_eax, l7s0_ebx);
>> +    } else
>> +          skip_l7s0_eax_ebx = 1;
>> +    ecx = c->cpuid_level >= 6 ? cpuid_ecx(6) : 0;
>> +    if (ecx && ~thermal_ecx && c->x86 == 0x15) {
> Will merge this, albeit I don't like == in checks like this at all.
>
>> +        thermal_ecx &= ecx;
>> +        printk("Writing CPUID thermal/power feature mask ECX -> %08Xh\n",
>> +                thermal_ecx);
>> +    } else
>> +        skip_thermal_ecx = 1;
>>     setmask:
>> -   /* FIXME check if processor supports CPUID masking */
>>       /* AMD processors prior to family 10h required a 32-bit password */
>> -   if (c->x86 >= 0x10) {
>> +   if (c->x86 >= 0x10 && c->x86 != 0x11) {
> I'll use this one, but in a separate prefix patch (as I'll want to backport
> this), and in a different fashion: I don't think the "else" path to this "if"
> applies to Fam11 either based on documentation and your statements
> above.
>

Yes, missed it. Sorry about that..

Thanks,
-Aravind

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: Fwd: [PATCH RFC 3/4] x86/AMD: support further feature masking MSRs
  2014-04-04 16:21             ` Aravind Gopalakrishnan
@ 2014-04-07  6:54               ` Jan Beulich
  2014-04-07  8:52                 ` Andrew Cooper
  0 siblings, 1 reply; 28+ messages in thread
From: Jan Beulich @ 2014-04-07  6:54 UTC (permalink / raw)
  To: Aravind Gopalakrishnan, xen-devel
  Cc: Keir Fraser, ian.campbell, Andrew Cooper, Ian.Jackson,
	Suravee Suthikulpanit, Sherry Hurwitz

>>> On 04.04.14 at 18:21, <aravind.gopalakrishnan@amd.com> wrote:
> On 4/4/2014 4:37 AM, Jan Beulich wrote:
>>>>> On 04.04.14 at 00:44, <aravind.gopalakrishnan@amd.com> wrote:
>>> On 4/1/2014 6:10 PM, Aravind Gopalakrishnan wrote:
>>>>> Newer AMD CPUs also allow masking CPUID leaf 6 ECX and CPUID leaf 7
>>>>> sub-leaf 0 EAX and EBX.
>>>>>
>>>>> Signed-off-by: Jan Beulich <jbeulich@suse.com 
>>>>> <mailto:jbeulich@suse.com>>
>>>>>
>>>>> TBD:
>>>>> - Fam15M0x is documented to not have MSR C0011002 despite
>>>>> CPUID[7,0].EBX != 0 from model 02
>>>>>    onwards, and contrary to what I see on the system I have access to
>>>>> (which is model 02)
>>>>> - Fam12, Fam14, and Fam16 are documented to not have MSR C0011003
>>>>> despite CPUID[6].ECX != 0
>>>> Fam10 too has cpuid[6].ecx != 0 but no MSR C0011003
>>>>
>>>>> - Fam11 is documented to not even have MSR C0011004 and C0011005
>>>>>
>>>> I am still trying to get some clarity on this;
>>> Here's more info regarding your questions:
>>> 1. This is a documentation error.
>>> 2. We cannot mask cpuid[6].ecx on any of these families: 0x10, 0x11,
>>> 0x12,0x14,0x16 as feature is not supported..
>>> 3. We cannot mask standard,extended cpuid feature bits on Fam0x11 as
>>> feature is not supported.
>>>
>>> So simple enough additions to your patch to include some family checks:
>>>
>>> +    if (c->cpuid_level >= 7)
>>> +        cpuid_count(7, 0, &eax, &ebx, &ecx, &edx);
>>> +    else
>>> +        ebx = eax = 0;
>>> +    if ((eax | ebx) && ~(l7s0_eax & l7s0_ebx) &&
>>> +         c->x86 == 0x15 && c->x86_model >= 0x2) {
>> Also, is Fam16 indeed not capable of masking features here too?
>>
>> Assuming "yes" and "no", I'd rather use
>>
>>      if ((eax | ebx) && ~(l7s0_eax & l7s0_ebx) &&
>>           (c->x86 != 0x15 || c->x86_model >= 0x2)) {
>>
>> But then again models 0 and 1 are documented to have this CPUID
>> leaf blank, so we shouldn't need a family/model check here at all.
> 
> Yes, you are right. No need for above family check at all.
> But for fam16h, I was not able to access the MSR on my machine.
> And here's why:
> fam16h can use the MSR to mask the bit only if microcode patch level >= 
> 0x700010a

I think that's going too far - no-one is required to pass these command
line options, and all that would happen if someone did was that the
system would crash during boot. The one thing we may want to check
(and adjust if necessary) is that this code only runs after an eventual
microcode update was already applied, to increase the chances of
things working. After all, if a minimum microcode level is known, I
suppose respective microcode updates are or will be made available
publicly.

Andrew otoh, in his attempt to make the feature masking per-VM,
will clearly need to take this into consideration, or else he'd be
risking runtime crashes.

So I'll repost the patch set with the adjustments done so far, and
we'll go from there.

Jan

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: Fwd: [PATCH RFC 3/4] x86/AMD: support further feature masking MSRs
  2014-04-07  6:54               ` Jan Beulich
@ 2014-04-07  8:52                 ` Andrew Cooper
  2014-04-07  9:14                   ` Jan Beulich
  0 siblings, 1 reply; 28+ messages in thread
From: Andrew Cooper @ 2014-04-07  8:52 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Keir Fraser, ian.campbell, Ian.Jackson, Aravind Gopalakrishnan,
	Suravee Suthikulpanit, Sherry Hurwitz, xen-devel

On 07/04/14 07:54, Jan Beulich wrote:
>>>> On 04.04.14 at 18:21, <aravind.gopalakrishnan@amd.com> wrote:
>> On 4/4/2014 4:37 AM, Jan Beulich wrote:
>>>>>> On 04.04.14 at 00:44, <aravind.gopalakrishnan@amd.com> wrote:
>>>> On 4/1/2014 6:10 PM, Aravind Gopalakrishnan wrote:
>>>>>> Newer AMD CPUs also allow masking CPUID leaf 6 ECX and CPUID leaf 7
>>>>>> sub-leaf 0 EAX and EBX.
>>>>>>
>>>>>> Signed-off-by: Jan Beulich <jbeulich@suse.com 
>>>>>> <mailto:jbeulich@suse.com>>
>>>>>>
>>>>>> TBD:
>>>>>> - Fam15M0x is documented to not have MSR C0011002 despite
>>>>>> CPUID[7,0].EBX != 0 from model 02
>>>>>>    onwards, and contrary to what I see on the system I have access to
>>>>>> (which is model 02)
>>>>>> - Fam12, Fam14, and Fam16 are documented to not have MSR C0011003
>>>>>> despite CPUID[6].ECX != 0
>>>>> Fam10 too has cpuid[6].ecx != 0 but no MSR C0011003
>>>>>
>>>>>> - Fam11 is documented to not even have MSR C0011004 and C0011005
>>>>>>
>>>>> I am still trying to get some clarity on this;
>>>> Here's more info regarding your questions:
>>>> 1. This is a documentation error.
>>>> 2. We cannot mask cpuid[6].ecx on any of these families: 0x10, 0x11,
>>>> 0x12,0x14,0x16 as feature is not supported..
>>>> 3. We cannot mask standard,extended cpuid feature bits on Fam0x11 as
>>>> feature is not supported.
>>>>
>>>> So simple enough additions to your patch to include some family checks:
>>>>
>>>> +    if (c->cpuid_level >= 7)
>>>> +        cpuid_count(7, 0, &eax, &ebx, &ecx, &edx);
>>>> +    else
>>>> +        ebx = eax = 0;
>>>> +    if ((eax | ebx) && ~(l7s0_eax & l7s0_ebx) &&
>>>> +         c->x86 == 0x15 && c->x86_model >= 0x2) {
>>> Also, is Fam16 indeed not capable of masking features here too?
>>>
>>> Assuming "yes" and "no", I'd rather use
>>>
>>>      if ((eax | ebx) && ~(l7s0_eax & l7s0_ebx) &&
>>>           (c->x86 != 0x15 || c->x86_model >= 0x2)) {
>>>
>>> But then again models 0 and 1 are documented to have this CPUID
>>> leaf blank, so we shouldn't need a family/model check here at all.
>> Yes, you are right. No need for above family check at all.
>> But for fam16h, I was not able to access the MSR on my machine.
>> And here's why:
>> fam16h can use the MSR to mask the bit only if microcode patch level >= 
>> 0x700010a
> I think that's going too far - no-one is required to pass these command
> line options, and all that would happen if someone did was that the
> system would crash during boot. The one thing we may want to check
> (and adjust if necessary) is that this code only runs after an eventual
> microcode update was already applied, to increase the chances of
> things working. After all, if a minimum microcode level is known, I
> suppose respective microcode updates are or will be made available
> publicly.
>
> Andrew otoh, in his attempt to make the feature masking per-VM,
> will clearly need to take this into consideration, or else he'd be
> risking runtime crashes.

That is a good point which I had not anticipated.  The way my fragments
of per-VM code currently works involve explicitly probing the expected
set of MSRs at boot, disabling if not found.  In a case like this, the
fam16 would not get the masks if the microcode started below 0x700010a
but subsequently got patched higher.

Catching this case at runtime is hard - if the command line option is
given, the BIOS is out of date wrt microcode, dom0 boots and applies new
microcode, it is not generally safe to retroactively apply the boot time
mask to dom0 which has already started running.

~Andrew

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: Fwd: [PATCH RFC 3/4] x86/AMD: support further feature masking MSRs
  2014-04-07  8:52                 ` Andrew Cooper
@ 2014-04-07  9:14                   ` Jan Beulich
  0 siblings, 0 replies; 28+ messages in thread
From: Jan Beulich @ 2014-04-07  9:14 UTC (permalink / raw)
  To: Andrew Cooper
  Cc: Keir Fraser, ian.campbell, Ian.Jackson, Aravind Gopalakrishnan,
	Suravee Suthikulpanit, SherryHurwitz, xen-devel

>>> On 07.04.14 at 10:52, <andrew.cooper3@citrix.com> wrote:
> On 07/04/14 07:54, Jan Beulich wrote:
>> Andrew otoh, in his attempt to make the feature masking per-VM,
>> will clearly need to take this into consideration, or else he'd be
>> risking runtime crashes.
> 
> That is a good point which I had not anticipated.  The way my fragments
> of per-VM code currently works involve explicitly probing the expected
> set of MSRs at boot, disabling if not found.  In a case like this, the
> fam16 would not get the masks if the microcode started below 0x700010a
> but subsequently got patched higher.
> 
> Catching this case at runtime is hard - if the command line option is
> given, the BIOS is out of date wrt microcode, dom0 boots and applies new
> microcode, it is not generally safe to retroactively apply the boot time
> mask to dom0 which has already started running.

I don't think we should go that far - applying microcode post (Xen-)
boot shouldn't affect past choices. But it of course ought to affect
future ones (i.e. for guests yet to be created).

Jan

^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2014-04-07  9:14 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <532880230200007800125450@nat28.tlf.novell.com>
2014-03-19  9:16 ` [PATCH RFC 0/4] x86/AMD: support newer hardware features Jan Beulich
2014-03-19  9:46   ` [PATCH RFC 1/4] x86/SVM: support data breakpoint extension registers Jan Beulich
2014-03-19  9:47   ` [PATCH RFC 2/4] x86/PV: " Jan Beulich
2014-03-19  9:47   ` [PATCH RFC 3/4] x86/AMD: support further feature masking MSRs Jan Beulich
     [not found]     ` <CAAAAutDJ3CzvGp9jWCS7=b3rQsG81+w_W7qXLKxDWB7QBU23DQ@mail.gmail.com>
2014-04-01 23:10       ` Fwd: " Aravind Gopalakrishnan
2014-04-03 22:44         ` Aravind Gopalakrishnan
2014-04-04  9:37           ` Jan Beulich
2014-04-04 16:21             ` Aravind Gopalakrishnan
2014-04-07  6:54               ` Jan Beulich
2014-04-07  8:52                 ` Andrew Cooper
2014-04-07  9:14                   ` Jan Beulich
2014-03-19  9:48   ` [PATCH RFC 4/4] x86/AMD: clean up pre-canned family/revision handling for CPUID masking Jan Beulich
2014-03-26 14:13   ` [PATCH v2 RFC 2/4] x86/PV: support data breakpoint extension registers Jan Beulich
2014-03-27 15:28     ` Ian Campbell
2014-03-27 16:03       ` Jan Beulich
2014-03-27 16:23         ` Ian Campbell
2014-03-27 16:44           ` Jan Beulich
2014-03-27 17:29             ` Ian Campbell
2014-03-28  8:05               ` Jan Beulich
2014-03-28  9:55                 ` Ian Campbell
2014-03-28 10:43                   ` Jan Beulich
2014-03-28 10:56                     ` Ian Campbell
2014-03-27 16:03       ` Andrew Cooper
2014-03-27 16:37         ` Jan Beulich
2014-03-27 17:11           ` Andrew Cooper
2014-03-28 10:46             ` Jan Beulich
     [not found]   ` <CAAAAutAgdSJqYEjc4dtixuTS2S=PUx-L4Sy=JCQRZ57rTdoPCQ@mail.gmail.com>
2014-03-26 22:35     ` [PATCH RFC 0/4] x86/AMD: support newer hardware features Aravind Gopalakrishnan
2014-04-01 23:10       ` Aravind Gopalakrishnan

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.