[PATCH v20 10/13] x86/VPMU: Handle PMU interrupts for PV(H) guests

From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
To: JBeulich@suse.com, kevin.tian@intel.com,
	suravee.suthikulpanit@amd.com, Aravind.Gopalakrishnan@amd.com,
	dietmar.hahn@ts.fujitsu.com, dgdegra@tycho.nsa.gov,
	andrew.cooper3@citrix.com
Cc: boris.ostrovsky@oracle.com, tim@xen.org, jun.nakajima@intel.com,
	xen-devel@lists.xen.org
Subject: [PATCH v20 10/13] x86/VPMU: Handle PMU interrupts for PV(H) guests
Date: Thu,  9 Apr 2015 11:44:52 -0400	[thread overview]
Message-ID: <1428594295-2024-11-git-send-email-boris.ostrovsky@oracle.com> (raw)
In-Reply-To: <1428594295-2024-1-git-send-email-boris.ostrovsky@oracle.com>

Add support for handling PMU interrupts for PV(H) guests.

VPMU for the interrupted VCPU is unloaded until the guest issues XENPMU_flush
hypercall. This allows the guest to access PMU MSR values that are stored in
VPMU context which is shared between hypervisor and domain, thus avoiding
traps to hypervisor.

Since the interrupt handler may now force VPMU context save (i.e. set
VPMU_CONTEXT_SAVE flag) we need to make changes to amd_vpmu_save() which
until now expected this flag to be set only when the counters were stopped.

Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Acked-by: Daniel De Graaf <dgdegra@tycho.nsa.gov>
Acked-by: Jan Beulich <jbeulich@suse.com>
---
* Updated patch title (include PVH guests)
* vpmu_lvtpc_update() initializes curr at definition time
* Drop curr in do_xenpmu_op()'s XENPMU_lvtpc_set case.
* Declared domid as domid_t type

 xen/arch/x86/hvm/svm/vpmu.c       |  11 +-
 xen/arch/x86/hvm/vpmu.c           | 211 +++++++++++++++++++++++++++++++++++---
 xen/include/public/arch-x86/pmu.h |   6 ++
 xen/include/public/pmu.h          |   2 +
 xen/include/xsm/dummy.h           |   4 +-
 xen/xsm/flask/hooks.c             |   2 +
 6 files changed, 215 insertions(+), 21 deletions(-)

diff --git a/xen/arch/x86/hvm/svm/vpmu.c b/xen/arch/x86/hvm/svm/vpmu.c
index 474d0db..0997901 100644
--- a/xen/arch/x86/hvm/svm/vpmu.c
+++ b/xen/arch/x86/hvm/svm/vpmu.c
@@ -228,17 +228,12 @@ static int amd_vpmu_save(struct vcpu *v)
     struct vpmu_struct *vpmu = vcpu_vpmu(v);
     unsigned int i;
 
-    /*
-     * Stop the counters. If we came here via vpmu_save_force (i.e.
-     * when VPMU_CONTEXT_SAVE is set) counters are already stopped.
-     */
+    for ( i = 0; i < num_counters; i++ )
+        wrmsrl(ctrls[i], 0);
+
     if ( !vpmu_is_set(vpmu, VPMU_CONTEXT_SAVE) )
     {
         vpmu_set(vpmu, VPMU_FROZEN);
-
-        for ( i = 0; i < num_counters; i++ )
-            wrmsrl(ctrls[i], 0);
-
         return 0;
     }
 
diff --git a/xen/arch/x86/hvm/vpmu.c b/xen/arch/x86/hvm/vpmu.c
index 5fbb799..37e612a 100644
--- a/xen/arch/x86/hvm/vpmu.c
+++ b/xen/arch/x86/hvm/vpmu.c
@@ -85,31 +85,56 @@ static void __init parse_vpmu_param(char *s)
 void vpmu_lvtpc_update(uint32_t val)
 {
     struct vpmu_struct *vpmu;
+    struct vcpu *curr = current;
 
-    if ( vpmu_mode == XENPMU_MODE_OFF )
+    if ( likely(vpmu_mode == XENPMU_MODE_OFF) )
         return;
 
-    vpmu = vcpu_vpmu(current);
+    vpmu = vcpu_vpmu(curr);
 
     vpmu->hw_lapic_lvtpc = PMU_APIC_VECTOR | (val & APIC_LVT_MASKED);
-    apic_write(APIC_LVTPC, vpmu->hw_lapic_lvtpc);
+
+    /* Postpone APIC updates for PV(H) guests if PMU interrupt is pending */
+    if ( is_hvm_vcpu(curr) || !vpmu->xenpmu_data ||
+         !(vpmu->xenpmu_data->pmu.pmu_flags & PMU_CACHED) )
+        apic_write(APIC_LVTPC, vpmu->hw_lapic_lvtpc);
 }
 
 int vpmu_do_wrmsr(unsigned int msr, uint64_t msr_content, uint64_t supported)
 {
-    struct vpmu_struct *vpmu = vcpu_vpmu(current);
+    struct vcpu *curr = current;
+    struct vpmu_struct *vpmu;
 
     if ( vpmu_mode == XENPMU_MODE_OFF )
         return 0;
 
+    vpmu = vcpu_vpmu(curr);
     if ( vpmu->arch_vpmu_ops && vpmu->arch_vpmu_ops->do_wrmsr )
-        return vpmu->arch_vpmu_ops->do_wrmsr(msr, msr_content, supported);
+    {
+        int ret = vpmu->arch_vpmu_ops->do_wrmsr(msr, msr_content, supported);
+
+        /*
+         * We may have received a PMU interrupt during WRMSR handling
+         * and since do_wrmsr may load VPMU context we should save
+         * (and unload) it again.
+         */
+        if ( !is_hvm_vcpu(curr) && vpmu->xenpmu_data &&
+             (vpmu->xenpmu_data->pmu.pmu_flags & PMU_CACHED) )
+        {
+            vpmu_set(vpmu, VPMU_CONTEXT_SAVE);
+            vpmu->arch_vpmu_ops->arch_vpmu_save(curr);
+            vpmu_reset(vpmu, VPMU_CONTEXT_SAVE | VPMU_CONTEXT_LOADED);
+        }
+        return ret;
+    }
+
     return 0;
 }
 
 int vpmu_do_rdmsr(unsigned int msr, uint64_t *msr_content)
 {
-    struct vpmu_struct *vpmu = vcpu_vpmu(current);
+    struct vcpu *curr = current;
+    struct vpmu_struct *vpmu;
 
     if ( vpmu_mode == XENPMU_MODE_OFF )
     {
@@ -117,24 +142,163 @@ int vpmu_do_rdmsr(unsigned int msr, uint64_t *msr_content)
         return 0;
     }
 
+    vpmu = vcpu_vpmu(curr);
     if ( vpmu->arch_vpmu_ops && vpmu->arch_vpmu_ops->do_rdmsr )
-        return vpmu->arch_vpmu_ops->do_rdmsr(msr, msr_content);
+    {
+        int ret = vpmu->arch_vpmu_ops->do_rdmsr(msr, msr_content);
+
+        if ( !is_hvm_vcpu(curr) && vpmu->xenpmu_data &&
+             (vpmu->xenpmu_data->pmu.pmu_flags & PMU_CACHED) )
+        {
+            vpmu_set(vpmu, VPMU_CONTEXT_SAVE);
+            vpmu->arch_vpmu_ops->arch_vpmu_save(curr);
+            vpmu_reset(vpmu, VPMU_CONTEXT_SAVE | VPMU_CONTEXT_LOADED);
+        }
+        return ret;
+    }
     else
         *msr_content = 0;
 
     return 0;
 }
 
+static inline struct vcpu *choose_hwdom_vcpu(void)
+{
+    unsigned idx;
+
+    if ( hardware_domain->max_vcpus == 0 )
+        return NULL;
+
+    idx = smp_processor_id() % hardware_domain->max_vcpus;
+
+    return hardware_domain->vcpu[idx];
+}
+
 void vpmu_do_interrupt(struct cpu_user_regs *regs)
 {
-    struct vcpu *v = current;
-    struct vpmu_struct *vpmu = vcpu_vpmu(v);
+    struct vcpu *sampled = current, *sampling;
+    struct vpmu_struct *vpmu;
+
+    /* dom0 will handle interrupt for special domains (e.g. idle domain) */
+    if ( sampled->domain->domain_id >= DOMID_FIRST_RESERVED )
+    {
+        sampling = choose_hwdom_vcpu();
+        if ( !sampling )
+            return;
+    }
+    else
+        sampling = sampled;
+
+    vpmu = vcpu_vpmu(sampling);
+    if ( !is_hvm_vcpu(sampling) )
+    {
+        /* PV(H) guest */
+        const struct cpu_user_regs *cur_regs;
+        uint64_t *flags = &vpmu->xenpmu_data->pmu.pmu_flags;
+        domid_t domid = DOMID_SELF;
+
+        if ( !vpmu->xenpmu_data )
+            return;
+
+        if ( is_pvh_vcpu(sampling) &&
+             !vpmu->arch_vpmu_ops->do_interrupt(regs) )
+            return;
+
+        if ( *flags & PMU_CACHED )
+            return;
+
+        /* PV guest will be reading PMU MSRs from xenpmu_data */
+        vpmu_set(vpmu, VPMU_CONTEXT_SAVE | VPMU_CONTEXT_LOADED);
+        vpmu->arch_vpmu_ops->arch_vpmu_save(sampling);
+        vpmu_reset(vpmu, VPMU_CONTEXT_SAVE | VPMU_CONTEXT_LOADED);
+
+        if ( has_hvm_container_vcpu(sampled) )
+            *flags = 0;
+        else
+            *flags = PMU_SAMPLE_PV;
+
+        /* Store appropriate registers in xenpmu_data */
+        /* FIXME: 32-bit PVH should go here as well */
+        if ( is_pv_32bit_vcpu(sampling) )
+        {
+            /*
+             * 32-bit dom0 cannot process Xen's addresses (which are 64 bit)
+             * and therefore we treat it the same way as a non-privileged
+             * PV 32-bit domain.
+             */
+            struct compat_pmu_regs *cmp;
+
+            cur_regs = guest_cpu_user_regs();
+
+            cmp = (void *)&vpmu->xenpmu_data->pmu.r.regs;
+            cmp->ip = cur_regs->rip;
+            cmp->sp = cur_regs->rsp;
+            cmp->flags = cur_regs->eflags;
+            cmp->ss = cur_regs->ss;
+            cmp->cs = cur_regs->cs;
+            if ( (cmp->cs & 3) > 1 )
+                *flags |= PMU_SAMPLE_USER;
+        }
+        else
+        {
+            struct xen_pmu_regs *r = &vpmu->xenpmu_data->pmu.r.regs;
+
+            if ( (vpmu_mode & XENPMU_MODE_SELF) )
+                cur_regs = guest_cpu_user_regs();
+            else if ( !guest_mode(regs) && is_hardware_domain(sampling->domain) )
+            {
+                cur_regs = regs;
+                domid = DOMID_XEN;
+            }
+            else
+                cur_regs = guest_cpu_user_regs();
+
+            r->ip = cur_regs->rip;
+            r->sp = cur_regs->rsp;
+            r->flags = cur_regs->eflags;
+
+            if ( !has_hvm_container_vcpu(sampled) )
+            {
+                r->ss = cur_regs->ss;
+                r->cs = cur_regs->cs;
+                if ( !(sampled->arch.flags & TF_kernel_mode) )
+                    *flags |= PMU_SAMPLE_USER;
+            }
+            else
+            {
+                struct segment_register seg;
+
+                hvm_get_segment_register(sampled, x86_seg_cs, &seg);
+                r->cs = seg.sel;
+                hvm_get_segment_register(sampled, x86_seg_ss, &seg);
+                r->ss = seg.sel;
+                r->cpl = seg.attr.fields.dpl;
+                if ( !(sampled->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE) )
+                    *flags |= PMU_SAMPLE_REAL;
+            }
+        }
+
+        vpmu->xenpmu_data->domain_id = domid;
+        vpmu->xenpmu_data->vcpu_id = sampled->vcpu_id;
+        vpmu->xenpmu_data->pcpu_id = smp_processor_id();
+
+        vpmu->hw_lapic_lvtpc |= APIC_LVT_MASKED;
+        apic_write(APIC_LVTPC, vpmu->hw_lapic_lvtpc);
+        *flags |= PMU_CACHED;
+
+        send_guest_vcpu_virq(sampling, VIRQ_XENPMU);
+
+        return;
+    }
 
     if ( vpmu->arch_vpmu_ops )
     {
-        struct vlapic *vlapic = vcpu_vlapic(v);
+        struct vlapic *vlapic = vcpu_vlapic(sampling);
         u32 vlapic_lvtpc;
 
+        /* We don't support (yet) HVM dom0 */
+        ASSERT(sampling == sampled);
+
         if ( !vpmu->arch_vpmu_ops->do_interrupt(regs) ||
              !is_vlapic_lvtpc_enabled(vlapic) )
             return;
@@ -147,7 +311,7 @@ void vpmu_do_interrupt(struct cpu_user_regs *regs)
             vlapic_set_irq(vlapic, vlapic_lvtpc & APIC_VECTOR_MASK, 0);
             break;
         case APIC_MODE_NMI:
-            v->nmi_pending = 1;
+            sampling->nmi_pending = 1;
             break;
         }
     }
@@ -245,7 +409,9 @@ void vpmu_load(struct vcpu *v)
     local_irq_enable();
 
     /* Only when PMU is counting, we load PMU context immediately. */
-    if ( !vpmu_is_set(vpmu, VPMU_RUNNING) )
+    if ( !vpmu_is_set(vpmu, VPMU_RUNNING) ||
+         (!is_hvm_vcpu(vpmu_vcpu(vpmu)) &&
+          (vpmu->xenpmu_data->pmu.pmu_flags & PMU_CACHED)) )
         return;
 
     if ( vpmu->arch_vpmu_ops && vpmu->arch_vpmu_ops->arch_vpmu_load )
@@ -265,6 +431,8 @@ void vpmu_initialise(struct vcpu *v)
 
     BUILD_BUG_ON(sizeof(struct xen_pmu_intel_ctxt) > XENPMU_CTXT_PAD_SZ);
     BUILD_BUG_ON(sizeof(struct xen_pmu_amd_ctxt) > XENPMU_CTXT_PAD_SZ);
+    BUILD_BUG_ON(sizeof(struct xen_pmu_regs) > XENPMU_REGS_PAD_SZ);
+    BUILD_BUG_ON(sizeof(struct compat_pmu_regs) > XENPMU_REGS_PAD_SZ);
 
     ASSERT(!vpmu->flags && !vpmu->context);
 
@@ -449,7 +617,9 @@ void vpmu_dump(struct vcpu *v)
 long do_xenpmu_op(unsigned int op, XEN_GUEST_HANDLE_PARAM(xen_pmu_params_t) arg)
 {
     int ret;
+    struct vcpu *curr;
     struct xen_pmu_params pmu_params = {.val = 0};
+    struct xen_pmu_data *xenpmu_data;
 
     if ( !opt_vpmu_enabled )
         return -EOPNOTSUPP;
@@ -552,6 +722,23 @@ long do_xenpmu_op(unsigned int op, XEN_GUEST_HANDLE_PARAM(xen_pmu_params_t) arg)
         pvpmu_finish(current->domain, &pmu_params);
         break;
 
+    case XENPMU_lvtpc_set:
+        xenpmu_data = current->arch.vpmu.xenpmu_data;
+        if ( xenpmu_data == NULL )
+            return -EINVAL;
+        vpmu_lvtpc_update(xenpmu_data->pmu.l.lapic_lvtpc);
+        break;
+
+    case XENPMU_flush:
+        curr = current;
+        xenpmu_data = curr->arch.vpmu.xenpmu_data;
+        if ( xenpmu_data == NULL )
+            return -EINVAL;
+        xenpmu_data->pmu.pmu_flags &= ~PMU_CACHED;
+        vpmu_lvtpc_update(xenpmu_data->pmu.l.lapic_lvtpc);
+        vpmu_load(curr);
+        break;
+
     default:
         ret = -EINVAL;
     }
diff --git a/xen/include/public/arch-x86/pmu.h b/xen/include/public/arch-x86/pmu.h
index 8bbe341..74a91ac 100644
--- a/xen/include/public/arch-x86/pmu.h
+++ b/xen/include/public/arch-x86/pmu.h
@@ -51,6 +51,12 @@ struct xen_pmu_regs {
 typedef struct xen_pmu_regs xen_pmu_regs_t;
 DEFINE_XEN_GUEST_HANDLE(xen_pmu_regs_t);
 
+/* PMU flags */
+#define PMU_CACHED         (1<<0) /* PMU MSRs are cached in the context */
+#define PMU_SAMPLE_USER    (1<<1) /* Sample is from user or kernel mode */
+#define PMU_SAMPLE_REAL    (1<<2) /* Sample is from realmode */
+#define PMU_SAMPLE_PV      (1<<3) /* Sample from a PV guest */
+
 struct xen_pmu_arch {
     union {
         struct xen_pmu_regs regs;
diff --git a/xen/include/public/pmu.h b/xen/include/public/pmu.h
index 81210d3..548aeeb 100644
--- a/xen/include/public/pmu.h
+++ b/xen/include/public/pmu.h
@@ -27,6 +27,8 @@
 #define XENPMU_feature_set     3
 #define XENPMU_init            4
 #define XENPMU_finish          5
+#define XENPMU_lvtpc_set       6
+#define XENPMU_flush           7 /* Write cached MSR values to HW     */
 /* ` } */
 
 /* Parameters structure for HYPERVISOR_xenpmu_op call */
diff --git a/xen/include/xsm/dummy.h b/xen/include/xsm/dummy.h
index ae47135..1ad4ecc 100644
--- a/xen/include/xsm/dummy.h
+++ b/xen/include/xsm/dummy.h
@@ -666,7 +666,9 @@ static XSM_INLINE int xsm_pmu_op (XSM_DEFAULT_ARG struct domain *d, int op)
     case XENPMU_feature_get:
         return xsm_default_action(XSM_PRIV, d, current->domain);
     case XENPMU_init:
-    case XENPMU_finish: 
+    case XENPMU_finish:
+    case XENPMU_lvtpc_set:
+    case XENPMU_flush:
         return xsm_default_action(XSM_HOOK, d, current->domain);
     default:
         return -EPERM;
diff --git a/xen/xsm/flask/hooks.c b/xen/xsm/flask/hooks.c
index 8e85af3..b5c3e65 100644
--- a/xen/xsm/flask/hooks.c
+++ b/xen/xsm/flask/hooks.c
@@ -1517,6 +1517,8 @@ static int flask_pmu_op (struct domain *d, unsigned int op)
                             XEN2__PMU_CTRL, NULL);
     case XENPMU_init:
     case XENPMU_finish:
+    case XENPMU_lvtpc_set:
+    case XENPMU_flush:
         return avc_has_perm(dsid, SECINITSID_XEN, SECCLASS_XEN2,
                             XEN2__PMU_USE, NULL);
     default:
-- 
1.8.1.4