From mboxrd@z Thu Jan 1 00:00:00 1970 From: Boris Ostrovsky Subject: [PATCH v18 14/16] x86/VPMU: NMI-based VPMU support Date: Mon, 16 Feb 2015 17:26:57 -0500 Message-ID: <1424125619-10851-15-git-send-email-boris.ostrovsky@oracle.com> References: <1424125619-10851-1-git-send-email-boris.ostrovsky@oracle.com> Mime-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Return-path: In-Reply-To: <1424125619-10851-1-git-send-email-boris.ostrovsky@oracle.com> List-Unsubscribe: , List-Post: List-Help: List-Subscribe: , Sender: xen-devel-bounces@lists.xen.org Errors-To: xen-devel-bounces@lists.xen.org To: JBeulich@suse.com, kevin.tian@intel.com, suravee.suthikulpanit@amd.com, Aravind.Gopalakrishnan@amd.com, dietmar.hahn@ts.fujitsu.com, dgdegra@tycho.nsa.gov, andrew.cooper3@citrix.com Cc: boris.ostrovsky@oracle.com, tim@xen.org, jun.nakajima@intel.com, xen-devel@lists.xen.org List-Id: xen-devel@lists.xenproject.org Add support for using NMIs as PMU interrupts to allow profiling hypervisor when interrupts are disabled. Most of processing is still performed by vpmu_do_interrupt(). However, since certain operations are not NMI-safe we defer them to a softint that vpmu_do_interrupt() will schedule: * For PV guests that would be send_guest_vcpu_virq() * For HVM guests it's VLAPIC accesses and hvm_get_segment_register() (the later can be called in privileged profiling mode when the interrupted guest is an HVM one). With send_guest_vcpu_virq() and hvm_get_segment_register() for PV(H) and vlapic accesses for HVM moved to sofint, the only routines/macros that vpmu_do_interrupt() calls in NMI mode are: * memcpy() * querying domain type (is_XX_domain()) * guest_cpu_user_regs() * XLAT_cpu_user_regs() * raise_softirq() * vcpu_vpmu() * vpmu_ops->arch_vpmu_save() * vpmu_ops->do_interrupt() The latter two only access PMU MSRs with {rd,wr}msrl() (not the _safe versions which would not be NMI-safe). Signed-off-by: Boris Ostrovsky --- docs/misc/xen-command-line.markdown | 8 +- xen/arch/x86/hvm/svm/vpmu.c | 3 +- xen/arch/x86/hvm/vmx/vpmu_core2.c | 3 +- xen/arch/x86/hvm/vpmu.c | 229 ++++++++++++++++++++++++++++-------- xen/include/asm-x86/hvm/vpmu.h | 4 +- xen/include/asm-x86/softirq.h | 3 +- 6 files changed, 193 insertions(+), 57 deletions(-) diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown index bc316be..0ab1188 100644 --- a/docs/misc/xen-command-line.markdown +++ b/docs/misc/xen-command-line.markdown @@ -1330,11 +1330,11 @@ Use Virtual Processor ID support if available. This prevents the need for TLB flushes on VM entry and exit, increasing performance. ### vpmu -> `= ( bts )` +> `= ( [nmi,][bts] )` > Default: `off` -Switch on the virtualized performance monitoring unit for HVM guests. +Switch on the virtualized performance monitoring unit. If the current cpu isn't supported a message like 'VPMU: Initialization failed. ...' @@ -1348,6 +1348,10 @@ feature is switched on on Intel processors supporting this feature. Note that if **watchdog** option is also specified vpmu will be turned off. +If 'vpmu=nmi' is specified the PMU interrupt will cause an NMI instead of a +regular vector interrupt (which is the default). This can be useful for sampling +hypervisor code that is executed with interrupts disabled. + *Warning:* As the BTS virtualisation is not 100% safe and because of the nehalem quirk don't use the vpmu flag on production systems with Intel cpus! diff --git a/xen/arch/x86/hvm/svm/vpmu.c b/xen/arch/x86/hvm/svm/vpmu.c index 68113c7..7ddce33 100644 --- a/xen/arch/x86/hvm/svm/vpmu.c +++ b/xen/arch/x86/hvm/svm/vpmu.c @@ -168,7 +168,7 @@ static void amd_vpmu_unset_msr_bitmap(struct vcpu *v) msr_bitmap_off(vpmu); } -static int amd_vpmu_do_interrupt(struct cpu_user_regs *regs) +static int amd_vpmu_do_interrupt(const struct cpu_user_regs *regs) { return 1; } @@ -220,6 +220,7 @@ static inline void context_save(struct vpmu_struct *vpmu) rdmsrl(counters[i], counter_regs[i]); } +/* Must be NMI-safe */ static int amd_vpmu_save(struct vpmu_struct *vpmu) { struct vcpu *v; diff --git a/xen/arch/x86/hvm/vmx/vpmu_core2.c b/xen/arch/x86/hvm/vmx/vpmu_core2.c index 8067d83..0c7fd74 100644 --- a/xen/arch/x86/hvm/vmx/vpmu_core2.c +++ b/xen/arch/x86/hvm/vmx/vpmu_core2.c @@ -305,6 +305,7 @@ static inline void __core2_vpmu_save(struct vpmu_struct *vpmu) rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, core2_vpmu_cxt->global_status); } +/* Must be NMI-safe */ static int core2_vpmu_save(struct vpmu_struct *vpmu) { struct vcpu *v = vpmu_vcpu(vpmu); @@ -720,7 +721,7 @@ static void core2_vpmu_dump(const struct vcpu *v) } } -static int core2_vpmu_do_interrupt(struct cpu_user_regs *regs) +static int core2_vpmu_do_interrupt(const struct cpu_user_regs *regs) { struct vcpu *v = current; u64 msr_content; diff --git a/xen/arch/x86/hvm/vpmu.c b/xen/arch/x86/hvm/vpmu.c index 651cb00..cdead13 100644 --- a/xen/arch/x86/hvm/vpmu.c +++ b/xen/arch/x86/hvm/vpmu.c @@ -56,29 +56,47 @@ static bool_t __read_mostly vpmu_disabled = 1; static void parse_vpmu_param(char *s); custom_param("vpmu", parse_vpmu_param); +static void pmu_softnmi(void); + static DEFINE_PER_CPU(struct vcpu *, last_vcpu); +static DEFINE_PER_CPU(struct vcpu *, sampled_vcpu); + +static uint32_t __read_mostly vpmu_interrupt_type = PMU_APIC_VECTOR; static void __init parse_vpmu_param(char *s) { - switch ( parse_bool(s) ) - { - case 0: - break; - default: - if ( !strcmp(s, "bts") ) - vpmu_features |= XENPMU_FEATURE_INTEL_BTS; - else if ( *s ) + char *ss; + + vpmu_mode = XENPMU_MODE_SELF; + vpmu_disabled = 0; + if (*s == '\0') + return; + + do { + ss = strchr(s, ','); + if ( ss ) + *ss = '\0'; + + switch ( parse_bool(s) ) { - printk("VPMU: unknown flag: %s - vpmu disabled!\n", s); - break; + default: + if ( !strcmp(s, "nmi") ) + vpmu_interrupt_type = APIC_DM_NMI; + else if ( !strcmp(s, "bts") ) + vpmu_features |= XENPMU_FEATURE_INTEL_BTS; + else + { + printk("VPMU: unknown flag: %s - vpmu disabled!\n", s); + case 0: + vpmu_mode = XENPMU_MODE_OFF; + vpmu_disabled = 1; + case 1: + return; + } } - /* fall through */ - case 1: - /* Default VPMU mode */ - vpmu_mode = XENPMU_MODE_SELF; - vpmu_disabled = 0; - break; - } + + s = ss + 1; + } while ( ss ); } void vpmu_lvtpc_update(uint32_t val) @@ -92,7 +110,7 @@ void vpmu_lvtpc_update(uint32_t val) curr = current; vpmu = vcpu_vpmu(curr); - vpmu->hw_lapic_lvtpc = PMU_APIC_VECTOR | (val & APIC_LVT_MASKED); + vpmu->hw_lapic_lvtpc = vpmu_interrupt_type | (val & APIC_LVT_MASKED); /* Postpone APIC updates for PV(H) guests if PMU interrupt is pending */ if ( is_hvm_vcpu(curr) || !vpmu->xenpmu_data || @@ -100,6 +118,30 @@ void vpmu_lvtpc_update(uint32_t val) apic_write(APIC_LVTPC, vpmu->hw_lapic_lvtpc); } +static void vpmu_send_interrupt(struct vcpu *v) +{ + struct vlapic *vlapic; + u32 vlapic_lvtpc; + + ASSERT(is_hvm_vcpu(v)); + + vlapic = vcpu_vlapic(v); + if ( !is_vlapic_lvtpc_enabled(vlapic) ) + return; + + vlapic_lvtpc = vlapic_get_reg(vlapic, APIC_LVTPC); + + switch ( GET_APIC_DELIVERY_MODE(vlapic_lvtpc) ) + { + case APIC_MODE_FIXED: + vlapic_set_irq(vlapic, vlapic_lvtpc & APIC_VECTOR_MASK, 0); + break; + case APIC_MODE_NMI: + v->nmi_pending = 1; + break; + } +} + int vpmu_do_msr(unsigned int msr, uint64_t *msr_content, uint64_t supported, bool_t is_write) { @@ -157,7 +199,7 @@ static struct vcpu *choose_hwdom_vcpu(void) return hardware_domain->vcpu[idx]; } -void vpmu_do_interrupt(struct cpu_user_regs *regs) +int vpmu_do_interrupt(const struct cpu_user_regs *regs) { struct vcpu *sampled = current, *sampling; struct vpmu_struct *vpmu; @@ -171,7 +213,7 @@ void vpmu_do_interrupt(struct cpu_user_regs *regs) { sampling = choose_hwdom_vcpu(); if ( !sampling ) - return; + return 0; } else sampling = sampled; @@ -185,15 +227,15 @@ void vpmu_do_interrupt(struct cpu_user_regs *regs) uint32_t domid; if ( !vpmu->xenpmu_data ) - return; + return 0; if ( is_pvh_vcpu(sampling) && !(vpmu_mode & XENPMU_MODE_ALL) && !vpmu->arch_vpmu_ops->do_interrupt(regs) ) - return; + return 0; if ( *flags & PMU_CACHED ) - return; + return 1; /* PV guest will be reading PMU MSRs from xenpmu_data */ vpmu_set(vpmu, VPMU_CONTEXT_SAVE | VPMU_CONTEXT_LOADED); @@ -260,15 +302,20 @@ void vpmu_do_interrupt(struct cpu_user_regs *regs) } else { - struct segment_register seg; - - hvm_get_segment_register(sampled, x86_seg_cs, &seg); - r->cs = seg.sel; - hvm_get_segment_register(sampled, x86_seg_ss, &seg); - r->ss = seg.sel; - r->cpl = seg.attr.fields.dpl; if ( !(sampled->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE) ) *flags |= PMU_SAMPLE_REAL; + + /* Unsafe in NMI context, defer to softint later. */ + if ( vpmu_interrupt_type != APIC_DM_NMI ) + { + struct segment_register seg; + + hvm_get_segment_register(sampled, x86_seg_cs, &seg); + r->cs = seg.sel; + hvm_get_segment_register(sampled, x86_seg_ss, &seg); + r->ss = seg.sel; + r->cpl = seg.attr.fields.dpl; + } } } @@ -280,35 +327,37 @@ void vpmu_do_interrupt(struct cpu_user_regs *regs) apic_write(APIC_LVTPC, vpmu->hw_lapic_lvtpc); *flags |= PMU_CACHED; - send_guest_vcpu_virq(sampling, VIRQ_XENPMU); + if ( vpmu_interrupt_type == APIC_DM_NMI ) + { + this_cpu(sampled_vcpu) = sampled; + raise_softirq(PMU_SOFTIRQ); + } + else + send_guest_vcpu_virq(sampling, VIRQ_XENPMU); - return; + return 1; } if ( vpmu->arch_vpmu_ops ) { - struct vlapic *vlapic = vcpu_vlapic(sampling); - u32 vlapic_lvtpc; - /* We don't support (yet) HVM dom0 */ ASSERT(sampling == sampled); - if ( !vpmu->arch_vpmu_ops->do_interrupt(regs) || - !is_vlapic_lvtpc_enabled(vlapic) ) - return; - - vlapic_lvtpc = vlapic_get_reg(vlapic, APIC_LVTPC); + if ( !vpmu->arch_vpmu_ops->do_interrupt(regs) ) + return 0; - switch ( GET_APIC_DELIVERY_MODE(vlapic_lvtpc) ) + if ( vpmu_interrupt_type == APIC_DM_NMI ) { - case APIC_MODE_FIXED: - vlapic_set_irq(vlapic, vlapic_lvtpc & APIC_VECTOR_MASK, 0); - break; - case APIC_MODE_NMI: - sampling->nmi_pending = 1; - break; + this_cpu(sampled_vcpu) = sampled; + raise_softirq(PMU_SOFTIRQ); } + else + vpmu_send_interrupt(sampling); + + return 1; } + + return 0; } void vpmu_do_cpuid(unsigned int input, @@ -336,6 +385,9 @@ static void vpmu_save_force(void *arg) vpmu_reset(vpmu, VPMU_CONTEXT_SAVE); per_cpu(last_vcpu, smp_processor_id()) = NULL; + + /* Make sure there are no outstanding PMU NMIs */ + pmu_softnmi(); } void vpmu_save(struct vpmu_struct *vpmu) @@ -352,7 +404,10 @@ void vpmu_save(struct vpmu_struct *vpmu) if ( vpmu->arch_vpmu_ops->arch_vpmu_save(vpmu) ) vpmu_reset(vpmu, VPMU_CONTEXT_LOADED); - apic_write(APIC_LVTPC, PMU_APIC_VECTOR | APIC_LVT_MASKED); + apic_write(APIC_LVTPC, vpmu_interrupt_type | APIC_LVT_MASKED); + + /* Make sure there are no outstanding PMU NMIs */ + pmu_softnmi(); } void vpmu_load(struct vpmu_struct *vpmu) @@ -403,6 +458,9 @@ void vpmu_load(struct vpmu_struct *vpmu) (vpmu->xenpmu_data->pmu.pmu_flags & PMU_CACHED)) ) return; + /* Make sure there are no outstanding PMU NMIs from previous vcpu */ + pmu_softnmi(); + if ( vpmu->arch_vpmu_ops && vpmu->arch_vpmu_ops->arch_vpmu_load ) { apic_write_around(APIC_LVTPC, vpmu->hw_lapic_lvtpc); @@ -426,7 +484,7 @@ void vpmu_initialise(struct vcpu *v) vpmu_destroy(v); vpmu_clear(vpmu); vpmu->context = NULL; - vpmu->hw_lapic_lvtpc = PMU_APIC_VECTOR | APIC_LVT_MASKED; + vpmu->hw_lapic_lvtpc = vpmu_interrupt_type | APIC_LVT_MASKED; switch ( vendor ) { @@ -487,6 +545,55 @@ void vpmu_destroy(struct vcpu *v) } } +/* Process the softirq set by PMU NMI handler */ +static void pmu_softnmi(void) +{ + unsigned int cpu = smp_processor_id(); + struct vcpu *v, *sampled = per_cpu(sampled_vcpu, cpu); + + if ( sampled == NULL ) + return; + + per_cpu(sampled_vcpu, cpu) = NULL; + + if ( (vpmu_mode & XENPMU_MODE_ALL) || + (sampled->domain->domain_id >= DOMID_FIRST_RESERVED) ) + { + v = choose_hwdom_vcpu(); + if ( !v ) + return; + } + else + { + if ( is_hvm_vcpu(sampled) ) + { + vpmu_send_interrupt(sampled); + return; + } + v = sampled; + } + + if ( has_hvm_container_vcpu(sampled) ) + { + struct segment_register seg; + struct xen_pmu_arch *pmu = &v->arch.vpmu.xenpmu_data->pmu; + struct xen_pmu_regs *r = &pmu->r.regs; + + hvm_get_segment_register(sampled, x86_seg_cs, &seg); + r->cs = seg.sel; + hvm_get_segment_register(sampled, x86_seg_ss, &seg); + r->ss = seg.sel; + r->cpl = seg.attr.fields.dpl; + } + + send_guest_vcpu_virq(v, VIRQ_XENPMU); +} + +int pmu_nmi_interrupt(const struct cpu_user_regs *regs, int cpu) +{ + return vpmu_do_interrupt(regs); +} + static int pvpmu_init(struct domain *d, xen_pmu_params_t *params) { struct vcpu *v; @@ -502,6 +609,7 @@ static int pvpmu_init(struct domain *d, xen_pmu_params_t *params) (d->vcpu[params->vcpu] == NULL) ) return -EINVAL; + v = d->vcpu[params->vcpu]; if ( v->arch.vpmu.xenpmu_data ) return -EINVAL; @@ -515,7 +623,6 @@ static int pvpmu_init(struct domain *d, xen_pmu_params_t *params) return -EINVAL; } - v = d->vcpu[params->vcpu]; vpmu = vcpu_vpmu(v); spin_lock(&vpmu->vpmu_lock); @@ -832,6 +939,21 @@ static int __init vpmu_init(void) return 0; } + if ( vpmu_interrupt_type == APIC_DM_NMI ) + { + if ( reserve_lapic_nmi() != 0 ) + { + printk(XENLOG_WARNING "VPMU: Can't reserve NMI, will use" + " APIC vector 0x%x\n", PMU_APIC_VECTOR); + vpmu_interrupt_type = PMU_APIC_VECTOR; + } + else + { + set_nmi_callback(pmu_nmi_interrupt); + open_softirq(PMU_SOFTIRQ, pmu_softnmi); + } + } + switch ( vendor ) { case X86_VENDOR_AMD: @@ -853,7 +975,14 @@ static int __init vpmu_init(void) printk(XENLOG_INFO "VPMU: version " __stringify(XENPMU_VER_MAJ) "." __stringify(XENPMU_VER_MIN) "\n"); else + { + if ( vpmu_interrupt_type == APIC_DM_NMI ) + { + unset_nmi_callback(); + release_lapic_nmi(); + } vpmu_disabled = 1; + } return 0; } diff --git a/xen/include/asm-x86/hvm/vpmu.h b/xen/include/asm-x86/hvm/vpmu.h index 2c888cc..ed5dc8c 100644 --- a/xen/include/asm-x86/hvm/vpmu.h +++ b/xen/include/asm-x86/hvm/vpmu.h @@ -53,7 +53,7 @@ struct arch_vpmu_ops { int (*do_wrmsr)(unsigned int msr, uint64_t msr_content, uint64_t supported); int (*do_rdmsr)(unsigned int msr, uint64_t *msr_content); - int (*do_interrupt)(struct cpu_user_regs *regs); + int (*do_interrupt)(const struct cpu_user_regs *regs); void (*do_cpuid)(unsigned int input, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx); @@ -102,7 +102,7 @@ static inline bool_t vpmu_are_all_set(const struct vpmu_struct *vpmu, void vpmu_lvtpc_update(uint32_t val); int vpmu_do_msr(unsigned int msr, uint64_t *msr_content, uint64_t supported, bool_t is_write); -void vpmu_do_interrupt(struct cpu_user_regs *regs); +int vpmu_do_interrupt(const struct cpu_user_regs *regs); void vpmu_do_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx); void vpmu_initialise(struct vcpu *v); diff --git a/xen/include/asm-x86/softirq.h b/xen/include/asm-x86/softirq.h index ec787d6..fca110f 100644 --- a/xen/include/asm-x86/softirq.h +++ b/xen/include/asm-x86/softirq.h @@ -8,7 +8,8 @@ #define MACHINE_CHECK_SOFTIRQ (NR_COMMON_SOFTIRQS + 3) #define PCI_SERR_SOFTIRQ (NR_COMMON_SOFTIRQS + 4) #define HVM_DPCI_SOFTIRQ (NR_COMMON_SOFTIRQS + 5) -#define NR_ARCH_SOFTIRQS 6 +#define PMU_SOFTIRQ (NR_COMMON_SOFTIRQS + 6) +#define NR_ARCH_SOFTIRQS 7 bool_t arch_skip_send_event_check(unsigned int cpu); -- 1.8.1.4