From mboxrd@z Thu Jan 1 00:00:00 1970 From: Ross Lagerwall Subject: [PATCH v3] x86/NMI: Allow processing unknown NMIs when watchdog is enabled Date: Wed, 27 Aug 2014 17:25:28 +0100 Message-ID: <1409156728-4995-1-git-send-email-ross.lagerwall@citrix.com> Mime-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Return-path: List-Unsubscribe: , List-Post: List-Help: List-Subscribe: , Sender: xen-devel-bounces@lists.xen.org Errors-To: xen-devel-bounces@lists.xen.org To: Xen-devel Cc: Ross Lagerwall , Keir Fraser , Jan Beulich List-Id: xen-devel@lists.xenproject.org Change NMI processing so that if watchdog=force is passed on the command-line and the NMI is not caused by a perf counter overflow (i.e. likely not a watchdog "tick"), the NMI is handled by the unknown NMI handler. This allows injection of NMIs from IPMI controllers that don't set the IOCK/SERR bits to trigger the unknown NMI handler rather than be ignored. Signed-off-by: Ross Lagerwall CC: Keir Fraser CC: Jan Beulich --- Changes since V2: Fix processing when watchdog is turned off. Changes since V1: Most of the original behavior is now restored. Now, only if watchdog=force is given on the command-line and the NMI is not a watchdog tick (i.e. the perf counter has not overflowed) will the unknown NMI handler be invoked. docs/misc/xen-command-line.markdown | 6 ++++-- xen/arch/x86/nmi.c | 39 +++++++++++++++++++++++++++++++++++-- xen/arch/x86/traps.c | 7 ++++--- xen/include/asm-x86/apic.h | 2 +- xen/include/asm-x86/nmi.h | 3 +++ 5 files changed, 49 insertions(+), 8 deletions(-) diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown index a8cab59..1cbcca4 100644 --- a/docs/misc/xen-command-line.markdown +++ b/docs/misc/xen-command-line.markdown @@ -1245,12 +1245,14 @@ As the BTS virtualisation is not 100% safe and because of the nehalem quirk don't use the vpmu flag on production systems with Intel cpus! ### watchdog -> `= ` +> `= force | ` > Default: `false` Run an NMI watchdog on each processor. If a processor is stuck for -longer than the **watchdog\_timeout**, a panic occurs. +longer than the **watchdog\_timeout**, a panic occurs. When `force` is +specified, in addition to running an NMI watchdog on each processor, +unknown NMIs will still be processed. ### watchdog\_timeout > `= ` diff --git a/xen/arch/x86/nmi.c b/xen/arch/x86/nmi.c index c4427a6..873427f 100644 --- a/xen/arch/x86/nmi.c +++ b/xen/arch/x86/nmi.c @@ -43,7 +43,18 @@ static DEFINE_PER_CPU(unsigned int, nmi_timer_ticks); /* opt_watchdog: If true, run a watchdog NMI on each processor. */ bool_t __initdata opt_watchdog = 0; -boolean_param("watchdog", opt_watchdog); + +/* watchdog_force: If true, process unknown NMIs when running the watchdog. */ +bool_t watchdog_force = 0; + +static void __init parse_watchdog(char * s) +{ + opt_watchdog = !!parse_bool(s); + + if ( !strcmp(s, "force") ) + watchdog_force = 1; +} +custom_param("watchdog", parse_watchdog); /* opt_watchdog_timeout: Number of seconds to wait before panic. */ static unsigned int opt_watchdog_timeout = 5; @@ -82,6 +93,7 @@ int nmi_active; #define K7_EVNTSEL_USR (1 << 16) #define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 #define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING +#define K7_EVENT_WIDTH 32 #define P6_EVNTSEL0_ENABLE (1 << 22) #define P6_EVNTSEL_INT (1 << 20) @@ -89,10 +101,12 @@ int nmi_active; #define P6_EVNTSEL_USR (1 << 16) #define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79 #define CORE_EVENT_CPU_CLOCKS_NOT_HALTED 0x3c +#define P6_EVENT_WIDTH 32 #define P4_ESCR_EVENT_SELECT(N) ((N)<<25) #define P4_CCCR_OVF_PMI0 (1<<26) #define P4_CCCR_OVF_PMI1 (1<<27) +#define P4_CCCR_OVF (1<<31) #define P4_CCCR_THRESHOLD(N) ((N)<<20) #define P4_CCCR_COMPLEMENT (1<<19) #define P4_CCCR_COMPARE (1<<18) @@ -432,8 +446,10 @@ int __init watchdog_setup(void) return 0; } -void nmi_watchdog_tick(const struct cpu_user_regs *regs) +/* Returns false if this was not a watchdog NMI, true otherwise */ +bool_t nmi_watchdog_tick(const struct cpu_user_regs *regs) { + bool_t watchdog_tick = 1; unsigned int sum = this_cpu(nmi_timer_ticks); if ( (this_cpu(last_irq_sums) == sum) && watchdog_enabled() ) @@ -459,8 +475,15 @@ void nmi_watchdog_tick(const struct cpu_user_regs *regs) if ( nmi_perfctr_msr ) { + uint64_t msr_content; + + /* Work out if this is a watchdog tick by checking for overflow. */ if ( nmi_perfctr_msr == MSR_P4_IQ_PERFCTR0 ) { + rdmsrl(MSR_P4_IQ_CCCR0, msr_content); + if ( !(msr_content & P4_CCCR_OVF) ) + watchdog_tick = 0; + /* * P4 quirks: * - An overflown perfctr will assert its interrupt @@ -473,14 +496,26 @@ void nmi_watchdog_tick(const struct cpu_user_regs *regs) } else if ( nmi_perfctr_msr == MSR_P6_PERFCTR0 ) { + rdmsrl(MSR_P6_PERFCTR0, msr_content); + if ( msr_content & (1ULL << P6_EVENT_WIDTH) ) + watchdog_tick = 0; + /* * Only P6 based Pentium M need to re-unmask the apic vector but * it doesn't hurt other P6 variants. */ apic_write(APIC_LVTPC, APIC_DM_NMI); } + else if ( nmi_perfctr_msr == MSR_K7_PERFCTR0 ) + { + rdmsrl(MSR_K7_PERFCTR0, msr_content); + if ( msr_content & (1ULL << K7_EVENT_WIDTH) ) + watchdog_tick = 0; + } write_watchdog_counter(NULL); } + + return watchdog_tick; } /* diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c index 71be2ae..7f5306f 100644 --- a/xen/arch/x86/traps.c +++ b/xen/arch/x86/traps.c @@ -3306,14 +3306,15 @@ void do_nmi(const struct cpu_user_regs *regs) { unsigned int cpu = smp_processor_id(); unsigned char reason; + bool_t handle_unknown = 0; ++nmi_count(cpu); if ( nmi_callback(regs, cpu) ) return; - if ( nmi_watchdog ) - nmi_watchdog_tick(regs); + if ( !nmi_watchdog || (!nmi_watchdog_tick(regs) && watchdog_force) ) + handle_unknown = 1; /* Only the BSP gets external NMIs from the system. */ if ( cpu == 0 ) @@ -3323,7 +3324,7 @@ void do_nmi(const struct cpu_user_regs *regs) pci_serr_error(regs); if ( reason & 0x40 ) io_check_error(regs); - if ( !(reason & 0xc0) && !nmi_watchdog ) + if ( !(reason & 0xc0) && handle_unknown ) unknown_nmi_error(regs, reason); } } diff --git a/xen/include/asm-x86/apic.h b/xen/include/asm-x86/apic.h index 5d7623f..6697245 100644 --- a/xen/include/asm-x86/apic.h +++ b/xen/include/asm-x86/apic.h @@ -206,7 +206,7 @@ extern void release_lapic_nmi(void); extern void self_nmi(void); extern void disable_timer_nmi_watchdog(void); extern void enable_timer_nmi_watchdog(void); -extern void nmi_watchdog_tick (const struct cpu_user_regs *regs); +extern bool_t nmi_watchdog_tick (const struct cpu_user_regs *regs); extern int APIC_init_uniprocessor (void); extern void disable_APIC_timer(void); extern void enable_APIC_timer(void); diff --git a/xen/include/asm-x86/nmi.h b/xen/include/asm-x86/nmi.h index 4504fe1..2c92db9 100644 --- a/xen/include/asm-x86/nmi.h +++ b/xen/include/asm-x86/nmi.h @@ -8,6 +8,9 @@ struct cpu_user_regs; /* Watchdog boolean from the command line */ extern bool_t opt_watchdog; + +/* Watchdog force parameter from the command line */ +extern bool_t watchdog_force; typedef int (*nmi_callback_t)(const struct cpu_user_regs *regs, int cpu); -- 1.9.3