All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] x86/crash: Indicate how well nmi_shootdown_cpus() managed to do.
@ 2013-09-24 19:56 Andrew Cooper
  2013-09-24 19:56 ` [PATCH 2/2] DO NOT APPLY - debugging code to lock a pcpu in an NMI loop Andrew Cooper
                   ` (3 more replies)
  0 siblings, 4 replies; 6+ messages in thread
From: Andrew Cooper @ 2013-09-24 19:56 UTC (permalink / raw)
  To: Xen-devel; +Cc: Andrew Cooper, Keir Fraser, Jan Beulich, Tim Deegan

Having nmi_shootdown_cpus() report which pcpus failed to be shot down is a
useful debugging hint as to what possibly went wrong (especially when the
crash logs seem to indicate that an NMI timeout occurred while waiting for one
of the problematic pcpus to perform an action).

This is achieved by swapping an atomic_t count of unreported pcpus with a
cpumask.  In the case that the 1 second timeout occurs, use the cpumask to
identify the problematic pcpus.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
CC: Keir Fraser <keir@xen.org>
CC: Jan Beulich <JBeulich@suse.com>
CC: Tim Deegan <tim@xen.org>

---

We in XenServer have seen a few crashes like this recently, and having an
extra bit of debugging on the serial console or in the conring is
substantially more helpful than trying to piece the crash together after-the-
fact based on what information is missing.
---
 xen/arch/x86/crash.c |   20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/xen/arch/x86/crash.c b/xen/arch/x86/crash.c
index 0a807d1..5f0f07c 100644
--- a/xen/arch/x86/crash.c
+++ b/xen/arch/x86/crash.c
@@ -22,6 +22,7 @@
 #include <xen/perfc.h>
 #include <xen/kexec.h>
 #include <xen/sched.h>
+#include <xen/keyhandler.h>
 #include <public/xen.h>
 #include <asm/shared.h>
 #include <asm/hvm/support.h>
@@ -30,7 +31,7 @@
 #include <xen/iommu.h>
 #include <asm/hpet.h>
 
-static atomic_t waiting_for_crash_ipi;
+static cpumask_t waiting_to_crash;
 static unsigned int crashing_cpu;
 static DEFINE_PER_CPU_READ_MOSTLY(bool_t, crash_save_done);
 
@@ -65,7 +66,7 @@ void __attribute__((noreturn)) do_nmi_crash(struct cpu_user_regs *regs)
         __stop_this_cpu();
 
         this_cpu(crash_save_done) = 1;
-        atomic_dec(&waiting_for_crash_ipi);
+        cpumask_clear_cpu(cpu, &waiting_to_crash);
     }
 
     /* Poor mans self_nmi().  __stop_this_cpu() has reverted the LAPIC
@@ -122,7 +123,8 @@ static void nmi_shootdown_cpus(void)
     crashing_cpu = cpu;
     local_irq_count(crashing_cpu) = 0;
 
-    atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
+    cpumask_copy(&waiting_to_crash, &cpu_online_map);
+    cpumask_clear_cpu(cpu, &waiting_to_crash);
 
     /* Change NMI trap handlers.  Non-crashing pcpus get nmi_crash which
      * invokes do_nmi_crash (above), which cause them to write state and
@@ -162,12 +164,22 @@ static void nmi_shootdown_cpus(void)
     smp_send_nmi_allbutself();
 
     msecs = 1000; /* Wait at most a second for the other cpus to stop */
-    while ( (atomic_read(&waiting_for_crash_ipi) > 0) && msecs )
+    while ( (cpumask_weight(&waiting_to_crash) > 0) && msecs )
     {
         mdelay(1);
         msecs--;
     }
 
+    /* Leave a hint of how well we did trying to shoot down the other cpus */
+    if ( msecs )
+        printk("Shot down all cpus\n");
+    else
+    {
+        cpulist_scnprintf(keyhandler_scratch, sizeof keyhandler_scratch,
+                          &waiting_to_crash);
+        printk("Failed to shoot down cpus {%s}\n", keyhandler_scratch);
+    }
+
     /* Crash shutdown any IOMMU functionality as the crashdump kernel is not
      * happy when booting if interrupt/dma remapping is still enabled */
     iommu_crash_shutdown();
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 2/2] DO NOT APPLY - debugging code to lock a pcpu in an NMI loop
  2013-09-24 19:56 [PATCH 1/2] x86/crash: Indicate how well nmi_shootdown_cpus() managed to do Andrew Cooper
@ 2013-09-24 19:56 ` Andrew Cooper
  2013-09-25  5:56 ` [PATCH 1/2] x86/crash: Indicate how well nmi_shootdown_cpus() managed to do Keir Fraser
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 6+ messages in thread
From: Andrew Cooper @ 2013-09-24 19:56 UTC (permalink / raw)
  To: Xen-devel; +Cc: Andrew Cooper

Should be used in combination with the regular NMI watchdog
---
 xen/arch/x86/crash.c        |   32 ++++++++++++++++++++++++++++++++
 xen/arch/x86/x86_64/entry.S |    8 ++++++++
 2 files changed, 40 insertions(+)

diff --git a/xen/arch/x86/crash.c b/xen/arch/x86/crash.c
index 5f0f07c..724fc17 100644
--- a/xen/arch/x86/crash.c
+++ b/xen/arch/x86/crash.c
@@ -207,6 +207,38 @@ void machine_crash_shutdown(void)
         arch_get_pfn_to_mfn_frame_list_list(dom0);
 }
 
+void __attribute__((noreturn)) do_nmi_loop(struct cpu_user_regs *regs)
+{
+    unsigned int cpu = smp_processor_id();
+    printk("In NMI wedge on cpu%d\n  Spinning forever...", cpu);
+    for ( ; ; )
+        halt();
+}
+
+void nmi_loop(void);
+static void nmi_wedge(unsigned char key)
+{
+    unsigned int cpu = smp_processor_id();
+
+    printk("'%c' pressed -> Wedging cpu%d in NMI loop\n", key, cpu);
+
+    _update_gate_addr_lower(&idt_tables[cpu][TRAP_nmi], &nmi_loop);
+}
+
+static struct keyhandler nmi_wedge_keyhandler = {
+    .diagnostic = 1,
+    .u.fn = nmi_wedge,
+    .desc = "Wedge cpu in NMI loop"
+};
+
+static int __init nmi_wedge_key_init(void)
+{
+    printk("Installing NMI wedge keyhandler\n");
+    register_keyhandler('1', &nmi_wedge_keyhandler);
+    return 0;
+}
+__initcall(nmi_wedge_key_init);
+
 /*
  * Local variables:
  * mode: C
diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S
index f64e871..18b77f4 100644
--- a/xen/arch/x86/x86_64/entry.S
+++ b/xen/arch/x86/x86_64/entry.S
@@ -666,6 +666,14 @@ ENTRY(nmi_crash)
         callq do_nmi_crash /* Does not return */
         ud2
 
+ENTRY(nmi_loop)
+        pushq $0
+        movl $TRAP_nmi,4(%rsp)
+        SAVE_ALL
+        movq %rsp,%rdi
+        callq do_nmi_loop /* Does not return */
+        ud2
+
 ENTRY(machine_check)
         pushq $0
         movl  $TRAP_machine_check,4(%rsp)
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH 1/2] x86/crash: Indicate how well nmi_shootdown_cpus() managed to do.
  2013-09-24 19:56 [PATCH 1/2] x86/crash: Indicate how well nmi_shootdown_cpus() managed to do Andrew Cooper
  2013-09-24 19:56 ` [PATCH 2/2] DO NOT APPLY - debugging code to lock a pcpu in an NMI loop Andrew Cooper
@ 2013-09-25  5:56 ` Keir Fraser
  2013-09-25  7:35 ` Jan Beulich
  2013-09-25 10:22 ` [Patch v2] " Andrew Cooper
  3 siblings, 0 replies; 6+ messages in thread
From: Keir Fraser @ 2013-09-25  5:56 UTC (permalink / raw)
  To: Andrew Cooper, Xen-devel; +Cc: Tim Deegan, Jan Beulich

On 24/09/2013 20:56, "Andrew Cooper" <andrew.cooper3@citrix.com> wrote:

> Having nmi_shootdown_cpus() report which pcpus failed to be shot down is a
> useful debugging hint as to what possibly went wrong (especially when the
> crash logs seem to indicate that an NMI timeout occurred while waiting for one
> of the problematic pcpus to perform an action).
> 
> This is achieved by swapping an atomic_t count of unreported pcpus with a
> cpumask.  In the case that the 1 second timeout occurs, use the cpumask to
> identify the problematic pcpus.
> 
> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
> CC: Keir Fraser <keir@xen.org>
> CC: Jan Beulich <JBeulich@suse.com>
> CC: Tim Deegan <tim@xen.org>
> 
> ---
> 

> @@ -162,12 +164,22 @@ static void nmi_shootdown_cpus(void)
>      smp_send_nmi_allbutself();
>  
>      msecs = 1000; /* Wait at most a second for the other cpus to stop */
> -    while ( (atomic_read(&waiting_for_crash_ipi) > 0) && msecs )
> +    while ( (cpumask_weight(&waiting_to_crash) > 0) && msecs )
>      {
>          mdelay(1);
>          msecs--;
>      }
>  
> +    /* Leave a hint of how well we did trying to shoot down the other cpus */
> +    if ( msecs )

if (cpumask_empty(&waiting_to_crash))
Would be more obvious I think.

Apart from that
Acked-by: Keir Fraser <keir@xen.org>

> +        printk("Shot down all cpus\n");
> +    else
> +    {
> +        cpulist_scnprintf(keyhandler_scratch, sizeof keyhandler_scratch,
> +                          &waiting_to_crash);
> +        printk("Failed to shoot down cpus {%s}\n", keyhandler_scratch);
> +    }
> +
>      /* Crash shutdown any IOMMU functionality as the crashdump kernel is not
>       * happy when booting if interrupt/dma remapping is still enabled */
>      iommu_crash_shutdown();

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 1/2] x86/crash: Indicate how well nmi_shootdown_cpus() managed to do.
  2013-09-24 19:56 [PATCH 1/2] x86/crash: Indicate how well nmi_shootdown_cpus() managed to do Andrew Cooper
  2013-09-24 19:56 ` [PATCH 2/2] DO NOT APPLY - debugging code to lock a pcpu in an NMI loop Andrew Cooper
  2013-09-25  5:56 ` [PATCH 1/2] x86/crash: Indicate how well nmi_shootdown_cpus() managed to do Keir Fraser
@ 2013-09-25  7:35 ` Jan Beulich
  2013-09-25 10:22 ` [Patch v2] " Andrew Cooper
  3 siblings, 0 replies; 6+ messages in thread
From: Jan Beulich @ 2013-09-25  7:35 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: xen-devel, Keir Fraser, Tim Deegan

>>> On 24.09.13 at 21:56, Andrew Cooper <andrew.cooper3@citrix.com> wrote:
> Having nmi_shootdown_cpus() report which pcpus failed to be shot down is a
> useful debugging hint as to what possibly went wrong (especially when the
> crash logs seem to indicate that an NMI timeout occurred while waiting for 
> one
> of the problematic pcpus to perform an action).
> 
> This is achieved by swapping an atomic_t count of unreported pcpus with a
> cpumask.  In the case that the 1 second timeout occurs, use the cpumask to
> identify the problematic pcpus.
> 
> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
> CC: Keir Fraser <keir@xen.org>
> CC: Jan Beulich <JBeulich@suse.com>
> CC: Tim Deegan <tim@xen.org>
> 
> ---
> 
> We in XenServer have seen a few crashes like this recently, and having an
> extra bit of debugging on the serial console or in the conring is
> substantially more helpful than trying to piece the crash together after-the-
> fact based on what information is missing.
> ---
>  xen/arch/x86/crash.c |   20 ++++++++++++++++----
>  1 file changed, 16 insertions(+), 4 deletions(-)
> 
> diff --git a/xen/arch/x86/crash.c b/xen/arch/x86/crash.c
> index 0a807d1..5f0f07c 100644
> --- a/xen/arch/x86/crash.c
> +++ b/xen/arch/x86/crash.c
> @@ -22,6 +22,7 @@
>  #include <xen/perfc.h>
>  #include <xen/kexec.h>
>  #include <xen/sched.h>
> +#include <xen/keyhandler.h>
>  #include <public/xen.h>
>  #include <asm/shared.h>
>  #include <asm/hvm/support.h>
> @@ -30,7 +31,7 @@
>  #include <xen/iommu.h>
>  #include <asm/hpet.h>
>  
> -static atomic_t waiting_for_crash_ipi;
> +static cpumask_t waiting_to_crash;
>  static unsigned int crashing_cpu;
>  static DEFINE_PER_CPU_READ_MOSTLY(bool_t, crash_save_done);
>  
> @@ -65,7 +66,7 @@ void __attribute__((noreturn)) do_nmi_crash(struct 
> cpu_user_regs *regs)
>          __stop_this_cpu();
>  
>          this_cpu(crash_save_done) = 1;
> -        atomic_dec(&waiting_for_crash_ipi);
> +        cpumask_clear_cpu(cpu, &waiting_to_crash);
>      }
>  
>      /* Poor mans self_nmi().  __stop_this_cpu() has reverted the LAPIC
> @@ -122,7 +123,8 @@ static void nmi_shootdown_cpus(void)
>      crashing_cpu = cpu;
>      local_irq_count(crashing_cpu) = 0;
>  
> -    atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
> +    cpumask_copy(&waiting_to_crash, &cpu_online_map);
> +    cpumask_clear_cpu(cpu, &waiting_to_crash);

cpumask_andnot(&waiting_to_crash, &cpu_online_map, cpumask_of(cpu));

Jan

>  
>      /* Change NMI trap handlers.  Non-crashing pcpus get nmi_crash which
>       * invokes do_nmi_crash (above), which cause them to write state and
> @@ -162,12 +164,22 @@ static void nmi_shootdown_cpus(void)
>      smp_send_nmi_allbutself();
>  
>      msecs = 1000; /* Wait at most a second for the other cpus to stop */
> -    while ( (atomic_read(&waiting_for_crash_ipi) > 0) && msecs )
> +    while ( (cpumask_weight(&waiting_to_crash) > 0) && msecs )
>      {
>          mdelay(1);
>          msecs--;
>      }
>  
> +    /* Leave a hint of how well we did trying to shoot down the other cpus 
> */
> +    if ( msecs )
> +        printk("Shot down all cpus\n");
> +    else
> +    {
> +        cpulist_scnprintf(keyhandler_scratch, sizeof keyhandler_scratch,
> +                          &waiting_to_crash);
> +        printk("Failed to shoot down cpus {%s}\n", keyhandler_scratch);
> +    }
> +
>      /* Crash shutdown any IOMMU functionality as the crashdump kernel is 
> not
>       * happy when booting if interrupt/dma remapping is still enabled */
>      iommu_crash_shutdown();
> -- 
> 1.7.10.4

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [Patch v2] x86/crash: Indicate how well nmi_shootdown_cpus() managed to do.
  2013-09-24 19:56 [PATCH 1/2] x86/crash: Indicate how well nmi_shootdown_cpus() managed to do Andrew Cooper
                   ` (2 preceding siblings ...)
  2013-09-25  7:35 ` Jan Beulich
@ 2013-09-25 10:22 ` Andrew Cooper
  2013-09-25 11:41   ` Keir Fraser
  3 siblings, 1 reply; 6+ messages in thread
From: Andrew Cooper @ 2013-09-25 10:22 UTC (permalink / raw)
  To: Xen-devel; +Cc: Andrew Cooper, Keir Fraser, Jan Beulich, Tim Deegan

Having nmi_shootdown_cpus() report which pcpus failed to be shot down is a
useful debugging hint as to what possibly went wrong (especially when the
crash logs seem to indicate that an NMI timeout occurred while waiting for one
of the problematic pcpus to perform an action).

This is achieved by swapping an atomic_t count of unreported pcpus with a
cpumask.  In the case that the 1 second timeout occurs, use the cpumask to
identify the problematic pcpus.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
CC: Keir Fraser <keir@xen.org>
CC: Jan Beulich <JBeulich@suse.com>
CC: Tim Deegan <tim@xen.org>

---

Changes in v2:
 * Use cpumask_andnot() in preference to copy() followed by clear_cpu()
 * Use cpumask_empty() in preference to "if ( msecs )"
 * Use !cpumask_empty() in preference to "cpumask_weight(&waiting_to_crash) > 0"

We in XenServer have seen a few crashes like this recently, and having an
extra bit of debugging on the serial console or in the conring is
substantially more helpful than trying to piece the crash together after-the-
fact based on what information is missing.
---
 xen/arch/x86/crash.c |   19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/xen/arch/x86/crash.c b/xen/arch/x86/crash.c
index 0a807d1..4495451 100644
--- a/xen/arch/x86/crash.c
+++ b/xen/arch/x86/crash.c
@@ -22,6 +22,7 @@
 #include <xen/perfc.h>
 #include <xen/kexec.h>
 #include <xen/sched.h>
+#include <xen/keyhandler.h>
 #include <public/xen.h>
 #include <asm/shared.h>
 #include <asm/hvm/support.h>
@@ -30,7 +31,7 @@
 #include <xen/iommu.h>
 #include <asm/hpet.h>
 
-static atomic_t waiting_for_crash_ipi;
+static cpumask_t waiting_to_crash;
 static unsigned int crashing_cpu;
 static DEFINE_PER_CPU_READ_MOSTLY(bool_t, crash_save_done);
 
@@ -65,7 +66,7 @@ void __attribute__((noreturn)) do_nmi_crash(struct cpu_user_regs *regs)
         __stop_this_cpu();
 
         this_cpu(crash_save_done) = 1;
-        atomic_dec(&waiting_for_crash_ipi);
+        cpumask_clear_cpu(cpu, &waiting_to_crash);
     }
 
     /* Poor mans self_nmi().  __stop_this_cpu() has reverted the LAPIC
@@ -122,7 +123,7 @@ static void nmi_shootdown_cpus(void)
     crashing_cpu = cpu;
     local_irq_count(crashing_cpu) = 0;
 
-    atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
+    cpumask_andnot(&waiting_to_crash, &cpu_online_map, cpumask_of(cpu));
 
     /* Change NMI trap handlers.  Non-crashing pcpus get nmi_crash which
      * invokes do_nmi_crash (above), which cause them to write state and
@@ -162,12 +163,22 @@ static void nmi_shootdown_cpus(void)
     smp_send_nmi_allbutself();
 
     msecs = 1000; /* Wait at most a second for the other cpus to stop */
-    while ( (atomic_read(&waiting_for_crash_ipi) > 0) && msecs )
+    while ( !cpumask_empty(&waiting_to_crash) && msecs )
     {
         mdelay(1);
         msecs--;
     }
 
+    /* Leave a hint of how well we did trying to shoot down the other cpus */
+    if ( cpumask_empty(&waiting_to_crash) )
+        printk("Shot down all cpus\n");
+    else
+    {
+        cpulist_scnprintf(keyhandler_scratch, sizeof keyhandler_scratch,
+                          &waiting_to_crash);
+        printk("Failed to shoot down cpus {%s}\n", keyhandler_scratch);
+    }
+
     /* Crash shutdown any IOMMU functionality as the crashdump kernel is not
      * happy when booting if interrupt/dma remapping is still enabled */
     iommu_crash_shutdown();
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [Patch v2] x86/crash: Indicate how well nmi_shootdown_cpus() managed to do.
  2013-09-25 10:22 ` [Patch v2] " Andrew Cooper
@ 2013-09-25 11:41   ` Keir Fraser
  0 siblings, 0 replies; 6+ messages in thread
From: Keir Fraser @ 2013-09-25 11:41 UTC (permalink / raw)
  To: Andrew Cooper, Xen-devel; +Cc: Tim Deegan, Jan Beulich

On 25/09/2013 11:22, "Andrew Cooper" <andrew.cooper3@citrix.com> wrote:

> Having nmi_shootdown_cpus() report which pcpus failed to be shot down is a
> useful debugging hint as to what possibly went wrong (especially when the
> crash logs seem to indicate that an NMI timeout occurred while waiting for one
> of the problematic pcpus to perform an action).
> 
> This is achieved by swapping an atomic_t count of unreported pcpus with a
> cpumask.  In the case that the 1 second timeout occurs, use the cpumask to
> identify the problematic pcpus.
> 
> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
> CC: Keir Fraser <keir@xen.org>
> CC: Jan Beulich <JBeulich@suse.com>
> CC: Tim Deegan <tim@xen.org>

Acked-by: Keir Fraser <keir@xen.org>

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2013-09-25 11:41 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-09-24 19:56 [PATCH 1/2] x86/crash: Indicate how well nmi_shootdown_cpus() managed to do Andrew Cooper
2013-09-24 19:56 ` [PATCH 2/2] DO NOT APPLY - debugging code to lock a pcpu in an NMI loop Andrew Cooper
2013-09-25  5:56 ` [PATCH 1/2] x86/crash: Indicate how well nmi_shootdown_cpus() managed to do Keir Fraser
2013-09-25  7:35 ` Jan Beulich
2013-09-25 10:22 ` [Patch v2] " Andrew Cooper
2013-09-25 11:41   ` Keir Fraser

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.