All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] target/i386: log guest name and memory error type AO, AR for MCEs
@ 2019-10-04 23:53 Mario Smarduch
  2019-10-06 16:11 ` Paolo Bonzini
  2019-10-07 10:27 ` Philippe Mathieu-Daudé
  0 siblings, 2 replies; 5+ messages in thread
From: Mario Smarduch @ 2019-10-04 23:53 UTC (permalink / raw)
  To: mtosatti, armbru, pbonzini; +Cc: qemu-devel, rth, ehabkost, qemu-trivial

In a large VPC environment we want to log memory error occurrences
and log them with guest name and type - there are few use cases


- if VM crashes on AR mce inform the user about the reason and
  resolve the case
- if VM hangs notify the user to reboot and resume processing
- if VM continues to run let the user know, he/she maybe able to
  correlate to vm internal outage
- Rawhammer attacks - isolate/determine the attacker possible
  migrating it off the hypervisor
- In general track memory errors on a hyperviosr over time to determine
  trends

Monitoring our fleet we come across quite a few of these and been
able to take action where before there were no clues to the causes.

When memory error occurs we get a log entry in qemu log:

Guest [Droplet-12345678] 2019-08-02T05:00:11.940270Z qemu-system-x86_64:
Guest MCE Memory Error at qemu addr 0x7f3c7622f000 and guest 78e42f000
addr of type BUS_MCEERR_AR injected

with enterprise logging environment we can to take further actions.

Signed-off-by: Mario Smarduch <msmarduch@digitalocean.com>
---
 target/i386/kvm.c | 27 ++++++++++++++++++++++-----
 util/qemu-error.c | 24 ++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 5 deletions(-)

diff --git a/target/i386/kvm.c b/target/i386/kvm.c
index 92069099ab..79ebccc684 100644
--- a/target/i386/kvm.c
+++ b/target/i386/kvm.c
@@ -555,9 +555,9 @@ static void kvm_mce_inject(X86CPU *cpu, hwaddr
paddr, int code)
                        (MCM_ADDR_PHYS << 6) | 0xc, flags);
 }

-static void hardware_memory_error(void)
+static void hardware_memory_error(void *addr)
 {
-    fprintf(stderr, "Hardware memory error!\n");
+    error_report("QEMU got Hardware memory error at addr %p", addr);
     exit(1);
 }

@@ -581,15 +581,32 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int
code, void *addr)
             kvm_physical_memory_addr_from_host(c->kvm_state, addr,
&paddr)) {
             kvm_hwpoison_page_add(ram_addr);
             kvm_mce_inject(cpu, paddr, code);
+            /*
+             * Use different logging severity based on error type.
+             * If mcelog is running qemu va addr will help debug via
mcelog.
+             */
+            if (code == BUS_MCEERR_AR) {
+                error_report("Guest MCE Memory Error at qemu addr %p and "
+                    "guest %lx addr of type %s injected", addr, paddr,
+                     "BUS_MCEERR_AR");
+            } else {
+                 warn_report("Guest MCE Memory Error at qemu addr %p and "
+                     "guest %lx addr of type %s injected", addr,
+                     paddr, "BUS_MCEERR_AO");
+            }
+
             return;
         }

-        fprintf(stderr, "Hardware memory error for memory used by "
-                "QEMU itself instead of guest system!\n");
+        if (code == BUS_MCEERR_AO) {
+            warn_report("Hardware memory error at addr %p of type %s "
+                "for memory used by QEMU itself instead of guest system!",
+                addr, "BUS_MCEERR_AO");
+        }
     }

     if (code == BUS_MCEERR_AR) {
-        hardware_memory_error();
+        hardware_memory_error(addr);
     }

     /* Hope we are lucky for AO MCE */
diff --git a/util/qemu-error.c b/util/qemu-error.c
index f373f3b3b0..2ebafd4405 100644
--- a/util/qemu-error.c
+++ b/util/qemu-error.c
@@ -11,6 +11,8 @@
  */

 #include "qemu/osdep.h"
+#include "qemu/option.h"
+#include "qemu/config-file.h"
 #include "monitor/monitor.h"
 #include "qemu/error-report.h"

@@ -35,11 +37,31 @@ int error_printf(const char *fmt, ...)
     return ret;
 }

+static const char *error_get_guestname(void)
+{
+    QemuOpts *opts = qemu_opts_find(qemu_find_opts("name"), NULL);
+    return qemu_opt_get(opts, "guest");
+}
+
+/*
+ * Print guest name associated with error, to aid debugging errors from
+ * multiple guests in centralized logging environment.
+ */
+static void error_print_guestname(void)
+{
+    const char *name;
+    name = error_get_guestname();
+    if (name != NULL && !cur_mon) {
+        error_printf("Guest [%s] ", name);
+    }
+}
+
 int error_printf_unless_qmp(const char *fmt, ...)
 {
     va_list ap;
     int ret;

+    error_print_guestname();
     va_start(ap, fmt);
     ret = error_vprintf_unless_qmp(fmt, ap);
     va_end(ap);
@@ -274,6 +296,7 @@ void error_report(const char *fmt, ...)
 {
     va_list ap;

+    error_print_guestname();
     va_start(ap, fmt);
     vreport(REPORT_TYPE_ERROR, fmt, ap);
     va_end(ap);
@@ -289,6 +312,7 @@ void warn_report(const char *fmt, ...)
 {
     va_list ap;

+    error_print_guestname();
     va_start(ap, fmt);
     vreport(REPORT_TYPE_WARNING, fmt, ap);
     va_end(ap);
--
2.17.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH] target/i386: log guest name and memory error type AO, AR for MCEs
  2019-10-04 23:53 [PATCH] target/i386: log guest name and memory error type AO, AR for MCEs Mario Smarduch
@ 2019-10-06 16:11 ` Paolo Bonzini
  2019-10-06 22:01   ` Mario Smarduch
  2019-10-07 10:27 ` Philippe Mathieu-Daudé
  1 sibling, 1 reply; 5+ messages in thread
From: Paolo Bonzini @ 2019-10-06 16:11 UTC (permalink / raw)
  To: Mario Smarduch, mtosatti, armbru; +Cc: qemu-trivial, qemu-devel, ehabkost, rth

On 05/10/19 01:53, Mario Smarduch wrote:
> Guest [Droplet-12345678] 2019-08-02T05:00:11.940270Z qemu-system-x86_64:
> Guest MCE Memory Error at qemu addr 0x7f3c7622f000 and guest 78e42f000
> addr of type BUS_MCEERR_AR injected
> 
> with enterprise logging environment we can to take further actions.
> 
> Signed-off-by: Mario Smarduch <msmarduch@digitalocean.com>

The guest name part should be a separate patch, controlled by "-msg
name=on" or something like that.  The MCE parts look good okay.  Can you
split the patch in two?

Paolo


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] target/i386: log guest name and memory error type AO, AR for MCEs
  2019-10-06 16:11 ` Paolo Bonzini
@ 2019-10-06 22:01   ` Mario Smarduch
  0 siblings, 0 replies; 5+ messages in thread
From: Mario Smarduch @ 2019-10-06 22:01 UTC (permalink / raw)
  To: Paolo Bonzini, mtosatti, armbru; +Cc: qemu-devel, rth, ehabkost, qemu-trivial



On 10/06/2019 09:11 AM, Paolo Bonzini wrote:
> On 05/10/19 01:53, Mario Smarduch wrote:
>> Guest [Droplet-12345678] 2019-08-02T05:00:11.940270Z qemu-system-x86_64:
>> Guest MCE Memory Error at qemu addr 0x7f3c7622f000 and guest 78e42f000
>> addr of type BUS_MCEERR_AR injected
>>
>> with enterprise logging environment we can to take further actions.
>>
>> Signed-off-by: Mario Smarduch <msmarduch@digitalocean.com>
> 
> The guest name part should be a separate patch, controlled by "-msg
> name=on" or something like that.  The MCE parts look good okay.  Can you
> split the patch in two?
> 
> Paolo
> 
Yes will do.

thanks.


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] target/i386: log guest name and memory error type AO, AR for MCEs
  2019-10-04 23:53 [PATCH] target/i386: log guest name and memory error type AO, AR for MCEs Mario Smarduch
  2019-10-06 16:11 ` Paolo Bonzini
@ 2019-10-07 10:27 ` Philippe Mathieu-Daudé
  2019-10-07 17:52   ` Mario Smarduch
  1 sibling, 1 reply; 5+ messages in thread
From: Philippe Mathieu-Daudé @ 2019-10-07 10:27 UTC (permalink / raw)
  To: Mario Smarduch, mtosatti, armbru, pbonzini
  Cc: qemu-trivial, qemu-devel, ehabkost, rth

Hi Mario,

On 10/5/19 1:53 AM, Mario Smarduch wrote:
> In a large VPC environment we want to log memory error occurrences
> and log them with guest name and type - there are few use cases
> 
> 
> - if VM crashes on AR mce inform the user about the reason and
>    resolve the case
> - if VM hangs notify the user to reboot and resume processing
> - if VM continues to run let the user know, he/she maybe able to
>    correlate to vm internal outage
> - Rawhammer attacks - isolate/determine the attacker possible
>    migrating it off the hypervisor
> - In general track memory errors on a hyperviosr over time to determine
>    trends
> 
> Monitoring our fleet we come across quite a few of these and been
> able to take action where before there were no clues to the causes.
> 
> When memory error occurs we get a log entry in qemu log:
> 
> Guest [Droplet-12345678] 2019-08-02T05:00:11.940270Z qemu-system-x86_64:
> Guest MCE Memory Error at qemu addr 0x7f3c7622f000 and guest 78e42f000
> addr of type BUS_MCEERR_AR injected
> 
> with enterprise logging environment we can to take further actions.
> 
> Signed-off-by: Mario Smarduch <msmarduch@digitalocean.com>
> ---
>   target/i386/kvm.c | 27 ++++++++++++++++++++++-----
>   util/qemu-error.c | 24 ++++++++++++++++++++++++
>   2 files changed, 46 insertions(+), 5 deletions(-)
> 
> diff --git a/target/i386/kvm.c b/target/i386/kvm.c
> index 92069099ab..79ebccc684 100644
> --- a/target/i386/kvm.c
> +++ b/target/i386/kvm.c
> @@ -555,9 +555,9 @@ static void kvm_mce_inject(X86CPU *cpu, hwaddr
> paddr, int code)
>                          (MCM_ADDR_PHYS << 6) | 0xc, flags);
>   }
> 
> -static void hardware_memory_error(void)
> +static void hardware_memory_error(void *addr)

Maybe rename addr -> host_addr.

>   {
> -    fprintf(stderr, "Hardware memory error!\n");
> +    error_report("QEMU got Hardware memory error at addr %p", addr);
>       exit(1);
>   }
> 
> @@ -581,15 +581,32 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int
> code, void *addr)
>               kvm_physical_memory_addr_from_host(c->kvm_state, addr,
> &paddr)) {
>               kvm_hwpoison_page_add(ram_addr);
>               kvm_mce_inject(cpu, paddr, code);
> +            /*
> +             * Use different logging severity based on error type.
> +             * If mcelog is running qemu va addr will help debug via
> mcelog.
> +             */
> +            if (code == BUS_MCEERR_AR) {
> +                error_report("Guest MCE Memory Error at qemu addr %p and "
> +                    "guest %lx addr of type %s injected", addr, paddr,

"qemu addr" is not clear IMO, 'addr' is in the host (and is virtual... 
how does this help you?).

For the guest paddr you should use "0x%"HWADDR_PRIx format.

> +                     "BUS_MCEERR_AR");
> +            } else {
> +                 warn_report("Guest MCE Memory Error at qemu addr %p and "
> +                     "guest %lx addr of type %s injected", addr,
> +                     paddr, "BUS_MCEERR_AO");
> +            }
> +
>               return;
>           }
> 
> -        fprintf(stderr, "Hardware memory error for memory used by "
> -                "QEMU itself instead of guest system!\n");
> +        if (code == BUS_MCEERR_AO) {
> +            warn_report("Hardware memory error at addr %p of type %s "
> +                "for memory used by QEMU itself instead of guest system!",
> +                addr, "BUS_MCEERR_AO");
> +        }
>       }
> 
>       if (code == BUS_MCEERR_AR) {
> -        hardware_memory_error();
> +        hardware_memory_error(addr);
>       }
> 
>       /* Hope we are lucky for AO MCE */
> diff --git a/util/qemu-error.c b/util/qemu-error.c
> index f373f3b3b0..2ebafd4405 100644
> --- a/util/qemu-error.c
> +++ b/util/qemu-error.c
> @@ -11,6 +11,8 @@
>    */
> 
>   #include "qemu/osdep.h"
> +#include "qemu/option.h"
> +#include "qemu/config-file.h"
>   #include "monitor/monitor.h"
>   #include "qemu/error-report.h"
> 
> @@ -35,11 +37,31 @@ int error_printf(const char *fmt, ...)
>       return ret;
>   }
> 
> +static const char *error_get_guestname(void)
> +{
> +    QemuOpts *opts = qemu_opts_find(qemu_find_opts("name"), NULL);
> +    return qemu_opt_get(opts, "guest");
> +}
> +
> +/*
> + * Print guest name associated with error, to aid debugging errors from
> + * multiple guests in centralized logging environment.
> + */
> +static void error_print_guestname(void)
> +{
> +    const char *name;
> +    name = error_get_guestname();
> +    if (name != NULL && !cur_mon) {
> +        error_printf("Guest [%s] ", name);
> +    }
> +}
> +
>   int error_printf_unless_qmp(const char *fmt, ...)
>   {
>       va_list ap;
>       int ret;
> 
> +    error_print_guestname();
>       va_start(ap, fmt);
>       ret = error_vprintf_unless_qmp(fmt, ap);
>       va_end(ap);
> @@ -274,6 +296,7 @@ void error_report(const char *fmt, ...)
>   {
>       va_list ap;
> 
> +    error_print_guestname();
>       va_start(ap, fmt);
>       vreport(REPORT_TYPE_ERROR, fmt, ap);
>       va_end(ap);
> @@ -289,6 +312,7 @@ void warn_report(const char *fmt, ...)
>   {
>       va_list ap;
> 
> +    error_print_guestname();
>       va_start(ap, fmt);
>       vreport(REPORT_TYPE_WARNING, fmt, ap);
>       va_end(ap);
> --
> 2.17.1
> 


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] target/i386: log guest name and memory error type AO, AR for MCEs
  2019-10-07 10:27 ` Philippe Mathieu-Daudé
@ 2019-10-07 17:52   ` Mario Smarduch
  0 siblings, 0 replies; 5+ messages in thread
From: Mario Smarduch @ 2019-10-07 17:52 UTC (permalink / raw)
  To: Philippe Mathieu-Daudé, mtosatti, armbru, pbonzini
  Cc: qemu-devel, rth, ehabkost, qemu-trivial



On 10/07/2019 03:27 AM, Philippe Mathieu-Daudé wrote:
> Hi Mario,
> 
> On 10/5/19 1:53 AM, Mario Smarduch wrote:
>> In a large VPC environment we want to log memory error occurrences
>> and log them with guest name and type - there are few use cases
>>
>>
>> - if VM crashes on AR mce inform the user about the reason and
>>    resolve the case
>> - if VM hangs notify the user to reboot and resume processing
>> - if VM continues to run let the user know, he/she maybe able to
>>    correlate to vm internal outage
>> - Rawhammer attacks - isolate/determine the attacker possible
>>    migrating it off the hypervisor
>> - In general track memory errors on a hyperviosr over time to determine
>>    trends
>>
>> Monitoring our fleet we come across quite a few of these and been
>> able to take action where before there were no clues to the causes.
>>
>> When memory error occurs we get a log entry in qemu log:
>>
>> Guest [Droplet-12345678] 2019-08-02T05:00:11.940270Z qemu-system-x86_64:
>> Guest MCE Memory Error at qemu addr 0x7f3c7622f000 and guest 78e42f000
>> addr of type BUS_MCEERR_AR injected
>>
>> with enterprise logging environment we can to take further actions.
>>
>> Signed-off-by: Mario Smarduch <msmarduch@digitalocean.com>
>> ---
>>   target/i386/kvm.c | 27 ++++++++++++++++++++++-----
>>   util/qemu-error.c | 24 ++++++++++++++++++++++++
>>   2 files changed, 46 insertions(+), 5 deletions(-)
>>
>> diff --git a/target/i386/kvm.c b/target/i386/kvm.c
>> index 92069099ab..79ebccc684 100644
>> --- a/target/i386/kvm.c
>> +++ b/target/i386/kvm.c
>> @@ -555,9 +555,9 @@ static void kvm_mce_inject(X86CPU *cpu, hwaddr
>> paddr, int code)
>>                          (MCM_ADDR_PHYS << 6) | 0xc, flags);
>>   }
>>
>> -static void hardware_memory_error(void)
>> +static void hardware_memory_error(void *addr)
> 
> Maybe rename addr -> host_addr.
yep makes it more clear.

> 
>>   {
>> -    fprintf(stderr, "Hardware memory error!\n");
>> +    error_report("QEMU got Hardware memory error at addr %p", addr);
>>       exit(1);
>>   }
>>
>> @@ -581,15 +581,32 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int
>> code, void *addr)
>>               kvm_physical_memory_addr_from_host(c->kvm_state, addr,
>> &paddr)) {
>>               kvm_hwpoison_page_add(ram_addr);
>>               kvm_mce_inject(cpu, paddr, code);
>> +            /*
>> +             * Use different logging severity based on error type.
>> +             * If mcelog is running qemu va addr will help debug via
>> mcelog.
>> +             */
>> +            if (code == BUS_MCEERR_AR) {
>> +                error_report("Guest MCE Memory Error at qemu addr %p
>> and "
>> +                    "guest %lx addr of type %s injected", addr, paddr,
> 
> "qemu addr" is not clear IMO, 'addr' is in the host (and is virtual...
> how does this help you?).

Our mcelog entries are logged globally as well as qemu memory errors, if
log entry(s) from mcelog go missing we can use the VA from qemu to
more or less figure out its the same memory if we have some prior
relationship (based on timestamps of entries). For some cases every
bit helps.

> 
> For the guest paddr you should use "0x%"HWADDR_PRIx format.

Yes missed that should be similar to x86_cpu_dump_state(), a bare
pointer looks bad in qemu.

> 
>> +                     "BUS_MCEERR_AR");
>> +            } else {
>> +                 warn_report("Guest MCE Memory Error at qemu addr %p
>> and "
>> +                     "guest %lx addr of type %s injected", addr,
>> +                     paddr, "BUS_MCEERR_AO");
>> +            }
>> +
>>               return;
>>           }
>>
>> -        fprintf(stderr, "Hardware memory error for memory used by "
>> -                "QEMU itself instead of guest system!\n");
>> +        if (code == BUS_MCEERR_AO) {
>> +            warn_report("Hardware memory error at addr %p of type %s "
>> +                "for memory used by QEMU itself instead of guest
>> system!",
>> +                addr, "BUS_MCEERR_AO");
>> +        }
>>       }
>>
>>       if (code == BUS_MCEERR_AR) {
>> -        hardware_memory_error();
>> +        hardware_memory_error(addr);
>>       }
>>
>>       /* Hope we are lucky for AO MCE */
>> diff --git a/util/qemu-error.c b/util/qemu-error.c
>> index f373f3b3b0..2ebafd4405 100644
>> --- a/util/qemu-error.c
>> +++ b/util/qemu-error.c
>> @@ -11,6 +11,8 @@
>>    */
>>
>>   #include "qemu/osdep.h"
>> +#include "qemu/option.h"
>> +#include "qemu/config-file.h"
>>   #include "monitor/monitor.h"
>>   #include "qemu/error-report.h"
>>
>> @@ -35,11 +37,31 @@ int error_printf(const char *fmt, ...)
>>       return ret;
>>   }
>>
>> +static const char *error_get_guestname(void)
>> +{
>> +    QemuOpts *opts = qemu_opts_find(qemu_find_opts("name"), NULL);
>> +    return qemu_opt_get(opts, "guest");
>> +}
>> +
>> +/*
>> + * Print guest name associated with error, to aid debugging errors from
>> + * multiple guests in centralized logging environment.
>> + */
>> +static void error_print_guestname(void)
>> +{
>> +    const char *name;
>> +    name = error_get_guestname();
>> +    if (name != NULL && !cur_mon) {
>> +        error_printf("Guest [%s] ", name);
>> +    }
>> +}
>> +
>>   int error_printf_unless_qmp(const char *fmt, ...)
>>   {
>>       va_list ap;
>>       int ret;
>>
>> +    error_print_guestname();
>>       va_start(ap, fmt);
>>       ret = error_vprintf_unless_qmp(fmt, ap);
>>       va_end(ap);
>> @@ -274,6 +296,7 @@ void error_report(const char *fmt, ...)
>>   {
>>       va_list ap;
>>
>> +    error_print_guestname();
>>       va_start(ap, fmt);
>>       vreport(REPORT_TYPE_ERROR, fmt, ap);
>>       va_end(ap);
>> @@ -289,6 +312,7 @@ void warn_report(const char *fmt, ...)
>>   {
>>       va_list ap;
>>
>> +    error_print_guestname();
>>       va_start(ap, fmt);
>>       vreport(REPORT_TYPE_WARNING, fmt, ap);
>>       va_end(ap);
>> -- 
>> 2.17.1
>>


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2019-10-07 17:53 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-10-04 23:53 [PATCH] target/i386: log guest name and memory error type AO, AR for MCEs Mario Smarduch
2019-10-06 16:11 ` Paolo Bonzini
2019-10-06 22:01   ` Mario Smarduch
2019-10-07 10:27 ` Philippe Mathieu-Daudé
2019-10-07 17:52   ` Mario Smarduch

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.