All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] Fix xen hang on intel westmere-EP
@ 2011-08-22 13:55 Zhang, Yang Z
  2011-08-22 14:19 ` Jan Beulich
  0 siblings, 1 reply; 10+ messages in thread
From: Zhang, Yang Z @ 2011-08-22 13:55 UTC (permalink / raw)
  To: xen-devel
  Cc: 'Keir Fraser (keir.xen@gmail.com)',
	'Jan Beulich (JBeulich@novell.com)'

On Intel westmere-ep(w/ ICH10 chipset), the legacy USB logic will generate SMI consistently. And this will cause system hang under some specified conditions. So we need to disable it during early boot.

Signed-off-by: Yang Zhang <yang.z.zhang@intel.com>

diff -r 0f36c2eec2e1 xen/arch/x86/Makefile
--- a/xen/arch/x86/Makefile     Thu Jul 28 15:40:54 2011 +0100
+++ b/xen/arch/x86/Makefile     Mon Aug 22 16:23:16 2011 +0800
@@ -57,6 +57,7 @@
 obj-y += tboot.o
 obj-y += hpet.o
 obj-y += xstate.o
+obj-y += early_quirks.o

 obj-$(crash_debug) += gdbstub.o

diff -r 0f36c2eec2e1 xen/arch/x86/early_quirks.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/early_quirks.c       Mon Aug 22 16:23:16 2011 +0800
@@ -0,0 +1,72 @@
+/*
+ *  arch/x86/early_quirks.c
+ *
+ *  This code used to workaround the chipset bugs.
+ *
+ */
+
+#include <xen/init.h>
+#include <xen/pci.h>
+#include <xen/pci_regs.h>
+#include <asm/io.h>
+
+/*
+ * For intel ich10 chipset, it has bug on deliver legacy usb
+ * SMI to CPU and will cause system hang. So we need to disable
+ * it when booting.
+ */
+void intel_smi_quirk(
+        unsigned int bus, unsigned int dev, unsigned int func) {
+    unsigned int pmbase;
+    unsigned int smi_ctrl_addr, smi_ctrl_reg;
+
+    /*
+     * Smi control register reside at pmbase +30h. We need to find
+     * the pmbase address firstly which located at offset 0x40.
+     */
+    pmbase = pci_conf_read32(bus, dev, func, 0x40);
+
+    smi_ctrl_addr = (pmbase & 0xff80) + 0x30;
+    smi_ctrl_reg = inl(smi_ctrl_addr);
+
+    /*
+     *mask bit 3 and bit 17 to disable legacy usb to generate SMI.
+     */
+    smi_ctrl_reg &= ~0x20008;
+    outl(smi_ctrl_reg, smi_ctrl_addr);
+}
+
+struct chipset {
+    unsigned int vendor;
+    unsigned int device;
+    void (*f)(unsigned int bus, unsigned int dev, unsigned int func); 
+};
+
+static struct chipset early_quirk[] = {
+    {PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_ICH10_LPC, intel_smi_quirk},
+    {}
+};
+
+static void check_quirk(unsigned int bus, unsigned int dev, unsigned 
+int func) {
+    unsigned int vendor_id, device_id;
+    int i;
+
+    vendor_id = pci_conf_read16(bus, dev, func, PCI_VENDOR_ID);
+    device_id = pci_conf_read16(bus, dev, func, PCI_DEVICE_ID);
+
+    for (i = 0; early_quirk[i].f != NULL; i++)
+        if (early_quirk[i].vendor == vendor_id &&
+            early_quirk[i].device == device_id)
+                early_quirk[i].f(bus, dev, func); }
+
+void  __init early_quirks(void)
+{
+    unsigned int dev, func;
+
+    for (dev = 0; dev < 32; dev++)
+        for (func = 0; func < 8; func++)
+            check_quirk(0, dev, func);
+}
diff -r 0f36c2eec2e1 xen/arch/x86/setup.c
--- a/xen/arch/x86/setup.c      Thu Jul 28 15:40:54 2011 +0100
+++ b/xen/arch/x86/setup.c      Mon Aug 22 16:23:16 2011 +0800
@@ -1255,6 +1255,8 @@
     if ( opt_nosmp )
         max_cpus = 0;

+    early_quirks();
+
     iommu_setup();    /* setup iommu if available */

     smp_prepare_cpus(max_cpus);
diff -r 0f36c2eec2e1 xen/include/asm-x86/setup.h
--- a/xen/include/asm-x86/setup.h       Thu Jul 28 15:40:54 2011 +0100
+++ b/xen/include/asm-x86/setup.h       Mon Aug 22 16:23:16 2011 +0800
@@ -12,6 +12,7 @@
 void early_cpu_init(void);
 void early_time_init(void);
 void early_page_fault(void);
+void early_quirks(void);

 int intel_cpu_init(void);
 int amd_init_cpu(void);
diff -r 0f36c2eec2e1 xen/include/xen/pci_regs.h
--- a/xen/include/xen/pci_regs.h        Thu Jul 28 15:40:54 2011 +0100
+++ b/xen/include/xen/pci_regs.h        Mon Aug 22 16:23:16 2011 +0800
@@ -22,6 +22,9 @@
 #ifndef LINUX_PCI_REGS_H
 #define LINUX_PCI_REGS_H

+#define PCI_VENDOR_ID_INTEL         0x8086
+#define PCI_DEVICE_ID_ICH10_LPC     0x3a16
+
 /*
  * Under PCI, each device has 256 bytes of configuration address space,
  * of which the first 64 bytes are standardized as follows:

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] Fix xen hang on intel westmere-EP
  2011-08-22 13:55 [PATCH] Fix xen hang on intel westmere-EP Zhang, Yang Z
@ 2011-08-22 14:19 ` Jan Beulich
  2011-08-23  3:52   ` Zhang, Yang Z
  0 siblings, 1 reply; 10+ messages in thread
From: Jan Beulich @ 2011-08-22 14:19 UTC (permalink / raw)
  To: Yang Z Zhang; +Cc: xen-devel, Keir Fraser

>>> On 22.08.11 at 15:55, "Zhang, Yang Z" <yang.z.zhang@intel.com> wrote:
> On Intel westmere-ep(w/ ICH10 chipset), the legacy USB logic will generate SMI 
> consistently. And this will cause system hang under some specified 
> conditions. So we need to disable it during early boot.

NAK. See below.

> Signed-off-by: Yang Zhang <yang.z.zhang@intel.com>
> 
> diff -r 0f36c2eec2e1 xen/arch/x86/Makefile
> --- a/xen/arch/x86/Makefile     Thu Jul 28 15:40:54 2011 +0100
> +++ b/xen/arch/x86/Makefile     Mon Aug 22 16:23:16 2011 +0800
> @@ -57,6 +57,7 @@
>  obj-y += tboot.o
>  obj-y += hpet.o
>  obj-y += xstate.o
> +obj-y += early_quirks.o
> 
>  obj-$(crash_debug) += gdbstub.o
> 
> diff -r 0f36c2eec2e1 xen/arch/x86/early_quirks.c
> --- /dev/null   Thu Jan 01 00:00:00 1970 +0000
> +++ b/xen/arch/x86/early_quirks.c       Mon Aug 22 16:23:16 2011 +0800
> @@ -0,0 +1,72 @@
> +/*
> + *  arch/x86/early_quirks.c
> + *
> + *  This code used to workaround the chipset bugs.
> + *
> + */
> +
> +#include <xen/init.h>
> +#include <xen/pci.h>
> +#include <xen/pci_regs.h>
> +#include <asm/io.h>
> +
> +/*
> + * For intel ich10 chipset, it has bug on deliver legacy usb
> + * SMI to CPU and will cause system hang. So we need to disable
> + * it when booting.
> + */
> +void intel_smi_quirk(
> +        unsigned int bus, unsigned int dev, unsigned int func) {
> +    unsigned int pmbase;
> +    unsigned int smi_ctrl_addr, smi_ctrl_reg;
> +
> +    /*
> +     * Smi control register reside at pmbase +30h. We need to find
> +     * the pmbase address firstly which located at offset 0x40.
> +     */
> +    pmbase = pci_conf_read32(bus, dev, func, 0x40);
> +
> +    smi_ctrl_addr = (pmbase & 0xff80) + 0x30;
> +    smi_ctrl_reg = inl(smi_ctrl_addr);
> +
> +    /*
> +     *mask bit 3 and bit 17 to disable legacy usb to generate SMI.
> +     */
> +    smi_ctrl_reg &= ~0x20008;
> +    outl(smi_ctrl_reg, smi_ctrl_addr);
> +}
> +
> +struct chipset {
> +    unsigned int vendor;
> +    unsigned int device;
> +    void (*f)(unsigned int bus, unsigned int dev, unsigned int func); 
> +};
> +
> +static struct chipset early_quirk[] = {
> +    {PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_ICH10_LPC, intel_smi_quirk},

I think we discussed (off-list) quite extensively that this should not be
keyed to the PCI device IDs. It ought to hang off the BIOS and/or
board manufacturer. I actually have a patch that does just so, but
am waiting for possibly more fine grained identification information
from you (Intel).

Also, are you certain this problem exists only with this single ICH
variant?

> +    {}
> +};
> +
> +static void check_quirk(unsigned int bus, unsigned int dev, unsigned 
> +int func) {
> +    unsigned int vendor_id, device_id;
> +    int i;
> +
> +    vendor_id = pci_conf_read16(bus, dev, func, PCI_VENDOR_ID);
> +    device_id = pci_conf_read16(bus, dev, func, PCI_DEVICE_ID);
> +
> +    for (i = 0; early_quirk[i].f != NULL; i++)
> +        if (early_quirk[i].vendor == vendor_id &&
> +            early_quirk[i].device == device_id)
> +                early_quirk[i].f(bus, dev, func); }
> +
> +void  __init early_quirks(void)
> +{
> +    unsigned int dev, func;
> +
> +    for (dev = 0; dev < 32; dev++)
> +        for (func = 0; func < 8; func++)
> +            check_quirk(0, dev, func);

Further I'm opposed to introducing further instances of legacy brute-
force PCI bus scans.

And I don't think you got something along these lines accepted into
Linux, did you? It ought to be DMI based there, too.

Jan

> +}
> diff -r 0f36c2eec2e1 xen/arch/x86/setup.c
> --- a/xen/arch/x86/setup.c      Thu Jul 28 15:40:54 2011 +0100
> +++ b/xen/arch/x86/setup.c      Mon Aug 22 16:23:16 2011 +0800
> @@ -1255,6 +1255,8 @@
>      if ( opt_nosmp )
>          max_cpus = 0;
> 
> +    early_quirks();
> +
>      iommu_setup();    /* setup iommu if available */
> 
>      smp_prepare_cpus(max_cpus);
> diff -r 0f36c2eec2e1 xen/include/asm-x86/setup.h
> --- a/xen/include/asm-x86/setup.h       Thu Jul 28 15:40:54 2011 +0100
> +++ b/xen/include/asm-x86/setup.h       Mon Aug 22 16:23:16 2011 +0800
> @@ -12,6 +12,7 @@
>  void early_cpu_init(void);
>  void early_time_init(void);
>  void early_page_fault(void);
> +void early_quirks(void);
> 
>  int intel_cpu_init(void);
>  int amd_init_cpu(void);
> diff -r 0f36c2eec2e1 xen/include/xen/pci_regs.h
> --- a/xen/include/xen/pci_regs.h        Thu Jul 28 15:40:54 2011 +0100
> +++ b/xen/include/xen/pci_regs.h        Mon Aug 22 16:23:16 2011 +0800
> @@ -22,6 +22,9 @@
>  #ifndef LINUX_PCI_REGS_H
>  #define LINUX_PCI_REGS_H
> 
> +#define PCI_VENDOR_ID_INTEL         0x8086
> +#define PCI_DEVICE_ID_ICH10_LPC     0x3a16
> +
>  /*
>   * Under PCI, each device has 256 bytes of configuration address space,
>   * of which the first 64 bytes are standardized as follows:

^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: [PATCH] Fix xen hang on intel westmere-EP
  2011-08-22 14:19 ` Jan Beulich
@ 2011-08-23  3:52   ` Zhang, Yang Z
  2011-08-23  7:32     ` Keir Fraser
  2011-08-23  8:01     ` Jan Beulich
  0 siblings, 2 replies; 10+ messages in thread
From: Zhang, Yang Z @ 2011-08-23  3:52 UTC (permalink / raw)
  To: Jan Beulich; +Cc: Keir, xen-devel, Fraser

> From: Jan Beulich [mailto:JBeulich@novell.com]
> Sent: Monday, August 22, 2011 10:20 PM
> 
> I think we discussed (off-list) quite extensively that this should not be keyed to
> the PCI device IDs. It ought to hang off the BIOS and/or board manufacturer. I
> actually have a patch that does just so, but am waiting for possibly more fine
> grained identification information from you (Intel).
> 
> Also, are you certain this problem exists only with this single ICH variant?
So far, we only observed it on ICH10 based chipset. Did you see another platform have the problems?

> > +    {}
> > +};
> > +
> > +static void check_quirk(unsigned int bus, unsigned int dev, unsigned
> > +int func) {
> > +    unsigned int vendor_id, device_id;
> > +    int i;
> > +
> > +    vendor_id = pci_conf_read16(bus, dev, func, PCI_VENDOR_ID);
> > +    device_id = pci_conf_read16(bus, dev, func, PCI_DEVICE_ID);
> > +
> > +    for (i = 0; early_quirk[i].f != NULL; i++)
> > +        if (early_quirk[i].vendor == vendor_id &&
> > +            early_quirk[i].device == device_id)
> > +                early_quirk[i].f(bus, dev, func); }
> > +
> > +void  __init early_quirks(void)
> > +{
> > +    unsigned int dev, func;
> > +
> > +    for (dev = 0; dev < 32; dev++)
> > +        for (func = 0; func < 8; func++)
> > +            check_quirk(0, dev, func);
> 
> Further I'm opposed to introducing further instances of legacy brute- force PCI
> bus scans.
> 
> And I don't think you got something along these lines accepted into Linux, did
> you? It ought to be DMI based there, too.
I don't know why you think using DMI is a better way? For BDF based way, we only need to know the device ID. But for DMI base way, I don't know which condition should be matched.

Actually, the best way to solve it is to enable the ACPI mode in Xen instead of in dom0. For enable ACPI, we need to write the value from FADT.ACPI_ENABLE to SMI_CMD. After writing the value, the SMI ownership will be disable by ACPI hardware and it also will disable some logic which is able to cause SMI. For example, the legacy USB circuit will be masked too. Because at this point, there have no need to use legacy usb emulation. This is also what linux upstream did. But I think it is too complicated to port this logic to xen. Anyway, if you have interesting, you can add this logic to xen and there have no need for this patch again.



best regards
yang

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] Fix xen hang on intel westmere-EP
  2011-08-23  3:52   ` Zhang, Yang Z
@ 2011-08-23  7:32     ` Keir Fraser
  2011-08-23  7:47       ` Tian, Kevin
  2011-08-23  8:01     ` Jan Beulich
  1 sibling, 1 reply; 10+ messages in thread
From: Keir Fraser @ 2011-08-23  7:32 UTC (permalink / raw)
  To: Zhang, Yang Z, Jan Beulich; +Cc: xen-devel

On 23/08/2011 04:52, "Zhang, Yang Z" <yang.z.zhang@intel.com> wrote:

>> Further I'm opposed to introducing further instances of legacy brute- force
>> PCI
>> bus scans.
>> 
>> And I don't think you got something along these lines accepted into Linux,
>> did
>> you? It ought to be DMI based there, too.
> I don't know why you think using DMI is a better way? For BDF based way, we
> only need to know the device ID. But for DMI base way, I don't know which
> condition should be matched.
> 
> Actually, the best way to solve it is to enable the ACPI mode in Xen instead
> of in dom0. For enable ACPI, we need to write the value from FADT.ACPI_ENABLE
> to SMI_CMD. After writing the value, the SMI ownership will be disable by ACPI
> hardware and it also will disable some logic which is able to cause SMI. For
> example, the legacy USB circuit will be masked too. Because at this point,
> there have no need to use legacy usb emulation. This is also what linux
> upstream did. But I think it is too complicated to port this logic to xen.
> Anyway, if you have interesting, you can add this logic to xen and there have
> no need for this patch again.

It sounds like quite a good idea, and not very complicated at all. The main
concern would be potential other fall out from making the change.

 -- Keir

^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: [PATCH] Fix xen hang on intel westmere-EP
  2011-08-23  7:32     ` Keir Fraser
@ 2011-08-23  7:47       ` Tian, Kevin
  2011-08-23  8:04         ` Keir Fraser
  0 siblings, 1 reply; 10+ messages in thread
From: Tian, Kevin @ 2011-08-23  7:47 UTC (permalink / raw)
  To: Keir Fraser, Zhang, Yang Z, Jan Beulich; +Cc: xen-devel

> From: Keir Fraser
> Sent: Tuesday, August 23, 2011 3:32 PM
> 
> On 23/08/2011 04:52, "Zhang, Yang Z" <yang.z.zhang@intel.com> wrote:
> 
> >> Further I'm opposed to introducing further instances of legacy brute- force
> >> PCI
> >> bus scans.
> >>
> >> And I don't think you got something along these lines accepted into Linux,
> >> did
> >> you? It ought to be DMI based there, too.
> > I don't know why you think using DMI is a better way? For BDF based way, we
> > only need to know the device ID. But for DMI base way, I don't know which
> > condition should be matched.
> >
> > Actually, the best way to solve it is to enable the ACPI mode in Xen instead
> > of in dom0. For enable ACPI, we need to write the value from
> FADT.ACPI_ENABLE
> > to SMI_CMD. After writing the value, the SMI ownership will be disable by
> ACPI
> > hardware and it also will disable some logic which is able to cause SMI. For
> > example, the legacy USB circuit will be masked too. Because at this point,
> > there have no need to use legacy usb emulation. This is also what linux
> > upstream did. But I think it is too complicated to port this logic to xen.
> > Anyway, if you have interesting, you can add this logic to xen and there have
> > no need for this patch again.
> 
> It sounds like quite a good idea, and not very complicated at all. The main
> concern would be potential other fall out from making the change.
> 

This is indeed tricky though. This ACPI mode enable basically tells that Xen is
ready to own ACPI hardware registers through ACPI SCI mode, instead of
legacy SMI mode. However SCI itself requires some encoded ACPI information, 
such as routing and overlapping info, which depends on dom0. So Xen is not
ready to handle SCI before dom0 boots.

On the other hand, Linux does this ACPI mode enable late until APs are
ready to boot. Before that point there're some extra preparations completed
already, such as DMI system check (which may disable ACPI), root namespace
initialization, etc. I'm not sure whether those extra preparations can be all
moved down to Xen (some of them access encoded content), or can be safely
skipped in Xen (e.g. DMI check...)

Thanks
Kevin

^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: [PATCH] Fix xen hang on intel westmere-EP
  2011-08-23  3:52   ` Zhang, Yang Z
  2011-08-23  7:32     ` Keir Fraser
@ 2011-08-23  8:01     ` Jan Beulich
  2011-08-24  2:16       ` Zhang, Yang Z
  1 sibling, 1 reply; 10+ messages in thread
From: Jan Beulich @ 2011-08-23  8:01 UTC (permalink / raw)
  To: Yang Z Zhang; +Cc: xen-devel, KeirFraser

>>> On 23.08.11 at 05:52, "Zhang, Yang Z" <yang.z.zhang@intel.com> wrote:
>>  From: Jan Beulich [mailto:JBeulich@novell.com] 
>> Sent: Monday, August 22, 2011 10:20 PM
>> 
>> I think we discussed (off-list) quite extensively that this should not be 
> keyed to
>> the PCI device IDs. It ought to hang off the BIOS and/or board manufacturer. 
> I
>> actually have a patch that does just so, but am waiting for possibly more 
> fine
>> grained identification information from you (Intel).
>> 
>> Also, are you certain this problem exists only with this single ICH variant?
> So far, we only observed it on ICH10 based chipset. Did you see another 
> platform have the problems?

No, I haven't observed it on other platforms but
- the problematic bits exist in earlier ICH versions too, and
- there are ICH10 based systems that aren't showing any such bad
  behavior.

>> > +    {}
>> > +};
>> > +
>> > +static void check_quirk(unsigned int bus, unsigned int dev, unsigned
>> > +int func) {
>> > +    unsigned int vendor_id, device_id;
>> > +    int i;
>> > +
>> > +    vendor_id = pci_conf_read16(bus, dev, func, PCI_VENDOR_ID);
>> > +    device_id = pci_conf_read16(bus, dev, func, PCI_DEVICE_ID);
>> > +
>> > +    for (i = 0; early_quirk[i].f != NULL; i++)
>> > +        if (early_quirk[i].vendor == vendor_id &&
>> > +            early_quirk[i].device == device_id)
>> > +                early_quirk[i].f(bus, dev, func); }
>> > +
>> > +void  __init early_quirks(void)
>> > +{
>> > +    unsigned int dev, func;
>> > +
>> > +    for (dev = 0; dev < 32; dev++)
>> > +        for (func = 0; func < 8; func++)
>> > +            check_quirk(0, dev, func);
>> 
>> Further I'm opposed to introducing further instances of legacy brute- force 
> PCI
>> bus scans.
>> 
>> And I don't think you got something along these lines accepted into Linux, 
> did
>> you? It ought to be DMI based there, too.
> I don't know why you think using DMI is a better way? For BDF based way, we 
> only need to know the device ID. But for DMI base way, I don't know which 
> condition should be matched.

The keys I'm currently using (successfully tested on customer's and my
affected machines) are BIOS and board vendor being "Intel". Only on
those I'm then looking for the ICH10 (and then all known variants). See
below for the patch (still containing some debugging bits).

> Actually, the best way to solve it is to enable the ACPI mode in Xen instead 
> of in dom0. For enable ACPI, we need to write the value from FADT.ACPI_ENABLE 
> to SMI_CMD. After writing the value, the SMI ownership will be disable by 
> ACPI hardware and it also will disable some logic which is able to cause SMI. 
> For example, the legacy USB circuit will be masked too. Because at this 
> point, there have no need to use legacy usb emulation. This is also what 
> linux upstream did. But I think it is too complicated to port this logic to 
> xen. Anyway, if you have interesting, you can add this logic to xen and there 
> have no need for this patch again.

Was this a pretty recent change in Linux? Otherwise, how do you
explain that, with apparently lower probability, I'm observing the very
same hang with native Linux? My assumption is that ACPI mode
enabling is happening just too late for masking this problem.

Jan

--- a/xen/arch/x86/dmi_scan.c
+++ b/xen/arch/x86/dmi_scan.c
@@ -10,6 +10,8 @@
 #include <asm/system.h>
 #include <xen/dmi.h>
 #include <xen/efi.h>
+#include <xen/pci.h>
+#include <xen/pci_regs.h>
 
 #define bt_ioremap(b,l)  ((void *)__acpi_map_table(b,l))
 #define bt_iounmap(b,l)  ((void)0)
@@ -278,6 +280,31 @@ static __init int broken_toshiba_keyboar
 	return 0;
 }
 
+static int __init ich10_bios_quirk(struct dmi_system_id *d)
+{
+    u32 port, smictl;
+
+    if ( pci_conf_read16(0, 0x1f, 0, PCI_VENDOR_ID) != 0x8086 )
+        return 0;
+
+    switch ( pci_conf_read16(0, 0x1f, 0, PCI_DEVICE_ID) ) {
+    case 0x3a14:
+    case 0x3a16:
+    case 0x3a18:
+    case 0x3a1a:
+printk("ACPI base=%04x\n", port = pci_conf_read16(0, 0x1f, 0, 0x40));//temp
+        port = (port & 0xff80) + 0x30;
+        smictl = inl(port);
+printk("smictl=%08x\n", smictl);//temp
+        /* turn off LEGACY_USB{,2}_EN if enabled */
+        if ( smictl & 0x20008 )
+            outl(smictl & ~0x20008, port);
+printk("smictl:%08x\n", inl(port));//temp
+        break;
+    }
+
+    return 0;
+}
 
 #ifdef CONFIG_ACPI_SLEEP
 static __init int reset_videomode_after_s3(struct dmi_blacklist *d)
@@ -342,6 +369,18 @@ static __initdata struct dmi_blacklist d
 			} },
 #endif
 
+	{ ich10_bios_quirk, "Intel board & BIOS",
+		/*
+		 * BIOS leaves legacy USB emulation enabled while
+		 * SMM can't properly handle it.
+		 */
+		{
+			MATCH(DMI_BOARD_VENDOR, "Intel Corp"),
+			MATCH(DMI_BIOS_VENDOR, "Intel Corp"),
+			NO_MATCH, NO_MATCH
+		}
+	},
+
 #ifdef	CONFIG_ACPI_BOOT
 	/*
 	 * If your system is blacklisted here, but you find that acpi=force

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] Fix xen hang on intel westmere-EP
  2011-08-23  7:47       ` Tian, Kevin
@ 2011-08-23  8:04         ` Keir Fraser
  0 siblings, 0 replies; 10+ messages in thread
From: Keir Fraser @ 2011-08-23  8:04 UTC (permalink / raw)
  To: Tian, Kevin, Zhang, Yang Z, Jan Beulich; +Cc: xen-devel

On 23/08/2011 08:47, "Tian, Kevin" <kevin.tian@intel.com> wrote:

>> From: Keir Fraser
>> Sent: Tuesday, August 23, 2011 3:32 PM
>> 
>> On 23/08/2011 04:52, "Zhang, Yang Z" <yang.z.zhang@intel.com> wrote:
>> 
>>>> Further I'm opposed to introducing further instances of legacy brute- force
>>>> PCI
>>>> bus scans.
>>>> 
>>>> And I don't think you got something along these lines accepted into Linux,
>>>> did
>>>> you? It ought to be DMI based there, too.
>>> I don't know why you think using DMI is a better way? For BDF based way, we
>>> only need to know the device ID. But for DMI base way, I don't know which
>>> condition should be matched.
>>> 
>>> Actually, the best way to solve it is to enable the ACPI mode in Xen instead
>>> of in dom0. For enable ACPI, we need to write the value from
>> FADT.ACPI_ENABLE
>>> to SMI_CMD. After writing the value, the SMI ownership will be disable by
>> ACPI
>>> hardware and it also will disable some logic which is able to cause SMI. For
>>> example, the legacy USB circuit will be masked too. Because at this point,
>>> there have no need to use legacy usb emulation. This is also what linux
>>> upstream did. But I think it is too complicated to port this logic to xen.
>>> Anyway, if you have interesting, you can add this logic to xen and there
>>> have
>>> no need for this patch again.
>> 
>> It sounds like quite a good idea, and not very complicated at all. The main
>> concern would be potential other fall out from making the change.
>> 
> 
> This is indeed tricky though. This ACPI mode enable basically tells that Xen
> is
> ready to own ACPI hardware registers through ACPI SCI mode, instead of
> legacy SMI mode. However SCI itself requires some encoded ACPI information,
> such as routing and overlapping info, which depends on dom0. So Xen is not
> ready to handle SCI before dom0 boots.
> 
> On the other hand, Linux does this ACPI mode enable late until APs are
> ready to boot. Before that point there're some extra preparations completed
> already, such as DMI system check (which may disable ACPI), root namespace
> initialization, etc. I'm not sure whether those extra preparations can be all
> moved down to Xen (some of them access encoded content), or can be safely
> skipped in Xen (e.g. DMI check...)

That's a bit of a shame. Yes, sounds like quite a lot of pain.

 -- Keir

> Thanks
> Kevin

^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: [PATCH] Fix xen hang on intel westmere-EP
  2011-08-23  8:01     ` Jan Beulich
@ 2011-08-24  2:16       ` Zhang, Yang Z
  2011-08-24  7:28         ` Jan Beulich
  0 siblings, 1 reply; 10+ messages in thread
From: Zhang, Yang Z @ 2011-08-24  2:16 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, KeirFraser

> -----Original Message-----
> From: Jan Beulich [mailto:JBeulich@novell.com]
> Sent: Tuesday, August 23, 2011 4:02 PM
> >> Also, are you certain this problem exists only with this single ICH variant?
> > So far, we only observed it on ICH10 based chipset. Did you see
> > another platform have the problems?
> 
> No, I haven't observed it on other platforms but
> - the problematic bits exist in earlier ICH versions too, and
> - there are ICH10 based systems that aren't showing any such bad
>   behavior.

It depends on the BIOS. Some already disable the legacy USB emulation and obviously you cannot see this problem on those platforms.

> 
> The keys I'm currently using (successfully tested on customer's and my affected
> machines) are BIOS and board vendor being "Intel". Only on those I'm then
> looking for the ICH10 (and then all known variants). See below for the patch
> (still containing some debugging bits).
> 
> > Actually, the best way to solve it is to enable the ACPI mode in Xen
> > instead of in dom0. For enable ACPI, we need to write the value from
> > FADT.ACPI_ENABLE to SMI_CMD. After writing the value, the SMI
> > ownership will be disable by ACPI hardware and it also will disable some logic
> which is able to cause SMI.
> > For example, the legacy USB circuit will be masked too. Because at
> > this point, there have no need to use legacy usb emulation. This is
> > also what linux upstream did. But I think it is too complicated to
> > port this logic to xen. Anyway, if you have interesting, you can add
> > this logic to xen and there have no need for this patch again.
> 
> Was this a pretty recent change in Linux? Otherwise, how do you explain that,
> with apparently lower probability, I'm observing the very same hang with native
> Linux? My assumption is that ACPI mode enabling is happening just too late for
> masking this problem.

Our bios team and me also had a try with native linux. But we cannot reproduce it. So this maybe another potential issue which different from what we are discussing now.


> --- a/xen/arch/x86/dmi_scan.c
> +++ b/xen/arch/x86/dmi_scan.c
> @@ -10,6 +10,8 @@
>  #include <asm/system.h>
>  #include <xen/dmi.h>
>  #include <xen/efi.h>
> +#include <xen/pci.h>
> +#include <xen/pci_regs.h>
> 
>  #define bt_ioremap(b,l)  ((void *)__acpi_map_table(b,l))  #define
> bt_iounmap(b,l)  ((void)0) @@ -278,6 +280,31 @@ static __init int
> broken_toshiba_keyboar
>  	return 0;
>  }
> 
> +static int __init ich10_bios_quirk(struct dmi_system_id *d) {
> +    u32 port, smictl;
> +
> +    if ( pci_conf_read16(0, 0x1f, 0, PCI_VENDOR_ID) != 0x8086 )
> +        return 0;
> +
> +    switch ( pci_conf_read16(0, 0x1f, 0, PCI_DEVICE_ID) ) {
> +    case 0x3a14:
> +    case 0x3a16:
> +    case 0x3a18:
> +    case 0x3a1a:
> +printk("ACPI base=%04x\n", port = pci_conf_read16(0, 0x1f, 0, 0x40));//temp
> +        port = (port & 0xff80) + 0x30;
> +        smictl = inl(port);
> +printk("smictl=%08x\n", smictl);//temp
> +        /* turn off LEGACY_USB{,2}_EN if enabled */
> +        if ( smictl & 0x20008 )
> +            outl(smictl & ~0x20008, port); printk("smictl:%08x\n",
> +inl(port));//temp
> +        break;
> +    }
> +
> +    return 0;
> +}
> 
>  #ifdef CONFIG_ACPI_SLEEP
>  static __init int reset_videomode_after_s3(struct dmi_blacklist *d) @@
> -342,6 +369,18 @@ static __initdata struct dmi_blacklist d
>  			} },
>  #endif
> 
> +	{ ich10_bios_quirk, "Intel board & BIOS",
> +		/*
> +		 * BIOS leaves legacy USB emulation enabled while
> +		 * SMM can't properly handle it.
> +		 */
> +		{
> +			MATCH(DMI_BOARD_VENDOR, "Intel Corp"),
> +			MATCH(DMI_BIOS_VENDOR, "Intel Corp"),
> +			NO_MATCH, NO_MATCH
> +		}
> +	},
> +
>  #ifdef	CONFIG_ACPI_BOOT
>  	/*
>  	 * If your system is blacklisted here, but you find that acpi=force

This patch is better. But do we really need to disable it on other ICH? If you do seen this issue with other ICH, then you can do it. Otherwise, there may cause some potential issue. Have you test your patch on those platform?

^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: [PATCH] Fix xen hang on intel westmere-EP
  2011-08-24  2:16       ` Zhang, Yang Z
@ 2011-08-24  7:28         ` Jan Beulich
  2011-08-24  8:02           ` Zhang, Yang Z
  0 siblings, 1 reply; 10+ messages in thread
From: Jan Beulich @ 2011-08-24  7:28 UTC (permalink / raw)
  To: Yang Z Zhang; +Cc: xen-devel, KeirFraser

>>> On 24.08.11 at 04:16, "Zhang, Yang Z" <yang.z.zhang@intel.com> wrote:
>>  -----Original Message-----
>> From: Jan Beulich [mailto:JBeulich@novell.com] 
>> Sent: Tuesday, August 23, 2011 4:02 PM
>> >> Also, are you certain this problem exists only with this single ICH 
> variant?
>> > So far, we only observed it on ICH10 based chipset. Did you see
>> > another platform have the problems?
>> 
>> No, I haven't observed it on other platforms but
>> - the problematic bits exist in earlier ICH versions too, and
>> - there are ICH10 based systems that aren't showing any such bad
>>   behavior.
> 
> It depends on the BIOS. Some already disable the legacy USB emulation and 
> obviously you cannot see this problem on those platforms.

With the help of the debug printk()-s in the patch I was able to see that
on at least one unaffected platform, at least one of the USB emulation
bits was set nevertheless.

>> 
>> The keys I'm currently using (successfully tested on customer's and my 
> affected
>> machines) are BIOS and board vendor being "Intel". Only on those I'm then
>> looking for the ICH10 (and then all known variants). See below for the patch
>> (still containing some debugging bits).
>> 
>> > Actually, the best way to solve it is to enable the ACPI mode in Xen
>> > instead of in dom0. For enable ACPI, we need to write the value from
>> > FADT.ACPI_ENABLE to SMI_CMD. After writing the value, the SMI
>> > ownership will be disable by ACPI hardware and it also will disable some 
> logic
>> which is able to cause SMI.
>> > For example, the legacy USB circuit will be masked too. Because at
>> > this point, there have no need to use legacy usb emulation. This is
>> > also what linux upstream did. But I think it is too complicated to
>> > port this logic to xen. Anyway, if you have interesting, you can add
>> > this logic to xen and there have no need for this patch again.
>> 
>> Was this a pretty recent change in Linux? Otherwise, how do you explain 
> that,
>> with apparently lower probability, I'm observing the very same hang with 
> native
>> Linux? My assumption is that ACPI mode enabling is happening just too late 
> for
>> masking this problem.
> 
> Our bios team and me also had a try with native linux. But we cannot 
> reproduce it. So this maybe another potential issue which different from what 
> we are discussing now.

Pretty unlikely. Remember that for months you (as a company) claimed
not to be able to reproduce the issue on Xen. As it's even more
sporadic on native Linux, I'm not really surprised there's a problem
reproducing it there.

>> --- a/xen/arch/x86/dmi_scan.c
>> +++ b/xen/arch/x86/dmi_scan.c
>> @@ -10,6 +10,8 @@
>>  #include <asm/system.h>
>>  #include <xen/dmi.h>
>>  #include <xen/efi.h>
>> +#include <xen/pci.h>
>> +#include <xen/pci_regs.h>
>> 
>>  #define bt_ioremap(b,l)  ((void *)__acpi_map_table(b,l))  #define
>> bt_iounmap(b,l)  ((void)0) @@ -278,6 +280,31 @@ static __init int
>> broken_toshiba_keyboar
>>  	return 0;
>>  }
>> 
>> +static int __init ich10_bios_quirk(struct dmi_system_id *d) {
>> +    u32 port, smictl;
>> +
>> +    if ( pci_conf_read16(0, 0x1f, 0, PCI_VENDOR_ID) != 0x8086 )
>> +        return 0;
>> +
>> +    switch ( pci_conf_read16(0, 0x1f, 0, PCI_DEVICE_ID) ) {
>> +    case 0x3a14:
>> +    case 0x3a16:
>> +    case 0x3a18:
>> +    case 0x3a1a:
>> +printk("ACPI base=%04x\n", port = pci_conf_read16(0, 0x1f, 0, 0x40));//temp
>> +        port = (port & 0xff80) + 0x30;
>> +        smictl = inl(port);
>> +printk("smictl=%08x\n", smictl);//temp
>> +        /* turn off LEGACY_USB{,2}_EN if enabled */
>> +        if ( smictl & 0x20008 )
>> +            outl(smictl & ~0x20008, port); printk("smictl:%08x\n",
>> +inl(port));//temp
>> +        break;
>> +    }
>> +
>> +    return 0;
>> +}
>> 
>>  #ifdef CONFIG_ACPI_SLEEP
>>  static __init int reset_videomode_after_s3(struct dmi_blacklist *d) @@
>> -342,6 +369,18 @@ static __initdata struct dmi_blacklist d
>>  			} },
>>  #endif
>> 
>> +	{ ich10_bios_quirk, "Intel board & BIOS",
>> +		/*
>> +		 * BIOS leaves legacy USB emulation enabled while
>> +		 * SMM can't properly handle it.
>> +		 */
>> +		{
>> +			MATCH(DMI_BOARD_VENDOR, "Intel Corp"),
>> +			MATCH(DMI_BIOS_VENDOR, "Intel Corp"),
>> +			NO_MATCH, NO_MATCH
>> +		}
>> +	},
>> +
>>  #ifdef	CONFIG_ACPI_BOOT
>>  	/*
>>  	 * If your system is blacklisted here, but you find that acpi=force
> 
> This patch is better. But do we really need to disable it on other ICH? If 
> you do seen this issue with other ICH, then you can do it. Otherwise, there 
> may cause some potential issue. Have you test your patch on those platform?

Obviously not, given the very limited set of systems on which we
observe the problem. But I really just listed the various ICH10 PCI IDs
- are you saying that the problematic BIOSes is reasonably certain to
have got used with only one of them? After all, not affecting other
platforms is what is being aimed at with the DMI match strings.

Jan

^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: [PATCH] Fix xen hang on intel westmere-EP
  2011-08-24  7:28         ` Jan Beulich
@ 2011-08-24  8:02           ` Zhang, Yang Z
  0 siblings, 0 replies; 10+ messages in thread
From: Zhang, Yang Z @ 2011-08-24  8:02 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel, KeirFraser

> -----Original Message-----
> From: Jan Beulich [mailto:JBeulich@novell.com]
> Sent: Wednesday, August 24, 2011 3:28 PM
> >>> On 24.08.11 at 04:16, "Zhang, Yang Z" <yang.z.zhang@intel.com> wrote:
> >>  -----Original Message-----
> >> From: Jan Beulich [mailto:JBeulich@novell.com]
> >> Sent: Tuesday, August 23, 2011 4:02 PM
> >> >> Also, are you certain this problem exists only with this single
> >> >> ICH
> > variant?
> >> > So far, we only observed it on ICH10 based chipset. Did you see
> >> > another platform have the problems?
> >>
> >> No, I haven't observed it on other platforms but
> >> - the problematic bits exist in earlier ICH versions too, and
> >> - there are ICH10 based systems that aren't showing any such bad
> >>   behavior.
> >
> > It depends on the BIOS. Some already disable the legacy USB emulation
> > and obviously you cannot see this problem on those platforms.
> 
> With the help of the debug printk()-s in the patch I was able to see that on at
> least one unaffected platform, at least one of the USB emulation bits was set
> nevertheless.
> 
> >>
> >> The keys I'm currently using (successfully tested on customer's and
> >> my
> > affected
> >> machines) are BIOS and board vendor being "Intel". Only on those I'm
> >> then looking for the ICH10 (and then all known variants). See below
> >> for the patch (still containing some debugging bits).
> >>
> >> > Actually, the best way to solve it is to enable the ACPI mode in
> >> > Xen instead of in dom0. For enable ACPI, we need to write the value
> >> > from FADT.ACPI_ENABLE to SMI_CMD. After writing the value, the SMI
> >> > ownership will be disable by ACPI hardware and it also will disable
> >> > some
> > logic
> >> which is able to cause SMI.
> >> > For example, the legacy USB circuit will be masked too. Because at
> >> > this point, there have no need to use legacy usb emulation. This is
> >> > also what linux upstream did. But I think it is too complicated to
> >> > port this logic to xen. Anyway, if you have interesting, you can
> >> > add this logic to xen and there have no need for this patch again.
> >>
> >> Was this a pretty recent change in Linux? Otherwise, how do you
> >> explain
> > that,
> >> with apparently lower probability, I'm observing the very same hang
> >> with
> > native
> >> Linux? My assumption is that ACPI mode enabling is happening just too
> >> late
> > for
> >> masking this problem.
> >
> > Our bios team and me also had a try with native linux. But we cannot
> > reproduce it. So this maybe another potential issue which different
> > from what we are discussing now.
> 
> Pretty unlikely. Remember that for months you (as a company) claimed not to
> be able to reproduce the issue on Xen. As it's even more sporadic on native
> Linux, I'm not really surprised there's a problem reproducing it there.

To identify whether native linux have the same problem, you can disable the SMI to see whether the hang happen again. 

> >> --- a/xen/arch/x86/dmi_scan.c
> >> +++ b/xen/arch/x86/dmi_scan.c
> >> @@ -10,6 +10,8 @@
> >>  #include <asm/system.h>
> >>  #include <xen/dmi.h>
> >>  #include <xen/efi.h>
> >> +#include <xen/pci.h>
> >> +#include <xen/pci_regs.h>
> >>
> >>  #define bt_ioremap(b,l)  ((void *)__acpi_map_table(b,l))  #define
> >> bt_iounmap(b,l)  ((void)0) @@ -278,6 +280,31 @@ static __init int
> >> broken_toshiba_keyboar
> >>  	return 0;
> >>  }
> >>
> >> +static int __init ich10_bios_quirk(struct dmi_system_id *d) {
> >> +    u32 port, smictl;
> >> +
> >> +    if ( pci_conf_read16(0, 0x1f, 0, PCI_VENDOR_ID) != 0x8086 )
> >> +        return 0;
> >> +
> >> +    switch ( pci_conf_read16(0, 0x1f, 0, PCI_DEVICE_ID) ) {
> >> +    case 0x3a14:
> >> +    case 0x3a16:
> >> +    case 0x3a18:
> >> +    case 0x3a1a:
> >> +printk("ACPI base=%04x\n", port = pci_conf_read16(0, 0x1f, 0,
> 0x40));//temp
> >> +        port = (port & 0xff80) + 0x30;
> >> +        smictl = inl(port);
> >> +printk("smictl=%08x\n", smictl);//temp
> >> +        /* turn off LEGACY_USB{,2}_EN if enabled */
> >> +        if ( smictl & 0x20008 )
> >> +            outl(smictl & ~0x20008, port); printk("smictl:%08x\n",
> >> +inl(port));//temp
> >> +        break;
> >> +    }
> >> +
> >> +    return 0;
> >> +}
> >>
> >>  #ifdef CONFIG_ACPI_SLEEP
> >>  static __init int reset_videomode_after_s3(struct dmi_blacklist *d)
> >> @@
> >> -342,6 +369,18 @@ static __initdata struct dmi_blacklist d
> >>  			} },
> >>  #endif
> >>
> >> +	{ ich10_bios_quirk, "Intel board & BIOS",
> >> +		/*
> >> +		 * BIOS leaves legacy USB emulation enabled while
> >> +		 * SMM can't properly handle it.
> >> +		 */
> >> +		{
> >> +			MATCH(DMI_BOARD_VENDOR, "Intel Corp"),
> >> +			MATCH(DMI_BIOS_VENDOR, "Intel Corp"),
> >> +			NO_MATCH, NO_MATCH
> >> +		}
> >> +	},
> >> +
> >>  #ifdef	CONFIG_ACPI_BOOT
> >>  	/*
> >>  	 * If your system is blacklisted here, but you find that acpi=force
> >
> > This patch is better. But do we really need to disable it on other
> > ICH? If you do seen this issue with other ICH, then you can do it.
> > Otherwise, there may cause some potential issue. Have you test your patch
> on those platform?
> 
> Obviously not, given the very limited set of systems on which we observe the
> problem. But I really just listed the various ICH10 PCI IDs
> - are you saying that the problematic BIOSes is reasonably certain to have got
> used with only one of them? After all, not affecting other platforms is what is
> being aimed at with the DMI match strings.

I am not sure whether other BIOS have the same issue. But it's ok to turn off it even in the good platform.

best regards
yang

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2011-08-24  8:02 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-08-22 13:55 [PATCH] Fix xen hang on intel westmere-EP Zhang, Yang Z
2011-08-22 14:19 ` Jan Beulich
2011-08-23  3:52   ` Zhang, Yang Z
2011-08-23  7:32     ` Keir Fraser
2011-08-23  7:47       ` Tian, Kevin
2011-08-23  8:04         ` Keir Fraser
2011-08-23  8:01     ` Jan Beulich
2011-08-24  2:16       ` Zhang, Yang Z
2011-08-24  7:28         ` Jan Beulich
2011-08-24  8:02           ` Zhang, Yang Z

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.