All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
@ 2009-06-12 18:22 ` Jeremy Fitzhardinge
  0 siblings, 0 replies; 79+ messages in thread
From: Jeremy Fitzhardinge @ 2009-06-12 18:22 UTC (permalink / raw)
  To: Ingo Molnar, Thomas Gleixner, H. Peter Anvin
  Cc: Eric W. Biederman, the arch/x86 maintainers,
	Linux Kernel Mailing List, Xen-devel

Parse the ACPI MADT for I/O APIC information, even if the cpu has no
(apparent) local APIC (ie, the CPU's APIC feature flag is clear).

In principle, the local APIC and the I/O APIC are distinct (but related)
components, which can be independently present.

In practice this can happen in a Xen system, where the hypervisor has
full control over the local APICs, and delivers interrupts initiated by
the I/O APICs via Xen's event channel mechanism.

(This eliminates the need for any explicit if (xen...) tests in
acpi/boot.c)

Signed-off-by: Jeremy Fitzhardinge<jeremy.fitzhardinge@citrix.com>

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 2410469..19d13e5 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -193,9 +193,6 @@ static int __init acpi_parse_madt(struct acpi_table_header *table)
  {
  	struct acpi_table_madt *madt = NULL;

-	if (!cpu_has_apic)
-		return -EINVAL;
-
  	madt = (struct acpi_table_madt *)table;
  	if (!madt) {
  		printk(KERN_WARNING PREFIX "Unable to map MADT\n");
@@ -1252,9 +1249,6 @@ static int __init acpi_parse_madt_ioapic_entries(void)
  	if (acpi_disabled || acpi_noirq)
  		return -ENODEV;

-	if (!cpu_has_apic)
-		return -ENODEV;
-
  	/*
  	 * if "noapic" boot option, don't look for IO-APICs
  	 */
@@ -1357,6 +1351,16 @@ static void __init acpi_process_madt(void)
  #ifdef CONFIG_X86_BIGSMP
  			generic_bigsmp_probe();
  #endif
+		}
+		if (error == -EINVAL) {
+			/*
+			 * The ACPI tables themselves were malformed.
+			 * Dell Precision Workstation 410, 610 come here.
+			 */
+			printk(KERN_ERR PREFIX
+			       "Invalid BIOS MADT, disabling ACPI\n");
+			disable_acpi();
+		} else {
  			/*
  			 * Parse MADT IO-APIC entries
  			 */
@@ -1370,14 +1374,6 @@ static void __init acpi_process_madt(void)
  					apic->setup_apic_routing();
  			}
  		}
-		if (error == -EINVAL) {
-			/*
-			 * Dell Precision Workstation 410, 610 come here.
-			 */
-			printk(KERN_ERR PREFIX
-			       "Invalid BIOS MADT, disabling ACPI\n");
-			disable_acpi();
-		}
  	} else {
  		/*
   		 * ACPI found no MADT, and so ACPI wants UP PIC mode.
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index c6acce2..d5e3f03 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1807,6 +1807,10 @@ __apicdebuginit(void) print_all_local_APICs(void)
  {
  	int cpu;

+	/* don't print out if apic is not there */
+	if (!cpu_has_apic)
+		return;
+
  	preempt_disable();
  	for_each_online_cpu(cpu)
  		smp_call_function_single(cpu, print_local_APIC, NULL, 1);
@@ -1849,8 +1853,7 @@ __apicdebuginit(int) print_all_ICs(void)
  {
  	print_PIC();

-	/* don't print out if apic is not there */
-	if (!cpu_has_apic || disable_apic)
+	if (disable_apic)
  		return 0;

  	print_all_local_APICs();



^ permalink raw reply related	[flat|nested] 79+ messages in thread

* [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
@ 2009-06-12 18:22 ` Jeremy Fitzhardinge
  0 siblings, 0 replies; 79+ messages in thread
From: Jeremy Fitzhardinge @ 2009-06-12 18:22 UTC (permalink / raw)
  To: Ingo Molnar, Thomas Gleixner, H. Peter Anvin
  Cc: Xen-devel, the arch/x86 maintainers, Eric W. Biederman,
	Linux Kernel Mailing List

Parse the ACPI MADT for I/O APIC information, even if the cpu has no
(apparent) local APIC (ie, the CPU's APIC feature flag is clear).

In principle, the local APIC and the I/O APIC are distinct (but related)
components, which can be independently present.

In practice this can happen in a Xen system, where the hypervisor has
full control over the local APICs, and delivers interrupts initiated by
the I/O APICs via Xen's event channel mechanism.

(This eliminates the need for any explicit if (xen...) tests in
acpi/boot.c)

Signed-off-by: Jeremy Fitzhardinge<jeremy.fitzhardinge@citrix.com>

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 2410469..19d13e5 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -193,9 +193,6 @@ static int __init acpi_parse_madt(struct acpi_table_header *table)
  {
  	struct acpi_table_madt *madt = NULL;

-	if (!cpu_has_apic)
-		return -EINVAL;
-
  	madt = (struct acpi_table_madt *)table;
  	if (!madt) {
  		printk(KERN_WARNING PREFIX "Unable to map MADT\n");
@@ -1252,9 +1249,6 @@ static int __init acpi_parse_madt_ioapic_entries(void)
  	if (acpi_disabled || acpi_noirq)
  		return -ENODEV;

-	if (!cpu_has_apic)
-		return -ENODEV;
-
  	/*
  	 * if "noapic" boot option, don't look for IO-APICs
  	 */
@@ -1357,6 +1351,16 @@ static void __init acpi_process_madt(void)
  #ifdef CONFIG_X86_BIGSMP
  			generic_bigsmp_probe();
  #endif
+		}
+		if (error == -EINVAL) {
+			/*
+			 * The ACPI tables themselves were malformed.
+			 * Dell Precision Workstation 410, 610 come here.
+			 */
+			printk(KERN_ERR PREFIX
+			       "Invalid BIOS MADT, disabling ACPI\n");
+			disable_acpi();
+		} else {
  			/*
  			 * Parse MADT IO-APIC entries
  			 */
@@ -1370,14 +1374,6 @@ static void __init acpi_process_madt(void)
  					apic->setup_apic_routing();
  			}
  		}
-		if (error == -EINVAL) {
-			/*
-			 * Dell Precision Workstation 410, 610 come here.
-			 */
-			printk(KERN_ERR PREFIX
-			       "Invalid BIOS MADT, disabling ACPI\n");
-			disable_acpi();
-		}
  	} else {
  		/*
   		 * ACPI found no MADT, and so ACPI wants UP PIC mode.
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index c6acce2..d5e3f03 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1807,6 +1807,10 @@ __apicdebuginit(void) print_all_local_APICs(void)
  {
  	int cpu;

+	/* don't print out if apic is not there */
+	if (!cpu_has_apic)
+		return;
+
  	preempt_disable();
  	for_each_online_cpu(cpu)
  		smp_call_function_single(cpu, print_local_APIC, NULL, 1);
@@ -1849,8 +1853,7 @@ __apicdebuginit(int) print_all_ICs(void)
  {
  	print_PIC();

-	/* don't print out if apic is not there */
-	if (!cpu_has_apic || disable_apic)
+	if (disable_apic)
  		return 0;

  	print_all_local_APICs();

^ permalink raw reply related	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
  2009-06-12 18:22 ` Jeremy Fitzhardinge
@ 2009-06-12 18:28   ` Alan Cox
  -1 siblings, 0 replies; 79+ messages in thread
From: Alan Cox @ 2009-06-12 18:28 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Ingo Molnar, Thomas Gleixner, H. Peter Anvin, Eric W. Biederman,
	the arch/x86 maintainers, Linux Kernel Mailing List, Xen-devel

> +		if (error == -EINVAL) {
> +			/*
> +			 * The ACPI tables themselves were malformed.
> +			 * Dell Precision Workstation 410, 610 come here.
> +			 */
> +			printk(KERN_ERR PREFIX
> +			       "Invalid BIOS MADT, disabling ACPI\n");
> +			disable_acpi();
> +		} else {

This seems a very bad model. On todays systems turning off ACPI renders
them basically useless. If the MADT is bogus its far better to pray that
they rest of the ACPI is mostly sound and continue by ignoring the only
dodgy table.

Alan

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
@ 2009-06-12 18:28   ` Alan Cox
  0 siblings, 0 replies; 79+ messages in thread
From: Alan Cox @ 2009-06-12 18:28 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Xen-devel, H. Peter Anvin, the arch/x86 maintainers,
	Kernel Mailing List, Ingo Molnar, Eric W. Biederman, Linux,
	Thomas Gleixner

> +		if (error == -EINVAL) {
> +			/*
> +			 * The ACPI tables themselves were malformed.
> +			 * Dell Precision Workstation 410, 610 come here.
> +			 */
> +			printk(KERN_ERR PREFIX
> +			       "Invalid BIOS MADT, disabling ACPI\n");
> +			disable_acpi();
> +		} else {

This seems a very bad model. On todays systems turning off ACPI renders
them basically useless. If the MADT is bogus its far better to pray that
they rest of the ACPI is mostly sound and continue by ignoring the only
dodgy table.

Alan

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
  2009-06-12 18:28   ` Alan Cox
@ 2009-06-12 18:33     ` Jeremy Fitzhardinge
  -1 siblings, 0 replies; 79+ messages in thread
From: Jeremy Fitzhardinge @ 2009-06-12 18:33 UTC (permalink / raw)
  To: Alan Cox
  Cc: Ingo Molnar, Thomas Gleixner, H. Peter Anvin, Eric W. Biederman,
	the arch/x86 maintainers, Linux Kernel Mailing List, Xen-devel

On 06/12/09 11:28, Alan Cox wrote:
>> +		if (error == -EINVAL) {
>> +			/*
>> +			 * The ACPI tables themselves were malformed.
>> +			 * Dell Precision Workstation 410, 610 come here.
>> +			 */
>> +			printk(KERN_ERR PREFIX
>> +			       "Invalid BIOS MADT, disabling ACPI\n");
>> +			disable_acpi();
>> +		} else {
>>      
>
> This seems a very bad model. On todays systems turning off ACPI renders
> them basically useless. If the MADT is bogus its far better to pray that
> they rest of the ACPI is mostly sound and continue by ignoring the only
> dodgy table.
>    

I didn't make any change there; I just preserved the original behaviour 
of stopping all ACPI parsing when it returned -EINVAL (with a slightly 
more helpful comment).  My change was to make it continue to the I/O 
APICs if it returned something else (ie, -ENODEV).

     J

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
@ 2009-06-12 18:33     ` Jeremy Fitzhardinge
  0 siblings, 0 replies; 79+ messages in thread
From: Jeremy Fitzhardinge @ 2009-06-12 18:33 UTC (permalink / raw)
  To: Alan Cox
  Cc: Xen-devel, the arch/x86 maintainers, Linux Kernel Mailing List,
	Ingo Molnar, Eric W. Biederman, H. Peter Anvin, Thomas Gleixner

On 06/12/09 11:28, Alan Cox wrote:
>> +		if (error == -EINVAL) {
>> +			/*
>> +			 * The ACPI tables themselves were malformed.
>> +			 * Dell Precision Workstation 410, 610 come here.
>> +			 */
>> +			printk(KERN_ERR PREFIX
>> +			       "Invalid BIOS MADT, disabling ACPI\n");
>> +			disable_acpi();
>> +		} else {
>>      
>
> This seems a very bad model. On todays systems turning off ACPI renders
> them basically useless. If the MADT is bogus its far better to pray that
> they rest of the ACPI is mostly sound and continue by ignoring the only
> dodgy table.
>    

I didn't make any change there; I just preserved the original behaviour 
of stopping all ACPI parsing when it returned -EINVAL (with a slightly 
more helpful comment).  My change was to make it continue to the I/O 
APICs if it returned something else (ie, -ENODEV).

     J

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
  2009-06-12 18:22 ` Jeremy Fitzhardinge
  (?)
  (?)
@ 2009-06-12 20:11 ` Cyrill Gorcunov
  2009-06-15  2:01   ` Jeremy Fitzhardinge
  -1 siblings, 1 reply; 79+ messages in thread
From: Cyrill Gorcunov @ 2009-06-12 20:11 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Ingo Molnar, Thomas Gleixner, H. Peter Anvin, Eric W. Biederman,
	the arch/x86 maintainers, Linux Kernel Mailing List, Xen-devel

[Jeremy Fitzhardinge - Fri, Jun 12, 2009 at 11:22:48AM -0700]
> Parse the ACPI MADT for I/O APIC information, even if the cpu has no
> (apparent) local APIC (ie, the CPU's APIC feature flag is clear).
>
> In principle, the local APIC and the I/O APIC are distinct (but related)
> components, which can be independently present.
>
> In practice this can happen in a Xen system, where the hypervisor has
> full control over the local APICs, and delivers interrupts initiated by
> the I/O APICs via Xen's event channel mechanism.
>
> (This eliminates the need for any explicit if (xen...) tests in
> acpi/boot.c)
>
> Signed-off-by: Jeremy Fitzhardinge<jeremy.fitzhardinge@citrix.com>
>
> diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
> index 2410469..19d13e5 100644
> --- a/arch/x86/kernel/acpi/boot.c
> +++ b/arch/x86/kernel/acpi/boot.c
> @@ -193,9 +193,6 @@ static int __init acpi_parse_madt(struct acpi_table_header *table)
>  {
>  	struct acpi_table_madt *madt = NULL;
>
> -	if (!cpu_has_apic)
> -		return -EINVAL;
> -

Hi Jeremy,

just for the record -- this removement has a side effect.
Imagine I've passed "disableapic" so I expect as many as
possible apic-related code would not pass thru execution.
Now we would have (say for IBM Summit)

acpi_parse_madt
  default_acpi_madt_oem_check
    summit_acpi_madt_oem_check
      mark_tsc_unstable
      setup_summit

Dunno if it harmless or no but it changes kernel behaviour.
cpu_has_apic cleared if disableapic option (which is early
param) passed to kernel.

Just a note. Didn't walk thru all ways.

	-- Cyrill

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
  2009-06-12 18:22 ` Jeremy Fitzhardinge
@ 2009-06-12 20:35   ` Eric W. Biederman
  -1 siblings, 0 replies; 79+ messages in thread
From: Eric W. Biederman @ 2009-06-12 20:35 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Ingo Molnar, Thomas Gleixner, H. Peter Anvin,
	the arch/x86 maintainers, Linux Kernel Mailing List, Xen-devel

Jeremy Fitzhardinge <jeremy@goop.org> writes:

> Parse the ACPI MADT for I/O APIC information, even if the cpu has no
> (apparent) local APIC (ie, the CPU's APIC feature flag is clear).
>
> In principle, the local APIC and the I/O APIC are distinct (but related)
> components, which can be independently present.
>
> In practice this can happen in a Xen system, where the hypervisor has
> full control over the local APICs, and delivers interrupts initiated by
> the I/O APICs via Xen's event channel mechanism.

Xen  is giving us a semi bogus acpi table?

What is the paravirt configuration model with Xen?  Is it documented
somewhere?

Eric

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
@ 2009-06-12 20:35   ` Eric W. Biederman
  0 siblings, 0 replies; 79+ messages in thread
From: Eric W. Biederman @ 2009-06-12 20:35 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Xen-devel, the arch/x86 maintainers, Linux Kernel Mailing List,
	Ingo Molnar, H. Peter Anvin, Thomas Gleixner

Jeremy Fitzhardinge <jeremy@goop.org> writes:

> Parse the ACPI MADT for I/O APIC information, even if the cpu has no
> (apparent) local APIC (ie, the CPU's APIC feature flag is clear).
>
> In principle, the local APIC and the I/O APIC are distinct (but related)
> components, which can be independently present.
>
> In practice this can happen in a Xen system, where the hypervisor has
> full control over the local APICs, and delivers interrupts initiated by
> the I/O APICs via Xen's event channel mechanism.

Xen  is giving us a semi bogus acpi table?

What is the paravirt configuration model with Xen?  Is it documented
somewhere?

Eric

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
  2009-06-12 20:11 ` Cyrill Gorcunov
@ 2009-06-15  2:01   ` Jeremy Fitzhardinge
  0 siblings, 0 replies; 79+ messages in thread
From: Jeremy Fitzhardinge @ 2009-06-15  2:01 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Ingo Molnar, Thomas Gleixner, H. Peter Anvin, Eric W. Biederman,
	the arch/x86 maintainers, Linux Kernel Mailing List, Xen-devel

On 06/12/09 13:11, Cyrill Gorcunov wrote:
> just for the record -- this removement has a side effect.
> Imagine I've passed "disableapic" so I expect as many as
> possible apic-related code would not pass thru execution.
> Now we would have (say for IBM Summit)
>
> acpi_parse_madt
>    default_acpi_madt_oem_check
>      summit_acpi_madt_oem_check
>        mark_tsc_unstable
>        setup_summit
>
> Dunno if it harmless or no but it changes kernel behaviour.
> cpu_has_apic cleared if disableapic option (which is early
> param) passed to kernel.
>
> Just a note. Didn't walk thru all ways.
>    

Yeah, that occured to me over the weekend.  Some of those cpu_has_apic 
tests should probably be replaced with explicit tests to see if noapic 
was specified.  I'll go over it with that in mind.

Thanks,
     J

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
  2009-06-12 20:35   ` Eric W. Biederman
  (?)
@ 2009-06-15  2:06   ` Jeremy Fitzhardinge
  2009-06-15 10:47       ` Eric W. Biederman
  -1 siblings, 1 reply; 79+ messages in thread
From: Jeremy Fitzhardinge @ 2009-06-15  2:06 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Ingo Molnar, Thomas Gleixner, H. Peter Anvin,
	the arch/x86 maintainers, Linux Kernel Mailing List, Xen-devel

On 06/12/09 13:35, Eric W. Biederman wrote:
> Jeremy Fitzhardinge<jeremy@goop.org>  writes:
>
>    
>> Parse the ACPI MADT for I/O APIC information, even if the cpu has no
>> (apparent) local APIC (ie, the CPU's APIC feature flag is clear).
>>
>> In principle, the local APIC and the I/O APIC are distinct (but related)
>> components, which can be independently present.
>>
>> In practice this can happen in a Xen system, where the hypervisor has
>> full control over the local APICs, and delivers interrupts initiated by
>> the I/O APICs via Xen's event channel mechanism.
>>      
>
> Xen  is giving us a semi bogus acpi table?
>    

No, not really.  The guest is reading the real BIOS-provided ACPI 
tables, but Xen is clobbering the APIC feature in CPUID so the virtual 
CPU doesn't appear to have a usable local APIC.  Xen itself doesn't care 
very much about interrupt routing or ACPI, and doesn't make any attempt 
to read or parse the ACPI data itself (except for very basic things like 
the APIC addresses).
> What is the paravirt configuration model with Xen?  Is it documented
> somewhere?
>    

Not very well.  The basic idea is that Xen owns the local apics, and 
does things like vector allocation.  The guest kernel is responsible for 
asking for a vector, and doing the appropriate IO APIC programming, and 
binding that vector to an event channel.  The interrupt is then 
delivered via the normal event channel mechanism already in place to 
deal with all the other event types an unprivileged domain can get.

     J

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
  2009-06-15  2:06   ` Jeremy Fitzhardinge
@ 2009-06-15 10:47       ` Eric W. Biederman
  0 siblings, 0 replies; 79+ messages in thread
From: Eric W. Biederman @ 2009-06-15 10:47 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Ingo Molnar, Thomas Gleixner, H. Peter Anvin,
	the arch/x86 maintainers, Linux Kernel Mailing List, Xen-devel

Jeremy Fitzhardinge <jeremy@goop.org> writes:

> On 06/12/09 13:35, Eric W. Biederman wrote:
>> Jeremy Fitzhardinge<jeremy@goop.org>  writes:
>>
>>    
>>> Parse the ACPI MADT for I/O APIC information, even if the cpu has no
>>> (apparent) local APIC (ie, the CPU's APIC feature flag is clear).
>>>
>>> In principle, the local APIC and the I/O APIC are distinct (but related)
>>> components, which can be independently present.
>>>
>>> In practice this can happen in a Xen system, where the hypervisor has
>>> full control over the local APICs, and delivers interrupts initiated by
>>> the I/O APICs via Xen's event channel mechanism.
>>>      
>>
>> Xen  is giving us a semi bogus acpi table?
>>    
>
> No, not really.  The guest is reading the real BIOS-provided ACPI tables, but
> Xen is clobbering the APIC feature in CPUID so the virtual CPU doesn't appear to
> have a usable local APIC.  Xen itself doesn't care very much about interrupt
> routing or ACPI, and doesn't make any attempt to read or parse the ACPI data
> itself (except for very basic things like the APIC addresses).
>
>> What is the paravirt configuration model with Xen?  Is it documented
>> somewhere?
>>    
>
> Not very well.  The basic idea is that Xen owns the local apics, and does things
> like vector allocation.  The guest kernel is responsible for asking for a
> vector, and doing the appropriate IO APIC programming, and binding that vector
> to an event channel.  The interrupt is then delivered via the normal event
> channel mechanism already in place to deal with all the other event types an
> unprivileged domain can get.

For code reuse and maintainability that is a horrible separation of
responsibility.  Things looks similar to the existing cases until you
get up close and you discover all of the fundamental assumptions are
different so none of the existing code actually works unmodified.

The only clean way I can see to handle this is to make xen dom0 it's own
weird separate subarch that does all of the table parsing of the
firmware tables in completely separate code.  Then once we have something
that works factoring out the commonalities into a helper library for
better long term maintenance.

As it stands right now what Xen wants and what we need to do for normal
hardware are radically different, to the point of painful.  Things like
irq migration, and cpu hotplug require completely different algorithms.

I think Xen dom0 has picked the wrong abstraction for this one.  There
seems to be no gain and a lot of pain asking the slave kernel to
program the ioapics for it, when Xen presents a wildly different
abstraction at the cpu level.

If what xen was provided looked like an ioapic semantically I would
suggest setting cpu_has_apic in a different fashion.  We already have
two local apic variants after all so a 3rd should not be too nasty.
Except the Xen appears to have totally moved the responsibility around
in ways that over constrain the problem by taking, making the
existing code useless.

Please put the Xen dom0 insanity somewhere off in a corner where the rest
of x86 can ignore it.

Eric

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
@ 2009-06-15 10:47       ` Eric W. Biederman
  0 siblings, 0 replies; 79+ messages in thread
From: Eric W. Biederman @ 2009-06-15 10:47 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Xen-devel, the arch/x86 maintainers, Linux Kernel Mailing List,
	Ingo Molnar, H. Peter Anvin, Thomas Gleixner

Jeremy Fitzhardinge <jeremy@goop.org> writes:

> On 06/12/09 13:35, Eric W. Biederman wrote:
>> Jeremy Fitzhardinge<jeremy@goop.org>  writes:
>>
>>    
>>> Parse the ACPI MADT for I/O APIC information, even if the cpu has no
>>> (apparent) local APIC (ie, the CPU's APIC feature flag is clear).
>>>
>>> In principle, the local APIC and the I/O APIC are distinct (but related)
>>> components, which can be independently present.
>>>
>>> In practice this can happen in a Xen system, where the hypervisor has
>>> full control over the local APICs, and delivers interrupts initiated by
>>> the I/O APICs via Xen's event channel mechanism.
>>>      
>>
>> Xen  is giving us a semi bogus acpi table?
>>    
>
> No, not really.  The guest is reading the real BIOS-provided ACPI tables, but
> Xen is clobbering the APIC feature in CPUID so the virtual CPU doesn't appear to
> have a usable local APIC.  Xen itself doesn't care very much about interrupt
> routing or ACPI, and doesn't make any attempt to read or parse the ACPI data
> itself (except for very basic things like the APIC addresses).
>
>> What is the paravirt configuration model with Xen?  Is it documented
>> somewhere?
>>    
>
> Not very well.  The basic idea is that Xen owns the local apics, and does things
> like vector allocation.  The guest kernel is responsible for asking for a
> vector, and doing the appropriate IO APIC programming, and binding that vector
> to an event channel.  The interrupt is then delivered via the normal event
> channel mechanism already in place to deal with all the other event types an
> unprivileged domain can get.

For code reuse and maintainability that is a horrible separation of
responsibility.  Things looks similar to the existing cases until you
get up close and you discover all of the fundamental assumptions are
different so none of the existing code actually works unmodified.

The only clean way I can see to handle this is to make xen dom0 it's own
weird separate subarch that does all of the table parsing of the
firmware tables in completely separate code.  Then once we have something
that works factoring out the commonalities into a helper library for
better long term maintenance.

As it stands right now what Xen wants and what we need to do for normal
hardware are radically different, to the point of painful.  Things like
irq migration, and cpu hotplug require completely different algorithms.

I think Xen dom0 has picked the wrong abstraction for this one.  There
seems to be no gain and a lot of pain asking the slave kernel to
program the ioapics for it, when Xen presents a wildly different
abstraction at the cpu level.

If what xen was provided looked like an ioapic semantically I would
suggest setting cpu_has_apic in a different fashion.  We already have
two local apic variants after all so a 3rd should not be too nasty.
Except the Xen appears to have totally moved the responsibility around
in ways that over constrain the problem by taking, making the
existing code useless.

Please put the Xen dom0 insanity somewhere off in a corner where the rest
of x86 can ignore it.

Eric

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
  2009-06-12 18:22 ` Jeremy Fitzhardinge
@ 2009-06-15 10:51   ` Eric W. Biederman
  -1 siblings, 0 replies; 79+ messages in thread
From: Eric W. Biederman @ 2009-06-15 10:51 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Ingo Molnar, Thomas Gleixner, H. Peter Anvin,
	the arch/x86 maintainers, Linux Kernel Mailing List, Xen-devel

Jeremy Fitzhardinge <jeremy@goop.org> writes:

> Parse the ACPI MADT for I/O APIC information, even if the cpu has no
> (apparent) local APIC (ie, the CPU's APIC feature flag is clear).
>
> In principle, the local APIC and the I/O APIC are distinct (but related)
> components, which can be independently present.
>
> In practice this can happen in a Xen system, where the hypervisor has
> full control over the local APICs, and delivers interrupts initiated by
> the I/O APICs via Xen's event channel mechanism.
>
> (This eliminates the need for any explicit if (xen...) tests in
> acpi/boot.c)
>
> Signed-off-by: Jeremy Fitzhardinge<jeremy.fitzhardinge@citrix.com>

To be very clear.  We have mechanism and policy mixed in the mptable
and related code today.  While we continue to have that mixed I think
even attempting to reuse it for Xen dom0 is a horrifically bad move.

Nacked-by: "Eric W. Biederman" <ebiederm@xmission.com>

> diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
> index 2410469..19d13e5 100644
> --- a/arch/x86/kernel/acpi/boot.c
> +++ b/arch/x86/kernel/acpi/boot.c
> @@ -193,9 +193,6 @@ static int __init acpi_parse_madt(struct acpi_table_header *table)
>  {
>  	struct acpi_table_madt *madt = NULL;
>
> -	if (!cpu_has_apic)
> -		return -EINVAL;
> -
>  	madt = (struct acpi_table_madt *)table;
>  	if (!madt) {
>  		printk(KERN_WARNING PREFIX "Unable to map MADT\n");
> @@ -1252,9 +1249,6 @@ static int __init acpi_parse_madt_ioapic_entries(void)
>  	if (acpi_disabled || acpi_noirq)
>  		return -ENODEV;
>
> -	if (!cpu_has_apic)
> -		return -ENODEV;
> -
>  	/*
>  	 * if "noapic" boot option, don't look for IO-APICs
>  	 */
> @@ -1357,6 +1351,16 @@ static void __init acpi_process_madt(void)
>  #ifdef CONFIG_X86_BIGSMP
>  			generic_bigsmp_probe();
>  #endif
> +		}
> +		if (error == -EINVAL) {
> +			/*
> +			 * The ACPI tables themselves were malformed.
> +			 * Dell Precision Workstation 410, 610 come here.
> +			 */
> +			printk(KERN_ERR PREFIX
> +			       "Invalid BIOS MADT, disabling ACPI\n");
> +			disable_acpi();
> +		} else {
>  			/*
>  			 * Parse MADT IO-APIC entries
>  			 */
> @@ -1370,14 +1374,6 @@ static void __init acpi_process_madt(void)
>  					apic->setup_apic_routing();
>  			}
>  		}
> -		if (error == -EINVAL) {
> -			/*
> -			 * Dell Precision Workstation 410, 610 come here.
> -			 */
> -			printk(KERN_ERR PREFIX
> -			       "Invalid BIOS MADT, disabling ACPI\n");
> -			disable_acpi();
> -		}
>  	} else {
>  		/*
>   		 * ACPI found no MADT, and so ACPI wants UP PIC mode.
> diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
> index c6acce2..d5e3f03 100644
> --- a/arch/x86/kernel/apic/io_apic.c
> +++ b/arch/x86/kernel/apic/io_apic.c
> @@ -1807,6 +1807,10 @@ __apicdebuginit(void) print_all_local_APICs(void)
>  {
>  	int cpu;
>
> +	/* don't print out if apic is not there */
> +	if (!cpu_has_apic)
> +		return;
> +
>  	preempt_disable();
>  	for_each_online_cpu(cpu)
>  		smp_call_function_single(cpu, print_local_APIC, NULL, 1);
> @@ -1849,8 +1853,7 @@ __apicdebuginit(int) print_all_ICs(void)
>  {
>  	print_PIC();
>
> -	/* don't print out if apic is not there */
> -	if (!cpu_has_apic || disable_apic)
> +	if (disable_apic)
>  		return 0;
>
>  	print_all_local_APICs();

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
@ 2009-06-15 10:51   ` Eric W. Biederman
  0 siblings, 0 replies; 79+ messages in thread
From: Eric W. Biederman @ 2009-06-15 10:51 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Xen-devel, the arch/x86 maintainers, Linux Kernel Mailing List,
	Ingo Molnar, H. Peter Anvin, Thomas Gleixner

Jeremy Fitzhardinge <jeremy@goop.org> writes:

> Parse the ACPI MADT for I/O APIC information, even if the cpu has no
> (apparent) local APIC (ie, the CPU's APIC feature flag is clear).
>
> In principle, the local APIC and the I/O APIC are distinct (but related)
> components, which can be independently present.
>
> In practice this can happen in a Xen system, where the hypervisor has
> full control over the local APICs, and delivers interrupts initiated by
> the I/O APICs via Xen's event channel mechanism.
>
> (This eliminates the need for any explicit if (xen...) tests in
> acpi/boot.c)
>
> Signed-off-by: Jeremy Fitzhardinge<jeremy.fitzhardinge@citrix.com>

To be very clear.  We have mechanism and policy mixed in the mptable
and related code today.  While we continue to have that mixed I think
even attempting to reuse it for Xen dom0 is a horrifically bad move.

Nacked-by: "Eric W. Biederman" <ebiederm@xmission.com>

> diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
> index 2410469..19d13e5 100644
> --- a/arch/x86/kernel/acpi/boot.c
> +++ b/arch/x86/kernel/acpi/boot.c
> @@ -193,9 +193,6 @@ static int __init acpi_parse_madt(struct acpi_table_header *table)
>  {
>  	struct acpi_table_madt *madt = NULL;
>
> -	if (!cpu_has_apic)
> -		return -EINVAL;
> -
>  	madt = (struct acpi_table_madt *)table;
>  	if (!madt) {
>  		printk(KERN_WARNING PREFIX "Unable to map MADT\n");
> @@ -1252,9 +1249,6 @@ static int __init acpi_parse_madt_ioapic_entries(void)
>  	if (acpi_disabled || acpi_noirq)
>  		return -ENODEV;
>
> -	if (!cpu_has_apic)
> -		return -ENODEV;
> -
>  	/*
>  	 * if "noapic" boot option, don't look for IO-APICs
>  	 */
> @@ -1357,6 +1351,16 @@ static void __init acpi_process_madt(void)
>  #ifdef CONFIG_X86_BIGSMP
>  			generic_bigsmp_probe();
>  #endif
> +		}
> +		if (error == -EINVAL) {
> +			/*
> +			 * The ACPI tables themselves were malformed.
> +			 * Dell Precision Workstation 410, 610 come here.
> +			 */
> +			printk(KERN_ERR PREFIX
> +			       "Invalid BIOS MADT, disabling ACPI\n");
> +			disable_acpi();
> +		} else {
>  			/*
>  			 * Parse MADT IO-APIC entries
>  			 */
> @@ -1370,14 +1374,6 @@ static void __init acpi_process_madt(void)
>  					apic->setup_apic_routing();
>  			}
>  		}
> -		if (error == -EINVAL) {
> -			/*
> -			 * Dell Precision Workstation 410, 610 come here.
> -			 */
> -			printk(KERN_ERR PREFIX
> -			       "Invalid BIOS MADT, disabling ACPI\n");
> -			disable_acpi();
> -		}
>  	} else {
>  		/*
>   		 * ACPI found no MADT, and so ACPI wants UP PIC mode.
> diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
> index c6acce2..d5e3f03 100644
> --- a/arch/x86/kernel/apic/io_apic.c
> +++ b/arch/x86/kernel/apic/io_apic.c
> @@ -1807,6 +1807,10 @@ __apicdebuginit(void) print_all_local_APICs(void)
>  {
>  	int cpu;
>
> +	/* don't print out if apic is not there */
> +	if (!cpu_has_apic)
> +		return;
> +
>  	preempt_disable();
>  	for_each_online_cpu(cpu)
>  		smp_call_function_single(cpu, print_local_APIC, NULL, 1);
> @@ -1849,8 +1853,7 @@ __apicdebuginit(int) print_all_ICs(void)
>  {
>  	print_PIC();
>
> -	/* don't print out if apic is not there */
> -	if (!cpu_has_apic || disable_apic)
> +	if (disable_apic)
>  		return 0;
>
>  	print_all_local_APICs();

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
  2009-06-15 10:47       ` Eric W. Biederman
@ 2009-06-15 20:49         ` Jeremy Fitzhardinge
  -1 siblings, 0 replies; 79+ messages in thread
From: Jeremy Fitzhardinge @ 2009-06-15 20:49 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Ingo Molnar, Thomas Gleixner, H. Peter Anvin,
	the arch/x86 maintainers, Linux Kernel Mailing List, Xen-devel

On 06/15/09 03:47, Eric W. Biederman wrote:
> For code reuse and maintainability that is a horrible separation of
> responsibility.  Things looks similar to the existing cases until you
> get up close and you discover all of the fundamental assumptions are
> different so none of the existing code actually works unmodified.
>    

The I/O APIC code is used exactly as normal, routing from 
device->pin->vector; the whole interrupt emission path is unchanged.

The local APIC code doesn't get used at all, because we have a different 
interrupt catcher operating at the irq_chip level.

In terms of system architecture its a reasonable place to make the 
split; the local APICs and I/O APICs are distinct entities which 
communicate via fairly well-defined path.  Xen puts the 
hypervisor/control domain split at the same place.  This is mainly 
because Xen itself cares about managing CPUs (and memory), but doesn't 
really care about the rest of the system hardware much - it leaves that 
up to the control domain.

> The only clean way I can see to handle this is to make xen dom0 it's own
> weird separate subarch that does all of the table parsing of the
> firmware tables in completely separate code.  Then once we have something
> that works factoring out the commonalities into a helper library for
> better long term maintenance.
>    
That seems like overkill.  We can get things working under Xen with 3 
changes:

   1. make sure I/O APICs are discovered via ACPI properly (or MPTABLE
      if ACPI isn't present)
   2. get Xen to allocate a vector and bind that vector to an event channel
   3. make sure I/O APIC register writes get to the appropriate I/O APIC
      in hardware (the normal pin->vector routing)

These points already have fairly well-defined interfaces; there are no 
subtle interactions with the core of the APIC code.

This patch achieves the first of these, in a fairly minimal way.  I'm 
still investigating better ways of achieving 2 & 3.

> As it stands right now what Xen wants and what we need to do for normal
> hardware are radically different, to the point of painful.  Things like
> irq migration, and cpu hotplug require completely different algorithms.
>    

The control domain, being a virtual machine, has no access or visibility 
of physical CPUs in the system; all its CPUs are virtual (this is why a 
"local APIC" doesn't make much sense for it, since they're an inherent 
property of a physical CPU, and are not virtualized).

The hypervisor is responsible for all management of physical CPUs, and 
is therefore responsible for physical-CPU things like hotplug and 
interrupt migration.  The kernel doesn't need new algorithms to handle 
these because it simply doesn't know or care about them.

As far as the kernel is concerned, the interrupts look like events on 
event channels, like IPIs, timers, etc, and can be handled accordingly.  
The irq_chip machinery is already in place for them.

> I think Xen dom0 has picked the wrong abstraction for this one.  There
> seems to be no gain and a lot of pain asking the slave kernel to
> program the ioapics for it, when Xen presents a wildly different
> abstraction at the cpu level.
>    

Well, the bulk of the code is already present.  We avoid the local APIC 
part of the kernel completely, by installing a new irq_chip to handle 
incoming interrupts and deliver them into the core interrupt handling 
accordingly.  The control domain patches simply add the ability to bind 
a hardware-originated interrupt to an event channel to be delivered via 
this mechanism.

And, as Xen contains no device drivers or real hardware knowledge of 
busses, interrupt routing, etc, it falls to the control domain to work 
out those aspects.  The I/O APIC side of the setup is the same as it 
would be in the native case (program a vector corresponding to a pin on 
an I/O APIC).

> If what xen was provided looked like an ioapic semantically I would
> suggest setting cpu_has_apic in a different fashion.

cpu_has_apic has the specific meaning of "this CPU has a local APIC".  
It doesn't say anything about the presence or absence of I/O APICs; 
conflating the two notions doesn't seem like a good idea.  I'm clearing 
cpu_has_apic to indicate this specific fact: the CPU has no usable local 
APIC, and there's no point pretending it does - but that doesn't mean 
the I/O APICs aren't functional.

>    We already have two local apic variants after all so a 3rd should not be too nasty.
>    
We currently avoid any need to have, or pretend to have, a local APIC by 
taking control of the interrupt delivery subsystem at the irq_chip 
level.  I don't think there's much to be gained by adding a Xen-specific 
lapic abstraction for this case.

> Except the Xen appears to have totally moved the responsibility around
> in ways that over constrain the problem by taking, making the
> existing code useless.
>    
I don't think that's true at at all.  The split is along hardware lines, 
and so puts the same constraints on kernel development that the hardware 
does.

> Please put the Xen dom0 insanity somewhere off in a corner where the rest
> of x86 can ignore it.
>    

Yep, trying to.

     J

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
@ 2009-06-15 20:49         ` Jeremy Fitzhardinge
  0 siblings, 0 replies; 79+ messages in thread
From: Jeremy Fitzhardinge @ 2009-06-15 20:49 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Xen-devel, the arch/x86 maintainers, Linux Kernel Mailing List,
	Ingo Molnar, H. Peter Anvin, Thomas Gleixner

On 06/15/09 03:47, Eric W. Biederman wrote:
> For code reuse and maintainability that is a horrible separation of
> responsibility.  Things looks similar to the existing cases until you
> get up close and you discover all of the fundamental assumptions are
> different so none of the existing code actually works unmodified.
>    

The I/O APIC code is used exactly as normal, routing from 
device->pin->vector; the whole interrupt emission path is unchanged.

The local APIC code doesn't get used at all, because we have a different 
interrupt catcher operating at the irq_chip level.

In terms of system architecture its a reasonable place to make the 
split; the local APICs and I/O APICs are distinct entities which 
communicate via fairly well-defined path.  Xen puts the 
hypervisor/control domain split at the same place.  This is mainly 
because Xen itself cares about managing CPUs (and memory), but doesn't 
really care about the rest of the system hardware much - it leaves that 
up to the control domain.

> The only clean way I can see to handle this is to make xen dom0 it's own
> weird separate subarch that does all of the table parsing of the
> firmware tables in completely separate code.  Then once we have something
> that works factoring out the commonalities into a helper library for
> better long term maintenance.
>    
That seems like overkill.  We can get things working under Xen with 3 
changes:

   1. make sure I/O APICs are discovered via ACPI properly (or MPTABLE
      if ACPI isn't present)
   2. get Xen to allocate a vector and bind that vector to an event channel
   3. make sure I/O APIC register writes get to the appropriate I/O APIC
      in hardware (the normal pin->vector routing)

These points already have fairly well-defined interfaces; there are no 
subtle interactions with the core of the APIC code.

This patch achieves the first of these, in a fairly minimal way.  I'm 
still investigating better ways of achieving 2 & 3.

> As it stands right now what Xen wants and what we need to do for normal
> hardware are radically different, to the point of painful.  Things like
> irq migration, and cpu hotplug require completely different algorithms.
>    

The control domain, being a virtual machine, has no access or visibility 
of physical CPUs in the system; all its CPUs are virtual (this is why a 
"local APIC" doesn't make much sense for it, since they're an inherent 
property of a physical CPU, and are not virtualized).

The hypervisor is responsible for all management of physical CPUs, and 
is therefore responsible for physical-CPU things like hotplug and 
interrupt migration.  The kernel doesn't need new algorithms to handle 
these because it simply doesn't know or care about them.

As far as the kernel is concerned, the interrupts look like events on 
event channels, like IPIs, timers, etc, and can be handled accordingly.  
The irq_chip machinery is already in place for them.

> I think Xen dom0 has picked the wrong abstraction for this one.  There
> seems to be no gain and a lot of pain asking the slave kernel to
> program the ioapics for it, when Xen presents a wildly different
> abstraction at the cpu level.
>    

Well, the bulk of the code is already present.  We avoid the local APIC 
part of the kernel completely, by installing a new irq_chip to handle 
incoming interrupts and deliver them into the core interrupt handling 
accordingly.  The control domain patches simply add the ability to bind 
a hardware-originated interrupt to an event channel to be delivered via 
this mechanism.

And, as Xen contains no device drivers or real hardware knowledge of 
busses, interrupt routing, etc, it falls to the control domain to work 
out those aspects.  The I/O APIC side of the setup is the same as it 
would be in the native case (program a vector corresponding to a pin on 
an I/O APIC).

> If what xen was provided looked like an ioapic semantically I would
> suggest setting cpu_has_apic in a different fashion.

cpu_has_apic has the specific meaning of "this CPU has a local APIC".  
It doesn't say anything about the presence or absence of I/O APICs; 
conflating the two notions doesn't seem like a good idea.  I'm clearing 
cpu_has_apic to indicate this specific fact: the CPU has no usable local 
APIC, and there's no point pretending it does - but that doesn't mean 
the I/O APICs aren't functional.

>    We already have two local apic variants after all so a 3rd should not be too nasty.
>    
We currently avoid any need to have, or pretend to have, a local APIC by 
taking control of the interrupt delivery subsystem at the irq_chip 
level.  I don't think there's much to be gained by adding a Xen-specific 
lapic abstraction for this case.

> Except the Xen appears to have totally moved the responsibility around
> in ways that over constrain the problem by taking, making the
> existing code useless.
>    
I don't think that's true at at all.  The split is along hardware lines, 
and so puts the same constraints on kernel development that the hardware 
does.

> Please put the Xen dom0 insanity somewhere off in a corner where the rest
> of x86 can ignore it.
>    

Yep, trying to.

     J

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
  2009-06-15 20:49         ` Jeremy Fitzhardinge
@ 2009-06-15 21:58           ` Eric W. Biederman
  -1 siblings, 0 replies; 79+ messages in thread
From: Eric W. Biederman @ 2009-06-15 21:58 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Ingo Molnar, Thomas Gleixner, H. Peter Anvin,
	the arch/x86 maintainers, Linux Kernel Mailing List, Xen-devel

Jeremy Fitzhardinge <jeremy@goop.org> writes:

> On 06/15/09 03:47, Eric W. Biederman wrote:
>> For code reuse and maintainability that is a horrible separation of
>> responsibility.  Things looks similar to the existing cases until you
>> get up close and you discover all of the fundamental assumptions are
>> different so none of the existing code actually works unmodified.
>>    
>
> The I/O APIC code is used exactly as normal, routing from device->pin->vector;
> the whole interrupt emission path is unchanged.

But the algorithms that set it up are now useless.  Which implies most of the
current control logic is now useless.

> The local APIC code doesn't get used at all, because we have a different
> interrupt catcher operating at the irq_chip level.

Sure.  Which pretty much implies that all of io_apic.c and the setting up
of the interrupt controllers has to be replaced as well.

> In terms of system architecture its a reasonable place to make the split; the
> local APICs and I/O APICs are distinct entities which communicate via fairly
> well-defined path.  Xen puts the hypervisor/control domain split at the same
> place.  This is mainly because Xen itself cares about managing CPUs (and
> memory), but doesn't really care about the rest of the system hardware much - it
> leaves that up to the control domain.

Xen doesn't want to deal so it leaves the rest for us.  Making it so neither
kernel has full control of the irqs.  Neither has enough control to flexibly
set things up.

>> The only clean way I can see to handle this is to make xen dom0 it's own
>> weird separate subarch that does all of the table parsing of the
>> firmware tables in completely separate code.  Then once we have something
>> that works factoring out the commonalities into a helper library for
>> better long term maintenance.
>>    
> That seems like overkill.  We can get things working under Xen with 3 changes:

All of the subtle assumptions sound like the come out differently.  Which means
you can very easily start down the road of just reusing small bits and then
you find so many assumptions are different you have to scrap/replace or gunk
up with if (xen) tests.

>   1. make sure I/O APICs are discovered via ACPI properly (or MPTABLE
>      if ACPI isn't present)
>   2. get Xen to allocate a vector and bind that vector to an event channel

The x86 code doesn't allocate vectors.  It allocates a vector that is valid
on a single cpu, or a vector that is valid on all cpus.  Vectors are managed
as a per cpu resource and that is ultimately required to scale the number
of interrupts supported.

If Xen does not manage vectors as a per cpu resource.  That fundamentally
breaks one of the assumptions in the code.

>   3. make sure I/O APIC register writes get to the appropriate I/O APIC
>      in hardware (the normal pin->vector routing)
>
> These points already have fairly well-defined interfaces; there are no subtle
> interactions with the core of the APIC code.
>
> This patch achieves the first of these, in a fairly minimal way.  I'm still
> investigating better ways of achieving 2 & 3.

As I understand Xen so far I still disagree.

>> As it stands right now what Xen wants and what we need to do for normal
>> hardware are radically different, to the point of painful.  Things like
>> irq migration, and cpu hotplug require completely different algorithms.
>>    
>
> The control domain, being a virtual machine, has no access or visibility of
> physical CPUs in the system; all its CPUs are virtual (this is why a "local
> APIC" doesn't make much sense for it, since they're an inherent property of a
> physical CPU, and are not virtualized).
>
> The hypervisor is responsible for all management of physical CPUs, and is
> therefore responsible for physical-CPU things like hotplug and interrupt
> migration.  The kernel doesn't need new algorithms to handle these because it
> simply doesn't know or care about them.

So Xen calls the linux kernel when it wants to migrate an irq from one cpu
to another?

If Xen is calling us back when it wants to migrate an irq from one cpu to another
that implies we need knowledge of irqs.

> As far as the kernel is concerned, the interrupts look like events on event
> channels, like IPIs, timers, etc, and can be handled accordingly.  The irq_chip
> machinery is already in place for them.

If the kernel is responsible for programming the ioapics it definitely cares
about migration, and cpu hotplug.  All of those require reprogramming the
ioapics.

>> I think Xen dom0 has picked the wrong abstraction for this one.  There
>> seems to be no gain and a lot of pain asking the slave kernel to
>> program the ioapics for it, when Xen presents a wildly different
>> abstraction at the cpu level.
>>    
>
> Well, the bulk of the code is already present.  We avoid the local APIC part of
> the kernel completely, by installing a new irq_chip to handle incoming
> interrupts and deliver them into the core interrupt handling accordingly.  The
> control domain patches simply add the ability to bind a hardware-originated
> interrupt to an event channel to be delivered via this mechanism.
>
> And, as Xen contains no device drivers or real hardware knowledge of busses,
> interrupt routing, etc, it falls to the control domain to work out those
> aspects.  The I/O APIC side of the setup is the same as it would be in the
> native case (program a vector corresponding to a pin on an I/O APIC).

There is a lot more going on than that.  You need to know logical vs physical.
How many bits of cpu id you have.  

Which mode you program the ioapics in logical vs physical (and the variants)
of those is directly tied with how many cpus you have, and how that box
is arranged.

>> If what xen was provided looked like an ioapic semantically I would
>> suggest setting cpu_has_apic in a different fashion.
>
> cpu_has_apic has the specific meaning of "this CPU has a local APIC".  It
> doesn't say anything about the presence or absence of I/O APICs; conflating the
> two notions doesn't seem like a good idea.  I'm clearing cpu_has_apic to
> indicate this specific fact: the CPU has no usable local APIC, and there's no
> point pretending it does - but that doesn't mean the I/O APICs aren't
> functional.

If you don't have a local apic you can't receive apic messages.

The Xen ``whacky apic'' model is different.  

>>    We already have two local apic variants after all so a 3rd should not be too nasty.
>>    
> We currently avoid any need to have, or pretend to have, a local APIC by taking
> control of the interrupt delivery subsystem at the irq_chip level.  I don't
> think there's much to be gained by adding a Xen-specific lapic abstraction for
> this case.

To work with Xen dom0 we need to double the number of ioapic routing models.  This
is code that is tricky enough we are still struggling to get clean and sane without
Xen complications that violate some of the most basic assumptions in the code.

>> Except the Xen appears to have totally moved the responsibility around
>> in ways that over constrain the problem by taking, making the
>> existing code useless.
>>    
> I don't think that's true at at all.  The split is along hardware lines, and so
> puts the same constraints on kernel development that the hardware does.

The split is right in the middle of the irq handling.  It is happening in the middle
of deep architecture code.  So far voyager sounds easier to fit into the current
model then Xen.

>> Please put the Xen dom0 insanity somewhere off in a corner where the rest
>> of x86 can ignore it.
>>    
>
> Yep, trying to.

Then please don't hack up the normal x86 irq handling code.

Eric

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
@ 2009-06-15 21:58           ` Eric W. Biederman
  0 siblings, 0 replies; 79+ messages in thread
From: Eric W. Biederman @ 2009-06-15 21:58 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Xen-devel, the arch/x86 maintainers, Linux Kernel Mailing List,
	Ingo Molnar, H. Peter Anvin, Thomas Gleixner

Jeremy Fitzhardinge <jeremy@goop.org> writes:

> On 06/15/09 03:47, Eric W. Biederman wrote:
>> For code reuse and maintainability that is a horrible separation of
>> responsibility.  Things looks similar to the existing cases until you
>> get up close and you discover all of the fundamental assumptions are
>> different so none of the existing code actually works unmodified.
>>    
>
> The I/O APIC code is used exactly as normal, routing from device->pin->vector;
> the whole interrupt emission path is unchanged.

But the algorithms that set it up are now useless.  Which implies most of the
current control logic is now useless.

> The local APIC code doesn't get used at all, because we have a different
> interrupt catcher operating at the irq_chip level.

Sure.  Which pretty much implies that all of io_apic.c and the setting up
of the interrupt controllers has to be replaced as well.

> In terms of system architecture its a reasonable place to make the split; the
> local APICs and I/O APICs are distinct entities which communicate via fairly
> well-defined path.  Xen puts the hypervisor/control domain split at the same
> place.  This is mainly because Xen itself cares about managing CPUs (and
> memory), but doesn't really care about the rest of the system hardware much - it
> leaves that up to the control domain.

Xen doesn't want to deal so it leaves the rest for us.  Making it so neither
kernel has full control of the irqs.  Neither has enough control to flexibly
set things up.

>> The only clean way I can see to handle this is to make xen dom0 it's own
>> weird separate subarch that does all of the table parsing of the
>> firmware tables in completely separate code.  Then once we have something
>> that works factoring out the commonalities into a helper library for
>> better long term maintenance.
>>    
> That seems like overkill.  We can get things working under Xen with 3 changes:

All of the subtle assumptions sound like the come out differently.  Which means
you can very easily start down the road of just reusing small bits and then
you find so many assumptions are different you have to scrap/replace or gunk
up with if (xen) tests.

>   1. make sure I/O APICs are discovered via ACPI properly (or MPTABLE
>      if ACPI isn't present)
>   2. get Xen to allocate a vector and bind that vector to an event channel

The x86 code doesn't allocate vectors.  It allocates a vector that is valid
on a single cpu, or a vector that is valid on all cpus.  Vectors are managed
as a per cpu resource and that is ultimately required to scale the number
of interrupts supported.

If Xen does not manage vectors as a per cpu resource.  That fundamentally
breaks one of the assumptions in the code.

>   3. make sure I/O APIC register writes get to the appropriate I/O APIC
>      in hardware (the normal pin->vector routing)
>
> These points already have fairly well-defined interfaces; there are no subtle
> interactions with the core of the APIC code.
>
> This patch achieves the first of these, in a fairly minimal way.  I'm still
> investigating better ways of achieving 2 & 3.

As I understand Xen so far I still disagree.

>> As it stands right now what Xen wants and what we need to do for normal
>> hardware are radically different, to the point of painful.  Things like
>> irq migration, and cpu hotplug require completely different algorithms.
>>    
>
> The control domain, being a virtual machine, has no access or visibility of
> physical CPUs in the system; all its CPUs are virtual (this is why a "local
> APIC" doesn't make much sense for it, since they're an inherent property of a
> physical CPU, and are not virtualized).
>
> The hypervisor is responsible for all management of physical CPUs, and is
> therefore responsible for physical-CPU things like hotplug and interrupt
> migration.  The kernel doesn't need new algorithms to handle these because it
> simply doesn't know or care about them.

So Xen calls the linux kernel when it wants to migrate an irq from one cpu
to another?

If Xen is calling us back when it wants to migrate an irq from one cpu to another
that implies we need knowledge of irqs.

> As far as the kernel is concerned, the interrupts look like events on event
> channels, like IPIs, timers, etc, and can be handled accordingly.  The irq_chip
> machinery is already in place for them.

If the kernel is responsible for programming the ioapics it definitely cares
about migration, and cpu hotplug.  All of those require reprogramming the
ioapics.

>> I think Xen dom0 has picked the wrong abstraction for this one.  There
>> seems to be no gain and a lot of pain asking the slave kernel to
>> program the ioapics for it, when Xen presents a wildly different
>> abstraction at the cpu level.
>>    
>
> Well, the bulk of the code is already present.  We avoid the local APIC part of
> the kernel completely, by installing a new irq_chip to handle incoming
> interrupts and deliver them into the core interrupt handling accordingly.  The
> control domain patches simply add the ability to bind a hardware-originated
> interrupt to an event channel to be delivered via this mechanism.
>
> And, as Xen contains no device drivers or real hardware knowledge of busses,
> interrupt routing, etc, it falls to the control domain to work out those
> aspects.  The I/O APIC side of the setup is the same as it would be in the
> native case (program a vector corresponding to a pin on an I/O APIC).

There is a lot more going on than that.  You need to know logical vs physical.
How many bits of cpu id you have.  

Which mode you program the ioapics in logical vs physical (and the variants)
of those is directly tied with how many cpus you have, and how that box
is arranged.

>> If what xen was provided looked like an ioapic semantically I would
>> suggest setting cpu_has_apic in a different fashion.
>
> cpu_has_apic has the specific meaning of "this CPU has a local APIC".  It
> doesn't say anything about the presence or absence of I/O APICs; conflating the
> two notions doesn't seem like a good idea.  I'm clearing cpu_has_apic to
> indicate this specific fact: the CPU has no usable local APIC, and there's no
> point pretending it does - but that doesn't mean the I/O APICs aren't
> functional.

If you don't have a local apic you can't receive apic messages.

The Xen ``whacky apic'' model is different.  

>>    We already have two local apic variants after all so a 3rd should not be too nasty.
>>    
> We currently avoid any need to have, or pretend to have, a local APIC by taking
> control of the interrupt delivery subsystem at the irq_chip level.  I don't
> think there's much to be gained by adding a Xen-specific lapic abstraction for
> this case.

To work with Xen dom0 we need to double the number of ioapic routing models.  This
is code that is tricky enough we are still struggling to get clean and sane without
Xen complications that violate some of the most basic assumptions in the code.

>> Except the Xen appears to have totally moved the responsibility around
>> in ways that over constrain the problem by taking, making the
>> existing code useless.
>>    
> I don't think that's true at at all.  The split is along hardware lines, and so
> puts the same constraints on kernel development that the hardware does.

The split is right in the middle of the irq handling.  It is happening in the middle
of deep architecture code.  So far voyager sounds easier to fit into the current
model then Xen.

>> Please put the Xen dom0 insanity somewhere off in a corner where the rest
>> of x86 can ignore it.
>>    
>
> Yep, trying to.

Then please don't hack up the normal x86 irq handling code.

Eric

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
  2009-06-15 21:58           ` Eric W. Biederman
@ 2009-06-16 19:38             ` Jeremy Fitzhardinge
  -1 siblings, 0 replies; 79+ messages in thread
From: Jeremy Fitzhardinge @ 2009-06-16 19:38 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Ingo Molnar, Thomas Gleixner, H. Peter Anvin,
	the arch/x86 maintainers, Linux Kernel Mailing List, Xen-devel

On 06/15/09 14:58, Eric W. Biederman wrote:
>>> The only clean way I can see to handle this is to make xen dom0 it's own
>>> weird separate subarch that does all of the table parsing of the
>>> firmware tables in completely separate code.  Then once we have something
>>> that works factoring out the commonalities into a helper library for
>>> better long term maintenance.
>>>
>>>        
>> That seems like overkill.  We can get things working under Xen with 3 changes:
>>      
>
> All of the subtle assumptions sound like the come out differently.  Which means
> you can very easily start down the road of just reusing small bits and then
> you find so many assumptions are different you have to scrap/replace or gunk
> up with if (xen) tests.
>    

I think we're getting off into the weeds a bit here.  I'm looking at 
other options of how to fit Xen interrupt handling into the kernel in a 
clean way; we may end up with a different model from the previous patch 
postings (not this particular one under discussion; the ones from last 
month).  We can reopen this discussion when I post those patches.

However, the kernel will still need information about the I/O APICs from 
ACPI so that it can perform basic interrupt routing for PCI devices (ie, 
regardless of how the interrupt gets delivered, and who programs the 
APIC hardware, we still need the basic information of "what io apic+pin 
is this PCI device connected to?").  This particular patch is my attempt 
to achieve this.

> To be very clear.  We have mechanism and policy mixed in the mptable
> and related code today.  While we continue to have that mixed I think
> even attempting to reuse it for Xen dom0 is a horrifically bad move.
>
> Nacked-by: "Eric W. Biederman"<ebiederm@xmission.com>
>    

The only effect of this patch is to parse the I/O APIC parts of the MADT 
even if it skips the local APIC parts; it causes no change in behaviour 
in normal circumstances (unless you actually have a physical machine 
with ACPI and I/O APICs but CPUs with no local APICs, which is guess is 
possible in principle).

Can you give an example of how mechanism and policy are mixed?  In what 
ways could it break?  Would you agree to a patch which attempts to 
decouple policy and mechanism to solve these problems?

     J

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
@ 2009-06-16 19:38             ` Jeremy Fitzhardinge
  0 siblings, 0 replies; 79+ messages in thread
From: Jeremy Fitzhardinge @ 2009-06-16 19:38 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Xen-devel, the arch/x86 maintainers, Linux Kernel Mailing List,
	Ingo Molnar, H. Peter Anvin, Thomas Gleixner

On 06/15/09 14:58, Eric W. Biederman wrote:
>>> The only clean way I can see to handle this is to make xen dom0 it's own
>>> weird separate subarch that does all of the table parsing of the
>>> firmware tables in completely separate code.  Then once we have something
>>> that works factoring out the commonalities into a helper library for
>>> better long term maintenance.
>>>
>>>        
>> That seems like overkill.  We can get things working under Xen with 3 changes:
>>      
>
> All of the subtle assumptions sound like the come out differently.  Which means
> you can very easily start down the road of just reusing small bits and then
> you find so many assumptions are different you have to scrap/replace or gunk
> up with if (xen) tests.
>    

I think we're getting off into the weeds a bit here.  I'm looking at 
other options of how to fit Xen interrupt handling into the kernel in a 
clean way; we may end up with a different model from the previous patch 
postings (not this particular one under discussion; the ones from last 
month).  We can reopen this discussion when I post those patches.

However, the kernel will still need information about the I/O APICs from 
ACPI so that it can perform basic interrupt routing for PCI devices (ie, 
regardless of how the interrupt gets delivered, and who programs the 
APIC hardware, we still need the basic information of "what io apic+pin 
is this PCI device connected to?").  This particular patch is my attempt 
to achieve this.

> To be very clear.  We have mechanism and policy mixed in the mptable
> and related code today.  While we continue to have that mixed I think
> even attempting to reuse it for Xen dom0 is a horrifically bad move.
>
> Nacked-by: "Eric W. Biederman"<ebiederm@xmission.com>
>    

The only effect of this patch is to parse the I/O APIC parts of the MADT 
even if it skips the local APIC parts; it causes no change in behaviour 
in normal circumstances (unless you actually have a physical machine 
with ACPI and I/O APICs but CPUs with no local APICs, which is guess is 
possible in principle).

Can you give an example of how mechanism and policy are mixed?  In what 
ways could it break?  Would you agree to a patch which attempts to 
decouple policy and mechanism to solve these problems?

     J

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
  2009-06-16 19:38             ` Jeremy Fitzhardinge
@ 2009-06-17  5:10               ` Eric W. Biederman
  -1 siblings, 0 replies; 79+ messages in thread
From: Eric W. Biederman @ 2009-06-17  5:10 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Ingo Molnar, Thomas Gleixner, H. Peter Anvin,
	the arch/x86 maintainers, Linux Kernel Mailing List, Xen-devel

Jeremy Fitzhardinge <jeremy@goop.org> writes:

> The only effect of this patch is to parse the I/O APIC parts of the MADT even if
> it skips the local APIC parts; it causes no change in behaviour in normal
> circumstances (unless you actually have a physical machine with ACPI and I/O
> APICs but CPUs with no local APICs, which is guess is possible in principle).

You allow getting to places like apic->setup_apic_routing without going
through prerequisites like generic_bigsmp_probe().

> Can you give an example of how mechanism and policy are mixed?  In what ways
> could it break?  Would you agree to a patch which attempts to decouple policy
> and mechanism to solve these problems?

I would agree with a patch that decouples the parts you need.  Something
that makes it possible to call apci_parse_madt_lapic_entries without
calling the rest of the code sounds reasonable.

Given that ia64 already has a separate path calling into acpi I'm not
certain there is much truly useful code that can be shared.  Getting
the BIOS bug workarounds seems reasonable.

It would be good to see at least a rough draft of where you are going.  So
the whole picture can be clear.

Right now.  I don't think there is anything in anything in
arch/x86/kernel/apic/* arch/x86/kernel/smpboot.c that is usable for xen.

As for mixing mechanism and policy besides the cpu_has_apic tests we
have generic_bigsmp_probe, the calling of apic_setup_apic_routing.
The code that depends on the CONFIG_X86_LOCAL_APIC define.

There are also deep assumptions in the code like default_setup_apic_routing.
That tests the number of local apics and uses that to decide on how to setup
the ioapics.

Eric

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
@ 2009-06-17  5:10               ` Eric W. Biederman
  0 siblings, 0 replies; 79+ messages in thread
From: Eric W. Biederman @ 2009-06-17  5:10 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Xen-devel, the arch/x86 maintainers, Linux Kernel Mailing List,
	Ingo Molnar, H. Peter Anvin, Thomas Gleixner

Jeremy Fitzhardinge <jeremy@goop.org> writes:

> The only effect of this patch is to parse the I/O APIC parts of the MADT even if
> it skips the local APIC parts; it causes no change in behaviour in normal
> circumstances (unless you actually have a physical machine with ACPI and I/O
> APICs but CPUs with no local APICs, which is guess is possible in principle).

You allow getting to places like apic->setup_apic_routing without going
through prerequisites like generic_bigsmp_probe().

> Can you give an example of how mechanism and policy are mixed?  In what ways
> could it break?  Would you agree to a patch which attempts to decouple policy
> and mechanism to solve these problems?

I would agree with a patch that decouples the parts you need.  Something
that makes it possible to call apci_parse_madt_lapic_entries without
calling the rest of the code sounds reasonable.

Given that ia64 already has a separate path calling into acpi I'm not
certain there is much truly useful code that can be shared.  Getting
the BIOS bug workarounds seems reasonable.

It would be good to see at least a rough draft of where you are going.  So
the whole picture can be clear.

Right now.  I don't think there is anything in anything in
arch/x86/kernel/apic/* arch/x86/kernel/smpboot.c that is usable for xen.

As for mixing mechanism and policy besides the cpu_has_apic tests we
have generic_bigsmp_probe, the calling of apic_setup_apic_routing.
The code that depends on the CONFIG_X86_LOCAL_APIC define.

There are also deep assumptions in the code like default_setup_apic_routing.
That tests the number of local apics and uses that to decide on how to setup
the ioapics.

Eric

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
  2009-06-16 19:38             ` Jeremy Fitzhardinge
@ 2009-06-17 12:02               ` Eric W. Biederman
  -1 siblings, 0 replies; 79+ messages in thread
From: Eric W. Biederman @ 2009-06-17 12:02 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Ingo Molnar, Thomas Gleixner, H. Peter Anvin,
	the arch/x86 maintainers, Linux Kernel Mailing List, Xen-devel

Jeremy Fitzhardinge <jeremy@goop.org> writes:

> I think we're getting off into the weeds a bit here.  I'm looking at other
> options of how to fit Xen interrupt handling into the kernel in a clean way; we
> may end up with a different model from the previous patch postings (not this
> particular one under discussion; the ones from last month).  We can reopen this
> discussion when I post those patches.
>
> However, the kernel will still need information about the I/O APICs from ACPI so
> that it can perform basic interrupt routing for PCI devices (ie, regardless of
> how the interrupt gets delivered, and who programs the APIC hardware, we still
> need the basic information of "what io apic+pin is this PCI device connected
> to?").  This particular patch is my attempt to achieve this.

Trying to understand what is going on I just read through Xen 3.4 and the
accompanying 2.6.18 kernel source.

Xen has a horrible api with respect to io_apics.  They aren't even real
io_apics when Xen is done ``abstracting'' them.

Xen gives us the vector to write.  But we get to assign that
vector arbitrarily to an ioapic and vector.

We are required to use a hypercall when performing the write.
Xen overrides the delivery_mode and destination, and occasionally
the mask bit.

We still have to handle polarity and the trigger mode.  Despite
the fact that Xen has acpi and mp tables parsers of it's own.

I expect it would have been easier and simpler all around if there
was just a map_gsi event channel hypercall.  But Xen has an abi
and an existing set of calls so could aren't worth worrying about
much.

Xen's ioapic affinity management logic looks like it only works
on sunny days if you don't stress it too hard.  Of course the hard
part Xen of driving the hardware Xen doesn't want to share.

It looks like the only thing Xen gains by pushing out the work of
setting the polarity and setting edge/level triggering is our database
of motherboards which get those things wrong.

So I expect the thing to do is factor out acpi_parse_ioapic,
mp_register_ioapic so we can share information on borked BIOS's
between the Xen dom0 port and otherwise push Xen pseudo apic handling
off into it's strange little corner.

Eric

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
@ 2009-06-17 12:02               ` Eric W. Biederman
  0 siblings, 0 replies; 79+ messages in thread
From: Eric W. Biederman @ 2009-06-17 12:02 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Xen-devel, the arch/x86 maintainers, Linux Kernel Mailing List,
	Ingo Molnar, H. Peter Anvin, Thomas Gleixner

Jeremy Fitzhardinge <jeremy@goop.org> writes:

> I think we're getting off into the weeds a bit here.  I'm looking at other
> options of how to fit Xen interrupt handling into the kernel in a clean way; we
> may end up with a different model from the previous patch postings (not this
> particular one under discussion; the ones from last month).  We can reopen this
> discussion when I post those patches.
>
> However, the kernel will still need information about the I/O APICs from ACPI so
> that it can perform basic interrupt routing for PCI devices (ie, regardless of
> how the interrupt gets delivered, and who programs the APIC hardware, we still
> need the basic information of "what io apic+pin is this PCI device connected
> to?").  This particular patch is my attempt to achieve this.

Trying to understand what is going on I just read through Xen 3.4 and the
accompanying 2.6.18 kernel source.

Xen has a horrible api with respect to io_apics.  They aren't even real
io_apics when Xen is done ``abstracting'' them.

Xen gives us the vector to write.  But we get to assign that
vector arbitrarily to an ioapic and vector.

We are required to use a hypercall when performing the write.
Xen overrides the delivery_mode and destination, and occasionally
the mask bit.

We still have to handle polarity and the trigger mode.  Despite
the fact that Xen has acpi and mp tables parsers of it's own.

I expect it would have been easier and simpler all around if there
was just a map_gsi event channel hypercall.  But Xen has an abi
and an existing set of calls so could aren't worth worrying about
much.

Xen's ioapic affinity management logic looks like it only works
on sunny days if you don't stress it too hard.  Of course the hard
part Xen of driving the hardware Xen doesn't want to share.

It looks like the only thing Xen gains by pushing out the work of
setting the polarity and setting edge/level triggering is our database
of motherboards which get those things wrong.

So I expect the thing to do is factor out acpi_parse_ioapic,
mp_register_ioapic so we can share information on borked BIOS's
between the Xen dom0 port and otherwise push Xen pseudo apic handling
off into it's strange little corner.

Eric

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
  2009-06-17 12:02               ` Eric W. Biederman
@ 2009-06-17 17:32                 ` Jeremy Fitzhardinge
  -1 siblings, 0 replies; 79+ messages in thread
From: Jeremy Fitzhardinge @ 2009-06-17 17:32 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Ingo Molnar, Thomas Gleixner, H. Peter Anvin,
	the arch/x86 maintainers, Linux Kernel Mailing List, Xen-devel,
	Keir Fraser

On 06/17/09 05:02, Eric W. Biederman wrote:
> Trying to understand what is going on I just read through Xen 3.4 and the
> accompanying 2.6.18 kernel source.
>    

Thanks very much for spending time on this.  I really appreciate it.

> Xen has a horrible api with respect to io_apics.  They aren't even real
> io_apics when Xen is done ``abstracting'' them.
>
> Xen gives us the vector to write.  But we get to assign that
> vector arbitrarily to an ioapic and vector.
>
> We are required to use a hypercall when performing the write.
> Xen overrides the delivery_mode and destination, and occasionally
> the mask bit.
>    

Yes, it's a bit mad.  All those writes are really conveying is the 
vector, and Xen gave that to us in the first place.

> We still have to handle polarity and the trigger mode.  Despite
> the fact that Xen has acpi and mp tables parsers of it's own.
>
> I expect it would have been easier and simpler all around if there
> was just a map_gsi event channel hypercall.  But Xen has an abi
> and an existing set of calls so could aren't worth worrying about
> much.
>    

Actually I was discussing this with Keir yesterday.  We're definitely 
open to changing the dom0 API to make things simpler on the Linux side.  
(The dom0 ABI is more fluid than the domU one, and these changes would 
be backwards-compatible anyway.)

One of the options we discussed was changing the API to get rid of the 
exposed vector, and just replace it with an operation to directly bind a 
gsi to a pirq (internal Xen physical interrupt handle, if you will), so 
that Xen ends up doing all the I/O APIC programming internally, as well 
as the local APIC.

On the Linux side, I think it means we can just point 
pcibios_enable/disable_irq to our own xen_pci_irq_enable/disable 
functions to create the binding between a PCI device and an irq.

I haven't prototyped this yet, or even looked into it very closely, but 
it seems like a promising approach to avoid almost all interaction with 
the apic layer of the kernel.  xen_pci_irq_enable() would have to make 
its own calls acpi_pci_irq_lookup() to map pci_dev+pin -> gsi, so we 
would still need to make sure ACPI is up to that job.

> Xen's ioapic affinity management logic looks like it only works
> on sunny days if you don't stress it too hard.
Could you be a bit more specific?  Are you referring to problems that 
you've fixed in the kernel which are still present in Xen?

>    Of course the hard
> part Xen of driving the hardware Xen doesn't want to share.
>    

Yes; it has to handle everything relating to physical CPUs, as the 
kernel only has virtual CPUs.

> It looks like the only thing Xen gains by pushing out the work of
> setting the polarity and setting edge/level triggering is our database
> of motherboards which get those things wrong.
>    

Avoiding duplication of effort is a non-trivial benefit.

> So I expect the thing to do is factor out acpi_parse_ioapic,
> mp_register_ioapic so we can share information on borked BIOS's
> between the Xen dom0 port and otherwise push Xen pseudo apic handling
> off into it's strange little corner.

Yes, that's what I'll look into.

     J


^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
@ 2009-06-17 17:32                 ` Jeremy Fitzhardinge
  0 siblings, 0 replies; 79+ messages in thread
From: Jeremy Fitzhardinge @ 2009-06-17 17:32 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Xen-devel, the arch/x86 maintainers, Linux Kernel Mailing List,
	Ingo Molnar, Keir Fraser, H. Peter Anvin, Thomas Gleixner

On 06/17/09 05:02, Eric W. Biederman wrote:
> Trying to understand what is going on I just read through Xen 3.4 and the
> accompanying 2.6.18 kernel source.
>    

Thanks very much for spending time on this.  I really appreciate it.

> Xen has a horrible api with respect to io_apics.  They aren't even real
> io_apics when Xen is done ``abstracting'' them.
>
> Xen gives us the vector to write.  But we get to assign that
> vector arbitrarily to an ioapic and vector.
>
> We are required to use a hypercall when performing the write.
> Xen overrides the delivery_mode and destination, and occasionally
> the mask bit.
>    

Yes, it's a bit mad.  All those writes are really conveying is the 
vector, and Xen gave that to us in the first place.

> We still have to handle polarity and the trigger mode.  Despite
> the fact that Xen has acpi and mp tables parsers of it's own.
>
> I expect it would have been easier and simpler all around if there
> was just a map_gsi event channel hypercall.  But Xen has an abi
> and an existing set of calls so could aren't worth worrying about
> much.
>    

Actually I was discussing this with Keir yesterday.  We're definitely 
open to changing the dom0 API to make things simpler on the Linux side.  
(The dom0 ABI is more fluid than the domU one, and these changes would 
be backwards-compatible anyway.)

One of the options we discussed was changing the API to get rid of the 
exposed vector, and just replace it with an operation to directly bind a 
gsi to a pirq (internal Xen physical interrupt handle, if you will), so 
that Xen ends up doing all the I/O APIC programming internally, as well 
as the local APIC.

On the Linux side, I think it means we can just point 
pcibios_enable/disable_irq to our own xen_pci_irq_enable/disable 
functions to create the binding between a PCI device and an irq.

I haven't prototyped this yet, or even looked into it very closely, but 
it seems like a promising approach to avoid almost all interaction with 
the apic layer of the kernel.  xen_pci_irq_enable() would have to make 
its own calls acpi_pci_irq_lookup() to map pci_dev+pin -> gsi, so we 
would still need to make sure ACPI is up to that job.

> Xen's ioapic affinity management logic looks like it only works
> on sunny days if you don't stress it too hard.
Could you be a bit more specific?  Are you referring to problems that 
you've fixed in the kernel which are still present in Xen?

>    Of course the hard
> part Xen of driving the hardware Xen doesn't want to share.
>    

Yes; it has to handle everything relating to physical CPUs, as the 
kernel only has virtual CPUs.

> It looks like the only thing Xen gains by pushing out the work of
> setting the polarity and setting edge/level triggering is our database
> of motherboards which get those things wrong.
>    

Avoiding duplication of effort is a non-trivial benefit.

> So I expect the thing to do is factor out acpi_parse_ioapic,
> mp_register_ioapic so we can share information on borked BIOS's
> between the Xen dom0 port and otherwise push Xen pseudo apic handling
> off into it's strange little corner.

Yes, that's what I'll look into.

     J

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
  2009-06-17 17:32                 ` Jeremy Fitzhardinge
@ 2009-06-18  2:58                   ` Eric W. Biederman
  -1 siblings, 0 replies; 79+ messages in thread
From: Eric W. Biederman @ 2009-06-18  2:58 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Ingo Molnar, Thomas Gleixner, H. Peter Anvin,
	the arch/x86 maintainers, Linux Kernel Mailing List, Xen-devel,
	Keir Fraser

Jeremy Fitzhardinge <jeremy@goop.org> writes:

> On 06/17/09 05:02, Eric W. Biederman wrote:
>> Trying to understand what is going on I just read through Xen 3.4 and the
>> accompanying 2.6.18 kernel source.
>>    
>
> Thanks very much for spending time on this.  I really appreciate it.
>
>> Xen has a horrible api with respect to io_apics.  They aren't even real
>> io_apics when Xen is done ``abstracting'' them.
>>
>> Xen gives us the vector to write.  But we get to assign that
>> vector arbitrarily to an ioapic and vector.
>>
>> We are required to use a hypercall when performing the write.
>> Xen overrides the delivery_mode and destination, and occasionally
>> the mask bit.
>>    
>
> Yes, it's a bit mad.  All those writes are really conveying is the vector, and
> Xen gave that to us in the first place.

Pretty much.  After seeing the pirq to event channel binding I had to hunt
like mad to figure out why you needed anything else.

>> We still have to handle polarity and the trigger mode.  Despite
>> the fact that Xen has acpi and mp tables parsers of it's own.
>>
>> I expect it would have been easier and simpler all around if there
>> was just a map_gsi event channel hypercall.  But Xen has an abi
>> and an existing set of calls so could aren't worth worrying about
>> much.
>>    
>
> Actually I was discussing this with Keir yesterday.  We're definitely open to
> changing the dom0 API to make things simpler on the Linux side.  (The dom0 ABI
> is more fluid than the domU one, and these changes would be backwards-compatible
> anyway.)
>
> One of the options we discussed was changing the API to get rid of the exposed
> vector, and just replace it with an operation to directly bind a gsi to a pirq
> (internal Xen physical interrupt handle, if you will), so that Xen ends up doing
> all the I/O APIC programming internally, as well as the local APIC.

As an abstraction layer I think that will work out a lot better long term.

Given what iommus with irqs and DMA I expect you want something like
that, that can be used from domU.  Then you just make allowing the
operation conditional on if you happen to have the associated hardware
mapped into your domain.

> On the Linux side, I think it means we can just point pcibios_enable/disable_irq
> to our own xen_pci_irq_enable/disable functions to create the binding between a
> PCI device and an irq.

If you want xen to assign the linux irq number that is absolutely the properly place
to hook.

> I haven't prototyped this yet, or even looked into it very closely, but it seems
> like a promising approach to avoid almost all interaction with the apic layer of
> the kernel.  xen_pci_irq_enable() would have to make its own calls
> acpi_pci_irq_lookup() to map pci_dev+pin -> gsi, so we would still need to make
> sure ACPI is up to that job.
>
>> Xen's ioapic affinity management logic looks like it only works
>> on sunny days if you don't stress it too hard.
> Could you be a bit more specific?  Are you referring to problems that you've
> fixed in the kernel which are still present in Xen?

Problems I have avoided.

When I was messing with the irq code I did not recall finding many
cases where migrating irqs from process context worked without hitting
hardware bugs.  ioapic state machine lockups and the like.

I currently make that problem harder on myself by not allocating vectors
globally, but it gives an irq architecture that should work for however
much I/O we have in the future.  

The one case that it is most likely to work is lowest priority interrupt
delivery where the hardware decides which cpu it should go to and it only
takes a single register write to change the cpu mask, and the common case
in Xen.

When you start directing irqs at specific cpus things get a lot easier
to break.

>> It looks like the only thing Xen gains by pushing out the work of
>> setting the polarity and setting edge/level triggering is our database
>> of motherboards which get those things wrong.
>>    
>
> Avoiding duplication of effort is a non-trivial benefit.
>
>> So I expect the thing to do is factor out acpi_parse_ioapic,
>> mp_register_ioapic so we can share information on borked BIOS's
>> between the Xen dom0 port and otherwise push Xen pseudo apic handling
>> off into it's strange little corner.
>
> Yes, that's what I'll look into.

How does Xen handle domU with hardware directly mapped?

Temporally ignoring what we have to do to work with Xen 3.4.  I'm curious
if we could make the Xen dom0 irq case the same as the Xen domU case.

Eric

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
@ 2009-06-18  2:58                   ` Eric W. Biederman
  0 siblings, 0 replies; 79+ messages in thread
From: Eric W. Biederman @ 2009-06-18  2:58 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Xen-devel, the arch/x86 maintainers, Linux Kernel Mailing List,
	Ingo Molnar, Keir Fraser, H. Peter Anvin, Thomas Gleixner

Jeremy Fitzhardinge <jeremy@goop.org> writes:

> On 06/17/09 05:02, Eric W. Biederman wrote:
>> Trying to understand what is going on I just read through Xen 3.4 and the
>> accompanying 2.6.18 kernel source.
>>    
>
> Thanks very much for spending time on this.  I really appreciate it.
>
>> Xen has a horrible api with respect to io_apics.  They aren't even real
>> io_apics when Xen is done ``abstracting'' them.
>>
>> Xen gives us the vector to write.  But we get to assign that
>> vector arbitrarily to an ioapic and vector.
>>
>> We are required to use a hypercall when performing the write.
>> Xen overrides the delivery_mode and destination, and occasionally
>> the mask bit.
>>    
>
> Yes, it's a bit mad.  All those writes are really conveying is the vector, and
> Xen gave that to us in the first place.

Pretty much.  After seeing the pirq to event channel binding I had to hunt
like mad to figure out why you needed anything else.

>> We still have to handle polarity and the trigger mode.  Despite
>> the fact that Xen has acpi and mp tables parsers of it's own.
>>
>> I expect it would have been easier and simpler all around if there
>> was just a map_gsi event channel hypercall.  But Xen has an abi
>> and an existing set of calls so could aren't worth worrying about
>> much.
>>    
>
> Actually I was discussing this with Keir yesterday.  We're definitely open to
> changing the dom0 API to make things simpler on the Linux side.  (The dom0 ABI
> is more fluid than the domU one, and these changes would be backwards-compatible
> anyway.)
>
> One of the options we discussed was changing the API to get rid of the exposed
> vector, and just replace it with an operation to directly bind a gsi to a pirq
> (internal Xen physical interrupt handle, if you will), so that Xen ends up doing
> all the I/O APIC programming internally, as well as the local APIC.

As an abstraction layer I think that will work out a lot better long term.

Given what iommus with irqs and DMA I expect you want something like
that, that can be used from domU.  Then you just make allowing the
operation conditional on if you happen to have the associated hardware
mapped into your domain.

> On the Linux side, I think it means we can just point pcibios_enable/disable_irq
> to our own xen_pci_irq_enable/disable functions to create the binding between a
> PCI device and an irq.

If you want xen to assign the linux irq number that is absolutely the properly place
to hook.

> I haven't prototyped this yet, or even looked into it very closely, but it seems
> like a promising approach to avoid almost all interaction with the apic layer of
> the kernel.  xen_pci_irq_enable() would have to make its own calls
> acpi_pci_irq_lookup() to map pci_dev+pin -> gsi, so we would still need to make
> sure ACPI is up to that job.
>
>> Xen's ioapic affinity management logic looks like it only works
>> on sunny days if you don't stress it too hard.
> Could you be a bit more specific?  Are you referring to problems that you've
> fixed in the kernel which are still present in Xen?

Problems I have avoided.

When I was messing with the irq code I did not recall finding many
cases where migrating irqs from process context worked without hitting
hardware bugs.  ioapic state machine lockups and the like.

I currently make that problem harder on myself by not allocating vectors
globally, but it gives an irq architecture that should work for however
much I/O we have in the future.  

The one case that it is most likely to work is lowest priority interrupt
delivery where the hardware decides which cpu it should go to and it only
takes a single register write to change the cpu mask, and the common case
in Xen.

When you start directing irqs at specific cpus things get a lot easier
to break.

>> It looks like the only thing Xen gains by pushing out the work of
>> setting the polarity and setting edge/level triggering is our database
>> of motherboards which get those things wrong.
>>    
>
> Avoiding duplication of effort is a non-trivial benefit.
>
>> So I expect the thing to do is factor out acpi_parse_ioapic,
>> mp_register_ioapic so we can share information on borked BIOS's
>> between the Xen dom0 port and otherwise push Xen pseudo apic handling
>> off into it's strange little corner.
>
> Yes, that's what I'll look into.

How does Xen handle domU with hardware directly mapped?

Temporally ignoring what we have to do to work with Xen 3.4.  I'm curious
if we could make the Xen dom0 irq case the same as the Xen domU case.

Eric

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
  2009-06-17 17:32                 ` Jeremy Fitzhardinge
  (?)
  (?)
@ 2009-06-18 12:26                 ` Eric W. Biederman
  -1 siblings, 0 replies; 79+ messages in thread
From: Eric W. Biederman @ 2009-06-18 12:26 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Ingo Molnar, Thomas Gleixner, H. Peter Anvin,
	the arch/x86 maintainers, Linux Kernel Mailing List, Xen-devel,
	Keir Fraser

Jeremy Fitzhardinge <jeremy@goop.org> writes:

> Actually I was discussing this with Keir yesterday.  We're definitely open to
> changing the dom0 API to make things simpler on the Linux side.  (The dom0 ABI
> is more fluid than the domU one, and these changes would be backwards-compatible
> anyway.)
>
> One of the options we discussed was changing the API to get rid of the exposed
> vector, and just replace it with an operation to directly bind a gsi to a pirq
> (internal Xen physical interrupt handle, if you will), so that Xen ends up doing
> all the I/O APIC programming internally, as well as the local APIC.
>
> On the Linux side, I think it means we can just point pcibios_enable/disable_irq
> to our own xen_pci_irq_enable/disable functions to create the binding between a
> PCI device and an irq.

Then let's make this the plan.  Design a supportable dom0 <-> kernel
irq abi.  Essentially binding a gsi to an event channel mapping
function.  Get that into Xen.  Then get that into the mainstream linux
kernel.

Regardless of the upstream linux kernel merge status cleaning up the
irq handling is going to have to happen to move past 2.6.18.  I cleaned the
irq code up and changed it to work in incompatible ways starting in 2.6.19.

I really REALLY don't want to see support for Xen 3.4 domU irq
handling in the mainline linux kernel.  It is an evolutionary dead
end, and I have already ripped that code out of linux once.  Vectors
should be an implementation detail not an exposed part of the ABI.

Eric

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
  2009-06-12 18:22 ` Jeremy Fitzhardinge
                   ` (4 preceding siblings ...)
  (?)
@ 2009-06-18 16:08 ` Len Brown
  2009-06-18 19:14     ` Jeremy Fitzhardinge
  -1 siblings, 1 reply; 79+ messages in thread
From: Len Brown @ 2009-06-18 16:08 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Ingo Molnar, Thomas Gleixner, H. Peter Anvin, Eric W. Biederman,
	the arch/x86 maintainers, Linux Kernel Mailing List, Xen-devel

> In principle, the local APIC and the I/O APIC are distinct (but related)
> components, which can be independently present.

bzzzzt, but thanks for playing:-)

Nacked by: Len Brown <len.brown@intel.com>

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
  2009-06-18 16:08 ` Len Brown
@ 2009-06-18 19:14     ` Jeremy Fitzhardinge
  0 siblings, 0 replies; 79+ messages in thread
From: Jeremy Fitzhardinge @ 2009-06-18 19:14 UTC (permalink / raw)
  To: Len Brown
  Cc: Ingo Molnar, Thomas Gleixner, H. Peter Anvin, Eric W. Biederman,
	the arch/x86 maintainers, Linux Kernel Mailing List, Xen-devel

On 06/18/09 09:08, Len Brown wrote:
>> In principle, the local APIC and the I/O APIC are distinct (but related)
>> components, which can be independently present.
>>     
>
> bzzzzt, but thanks for playing:-)
>   

Perhaps I should have expressed that a bit more clearly:  you could, if
mad, build a machine with I/O APICs and some other mechanism for
delivering the interrupts to CPUs.  In practice, I doubt anyone ever
has, or ever would.

The only actual exception I know of is Xen's replacement of the physical
local APIC with a paravirtualized interrupt interface.

    J

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
@ 2009-06-18 19:14     ` Jeremy Fitzhardinge
  0 siblings, 0 replies; 79+ messages in thread
From: Jeremy Fitzhardinge @ 2009-06-18 19:14 UTC (permalink / raw)
  To: Len Brown
  Cc: Xen-devel, the arch/x86 maintainers, Linux Kernel Mailing List,
	Ingo Molnar, Eric W. Biederman, H. Peter Anvin, Thomas Gleixner

On 06/18/09 09:08, Len Brown wrote:
>> In principle, the local APIC and the I/O APIC are distinct (but related)
>> components, which can be independently present.
>>     
>
> bzzzzt, but thanks for playing:-)
>   

Perhaps I should have expressed that a bit more clearly:  you could, if
mad, build a machine with I/O APICs and some other mechanism for
delivering the interrupts to CPUs.  In practice, I doubt anyone ever
has, or ever would.

The only actual exception I know of is Xen's replacement of the physical
local APIC with a paravirtualized interrupt interface.

    J

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
  2009-06-18 19:14     ` Jeremy Fitzhardinge
  (?)
@ 2009-06-18 19:27     ` Eric W. Biederman
  2009-06-18 19:48         ` Jeremy Fitzhardinge
  -1 siblings, 1 reply; 79+ messages in thread
From: Eric W. Biederman @ 2009-06-18 19:27 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Len Brown, Ingo Molnar, Thomas Gleixner, H. Peter Anvin,
	the arch/x86 maintainers, Linux Kernel Mailing List, Xen-devel

Jeremy Fitzhardinge <jeremy@goop.org> writes:

> On 06/18/09 09:08, Len Brown wrote:
>>> In principle, the local APIC and the I/O APIC are distinct (but related)
>>> components, which can be independently present.
>>>     
>>
>> bzzzzt, but thanks for playing:-)
>>   
>
> Perhaps I should have expressed that a bit more clearly:  you could, if
> mad, build a machine with I/O APICs and some other mechanism for
> delivering the interrupts to CPUs.  In practice, I doubt anyone ever
> has, or ever would.
>
> The only actual exception I know of is Xen's replacement of the physical
> local APIC with a paravirtualized interrupt interface.

No one ever has.  Xen doesn't have I/O APICs either.  Not in any real
sense.  Xen just has devices that looking like I/O apics if you don't
look close.

Eric


^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
  2009-06-18  2:58                   ` Eric W. Biederman
@ 2009-06-18 19:34                     ` Jeremy Fitzhardinge
  -1 siblings, 0 replies; 79+ messages in thread
From: Jeremy Fitzhardinge @ 2009-06-18 19:34 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Ingo Molnar, Thomas Gleixner, H. Peter Anvin,
	the arch/x86 maintainers, Linux Kernel Mailing List, Xen-devel,
	Keir Fraser

On 06/17/09 19:58, Eric W. Biederman wrote:
>> One of the options we discussed was changing the API to get rid of the exposed
>> vector, and just replace it with an operation to directly bind a gsi to a pirq
>> (internal Xen physical interrupt handle, if you will), so that Xen ends up doing
>> all the I/O APIC programming internally, as well as the local APIC.
>>     
>
> As an abstraction layer I think that will work out a lot better long term.
>
> Given what iommus with irqs and DMA I expect you want something like
> that, that can be used from domU.  Then you just make allowing the
> operation conditional on if you happen to have the associated hardware
> mapped into your domain.
>   

A domU with a PCI passthrough device can bind a pirq to one of its event
channels.  All the gsi->pirq binding happens in dom0, but binding a pirq
to event channel can happen anywhere (that's why it doesn't bind gsi
directly to event channel, as they're strictly per-domain).

MSI interrupts also get bound to pirqs, so once the binding is created,
MSI and GSI interrupts can be treated identically (I think, I haven't
looked into the details yet).

>> On the Linux side, I think it means we can just point pcibios_enable/disable_irq
>> to our own xen_pci_irq_enable/disable functions to create the binding between a
>> PCI device and an irq.
>>     
>
> If you want xen to assign the linux irq number that is absolutely the properly place
> to hook.
>   

Yes.  We'd want to keep the irq==gsi mapping for non-MSI interrupts, but
that's easy enough to arrange.

> When I was messing with the irq code I did not recall finding many
> cases where migrating irqs from process context worked without hitting
> hardware bugs.  ioapic state machine lockups and the like.
>   

Keir mentioned that Xen avoids masking/unmasking interrupts in the I/O
APIC too much, because that has been problematic in the past.  Is that
related to the problems you're talking about?  Is there anywhere which
documents them?

> How does Xen handle domU with hardware directly mapped?
>   

We call that "pci passthrough".  Dom0 will bind the gsi to a pirq as
usual, and then pass the pirq through to the domU.  The domU will bind
the pirq to an event channel, which gets mapped to a Linux irq and
handled as usual.

> Temporally ignoring what we have to do to work with Xen 3.4.  I'm curious
> if we could make the Xen dom0 irq case the same as the Xen domU case.
>   

It is already; once the pirq is prepared, the process is the same in
both cases.

    J

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
@ 2009-06-18 19:34                     ` Jeremy Fitzhardinge
  0 siblings, 0 replies; 79+ messages in thread
From: Jeremy Fitzhardinge @ 2009-06-18 19:34 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Xen-devel, the arch/x86 maintainers, Linux Kernel Mailing List,
	Ingo Molnar, Keir Fraser, H. Peter Anvin, Thomas Gleixner

On 06/17/09 19:58, Eric W. Biederman wrote:
>> One of the options we discussed was changing the API to get rid of the exposed
>> vector, and just replace it with an operation to directly bind a gsi to a pirq
>> (internal Xen physical interrupt handle, if you will), so that Xen ends up doing
>> all the I/O APIC programming internally, as well as the local APIC.
>>     
>
> As an abstraction layer I think that will work out a lot better long term.
>
> Given what iommus with irqs and DMA I expect you want something like
> that, that can be used from domU.  Then you just make allowing the
> operation conditional on if you happen to have the associated hardware
> mapped into your domain.
>   

A domU with a PCI passthrough device can bind a pirq to one of its event
channels.  All the gsi->pirq binding happens in dom0, but binding a pirq
to event channel can happen anywhere (that's why it doesn't bind gsi
directly to event channel, as they're strictly per-domain).

MSI interrupts also get bound to pirqs, so once the binding is created,
MSI and GSI interrupts can be treated identically (I think, I haven't
looked into the details yet).

>> On the Linux side, I think it means we can just point pcibios_enable/disable_irq
>> to our own xen_pci_irq_enable/disable functions to create the binding between a
>> PCI device and an irq.
>>     
>
> If you want xen to assign the linux irq number that is absolutely the properly place
> to hook.
>   

Yes.  We'd want to keep the irq==gsi mapping for non-MSI interrupts, but
that's easy enough to arrange.

> When I was messing with the irq code I did not recall finding many
> cases where migrating irqs from process context worked without hitting
> hardware bugs.  ioapic state machine lockups and the like.
>   

Keir mentioned that Xen avoids masking/unmasking interrupts in the I/O
APIC too much, because that has been problematic in the past.  Is that
related to the problems you're talking about?  Is there anywhere which
documents them?

> How does Xen handle domU with hardware directly mapped?
>   

We call that "pci passthrough".  Dom0 will bind the gsi to a pirq as
usual, and then pass the pirq through to the domU.  The domU will bind
the pirq to an event channel, which gets mapped to a Linux irq and
handled as usual.

> Temporally ignoring what we have to do to work with Xen 3.4.  I'm curious
> if we could make the Xen dom0 irq case the same as the Xen domU case.
>   

It is already; once the pirq is prepared, the process is the same in
both cases.

    J

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
  2009-06-18 19:27     ` Eric W. Biederman
@ 2009-06-18 19:48         ` Jeremy Fitzhardinge
  0 siblings, 0 replies; 79+ messages in thread
From: Jeremy Fitzhardinge @ 2009-06-18 19:48 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Len Brown, Ingo Molnar, Thomas Gleixner, H. Peter Anvin,
	the arch/x86 maintainers, Linux Kernel Mailing List, Xen-devel

On 06/18/09 12:27, Eric W. Biederman wrote:
>> The only actual exception I know of is Xen's replacement of the physical
>> local APIC with a paravirtualized interrupt interface.
>>     
>
> No one ever has.  Xen doesn't have I/O APICs either.  Not in any real
> sense.  Xen just has devices that looking like I/O apics if you don't
> look close.
>   

Well, if acpi_pci_irq_lookup() and friends return the right things
without having parsed the MADT and set up the secondary state, then we
should be fine either way.

acpi_irq_model gets tested in all sorts of random places, so I wonder if
we'll need to set it to ACPI_IRQ_MODEL_IOAPIC (or something else?) to
make things work properly.

Hm, and principle we just get the SCI gsi from the FADT, but there's all
that other mucking about with it in the MADT processing... Wonder what
needs to happen there...

    J

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
@ 2009-06-18 19:48         ` Jeremy Fitzhardinge
  0 siblings, 0 replies; 79+ messages in thread
From: Jeremy Fitzhardinge @ 2009-06-18 19:48 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Xen-devel, the arch/x86 maintainers, Linux Kernel Mailing List,
	Ingo Molnar, H. Peter Anvin, Thomas Gleixner, Len Brown

On 06/18/09 12:27, Eric W. Biederman wrote:
>> The only actual exception I know of is Xen's replacement of the physical
>> local APIC with a paravirtualized interrupt interface.
>>     
>
> No one ever has.  Xen doesn't have I/O APICs either.  Not in any real
> sense.  Xen just has devices that looking like I/O apics if you don't
> look close.
>   

Well, if acpi_pci_irq_lookup() and friends return the right things
without having parsed the MADT and set up the secondary state, then we
should be fine either way.

acpi_irq_model gets tested in all sorts of random places, so I wonder if
we'll need to set it to ACPI_IRQ_MODEL_IOAPIC (or something else?) to
make things work properly.

Hm, and principle we just get the SCI gsi from the FADT, but there's all
that other mucking about with it in the MADT processing... Wonder what
needs to happen there...

    J

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
  2009-06-18 19:34                     ` Jeremy Fitzhardinge
  (?)
@ 2009-06-18 20:28                     ` Eric W. Biederman
  2009-06-18 21:09                         ` Jeremy Fitzhardinge
  -1 siblings, 1 reply; 79+ messages in thread
From: Eric W. Biederman @ 2009-06-18 20:28 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Ingo Molnar, Thomas Gleixner, H. Peter Anvin,
	the arch/x86 maintainers, Linux Kernel Mailing List, Xen-devel,
	Keir Fraser

Jeremy Fitzhardinge <jeremy@goop.org> writes:

> On 06/17/09 19:58, Eric W. Biederman wrote:
>>> One of the options we discussed was changing the API to get rid of the exposed
>>> vector, and just replace it with an operation to directly bind a gsi to a pirq
>>> (internal Xen physical interrupt handle, if you will), so that Xen ends up doing
>>> all the I/O APIC programming internally, as well as the local APIC.
>>>     
>>
>> As an abstraction layer I think that will work out a lot better long term.
>>
>> Given what iommus with irqs and DMA I expect you want something like
>> that, that can be used from domU.  Then you just make allowing the
>> operation conditional on if you happen to have the associated hardware
>> mapped into your domain.
>>   
>
> A domU with a PCI passthrough device can bind a pirq to one of its event
> channels.  All the gsi->pirq binding happens in dom0, but binding a pirq
> to event channel can happen anywhere (that's why it doesn't bind gsi
> directly to event channel, as they're strictly per-domain).
>
> MSI interrupts also get bound to pirqs, so once the binding is created,
> MSI and GSI interrupts can be treated identically (I think, I haven't
> looked into the details yet).
>
>>> On the Linux side, I think it means we can just point pcibios_enable/disable_irq
>>> to our own xen_pci_irq_enable/disable functions to create the binding between a
>>> PCI device and an irq.
>>>     
>>
>> If you want xen to assign the linux irq number that is absolutely the properly place
>> to hook.
>>   
>
> Yes.  We'd want to keep the irq==gsi mapping for non-MSI interrupts, but
> that's easy enough to arrange.
>
>> When I was messing with the irq code I did not recall finding many
>> cases where migrating irqs from process context worked without hitting
>> hardware bugs.  ioapic state machine lockups and the like.
>>   
>
> Keir mentioned that Xen avoids masking/unmasking interrupts in the I/O
> APIC too much, because that has been problematic in the past.  Is that
> related to the problems you're talking about?  Is there anywhere which
> documents them?

Not in great detail.  I have some comments in the code and some messages
on the mailing list.

What I know is that in linux the historical practice has always been
to migrate irqs in interrupt context and in testing I found I could
lock up ioapic state machines when I migrate interrupts from process
context enough.

It really cleans up the code not to migrate interrupts in the
interrupt handler.  So I spent a week or two on it.

>> How does Xen handle domU with hardware directly mapped?
>>   
>
> We call that "pci passthrough".  Dom0 will bind the gsi to a pirq as
> usual, and then pass the pirq through to the domU.  The domU will bind
> the pirq to an event channel, which gets mapped to a Linux irq and
> handled as usual.

Interesting.  How does domU find out the pirq -> pci device mapping?

>> Temporally ignoring what we have to do to work with Xen 3.4.  I'm curious
>> if we could make the Xen dom0 irq case the same as the Xen domU case.
>>   
>
> It is already; once the pirq is prepared, the process is the same in
> both cases.

I 3/4 believe that.  map_domain_pirq  appears to setup a per domain
mapping between the hardware vector and the irq name it is known as.
So I don't see how that works for other domains.

msi is setup on a per domain basis.

Eric

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
  2009-06-18 19:48         ` Jeremy Fitzhardinge
  (?)
@ 2009-06-18 20:39         ` Eric W. Biederman
  2009-06-18 22:33             ` Jeremy Fitzhardinge
  -1 siblings, 1 reply; 79+ messages in thread
From: Eric W. Biederman @ 2009-06-18 20:39 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Len Brown, Ingo Molnar, Thomas Gleixner, H. Peter Anvin,
	the arch/x86 maintainers, Linux Kernel Mailing List, Xen-devel

Jeremy Fitzhardinge <jeremy@goop.org> writes:

> On 06/18/09 12:27, Eric W. Biederman wrote:
>>> The only actual exception I know of is Xen's replacement of the physical
>>> local APIC with a paravirtualized interrupt interface.
>>>     
>>
>> No one ever has.  Xen doesn't have I/O APICs either.  Not in any real
>> sense.  Xen just has devices that looking like I/O apics if you don't
>> look close.
>>   
>
> Well, if acpi_pci_irq_lookup() and friends return the right things
> without having parsed the MADT and set up the secondary state, then we
> should be fine either way.
>
> acpi_irq_model gets tested in all sorts of random places, so I wonder if
> we'll need to set it to ACPI_IRQ_MODEL_IOAPIC (or something else?) to
> make things work properly.

And this is where things get interesting.  Xen strictly speaking has
already made that decision.  Unless you support non APIC mode it
should always be ACPI_IRQ_MODEL_IOAPIC.

But Xen runs the hardware so Xen knows, and Xen should be running
all of the acpi and what not to make it happen.

> Hm, and principle we just get the SCI gsi from the FADT, but there's all
> that other mucking about with it in the MADT processing... Wonder what
> needs to happen there...

Good question.  What does the domU case do?

Eric


^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
  2009-06-18 20:28                     ` Eric W. Biederman
@ 2009-06-18 21:09                         ` Jeremy Fitzhardinge
  0 siblings, 0 replies; 79+ messages in thread
From: Jeremy Fitzhardinge @ 2009-06-18 21:09 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Ingo Molnar, Thomas Gleixner, H. Peter Anvin,
	the arch/x86 maintainers, Linux Kernel Mailing List, Xen-devel,
	Keir Fraser

On 06/18/09 13:28, Eric W. Biederman wrote:
>>> How does Xen handle domU with hardware directly mapped?
>>>   
>>>       
>> We call that "pci passthrough".  Dom0 will bind the gsi to a pirq as
>> usual, and then pass the pirq through to the domU.  The domU will bind
>> the pirq to an event channel, which gets mapped to a Linux irq and
>> handled as usual.
>>     
>
> Interesting.  How does domU find out the pirq -> pci device mapping?
>   

Hm, I haven't looked at it closely, but conventionally it would be via
xenbus (which is how all the split frontend-backend drivers communicate).

>> It is already; once the pirq is prepared, the process is the same in
>> both cases.
>>     
>
> I 3/4 believe that.  map_domain_pirq  appears to setup a per domain
> mapping between the hardware vector and the irq name it is known as.
> So I don't see how that works for other domains.
>
> msi is setup on a per domain basis.
>   

Ah, OK.  The pirq is set up for a specific domain rather than being
global (otherwise it would need some kind of "which domain can access
which pirq" table).  dom0 can either create a pirq for itself or someone
else, and the final user of the pirq binds it to a domain-local evtchn.

I think.  I really haven't looked into the pci-passthrough parts very
closely yet.

    J

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
@ 2009-06-18 21:09                         ` Jeremy Fitzhardinge
  0 siblings, 0 replies; 79+ messages in thread
From: Jeremy Fitzhardinge @ 2009-06-18 21:09 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Xen-devel, the arch/x86 maintainers, Linux Kernel Mailing List,
	Ingo Molnar, Keir Fraser, H. Peter Anvin, Thomas Gleixner

On 06/18/09 13:28, Eric W. Biederman wrote:
>>> How does Xen handle domU with hardware directly mapped?
>>>   
>>>       
>> We call that "pci passthrough".  Dom0 will bind the gsi to a pirq as
>> usual, and then pass the pirq through to the domU.  The domU will bind
>> the pirq to an event channel, which gets mapped to a Linux irq and
>> handled as usual.
>>     
>
> Interesting.  How does domU find out the pirq -> pci device mapping?
>   

Hm, I haven't looked at it closely, but conventionally it would be via
xenbus (which is how all the split frontend-backend drivers communicate).

>> It is already; once the pirq is prepared, the process is the same in
>> both cases.
>>     
>
> I 3/4 believe that.  map_domain_pirq  appears to setup a per domain
> mapping between the hardware vector and the irq name it is known as.
> So I don't see how that works for other domains.
>
> msi is setup on a per domain basis.
>   

Ah, OK.  The pirq is set up for a specific domain rather than being
global (otherwise it would need some kind of "which domain can access
which pirq" table).  dom0 can either create a pirq for itself or someone
else, and the final user of the pirq binds it to a domain-local evtchn.

I think.  I really haven't looked into the pci-passthrough parts very
closely yet.

    J

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
  2009-06-18 20:39         ` Eric W. Biederman
@ 2009-06-18 22:33             ` Jeremy Fitzhardinge
  0 siblings, 0 replies; 79+ messages in thread
From: Jeremy Fitzhardinge @ 2009-06-18 22:33 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Len Brown, Ingo Molnar, Thomas Gleixner, H. Peter Anvin,
	the arch/x86 maintainers, Linux Kernel Mailing List, Xen-devel

On 06/18/09 13:39, Eric W. Biederman wrote:
>> Well, if acpi_pci_irq_lookup() and friends return the right things
>> without having parsed the MADT and set up the secondary state, then we
>> should be fine either way.
>>
>> acpi_irq_model gets tested in all sorts of random places, so I wonder if
>> we'll need to set it to ACPI_IRQ_MODEL_IOAPIC (or something else?) to
>> make things work properly.
>>     
>
> And this is where things get interesting.  Xen strictly speaking has
> already made that decision.  Unless you support non APIC mode it
> should always be ACPI_IRQ_MODEL_IOAPIC.
>   

We could decide by fiat to not support non-APIC machines (which is more
or less the case), but they do happen to work at the moment; Gerd even
has one and provided fixes to make it work.  (Ditto non-ACPI, though
they're commonly related.)

> But Xen runs the hardware so Xen knows, and Xen should be running
> all of the acpi and what not to make it happen.
>   

There are two separate issues:

   1. If we intercept interrupt routing at the pcibios_pci_irq_enable
      level, will anything in the kernel care about the state of its
      acpi_irq_model variable?  At first glance it *shouldn't* care,
      because its just handing the whole problem off to Xen.
   2. Xen has no AML interpreter, so its use of ACPI is limited to
      parsing tables.  It looks like we'll need to set acpi_irq_model
      appropriately and then get acpi_bus_init_irq() to run.

(Related to this is making sure any chipset configuration that happens
in the depths of the DSDT does in fact happen.)

>> Hm, and principle we just get the SCI gsi from the FADT, but there's all
>> that other mucking about with it in the MADT processing... Wonder what
>> needs to happen there...
>>     
>
> Good question.  What does the domU case do?
>   

DomU doesn't know or care about ACPI at all.  There's no reason for it
to get any kind of ACPI event.

    J

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
@ 2009-06-18 22:33             ` Jeremy Fitzhardinge
  0 siblings, 0 replies; 79+ messages in thread
From: Jeremy Fitzhardinge @ 2009-06-18 22:33 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Xen-devel, the arch/x86 maintainers, Linux Kernel Mailing List,
	Ingo Molnar, H. Peter Anvin, Thomas Gleixner, Len Brown

On 06/18/09 13:39, Eric W. Biederman wrote:
>> Well, if acpi_pci_irq_lookup() and friends return the right things
>> without having parsed the MADT and set up the secondary state, then we
>> should be fine either way.
>>
>> acpi_irq_model gets tested in all sorts of random places, so I wonder if
>> we'll need to set it to ACPI_IRQ_MODEL_IOAPIC (or something else?) to
>> make things work properly.
>>     
>
> And this is where things get interesting.  Xen strictly speaking has
> already made that decision.  Unless you support non APIC mode it
> should always be ACPI_IRQ_MODEL_IOAPIC.
>   

We could decide by fiat to not support non-APIC machines (which is more
or less the case), but they do happen to work at the moment; Gerd even
has one and provided fixes to make it work.  (Ditto non-ACPI, though
they're commonly related.)

> But Xen runs the hardware so Xen knows, and Xen should be running
> all of the acpi and what not to make it happen.
>   

There are two separate issues:

   1. If we intercept interrupt routing at the pcibios_pci_irq_enable
      level, will anything in the kernel care about the state of its
      acpi_irq_model variable?  At first glance it *shouldn't* care,
      because its just handing the whole problem off to Xen.
   2. Xen has no AML interpreter, so its use of ACPI is limited to
      parsing tables.  It looks like we'll need to set acpi_irq_model
      appropriately and then get acpi_bus_init_irq() to run.

(Related to this is making sure any chipset configuration that happens
in the depths of the DSDT does in fact happen.)

>> Hm, and principle we just get the SCI gsi from the FADT, but there's all
>> that other mucking about with it in the MADT processing... Wonder what
>> needs to happen there...
>>     
>
> Good question.  What does the domU case do?
>   

DomU doesn't know or care about ACPI at all.  There's no reason for it
to get any kind of ACPI event.

    J

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
  2009-06-18 19:14     ` Jeremy Fitzhardinge
  (?)
  (?)
@ 2009-06-18 22:51     ` Maciej W. Rozycki
  -1 siblings, 0 replies; 79+ messages in thread
From: Maciej W. Rozycki @ 2009-06-18 22:51 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Len Brown, Ingo Molnar, Thomas Gleixner, H. Peter Anvin,
	Eric W. Biederman, the arch/x86 maintainers,
	Linux Kernel Mailing List, Xen-devel

On Thu, 18 Jun 2009, Jeremy Fitzhardinge wrote:

> Perhaps I should have expressed that a bit more clearly:  you could, if
> mad, build a machine with I/O APICs and some other mechanism for
> delivering the interrupts to CPUs.  In practice, I doubt anyone ever
> has, or ever would.

 And people have done that (not for the x86 though) -- any machine with an 
HT bus will have I/O APIC interrupts as this is how the interconnect 
carries over interrupt messages from devices.  The root HT bridge forwards 
these messages to the local APIC -- this is similar to how MSI messages 
work; in fact if the upstream HT bridge was not a root bridge, but a 
PCI-HT bridge instead, then this is how these interrupt messages would 
have to be forwarded to the root PCI bridge.

 Now with a non-x86 machine there is not necessarily a real local APIC 
component -- this is the case for example with the Broadcom BCM1250A SOC 
based around a pair of MIPS64 processor cores.  Still this chip has to 
provide some logic to map HT interrupt messages to native interrupts and 
it is there, providing means for routing messages to the correct CPUs 
based on the destination and the destination mode and for the delivery 
mode and the vector (if applicable) to select the correct native interrupt 
source.  ExtINTA and EOI cycles have to be performed by software 
explicitly though, by poking at the right MMIO addresses which are not 
associated with the HT interrupt reception logic.  Not exactly an 
x86-style local APIC, but still an analogue.

 Just for the record.  I wholeheartedly agree with Eric pretending there 
is no local APIC and fiddling with our fragile code which assumes 
otherwise is not the best thing to do.  The Broadcom platform mentioned 
does not reuse any piece of our x86 APIC code.

  Maciej

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
  2009-06-18 21:09                         ` Jeremy Fitzhardinge
@ 2009-06-19  1:38                           ` Eric W. Biederman
  -1 siblings, 0 replies; 79+ messages in thread
From: Eric W. Biederman @ 2009-06-19  1:38 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Ingo Molnar, Thomas Gleixner, H. Peter Anvin,
	the arch/x86 maintainers, Linux Kernel Mailing List, Xen-devel,
	Keir Fraser

Jeremy Fitzhardinge <jeremy@goop.org> writes:

> Ah, OK.  The pirq is set up for a specific domain rather than being
> global (otherwise it would need some kind of "which domain can access
> which pirq" table).  dom0 can either create a pirq for itself or someone
> else, and the final user of the pirq binds it to a domain-local evtchn.
>
> I think.  I really haven't looked into the pci-passthrough parts very
> closely yet.

I certainly could not find the code that would let you setup a pirq for another
domain.  In fact the pirq code aka alloc_vectors appears to hard code dom0
in Xen 3.4.

pci-passthrough since it is domU, and since you describe it as well isolated and
comparatively simple should be a shoe in.

Further as you describe it pci-passthrough is a subset of what we have to do for dom0.
So if we can I would like to see the pci passthrough code get merged first.

Eric

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
@ 2009-06-19  1:38                           ` Eric W. Biederman
  0 siblings, 0 replies; 79+ messages in thread
From: Eric W. Biederman @ 2009-06-19  1:38 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Xen-devel, the arch/x86 maintainers, Linux Kernel Mailing List,
	Ingo Molnar, Keir Fraser, H. Peter Anvin, Thomas Gleixner

Jeremy Fitzhardinge <jeremy@goop.org> writes:

> Ah, OK.  The pirq is set up for a specific domain rather than being
> global (otherwise it would need some kind of "which domain can access
> which pirq" table).  dom0 can either create a pirq for itself or someone
> else, and the final user of the pirq binds it to a domain-local evtchn.
>
> I think.  I really haven't looked into the pci-passthrough parts very
> closely yet.

I certainly could not find the code that would let you setup a pirq for another
domain.  In fact the pirq code aka alloc_vectors appears to hard code dom0
in Xen 3.4.

pci-passthrough since it is domU, and since you describe it as well isolated and
comparatively simple should be a shoe in.

Further as you describe it pci-passthrough is a subset of what we have to do for dom0.
So if we can I would like to see the pci passthrough code get merged first.

Eric

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
  2009-06-18 22:33             ` Jeremy Fitzhardinge
@ 2009-06-19  2:42               ` Eric W. Biederman
  -1 siblings, 0 replies; 79+ messages in thread
From: Eric W. Biederman @ 2009-06-19  2:42 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Len Brown, Ingo Molnar, Thomas Gleixner, H. Peter Anvin,
	the arch/x86 maintainers, Linux Kernel Mailing List, Xen-devel

Jeremy Fitzhardinge <jeremy@goop.org> writes:

> On 06/18/09 13:39, Eric W. Biederman wrote:
>>> Well, if acpi_pci_irq_lookup() and friends return the right things
>>> without having parsed the MADT and set up the secondary state, then we
>>> should be fine either way.
>>>
>>> acpi_irq_model gets tested in all sorts of random places, so I wonder if
>>> we'll need to set it to ACPI_IRQ_MODEL_IOAPIC (or something else?) to
>>> make things work properly.
>>>     
>>
>> And this is where things get interesting.  Xen strictly speaking has
>> already made that decision.  Unless you support non APIC mode it
>> should always be ACPI_IRQ_MODEL_IOAPIC.
>>   
>
> We could decide by fiat to not support non-APIC machines (which is more
> or less the case), but they do happen to work at the moment; Gerd even
> has one and provided fixes to make it work.  (Ditto non-ACPI, though
> they're commonly related.)

Interesting.  Then we need to know what Xen has chosen to do.
This whole logic of having linux and Xen happen to choose the
same interrupt handling mode by coincidence that is in use
right now seems fragile.

Xen runs a timer so I can't see Xen actually leaving the decision
up to linux.  It certainly doesn't leave the physical versus logical
mode decision up to linux.

>> But Xen runs the hardware so Xen knows, and Xen should be running
>> all of the acpi and what not to make it happen.
>>   
>
> There are two separate issues:
>
>    1. If we intercept interrupt routing at the pcibios_pci_irq_enable
>       level, will anything in the kernel care about the state of its
>       acpi_irq_model variable?  At first glance it *shouldn't* care,
>       because its just handing the whole problem off to Xen.

>    2. Xen has no AML interpreter, so its use of ACPI is limited to
>       parsing tables.  It looks like we'll need to set acpi_irq_model
>       appropriately and then get acpi_bus_init_irq() to run.

Assuming Xen not having the AML interpreter is the right decision.
I recall from some of the hibernation discussions that ACPI is not an easy
thing to start/stop on a machine unless it is handled just so.

Which if dom0 is to be redundant/restartable seems to make the
argument for AML living in Xen.

Xen has everything except the AML interpreter.

> (Related to this is making sure any chipset configuration that happens
> in the depths of the DSDT does in fact happen.)
>
>>> Hm, and principle we just get the SCI gsi from the FADT, but there's all
>>> that other mucking about with it in the MADT processing... Wonder what
>>> needs to happen there...
>>>     
>>
>> Good question.  What does the domU case do?
>>   
>
> DomU doesn't know or care about ACPI at all.  There's no reason for it
> to get any kind of ACPI event.

Yet at least occasionally Xen fakes up ACPI tables for a guest.

DomU has to do something to get the mapping, which is what I thought you
were referring to.

I can believe a Xen dom0 that is the same as domU except with lots
more hardware.  I have trouble understanding and Xen dom0 that has to
all kinds of special one off hardware manipulation because Xen is too
lazy too.  Xen is already 300,000 lines of code so it's not like it is
fully comprehensible/maintainable by a single person anyway.

Eric

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
@ 2009-06-19  2:42               ` Eric W. Biederman
  0 siblings, 0 replies; 79+ messages in thread
From: Eric W. Biederman @ 2009-06-19  2:42 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Xen-devel, the arch/x86 maintainers, Linux Kernel Mailing List,
	Ingo Molnar, H. Peter Anvin, Thomas Gleixner, Len Brown

Jeremy Fitzhardinge <jeremy@goop.org> writes:

> On 06/18/09 13:39, Eric W. Biederman wrote:
>>> Well, if acpi_pci_irq_lookup() and friends return the right things
>>> without having parsed the MADT and set up the secondary state, then we
>>> should be fine either way.
>>>
>>> acpi_irq_model gets tested in all sorts of random places, so I wonder if
>>> we'll need to set it to ACPI_IRQ_MODEL_IOAPIC (or something else?) to
>>> make things work properly.
>>>     
>>
>> And this is where things get interesting.  Xen strictly speaking has
>> already made that decision.  Unless you support non APIC mode it
>> should always be ACPI_IRQ_MODEL_IOAPIC.
>>   
>
> We could decide by fiat to not support non-APIC machines (which is more
> or less the case), but they do happen to work at the moment; Gerd even
> has one and provided fixes to make it work.  (Ditto non-ACPI, though
> they're commonly related.)

Interesting.  Then we need to know what Xen has chosen to do.
This whole logic of having linux and Xen happen to choose the
same interrupt handling mode by coincidence that is in use
right now seems fragile.

Xen runs a timer so I can't see Xen actually leaving the decision
up to linux.  It certainly doesn't leave the physical versus logical
mode decision up to linux.

>> But Xen runs the hardware so Xen knows, and Xen should be running
>> all of the acpi and what not to make it happen.
>>   
>
> There are two separate issues:
>
>    1. If we intercept interrupt routing at the pcibios_pci_irq_enable
>       level, will anything in the kernel care about the state of its
>       acpi_irq_model variable?  At first glance it *shouldn't* care,
>       because its just handing the whole problem off to Xen.

>    2. Xen has no AML interpreter, so its use of ACPI is limited to
>       parsing tables.  It looks like we'll need to set acpi_irq_model
>       appropriately and then get acpi_bus_init_irq() to run.

Assuming Xen not having the AML interpreter is the right decision.
I recall from some of the hibernation discussions that ACPI is not an easy
thing to start/stop on a machine unless it is handled just so.

Which if dom0 is to be redundant/restartable seems to make the
argument for AML living in Xen.

Xen has everything except the AML interpreter.

> (Related to this is making sure any chipset configuration that happens
> in the depths of the DSDT does in fact happen.)
>
>>> Hm, and principle we just get the SCI gsi from the FADT, but there's all
>>> that other mucking about with it in the MADT processing... Wonder what
>>> needs to happen there...
>>>     
>>
>> Good question.  What does the domU case do?
>>   
>
> DomU doesn't know or care about ACPI at all.  There's no reason for it
> to get any kind of ACPI event.

Yet at least occasionally Xen fakes up ACPI tables for a guest.

DomU has to do something to get the mapping, which is what I thought you
were referring to.

I can believe a Xen dom0 that is the same as domU except with lots
more hardware.  I have trouble understanding and Xen dom0 that has to
all kinds of special one off hardware manipulation because Xen is too
lazy too.  Xen is already 300,000 lines of code so it's not like it is
fully comprehensible/maintainable by a single person anyway.

Eric

^ permalink raw reply	[flat|nested] 79+ messages in thread

* RE: [Xen-devel] Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
  2009-06-19  1:38                           ` Eric W. Biederman
@ 2009-06-19  3:10                             ` Jiang, Yunhong
  -1 siblings, 0 replies; 79+ messages in thread
From: Jiang, Yunhong @ 2009-06-19  3:10 UTC (permalink / raw)
  To: Eric W. Biederman, Jeremy Fitzhardinge
  Cc: Xen-devel, the arch/x86 maintainers, Linux Kernel Mailing List,
	Ingo Molnar, Keir Fraser, H. Peter Anvin, Thomas Gleixner



xen-devel-bounces@lists.xensource.com wrote:
> I/O APICs just because there's no local APIC
> 
> Jeremy Fitzhardinge <jeremy@goop.org> writes:
> 
>> Ah, OK.  The pirq is set up for a specific domain rather than being
>> global (otherwise it would need some kind of "which domain can access
>> which pirq" table).  dom0 can either create a pirq for itself or
>> someone else, and the final user of the pirq binds it to a
>> domain-local evtchn. 

I think currently the GSI pirq is global, while MSI irq is per-domain. In fact, the irq for gsi is allocated by dom0 itself, and is shared by xen/dom0. I suspect this is partly because In 2.6.18 kernel, the irq/gsi is really messed up (I remember there is cleanup happen in 2.6.19).  

The domU get the pirq value through pci-backend and pci frontend driver. The user space tools will grant one pirq to a guest through hypercall and the permission information is saved in domain's structure.

When we switch to Jeremy's new method, maybe we can make the irq to be alocated by Xen HV, but I suspect it is ok to be kept still as global.

The MSI is using per-domain pirq.

--jyh

>> 
>> I think.  I really haven't looked into the pci-passthrough parts very
>> closely yet.
> 
> I certainly could not find the code that would let you setup a pirq
> for another domain.  In fact the pirq code aka alloc_vectors appears
> to hard code dom0 in Xen 3.4.
> 
> pci-passthrough since it is domU, and since you describe it as well
> isolated and comparatively simple should be a shoe in.
> 
> Further as you describe it pci-passthrough is a subset of what we
> have to do for dom0. So if we can I would like to see the pci
> passthrough code get merged first. 
> 
> Eric
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 79+ messages in thread

* RE: Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
@ 2009-06-19  3:10                             ` Jiang, Yunhong
  0 siblings, 0 replies; 79+ messages in thread
From: Jiang, Yunhong @ 2009-06-19  3:10 UTC (permalink / raw)
  To: Eric W. Biederman, Jeremy Fitzhardinge
  Cc: Xen-devel, the arch/x86 maintainers, Linux Kernel Mailing List,
	Ingo Molnar, Keir Fraser, H. Peter Anvin, Thomas Gleixner



xen-devel-bounces@lists.xensource.com wrote:
> I/O APICs just because there's no local APIC
> 
> Jeremy Fitzhardinge <jeremy@goop.org> writes:
> 
>> Ah, OK.  The pirq is set up for a specific domain rather than being
>> global (otherwise it would need some kind of "which domain can access
>> which pirq" table).  dom0 can either create a pirq for itself or
>> someone else, and the final user of the pirq binds it to a
>> domain-local evtchn. 

I think currently the GSI pirq is global, while MSI irq is per-domain. In fact, the irq for gsi is allocated by dom0 itself, and is shared by xen/dom0. I suspect this is partly because In 2.6.18 kernel, the irq/gsi is really messed up (I remember there is cleanup happen in 2.6.19).  

The domU get the pirq value through pci-backend and pci frontend driver. The user space tools will grant one pirq to a guest through hypercall and the permission information is saved in domain's structure.

When we switch to Jeremy's new method, maybe we can make the irq to be alocated by Xen HV, but I suspect it is ok to be kept still as global.

The MSI is using per-domain pirq.

--jyh

>> 
>> I think.  I really haven't looked into the pci-passthrough parts very
>> closely yet.
> 
> I certainly could not find the code that would let you setup a pirq
> for another domain.  In fact the pirq code aka alloc_vectors appears
> to hard code dom0 in Xen 3.4.
> 
> pci-passthrough since it is domU, and since you describe it as well
> isolated and comparatively simple should be a shoe in.
> 
> Further as you describe it pci-passthrough is a subset of what we
> have to do for dom0. So if we can I would like to see the pci
> passthrough code get merged first. 
> 
> Eric
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's  no local APIC
  2009-06-18 22:33             ` Jeremy Fitzhardinge
@ 2009-06-19  5:32               ` Yinghai Lu
  -1 siblings, 0 replies; 79+ messages in thread
From: Yinghai Lu @ 2009-06-19  5:32 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Eric W. Biederman, Len Brown, Ingo Molnar, Thomas Gleixner,
	H. Peter Anvin, the arch/x86 maintainers,
	Linux Kernel Mailing List, Xen-devel

doesn't XEN support per cpu irq vector?

got sth from XEN 3.3 / SLES 11

igb 0000:81:00.0: PCI INT A -> GSI 95 (level, low) -> IRQ 95
igb 0000:81:00.0: setting latency timer to 64
igb 0000:81:00.0: Intel(R) Gigabit Ethernet Network Connection
igb 0000:81:00.0: eth9: (PCIe:2.5Gb/s:Width x4) 00:21:28:3a:d8:0e
igb 0000:81:00.0: eth9: PBA No: ffffff-0ff
igb 0000:81:00.0: Using MSI-X interrupts. 4 rx queue(s), 4 tx queue(s)
vendor=8086 device=3420
(XEN) irq.c:847: dom0: invalid pirq 94 or vector -28
igb 0000:81:00.1: PCI INT B -> GSI 94 (level, low) -> IRQ 94
igb 0000:81:00.1: setting latency timer to 64
(XEN) physdev.c:87: dom0: map irq with wrong vector -28
map irq failed
(XEN) physdev.c:87: dom0: map irq with wrong vector -28
map irq failed

the system need a lot of MSI-X normally.. with current mainline tree
kernel, it will need about 360 irq...

YH

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
@ 2009-06-19  5:32               ` Yinghai Lu
  0 siblings, 0 replies; 79+ messages in thread
From: Yinghai Lu @ 2009-06-19  5:32 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Xen-devel, the arch/x86 maintainers, Linux Kernel Mailing List,
	Ingo Molnar, Eric W. Biederman, H. Peter Anvin, Thomas Gleixner,
	Len Brown

doesn't XEN support per cpu irq vector?

got sth from XEN 3.3 / SLES 11

igb 0000:81:00.0: PCI INT A -> GSI 95 (level, low) -> IRQ 95
igb 0000:81:00.0: setting latency timer to 64
igb 0000:81:00.0: Intel(R) Gigabit Ethernet Network Connection
igb 0000:81:00.0: eth9: (PCIe:2.5Gb/s:Width x4) 00:21:28:3a:d8:0e
igb 0000:81:00.0: eth9: PBA No: ffffff-0ff
igb 0000:81:00.0: Using MSI-X interrupts. 4 rx queue(s), 4 tx queue(s)
vendor=8086 device=3420
(XEN) irq.c:847: dom0: invalid pirq 94 or vector -28
igb 0000:81:00.1: PCI INT B -> GSI 94 (level, low) -> IRQ 94
igb 0000:81:00.1: setting latency timer to 64
(XEN) physdev.c:87: dom0: map irq with wrong vector -28
map irq failed
(XEN) physdev.c:87: dom0: map irq with wrong vector -28
map irq failed

the system need a lot of MSI-X normally.. with current mainline tree
kernel, it will need about 360 irq...

YH

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's  no local APIC
  2009-06-19  5:32               ` Yinghai Lu
@ 2009-06-19  5:50                 ` Eric W. Biederman
  -1 siblings, 0 replies; 79+ messages in thread
From: Eric W. Biederman @ 2009-06-19  5:50 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: Jeremy Fitzhardinge, Len Brown, Ingo Molnar, Thomas Gleixner,
	H. Peter Anvin, the arch/x86 maintainers,
	Linux Kernel Mailing List, Xen-devel

Yinghai Lu <yhlu.kernel@gmail.com> writes:

> doesn't XEN support per cpu irq vector?

Nope.  It doesn't even support the notion of cpus.

Eric

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
@ 2009-06-19  5:50                 ` Eric W. Biederman
  0 siblings, 0 replies; 79+ messages in thread
From: Eric W. Biederman @ 2009-06-19  5:50 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: Jeremy Fitzhardinge, Xen-devel, the arch/x86 maintainers,
	Linux Kernel Mailing List, Ingo Molnar, H. Peter Anvin,
	Thomas Gleixner, Len Brown

Yinghai Lu <yhlu.kernel@gmail.com> writes:

> doesn't XEN support per cpu irq vector?

Nope.  It doesn't even support the notion of cpus.

Eric

^ permalink raw reply	[flat|nested] 79+ messages in thread

* [Xen-devel] Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs justbecause there's no local APIC
  2009-06-19  5:32               ` Yinghai Lu
@ 2009-06-19  7:52                 ` Jan Beulich
  -1 siblings, 0 replies; 79+ messages in thread
From: Jan Beulich @ 2009-06-19  7:52 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: Jeremy Fitzhardinge, Len Brown, the arch/x86 maintainers,
	Thomas Gleixner, Xen-devel, Ingo Molnar,
	Linux Kernel Mailing List, Eric W. Biederman, H. Peter Anvin

>>> Yinghai Lu <yhlu.kernel@gmail.com> 19.06.09 07:32 >>>
>doesn't XEN support per cpu irq vector?

No.

>got sth from XEN 3.3 / SLES 11
>
>igb 0000:81:00.0: PCI INT A -> GSI 95 (level, low) -> IRQ 95
>igb 0000:81:00.0: setting latency timer to 64
>igb 0000:81:00.0: Intel(R) Gigabit Ethernet Network Connection
>igb 0000:81:00.0: eth9: (PCIe:2.5Gb/s:Width x4) 00:21:28:3a:d8:0e
>igb 0000:81:00.0: eth9: PBA No: ffffff-0ff
>igb 0000:81:00.0: Using MSI-X interrupts. 4 rx queue(s), 4 tx queue(s)
>vendor=8086 device=3420
>(XEN) irq.c:847: dom0: invalid pirq 94 or vector -28
>igb 0000:81:00.1: PCI INT B -> GSI 94 (level, low) -> IRQ 94
>igb 0000:81:00.1: setting latency timer to 64
>(XEN) physdev.c:87: dom0: map irq with wrong vector -28
>map irq failed
>(XEN) physdev.c:87: dom0: map irq with wrong vector -28
>map irq failed
>
>the system need a lot of MSI-X normally.. with current mainline tree
>kernel, it will need about 360 irq...

Do you mean 360 connected devices, or just 360 IO-APIC pins (most of
which are usually unused)? In the latter case, devices using MSI (i.e. not
using high numbered IO-APIC pins) should work, while devices connected
to IO-APIC pins numbered 256 and higher won't work in SLE11 as-is.
This limitation got fixed recently in the 3.5-unstable tree, though. The
256 active vectors limit, however, continues to exist, so the former case
would still not be supported by Xen.

Jan


^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs justbecause there's no local APIC
@ 2009-06-19  7:52                 ` Jan Beulich
  0 siblings, 0 replies; 79+ messages in thread
From: Jan Beulich @ 2009-06-19  7:52 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: Jeremy Fitzhardinge, Xen-devel, the arch/x86 maintainers,
	Linux Kernel Mailing List, Ingo Molnar, Eric W. Biederman,
	H. Peter Anvin, Thomas Gleixner, Len Brown

>>> Yinghai Lu <yhlu.kernel@gmail.com> 19.06.09 07:32 >>>
>doesn't XEN support per cpu irq vector?

No.

>got sth from XEN 3.3 / SLES 11
>
>igb 0000:81:00.0: PCI INT A -> GSI 95 (level, low) -> IRQ 95
>igb 0000:81:00.0: setting latency timer to 64
>igb 0000:81:00.0: Intel(R) Gigabit Ethernet Network Connection
>igb 0000:81:00.0: eth9: (PCIe:2.5Gb/s:Width x4) 00:21:28:3a:d8:0e
>igb 0000:81:00.0: eth9: PBA No: ffffff-0ff
>igb 0000:81:00.0: Using MSI-X interrupts. 4 rx queue(s), 4 tx queue(s)
>vendor=8086 device=3420
>(XEN) irq.c:847: dom0: invalid pirq 94 or vector -28
>igb 0000:81:00.1: PCI INT B -> GSI 94 (level, low) -> IRQ 94
>igb 0000:81:00.1: setting latency timer to 64
>(XEN) physdev.c:87: dom0: map irq with wrong vector -28
>map irq failed
>(XEN) physdev.c:87: dom0: map irq with wrong vector -28
>map irq failed
>
>the system need a lot of MSI-X normally.. with current mainline tree
>kernel, it will need about 360 irq...

Do you mean 360 connected devices, or just 360 IO-APIC pins (most of
which are usually unused)? In the latter case, devices using MSI (i.e. not
using high numbered IO-APIC pins) should work, while devices connected
to IO-APIC pins numbered 256 and higher won't work in SLE11 as-is.
This limitation got fixed recently in the 3.5-unstable tree, though. The
256 active vectors limit, however, continues to exist, so the former case
would still not be supported by Xen.

Jan

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [Xen-devel] Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs justbecause there's no local APIC
  2009-06-19  7:52                 ` Jan Beulich
@ 2009-06-19  8:16                   ` Eric W. Biederman
  -1 siblings, 0 replies; 79+ messages in thread
From: Eric W. Biederman @ 2009-06-19  8:16 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Yinghai Lu, Jeremy Fitzhardinge, Len Brown,
	the arch/x86 maintainers, Thomas Gleixner, Xen-devel,
	Ingo Molnar, Linux Kernel Mailing List, H. Peter Anvin

"Jan Beulich" <JBeulich@novell.com> writes:

>>>> Yinghai Lu <yhlu.kernel@gmail.com> 19.06.09 07:32 >>>
>>doesn't XEN support per cpu irq vector?
>
> No.
>
>>got sth from XEN 3.3 / SLES 11
>>
>>igb 0000:81:00.0: PCI INT A -> GSI 95 (level, low) -> IRQ 95
>>igb 0000:81:00.0: setting latency timer to 64
>>igb 0000:81:00.0: Intel(R) Gigabit Ethernet Network Connection
>>igb 0000:81:00.0: eth9: (PCIe:2.5Gb/s:Width x4) 00:21:28:3a:d8:0e
>>igb 0000:81:00.0: eth9: PBA No: ffffff-0ff
>>igb 0000:81:00.0: Using MSI-X interrupts. 4 rx queue(s), 4 tx queue(s)
>>vendor=8086 device=3420
>>(XEN) irq.c:847: dom0: invalid pirq 94 or vector -28
>>igb 0000:81:00.1: PCI INT B -> GSI 94 (level, low) -> IRQ 94
>>igb 0000:81:00.1: setting latency timer to 64
>>(XEN) physdev.c:87: dom0: map irq with wrong vector -28
>>map irq failed
>>(XEN) physdev.c:87: dom0: map irq with wrong vector -28
>>map irq failed
>>
>>the system need a lot of MSI-X normally.. with current mainline tree
>>kernel, it will need about 360 irq...
>
> Do you mean 360 connected devices, or just 360 IO-APIC pins (most of
> which are usually unused)? In the latter case, devices using MSI (i.e. not
> using high numbered IO-APIC pins) should work, while devices connected
> to IO-APIC pins numbered 256 and higher won't work in SLE11 as-is.
> This limitation got fixed recently in the 3.5-unstable tree, though. The
> 256 active vectors limit, however, continues to exist, so the former case
> would still not be supported by Xen.

Good question.  I know YH had a system a few years ago that exceeded 256 vectors.
But in this case it really could be either.

Eric


^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs justbecause there's no local APIC
@ 2009-06-19  8:16                   ` Eric W. Biederman
  0 siblings, 0 replies; 79+ messages in thread
From: Eric W. Biederman @ 2009-06-19  8:16 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Jeremy Fitzhardinge, Xen-devel, the arch/x86 maintainers,
	Linux Kernel Mailing List, Ingo Molnar, Yinghai Lu,
	H. Peter Anvin, Thomas Gleixner, Len Brown

"Jan Beulich" <JBeulich@novell.com> writes:

>>>> Yinghai Lu <yhlu.kernel@gmail.com> 19.06.09 07:32 >>>
>>doesn't XEN support per cpu irq vector?
>
> No.
>
>>got sth from XEN 3.3 / SLES 11
>>
>>igb 0000:81:00.0: PCI INT A -> GSI 95 (level, low) -> IRQ 95
>>igb 0000:81:00.0: setting latency timer to 64
>>igb 0000:81:00.0: Intel(R) Gigabit Ethernet Network Connection
>>igb 0000:81:00.0: eth9: (PCIe:2.5Gb/s:Width x4) 00:21:28:3a:d8:0e
>>igb 0000:81:00.0: eth9: PBA No: ffffff-0ff
>>igb 0000:81:00.0: Using MSI-X interrupts. 4 rx queue(s), 4 tx queue(s)
>>vendor=8086 device=3420
>>(XEN) irq.c:847: dom0: invalid pirq 94 or vector -28
>>igb 0000:81:00.1: PCI INT B -> GSI 94 (level, low) -> IRQ 94
>>igb 0000:81:00.1: setting latency timer to 64
>>(XEN) physdev.c:87: dom0: map irq with wrong vector -28
>>map irq failed
>>(XEN) physdev.c:87: dom0: map irq with wrong vector -28
>>map irq failed
>>
>>the system need a lot of MSI-X normally.. with current mainline tree
>>kernel, it will need about 360 irq...
>
> Do you mean 360 connected devices, or just 360 IO-APIC pins (most of
> which are usually unused)? In the latter case, devices using MSI (i.e. not
> using high numbered IO-APIC pins) should work, while devices connected
> to IO-APIC pins numbered 256 and higher won't work in SLE11 as-is.
> This limitation got fixed recently in the 3.5-unstable tree, though. The
> 256 active vectors limit, however, continues to exist, so the former case
> would still not be supported by Xen.

Good question.  I know YH had a system a few years ago that exceeded 256 vectors.
But in this case it really could be either.

Eric

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
  2009-06-19  2:42               ` Eric W. Biederman
@ 2009-06-19 19:58                 ` Jeremy Fitzhardinge
  -1 siblings, 0 replies; 79+ messages in thread
From: Jeremy Fitzhardinge @ 2009-06-19 19:58 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Len Brown, Ingo Molnar, Thomas Gleixner, H. Peter Anvin,
	the arch/x86 maintainers, Linux Kernel Mailing List, Xen-devel,
	Nakajima, Jun, Keir Fraser

On 06/18/09 19:42, Eric W. Biederman wrote:
> Interesting.  Then we need to know what Xen has chosen to do.
> This whole logic of having linux and Xen happen to choose the
> same interrupt handling mode by coincidence that is in use
> right now seems fragile.
>
> Xen runs a timer so I can't see Xen actually leaving the decision
> up to linux.  It certainly doesn't leave the physical versus logical
> mode decision up to linux.
>   

Yes.  And if the chipset does actually rely on programming/configuration
from the DSDT, I guess there could be a problem in principle (I don't
know of any problems in practice).

> Assuming Xen not having the AML interpreter is the right decision.
> I recall from some of the hibernation discussions that ACPI is not an easy
> thing to start/stop on a machine unless it is handled just so.
>
> Which if dom0 is to be redundant/restartable seems to make the
> argument for AML living in Xen.
>   
> Xen has everything except the AML interpreter.
>   

I assume that putting AML into Xen has been considered, but I don't
anything about those deliberations.  Keir? Jun?

>> DomU doesn't know or care about ACPI at all.  There's no reason for it
>> to get any kind of ACPI event.
>>     
>
> Yet at least occasionally Xen fakes up ACPI tables for a guest.
>   

No, it never fakes up an ACPI table.  Privileged domains can see the
real BIOS ACPI tables, but unprivileged PV DomUs can't see anything.

(HVM - fully virtualized - domains get completely faked ACPI, along with
everything else, courtesy of qemu.)

> DomU has to do something to get the mapping, which is what I thought you
> were referring to.
>
> I can believe a Xen dom0 that is the same as domU except with lots
> more hardware.

There isn't really a sharp divide between the two.  At core a dom0 (or
"control domain" as we're beginnging to call it) functions just like a
plain paravirtualized domU domain with some extra privileges.  Among
those privileges is the ability to directly map device memory, perform
pio and bind physical interrupts to event channels.

At the moment a whole pile of disparate things are lumped together into
the initial domain (hence "dom0" - the first domain), which tends to
conflate them in people's minds.  But we're moving towards a model of
disaggregating those functions into separate special-purpose domains,
with a more fine-grained notion of "privileged".

For example, there have been prototypes of "driver domains" floating
around for a while now, where a domain has specific responsibility for a
device or set of devices, and is only allowed to access those particular
hardware resources (this is a lot more convincing with VT-d enabled).

Simiarly, xenstore/xenbus - the system-wide configuration state - can be
in its own domain, as it is one of the few things whose loss can't be
recovered from.

>   I have trouble understanding and Xen dom0 that has to
> all kinds of special one off hardware manipulation because Xen is too
> lazy too. 

Well the reason is the obvious one - Linux has a pile of drivers, and
attempting to duplicate them, or even port them into a completely
different environment - is a huge amount of pointless work.

>  Xen is already 300,000 lines of code so it's not like it is
> fully comprehensible/maintainable by a single person anyway.

Well, more like 200,000 for all the common code and arch/x86 (ia64 adds
another 30k lines or so).  By contrast, in Linux just kernel/ and
arch/x86/ is about 360,000 lines, and that doesn't include any drivers; 
drivers/ contains a further 5.8 million lines of code (~2 million in
scsi, net and block).

I don't see why it needs to be maintainable by a single person, though
there are probably only ~10 people who'd be classed as core developers. 
But while Xen and core Linux are about the same order of magnitude,
there's no way trying to import Linux drivers into Xen is even remotely
sustainable.

    J

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
@ 2009-06-19 19:58                 ` Jeremy Fitzhardinge
  0 siblings, 0 replies; 79+ messages in thread
From: Jeremy Fitzhardinge @ 2009-06-19 19:58 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Xen-devel, the arch/x86 maintainers, Linux Kernel Mailing List,
	Ingo Molnar, Keir Fraser, Nakajima, Jun, H. Peter Anvin,
	Thomas Gleixner, Len Brown

On 06/18/09 19:42, Eric W. Biederman wrote:
> Interesting.  Then we need to know what Xen has chosen to do.
> This whole logic of having linux and Xen happen to choose the
> same interrupt handling mode by coincidence that is in use
> right now seems fragile.
>
> Xen runs a timer so I can't see Xen actually leaving the decision
> up to linux.  It certainly doesn't leave the physical versus logical
> mode decision up to linux.
>   

Yes.  And if the chipset does actually rely on programming/configuration
from the DSDT, I guess there could be a problem in principle (I don't
know of any problems in practice).

> Assuming Xen not having the AML interpreter is the right decision.
> I recall from some of the hibernation discussions that ACPI is not an easy
> thing to start/stop on a machine unless it is handled just so.
>
> Which if dom0 is to be redundant/restartable seems to make the
> argument for AML living in Xen.
>   
> Xen has everything except the AML interpreter.
>   

I assume that putting AML into Xen has been considered, but I don't
anything about those deliberations.  Keir? Jun?

>> DomU doesn't know or care about ACPI at all.  There's no reason for it
>> to get any kind of ACPI event.
>>     
>
> Yet at least occasionally Xen fakes up ACPI tables for a guest.
>   

No, it never fakes up an ACPI table.  Privileged domains can see the
real BIOS ACPI tables, but unprivileged PV DomUs can't see anything.

(HVM - fully virtualized - domains get completely faked ACPI, along with
everything else, courtesy of qemu.)

> DomU has to do something to get the mapping, which is what I thought you
> were referring to.
>
> I can believe a Xen dom0 that is the same as domU except with lots
> more hardware.

There isn't really a sharp divide between the two.  At core a dom0 (or
"control domain" as we're beginnging to call it) functions just like a
plain paravirtualized domU domain with some extra privileges.  Among
those privileges is the ability to directly map device memory, perform
pio and bind physical interrupts to event channels.

At the moment a whole pile of disparate things are lumped together into
the initial domain (hence "dom0" - the first domain), which tends to
conflate them in people's minds.  But we're moving towards a model of
disaggregating those functions into separate special-purpose domains,
with a more fine-grained notion of "privileged".

For example, there have been prototypes of "driver domains" floating
around for a while now, where a domain has specific responsibility for a
device or set of devices, and is only allowed to access those particular
hardware resources (this is a lot more convincing with VT-d enabled).

Simiarly, xenstore/xenbus - the system-wide configuration state - can be
in its own domain, as it is one of the few things whose loss can't be
recovered from.

>   I have trouble understanding and Xen dom0 that has to
> all kinds of special one off hardware manipulation because Xen is too
> lazy too. 

Well the reason is the obvious one - Linux has a pile of drivers, and
attempting to duplicate them, or even port them into a completely
different environment - is a huge amount of pointless work.

>  Xen is already 300,000 lines of code so it's not like it is
> fully comprehensible/maintainable by a single person anyway.

Well, more like 200,000 for all the common code and arch/x86 (ia64 adds
another 30k lines or so).  By contrast, in Linux just kernel/ and
arch/x86/ is about 360,000 lines, and that doesn't include any drivers; 
drivers/ contains a further 5.8 million lines of code (~2 million in
scsi, net and block).

I don't see why it needs to be maintainable by a single person, though
there are probably only ~10 people who'd be classed as core developers. 
But while Xen and core Linux are about the same order of magnitude,
there's no way trying to import Linux drivers into Xen is even remotely
sustainable.

    J

^ permalink raw reply	[flat|nested] 79+ messages in thread

* RE: [Xen-devel] Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
  2009-06-19 19:58                 ` Jeremy Fitzhardinge
@ 2009-06-19 23:44                   ` Nakajima, Jun
  -1 siblings, 0 replies; 79+ messages in thread
From: Nakajima, Jun @ 2009-06-19 23:44 UTC (permalink / raw)
  To: Jeremy Fitzhardinge, Eric W. Biederman
  Cc: Xen-devel, the arch/x86 maintainers, Linux Kernel Mailing List,
	Ingo Molnar, Keir Fraser, H. Peter Anvin, Thomas Gleixner,
	Len Brown

Jeremy Fitzhardinge wrote on Fri, 19 Jun 2009 at 12:58:14:

>> 
>> Which if dom0 is to be redundant/restartable seems to make the
>> argument for AML living in Xen.
>> 
>> Xen has everything except the AML interpreter.
>> 
> 
> I assume that putting AML into Xen has been considered, but I don't
> anything about those deliberations.  Keir? Jun?
> 

Yes, it was one of the options years ago. We did not do that because Linux and Solaris (as dom0) already had the AML interpreter and it's overkill and redundant to have such a large component in the Xen hypervisor. Since the hypervisor does most of the power management (i.e. P, C, S-state, etc.) getting the info from dom0 today, we might want to reconsider the option. 

Jun
___
Intel Open Source Technology Center



^ permalink raw reply	[flat|nested] 79+ messages in thread

* RE: Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
@ 2009-06-19 23:44                   ` Nakajima, Jun
  0 siblings, 0 replies; 79+ messages in thread
From: Nakajima, Jun @ 2009-06-19 23:44 UTC (permalink / raw)
  To: Jeremy Fitzhardinge, Eric W. Biederman
  Cc: Xen-devel, the arch/x86 maintainers, Linux Kernel Mailing List,
	Ingo Molnar, Keir Fraser, H. Peter Anvin, Thomas Gleixner,
	Len Brown

Jeremy Fitzhardinge wrote on Fri, 19 Jun 2009 at 12:58:14:

>> 
>> Which if dom0 is to be redundant/restartable seems to make the
>> argument for AML living in Xen.
>> 
>> Xen has everything except the AML interpreter.
>> 
> 
> I assume that putting AML into Xen has been considered, but I don't
> anything about those deliberations.  Keir? Jun?
> 

Yes, it was one of the options years ago. We did not do that because Linux and Solaris (as dom0) already had the AML interpreter and it's overkill and redundant to have such a large component in the Xen hypervisor. Since the hypervisor does most of the power management (i.e. P, C, S-state, etc.) getting the info from dom0 today, we might want to reconsider the option. 

Jun
___
Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [Xen-devel] Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs  justbecause there's no local APIC
  2009-06-19  8:16                   ` Eric W. Biederman
@ 2009-06-20  3:58                     ` Yinghai Lu
  -1 siblings, 0 replies; 79+ messages in thread
From: Yinghai Lu @ 2009-06-20  3:58 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Jan Beulich, Jeremy Fitzhardinge, Len Brown,
	the arch/x86 maintainers, Thomas Gleixner, Xen-devel,
	Ingo Molnar, Linux Kernel Mailing List, H. Peter Anvin

On Fri, Jun 19, 2009 at 1:16 AM, Eric W. Biederman<ebiederm@xmission.com> wrote:
> "Jan Beulich" <JBeulich@novell.com> writes:
>
>>>>> Yinghai Lu <yhlu.kernel@gmail.com> 19.06.09 07:32 >>>
>>>doesn't XEN support per cpu irq vector?
>>
>> No.
>>
>>>got sth from XEN 3.3 / SLES 11
>>>
>>>igb 0000:81:00.0: PCI INT A -> GSI 95 (level, low) -> IRQ 95
>>>igb 0000:81:00.0: setting latency timer to 64
>>>igb 0000:81:00.0: Intel(R) Gigabit Ethernet Network Connection
>>>igb 0000:81:00.0: eth9: (PCIe:2.5Gb/s:Width x4) 00:21:28:3a:d8:0e
>>>igb 0000:81:00.0: eth9: PBA No: ffffff-0ff
>>>igb 0000:81:00.0: Using MSI-X interrupts. 4 rx queue(s), 4 tx queue(s)
>>>vendor=8086 device=3420
>>>(XEN) irq.c:847: dom0: invalid pirq 94 or vector -28
>>>igb 0000:81:00.1: PCI INT B -> GSI 94 (level, low) -> IRQ 94
>>>igb 0000:81:00.1: setting latency timer to 64
>>>(XEN) physdev.c:87: dom0: map irq with wrong vector -28
>>>map irq failed
>>>(XEN) physdev.c:87: dom0: map irq with wrong vector -28
>>>map irq failed
>>>
>>>the system need a lot of MSI-X normally.. with current mainline tree
>>>kernel, it will need about 360 irq...
>>
>> Do you mean 360 connected devices, or just 360 IO-APIC pins (most of
>> which are usually unused)? In the latter case, devices using MSI (i.e. not
>> using high numbered IO-APIC pins) should work, while devices connected
>> to IO-APIC pins numbered 256 and higher won't work in SLE11 as-is.
>> This limitation got fixed recently in the 3.5-unstable tree, though. The
>> 256 active vectors limit, however, continues to exist, so the former case
>> would still not be supported by Xen.

5 io-apic controllers, so total pins like 5x24

>
> Good question.  I know YH had a system a few years ago that exceeded 256 vectors.
that was in SimNow.

This time is real.
think about system: 24 pcie cards and every one has two functions. and
one function will use 16 or 20 MSIX
like 24 * 2 * 16

YH

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs justbecause there's no local APIC
@ 2009-06-20  3:58                     ` Yinghai Lu
  0 siblings, 0 replies; 79+ messages in thread
From: Yinghai Lu @ 2009-06-20  3:58 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Jeremy Fitzhardinge, Xen-devel, the arch/x86 maintainers,
	Linux Kernel Mailing List, Jan Beulich, Ingo Molnar,
	H. Peter Anvin, Thomas Gleixner, Len Brown

On Fri, Jun 19, 2009 at 1:16 AM, Eric W. Biederman<ebiederm@xmission.com> wrote:
> "Jan Beulich" <JBeulich@novell.com> writes:
>
>>>>> Yinghai Lu <yhlu.kernel@gmail.com> 19.06.09 07:32 >>>
>>>doesn't XEN support per cpu irq vector?
>>
>> No.
>>
>>>got sth from XEN 3.3 / SLES 11
>>>
>>>igb 0000:81:00.0: PCI INT A -> GSI 95 (level, low) -> IRQ 95
>>>igb 0000:81:00.0: setting latency timer to 64
>>>igb 0000:81:00.0: Intel(R) Gigabit Ethernet Network Connection
>>>igb 0000:81:00.0: eth9: (PCIe:2.5Gb/s:Width x4) 00:21:28:3a:d8:0e
>>>igb 0000:81:00.0: eth9: PBA No: ffffff-0ff
>>>igb 0000:81:00.0: Using MSI-X interrupts. 4 rx queue(s), 4 tx queue(s)
>>>vendor=8086 device=3420
>>>(XEN) irq.c:847: dom0: invalid pirq 94 or vector -28
>>>igb 0000:81:00.1: PCI INT B -> GSI 94 (level, low) -> IRQ 94
>>>igb 0000:81:00.1: setting latency timer to 64
>>>(XEN) physdev.c:87: dom0: map irq with wrong vector -28
>>>map irq failed
>>>(XEN) physdev.c:87: dom0: map irq with wrong vector -28
>>>map irq failed
>>>
>>>the system need a lot of MSI-X normally.. with current mainline tree
>>>kernel, it will need about 360 irq...
>>
>> Do you mean 360 connected devices, or just 360 IO-APIC pins (most of
>> which are usually unused)? In the latter case, devices using MSI (i.e. not
>> using high numbered IO-APIC pins) should work, while devices connected
>> to IO-APIC pins numbered 256 and higher won't work in SLE11 as-is.
>> This limitation got fixed recently in the 3.5-unstable tree, though. The
>> 256 active vectors limit, however, continues to exist, so the former case
>> would still not be supported by Xen.

5 io-apic controllers, so total pins like 5x24

>
> Good question.  I know YH had a system a few years ago that exceeded 256 vectors.
that was in SimNow.

This time is real.
think about system: 24 pcie cards and every one has two functions. and
one function will use 16 or 20 MSIX
like 24 * 2 * 16

YH

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [Xen-devel] Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs  justbecause there's no local APIC
  2009-06-20  3:58                     ` Yinghai Lu
@ 2009-06-20  5:40                       ` Eric W. Biederman
  -1 siblings, 0 replies; 79+ messages in thread
From: Eric W. Biederman @ 2009-06-20  5:40 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: Jan Beulich, Jeremy Fitzhardinge, Len Brown,
	the arch/x86 maintainers, Thomas Gleixner, Xen-devel,
	Ingo Molnar, Linux Kernel Mailing List, H. Peter Anvin

Yinghai Lu <yhlu.kernel@gmail.com> writes:

> On Fri, Jun 19, 2009 at 1:16 AM, Eric W. Biederman<ebiederm@xmission.com> wrote:
>> "Jan Beulich" <JBeulich@novell.com> writes:
>>
>>>>>> Yinghai Lu <yhlu.kernel@gmail.com> 19.06.09 07:32 >>>
>>>>doesn't XEN support per cpu irq vector?
>>>
>>> No.
>>>
>>>>got sth from XEN 3.3 / SLES 11
>>>>
>>>>igb 0000:81:00.0: PCI INT A -> GSI 95 (level, low) -> IRQ 95
>>>>igb 0000:81:00.0: setting latency timer to 64
>>>>igb 0000:81:00.0: Intel(R) Gigabit Ethernet Network Connection
>>>>igb 0000:81:00.0: eth9: (PCIe:2.5Gb/s:Width x4) 00:21:28:3a:d8:0e
>>>>igb 0000:81:00.0: eth9: PBA No: ffffff-0ff
>>>>igb 0000:81:00.0: Using MSI-X interrupts. 4 rx queue(s), 4 tx queue(s)
>>>>vendor=8086 device=3420
>>>>(XEN) irq.c:847: dom0: invalid pirq 94 or vector -28
>>>>igb 0000:81:00.1: PCI INT B -> GSI 94 (level, low) -> IRQ 94
>>>>igb 0000:81:00.1: setting latency timer to 64
>>>>(XEN) physdev.c:87: dom0: map irq with wrong vector -28
>>>>map irq failed
>>>>(XEN) physdev.c:87: dom0: map irq with wrong vector -28
>>>>map irq failed
>>>>
>>>>the system need a lot of MSI-X normally.. with current mainline tree
>>>>kernel, it will need about 360 irq...
>>>
>>> Do you mean 360 connected devices, or just 360 IO-APIC pins (most of
>>> which are usually unused)? In the latter case, devices using MSI (i.e. not
>>> using high numbered IO-APIC pins) should work, while devices connected
>>> to IO-APIC pins numbered 256 and higher won't work in SLE11 as-is.
>>> This limitation got fixed recently in the 3.5-unstable tree, though. The
>>> 256 active vectors limit, however, continues to exist, so the former case
>>> would still not be supported by Xen.
>
> 5 io-apic controllers, so total pins like 5x24
>
>>
>> Good question.  I know YH had a system a few years ago that exceeded 256 vectors.
> that was in SimNow.
>
> This time is real.
> think about system: 24 pcie cards and every one has two functions. and
> one function will use 16 or 20 MSIX
> like 24 * 2 * 16

I'm not too surprised.  I saw the writing on the wall when I implement
per irq vector, and MSIX was one the likely candidates.

I'm curious what kind of pcie card do you have plugged in?  Looks like
you have a irq or two per cpu.

Eric

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs justbecause there's no local APIC
@ 2009-06-20  5:40                       ` Eric W. Biederman
  0 siblings, 0 replies; 79+ messages in thread
From: Eric W. Biederman @ 2009-06-20  5:40 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: Jeremy Fitzhardinge, Xen-devel, the arch/x86 maintainers,
	Linux Kernel Mailing List, Jan Beulich, Ingo Molnar,
	H. Peter Anvin, Thomas Gleixner, Len Brown

Yinghai Lu <yhlu.kernel@gmail.com> writes:

> On Fri, Jun 19, 2009 at 1:16 AM, Eric W. Biederman<ebiederm@xmission.com> wrote:
>> "Jan Beulich" <JBeulich@novell.com> writes:
>>
>>>>>> Yinghai Lu <yhlu.kernel@gmail.com> 19.06.09 07:32 >>>
>>>>doesn't XEN support per cpu irq vector?
>>>
>>> No.
>>>
>>>>got sth from XEN 3.3 / SLES 11
>>>>
>>>>igb 0000:81:00.0: PCI INT A -> GSI 95 (level, low) -> IRQ 95
>>>>igb 0000:81:00.0: setting latency timer to 64
>>>>igb 0000:81:00.0: Intel(R) Gigabit Ethernet Network Connection
>>>>igb 0000:81:00.0: eth9: (PCIe:2.5Gb/s:Width x4) 00:21:28:3a:d8:0e
>>>>igb 0000:81:00.0: eth9: PBA No: ffffff-0ff
>>>>igb 0000:81:00.0: Using MSI-X interrupts. 4 rx queue(s), 4 tx queue(s)
>>>>vendor=8086 device=3420
>>>>(XEN) irq.c:847: dom0: invalid pirq 94 or vector -28
>>>>igb 0000:81:00.1: PCI INT B -> GSI 94 (level, low) -> IRQ 94
>>>>igb 0000:81:00.1: setting latency timer to 64
>>>>(XEN) physdev.c:87: dom0: map irq with wrong vector -28
>>>>map irq failed
>>>>(XEN) physdev.c:87: dom0: map irq with wrong vector -28
>>>>map irq failed
>>>>
>>>>the system need a lot of MSI-X normally.. with current mainline tree
>>>>kernel, it will need about 360 irq...
>>>
>>> Do you mean 360 connected devices, or just 360 IO-APIC pins (most of
>>> which are usually unused)? In the latter case, devices using MSI (i.e. not
>>> using high numbered IO-APIC pins) should work, while devices connected
>>> to IO-APIC pins numbered 256 and higher won't work in SLE11 as-is.
>>> This limitation got fixed recently in the 3.5-unstable tree, though. The
>>> 256 active vectors limit, however, continues to exist, so the former case
>>> would still not be supported by Xen.
>
> 5 io-apic controllers, so total pins like 5x24
>
>>
>> Good question.  I know YH had a system a few years ago that exceeded 256 vectors.
> that was in SimNow.
>
> This time is real.
> think about system: 24 pcie cards and every one has two functions. and
> one function will use 16 or 20 MSIX
> like 24 * 2 * 16

I'm not too surprised.  I saw the writing on the wall when I implement
per irq vector, and MSIX was one the likely candidates.

I'm curious what kind of pcie card do you have plugged in?  Looks like
you have a irq or two per cpu.

Eric

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [Xen-devel] Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs  justbecause there's no local APIC
  2009-06-20  5:40                       ` Eric W. Biederman
@ 2009-06-20  5:58                         ` Yinghai Lu
  -1 siblings, 0 replies; 79+ messages in thread
From: Yinghai Lu @ 2009-06-20  5:58 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Jan Beulich, Jeremy Fitzhardinge, Len Brown,
	the arch/x86 maintainers, Thomas Gleixner, Xen-devel,
	Ingo Molnar, Linux Kernel Mailing List, H. Peter Anvin

On Fri, Jun 19, 2009 at 10:40 PM, Eric W.
Biederman<ebiederm@xmission.com> wrote:
> Yinghai Lu <yhlu.kernel@gmail.com> writes:
>
> I'm curious what kind of pcie card do you have plugged in?  Looks like
> you have a irq or two per cpu.

intel 10g or sun neptune 10g

YH

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs justbecause there's no local APIC
@ 2009-06-20  5:58                         ` Yinghai Lu
  0 siblings, 0 replies; 79+ messages in thread
From: Yinghai Lu @ 2009-06-20  5:58 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Jeremy Fitzhardinge, Xen-devel, the arch/x86 maintainers,
	Linux Kernel Mailing List, Jan Beulich, Ingo Molnar,
	H. Peter Anvin, Thomas Gleixner, Len Brown

On Fri, Jun 19, 2009 at 10:40 PM, Eric W.
Biederman<ebiederm@xmission.com> wrote:
> Yinghai Lu <yhlu.kernel@gmail.com> writes:
>
> I'm curious what kind of pcie card do you have plugged in?  Looks like
> you have a irq or two per cpu.

intel 10g or sun neptune 10g

YH

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [Xen-devel] Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
  2009-06-19 23:44                   ` Nakajima, Jun
@ 2009-06-20  7:39                     ` Keir Fraser
  -1 siblings, 0 replies; 79+ messages in thread
From: Keir Fraser @ 2009-06-20  7:39 UTC (permalink / raw)
  To: Nakajima, Jun, Jeremy Fitzhardinge, Eric W. Biederman
  Cc: Xen-devel, the arch/x86 maintainers, Linux Kernel Mailing List,
	Ingo Molnar, H. Peter Anvin, Thomas Gleixner, Len Brown

On 20/06/2009 00:44, "Nakajima, Jun" <jun.nakajima@intel.com> wrote:

>> I assume that putting AML into Xen has been considered, but I don't
>> anything about those deliberations.  Keir? Jun?
>> 
> 
> Yes, it was one of the options years ago. We did not do that because Linux and
> Solaris (as dom0) already had the AML interpreter and it's overkill and
> redundant to have such a large component in the Xen hypervisor. Since the
> hypervisor does most of the power management (i.e. P, C, S-state, etc.)
> getting the info from dom0 today, we might want to reconsider the option.

Yes, we could reconsider. However is there any stuff that dom0 remains
responsible for (e.g., PCI management, and therefore PCI hotplug) where it
would continue to need to be OSPM, interpreting certain AML objects? In
general how safe would it be to have two layered entities both playing at
being OSPM?

 -- Keir



^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
@ 2009-06-20  7:39                     ` Keir Fraser
  0 siblings, 0 replies; 79+ messages in thread
From: Keir Fraser @ 2009-06-20  7:39 UTC (permalink / raw)
  To: Nakajima, Jun, Jeremy Fitzhardinge, Eric W. Biederman
  Cc: Xen-devel, the arch/x86 maintainers, Linux Kernel Mailing List,
	Ingo Molnar, H. Peter Anvin, Thomas Gleixner, Len Brown

On 20/06/2009 00:44, "Nakajima, Jun" <jun.nakajima@intel.com> wrote:

>> I assume that putting AML into Xen has been considered, but I don't
>> anything about those deliberations.  Keir? Jun?
>> 
> 
> Yes, it was one of the options years ago. We did not do that because Linux and
> Solaris (as dom0) already had the AML interpreter and it's overkill and
> redundant to have such a large component in the Xen hypervisor. Since the
> hypervisor does most of the power management (i.e. P, C, S-state, etc.)
> getting the info from dom0 today, we might want to reconsider the option.

Yes, we could reconsider. However is there any stuff that dom0 remains
responsible for (e.g., PCI management, and therefore PCI hotplug) where it
would continue to need to be OSPM, interpreting certain AML objects? In
general how safe would it be to have two layered entities both playing at
being OSPM?

 -- Keir

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [Xen-devel] Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
  2009-06-19 23:44                   ` Nakajima, Jun
@ 2009-06-20  8:18                     ` Eric W. Biederman
  -1 siblings, 0 replies; 79+ messages in thread
From: Eric W. Biederman @ 2009-06-20  8:18 UTC (permalink / raw)
  To: Nakajima, Jun
  Cc: Jeremy Fitzhardinge, Xen-devel, the arch/x86 maintainers,
	Linux Kernel Mailing List, Ingo Molnar, Keir Fraser,
	H. Peter Anvin, Thomas Gleixner, Len Brown

"Nakajima, Jun" <jun.nakajima@intel.com> writes:

> Jeremy Fitzhardinge wrote on Fri, 19 Jun 2009 at 12:58:14:
>
>>> 
>>> Which if dom0 is to be redundant/restartable seems to make the
>>> argument for AML living in Xen.
>>> 
>>> Xen has everything except the AML interpreter.
>>> 
>> 
>> I assume that putting AML into Xen has been considered, but I don't
>> anything about those deliberations.  Keir? Jun?
>> 
>
> Yes, it was one of the options years ago. We did not do that because Linux and Solaris (as dom0) already had the AML interpreter and it's overkill and redundant to have such a large component in the Xen hypervisor. Since the hypervisor does most of the power management (i.e. P, C, S-state, etc.) getting the info from dom0 today, we might want to reconsider the option. 

In my brief investigation it looks as if Xen having the AML
interpreter would considerably simplify the complexity of the
dom0 interface.

What I am certain of is that the current Xen dom0 irq interface exposes
implementation details (aka the vector number) that if continued will prevent
Xen from scaling to machines with large amounts of I/O.  As YH has recently
demonstrated.

That interface needs to be fixed.

I think the path to fixing it and getting linux kernel support is.
- Merge pass through device support for domU.
- Move all of the irq setup from dom0 into Xen, making dom0 interrupt
  handling exactly the same as domU.
- Move all of ACPI handling into Xen, in support of irq handling
  and power management.  Things Xen already claims are interesting
  problems.

At that point I don't know what is left but in the area that I am
knowledge, irq handling, will be complete.  The incestuousness of
the interface is removed and Xen and the linux kernel can keep those
same interfaces for the foreseeable future.

In summary.  

In support of the Xen grand vision of all domains being equal.  I
don't think linux should ever merge dom0 support.  I think domU
support should be expanded, and the dom0 requirements simplified
until there are no differences left.

Eric

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
@ 2009-06-20  8:18                     ` Eric W. Biederman
  0 siblings, 0 replies; 79+ messages in thread
From: Eric W. Biederman @ 2009-06-20  8:18 UTC (permalink / raw)
  To: Nakajima, Jun
  Cc: Jeremy Fitzhardinge, Xen-devel, the arch/x86 maintainers,
	Linux Kernel Mailing List, Ingo Molnar, Keir Fraser,
	H. Peter Anvin, Thomas Gleixner, Len Brown

"Nakajima, Jun" <jun.nakajima@intel.com> writes:

> Jeremy Fitzhardinge wrote on Fri, 19 Jun 2009 at 12:58:14:
>
>>> 
>>> Which if dom0 is to be redundant/restartable seems to make the
>>> argument for AML living in Xen.
>>> 
>>> Xen has everything except the AML interpreter.
>>> 
>> 
>> I assume that putting AML into Xen has been considered, but I don't
>> anything about those deliberations.  Keir? Jun?
>> 
>
> Yes, it was one of the options years ago. We did not do that because Linux and Solaris (as dom0) already had the AML interpreter and it's overkill and redundant to have such a large component in the Xen hypervisor. Since the hypervisor does most of the power management (i.e. P, C, S-state, etc.) getting the info from dom0 today, we might want to reconsider the option. 

In my brief investigation it looks as if Xen having the AML
interpreter would considerably simplify the complexity of the
dom0 interface.

What I am certain of is that the current Xen dom0 irq interface exposes
implementation details (aka the vector number) that if continued will prevent
Xen from scaling to machines with large amounts of I/O.  As YH has recently
demonstrated.

That interface needs to be fixed.

I think the path to fixing it and getting linux kernel support is.
- Merge pass through device support for domU.
- Move all of the irq setup from dom0 into Xen, making dom0 interrupt
  handling exactly the same as domU.
- Move all of ACPI handling into Xen, in support of irq handling
  and power management.  Things Xen already claims are interesting
  problems.

At that point I don't know what is left but in the area that I am
knowledge, irq handling, will be complete.  The incestuousness of
the interface is removed and Xen and the linux kernel can keep those
same interfaces for the foreseeable future.

In summary.  

In support of the Xen grand vision of all domains being equal.  I
don't think linux should ever merge dom0 support.  I think domU
support should be expanded, and the dom0 requirements simplified
until there are no differences left.

Eric

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [Xen-devel] Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
  2009-06-20  7:39                     ` Keir Fraser
@ 2009-06-20  8:21                       ` Eric W. Biederman
  -1 siblings, 0 replies; 79+ messages in thread
From: Eric W. Biederman @ 2009-06-20  8:21 UTC (permalink / raw)
  To: Keir Fraser
  Cc: Nakajima, Jun, Jeremy Fitzhardinge, Xen-devel,
	the arch/x86 maintainers, Linux Kernel Mailing List, Ingo Molnar,
	H. Peter Anvin, Thomas Gleixner, Len Brown

Keir Fraser <keir.fraser@eu.citrix.com> writes:

> On 20/06/2009 00:44, "Nakajima, Jun" <jun.nakajima@intel.com> wrote:
>
>>> I assume that putting AML into Xen has been considered, but I don't
>>> anything about those deliberations.  Keir? Jun?
>>> 
>> 
>> Yes, it was one of the options years ago. We did not do that because Linux and
>> Solaris (as dom0) already had the AML interpreter and it's overkill and
>> redundant to have such a large component in the Xen hypervisor. Since the
>> hypervisor does most of the power management (i.e. P, C, S-state, etc.)
>> getting the info from dom0 today, we might want to reconsider the option.
>
> Yes, we could reconsider. However is there any stuff that dom0 remains
> responsible for (e.g., PCI management, and therefore PCI hotplug) where it
> would continue to need to be OSPM, interpreting certain AML objects? In
> general how safe would it be to have two layered entities both playing at
> being OSPM?

Short of running the oddball acpi based drivers.  I'm not familiar with
any acpi in the pci management.

Eric


^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
@ 2009-06-20  8:21                       ` Eric W. Biederman
  0 siblings, 0 replies; 79+ messages in thread
From: Eric W. Biederman @ 2009-06-20  8:21 UTC (permalink / raw)
  To: Keir Fraser
  Cc: Jeremy Fitzhardinge, Xen-devel, the arch/x86 maintainers,
	Linux Kernel Mailing List, Ingo Molnar, Nakajima, Jun,
	H. Peter Anvin, Thomas Gleixner, Len Brown

Keir Fraser <keir.fraser@eu.citrix.com> writes:

> On 20/06/2009 00:44, "Nakajima, Jun" <jun.nakajima@intel.com> wrote:
>
>>> I assume that putting AML into Xen has been considered, but I don't
>>> anything about those deliberations.  Keir? Jun?
>>> 
>> 
>> Yes, it was one of the options years ago. We did not do that because Linux and
>> Solaris (as dom0) already had the AML interpreter and it's overkill and
>> redundant to have such a large component in the Xen hypervisor. Since the
>> hypervisor does most of the power management (i.e. P, C, S-state, etc.)
>> getting the info from dom0 today, we might want to reconsider the option.
>
> Yes, we could reconsider. However is there any stuff that dom0 remains
> responsible for (e.g., PCI management, and therefore PCI hotplug) where it
> would continue to need to be OSPM, interpreting certain AML objects? In
> general how safe would it be to have two layered entities both playing at
> being OSPM?

Short of running the oddball acpi based drivers.  I'm not familiar with
any acpi in the pci management.

Eric

^ permalink raw reply	[flat|nested] 79+ messages in thread

* RE: [Xen-devel] Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
  2009-06-20  8:21                       ` Eric W. Biederman
@ 2009-06-20  8:57                         ` Tian, Kevin
  -1 siblings, 0 replies; 79+ messages in thread
From: Tian, Kevin @ 2009-06-20  8:57 UTC (permalink / raw)
  To: Eric W. Biederman, Keir Fraser
  Cc: Jeremy Fitzhardinge, Xen-devel, the arch/x86 maintainers,
	Linux Kernel Mailing List, Ingo Molnar, Nakajima, Jun,
	H. Peter Anvin, Thomas Gleixner, Len Brown

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="gb2312", Size: 3104 bytes --]

>From: Eric W. Biederman
>Sent: 2009Äê6ÔÂ20ÈÕ 16:22
>
>Keir Fraser <keir.fraser@eu.citrix.com> writes:
>
>> On 20/06/2009 00:44, "Nakajima, Jun" <jun.nakajima@intel.com> wrote:
>>
>>>> I assume that putting AML into Xen has been considered, but I don't
>>>> anything about those deliberations.  Keir? Jun?
>>>> 
>>> 
>>> Yes, it was one of the options years ago. We did not do 
>that because Linux and
>>> Solaris (as dom0) already had the AML interpreter and it's 
>overkill and
>>> redundant to have such a large component in the Xen 
>hypervisor. Since the
>>> hypervisor does most of the power management (i.e. P, C, 
>S-state, etc.)
>>> getting the info from dom0 today, we might want to 
>reconsider the option.
>>
>> Yes, we could reconsider. However is there any stuff that 
>dom0 remains
>> responsible for (e.g., PCI management, and therefore PCI 
>hotplug) where it
>> would continue to need to be OSPM, interpreting certain AML 
>objects? In
>> general how safe would it be to have two layered entities 
>both playing at
>> being OSPM?
>
>Short of running the oddball acpi based drivers.  I'm not familiar with
>any acpi in the pci management.
>

PCIe hotplug is defined well by its own BUS spec. But conventional
PCI hotplug is implemented all kinds of strange things. Some is
through ACPI, and thus by moving ACPI into Xen, a new 'virtual' hotplug
architecture has to be introduced into dom0 Linux. Or Xen needs to 
emulate some known interface but as said there's no common standard
for PCI hotplug. What's worse is the docking station support which 
contains diverse legacy devices. How Xen pass those legacy device 
hotplug events into dom0 Linux become another gray area suffering from 
same question like whether IOAPIC needs to be changed for Xen...

Above comes from the exclusive assumption that ACPI is removed
from dom0 by moving into Xen.

Another choice is to have two layered ACPI in both dom0 and Xen
with dom0's ACPI virtualized a bit by Xen. However it's messy as 
ACPI encodes most stuff in its own AML encode as a gray box. 
Many ACPI methods talk to hardware bits internally even by hard 
coded I/O registers. You don't know whether one ACPI event 
should be handled by Xen or not, until some AML methods have 
been evaluated which then may already consume and change 
some device states and not reversible. Then Xen have to emulate 
those states when injecting a virtual ACPI event into dom0 as 
dom0 ACPI methods need to consume same states. However
automatic generating emulation code for diverse ACPI implementations
to me is far more complex than any discussion here. 

So the real trouble is ACPI , which encode all platform bits if 
they're not included in any existing BUS spec, such as power, 
thermal, processor, battery, PCI routing, hotplug, EC, etc. Some
are owned by dom0 and some by Xen. However ACPI's AML encoding
makes automatic division between two categories really difficult.

Thanks,
Kevinÿôèº{.nÇ+‰·Ÿ®‰­†+%ŠËÿ±éݶ\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dʇڙë,j\a­¢f£¢·hšïêÿ‘êçz_è®\x03(­éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨è­Ú&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

^ permalink raw reply	[flat|nested] 79+ messages in thread

* RE: Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
@ 2009-06-20  8:57                         ` Tian, Kevin
  0 siblings, 0 replies; 79+ messages in thread
From: Tian, Kevin @ 2009-06-20  8:57 UTC (permalink / raw)
  To: Eric W. Biederman, Keir Fraser
  Cc: Jeremy Fitzhardinge, Xen-devel, the arch/x86 maintainers,
	Linux Kernel Mailing List, Ingo Molnar, Nakajima, Jun,
	H. Peter Anvin, Thomas Gleixner, Len Brown

[-- Attachment #1: Type: text/plain, Size: 3013 bytes --]

>From: Eric W. Biederman
>Sent: 2009年6月20日 16:22
>
>Keir Fraser <keir.fraser@eu.citrix.com> writes:
>
>> On 20/06/2009 00:44, "Nakajima, Jun" <jun.nakajima@intel.com> wrote:
>>
>>>> I assume that putting AML into Xen has been considered, but I don't
>>>> anything about those deliberations.  Keir? Jun?
>>>> 
>>> 
>>> Yes, it was one of the options years ago. We did not do 
>that because Linux and
>>> Solaris (as dom0) already had the AML interpreter and it's 
>overkill and
>>> redundant to have such a large component in the Xen 
>hypervisor. Since the
>>> hypervisor does most of the power management (i.e. P, C, 
>S-state, etc.)
>>> getting the info from dom0 today, we might want to 
>reconsider the option.
>>
>> Yes, we could reconsider. However is there any stuff that 
>dom0 remains
>> responsible for (e.g., PCI management, and therefore PCI 
>hotplug) where it
>> would continue to need to be OSPM, interpreting certain AML 
>objects? In
>> general how safe would it be to have two layered entities 
>both playing at
>> being OSPM?
>
>Short of running the oddball acpi based drivers.  I'm not familiar with
>any acpi in the pci management.
>

PCIe hotplug is defined well by its own BUS spec. But conventional
PCI hotplug is implemented all kinds of strange things. Some is
through ACPI, and thus by moving ACPI into Xen, a new 'virtual' hotplug
architecture has to be introduced into dom0 Linux. Or Xen needs to 
emulate some known interface but as said there's no common standard
for PCI hotplug. What's worse is the docking station support which 
contains diverse legacy devices. How Xen pass those legacy device 
hotplug events into dom0 Linux become another gray area suffering from 
same question like whether IOAPIC needs to be changed for Xen...

Above comes from the exclusive assumption that ACPI is removed
from dom0 by moving into Xen.

Another choice is to have two layered ACPI in both dom0 and Xen
with dom0's ACPI virtualized a bit by Xen. However it's messy as 
ACPI encodes most stuff in its own AML encode as a gray box. 
Many ACPI methods talk to hardware bits internally even by hard 
coded I/O registers. You don't know whether one ACPI event 
should be handled by Xen or not, until some AML methods have 
been evaluated which then may already consume and change 
some device states and not reversible. Then Xen have to emulate 
those states when injecting a virtual ACPI event into dom0 as 
dom0 ACPI methods need to consume same states. However
automatic generating emulation code for diverse ACPI implementations
to me is far more complex than any discussion here. 

So the real trouble is ACPI , which encode all platform bits if 
they're not included in any existing BUS spec, such as power, 
thermal, processor, battery, PCI routing, hotplug, EC, etc. Some
are owned by dom0 and some by Xen. However ACPI's AML encoding
makes automatic division between two categories really difficult.

Thanks,
Kevin

[-- Attachment #2: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: [Xen-devel] Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
  2009-06-20  8:57                         ` Tian, Kevin
@ 2009-06-20 10:22                           ` Keir Fraser
  -1 siblings, 0 replies; 79+ messages in thread
From: Keir Fraser @ 2009-06-20 10:22 UTC (permalink / raw)
  To: Tian, Kevin, Eric W. Biederman
  Cc: Jeremy Fitzhardinge, Xen-devel, the arch/x86 maintainers,
	Linux Kernel Mailing List, Ingo Molnar, Nakajima, Jun,
	H. Peter Anvin, Thomas Gleixner, Len Brown

On 20/06/2009 09:57, "Tian, Kevin" <kevin.tian@intel.com> wrote:

> So the real trouble is ACPI , which encode all platform bits if
> they're not included in any existing BUS spec, such as power,
> thermal, processor, battery, PCI routing, hotplug, EC, etc. Some
> are owned by dom0 and some by Xen. However ACPI's AML encoding
> makes automatic division between two categories really difficult.

Yes, we share the same lament regarding ACPI. I'm not sure what the best
(cleanest, simplest, whatever) solution really is going forward. We're
certainly happy to make quite fundamental changes in Xen if there is
sufficient win to be had.

 -- Keir



^ permalink raw reply	[flat|nested] 79+ messages in thread

* Re: Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC
@ 2009-06-20 10:22                           ` Keir Fraser
  0 siblings, 0 replies; 79+ messages in thread
From: Keir Fraser @ 2009-06-20 10:22 UTC (permalink / raw)
  To: Tian, Kevin, Eric W. Biederman
  Cc: Jeremy Fitzhardinge, Xen-devel, the arch/x86 maintainers,
	Linux Kernel Mailing List, Ingo Molnar, Nakajima, Jun,
	H. Peter Anvin, Thomas Gleixner, Len Brown

On 20/06/2009 09:57, "Tian, Kevin" <kevin.tian@intel.com> wrote:

> So the real trouble is ACPI , which encode all platform bits if
> they're not included in any existing BUS spec, such as power,
> thermal, processor, battery, PCI routing, hotplug, EC, etc. Some
> are owned by dom0 and some by Xen. However ACPI's AML encoding
> makes automatic division between two categories really difficult.

Yes, we share the same lament regarding ACPI. I'm not sure what the best
(cleanest, simplest, whatever) solution really is going forward. We're
certainly happy to make quite fundamental changes in Xen if there is
sufficient win to be had.

 -- Keir

^ permalink raw reply	[flat|nested] 79+ messages in thread

end of thread, other threads:[~2009-06-20 10:22 UTC | newest]

Thread overview: 79+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-06-12 18:22 [PATCH RFC] x86/acpi: don't ignore I/O APICs just because there's no local APIC Jeremy Fitzhardinge
2009-06-12 18:22 ` Jeremy Fitzhardinge
2009-06-12 18:28 ` Alan Cox
2009-06-12 18:28   ` Alan Cox
2009-06-12 18:33   ` Jeremy Fitzhardinge
2009-06-12 18:33     ` Jeremy Fitzhardinge
2009-06-12 20:11 ` Cyrill Gorcunov
2009-06-15  2:01   ` Jeremy Fitzhardinge
2009-06-12 20:35 ` Eric W. Biederman
2009-06-12 20:35   ` Eric W. Biederman
2009-06-15  2:06   ` Jeremy Fitzhardinge
2009-06-15 10:47     ` Eric W. Biederman
2009-06-15 10:47       ` Eric W. Biederman
2009-06-15 20:49       ` Jeremy Fitzhardinge
2009-06-15 20:49         ` Jeremy Fitzhardinge
2009-06-15 21:58         ` Eric W. Biederman
2009-06-15 21:58           ` Eric W. Biederman
2009-06-16 19:38           ` Jeremy Fitzhardinge
2009-06-16 19:38             ` Jeremy Fitzhardinge
2009-06-17  5:10             ` Eric W. Biederman
2009-06-17  5:10               ` Eric W. Biederman
2009-06-17 12:02             ` Eric W. Biederman
2009-06-17 12:02               ` Eric W. Biederman
2009-06-17 17:32               ` Jeremy Fitzhardinge
2009-06-17 17:32                 ` Jeremy Fitzhardinge
2009-06-18  2:58                 ` Eric W. Biederman
2009-06-18  2:58                   ` Eric W. Biederman
2009-06-18 19:34                   ` Jeremy Fitzhardinge
2009-06-18 19:34                     ` Jeremy Fitzhardinge
2009-06-18 20:28                     ` Eric W. Biederman
2009-06-18 21:09                       ` Jeremy Fitzhardinge
2009-06-18 21:09                         ` Jeremy Fitzhardinge
2009-06-19  1:38                         ` Eric W. Biederman
2009-06-19  1:38                           ` Eric W. Biederman
2009-06-19  3:10                           ` [Xen-devel] " Jiang, Yunhong
2009-06-19  3:10                             ` Jiang, Yunhong
2009-06-18 12:26                 ` Eric W. Biederman
2009-06-15 10:51 ` Eric W. Biederman
2009-06-15 10:51   ` Eric W. Biederman
2009-06-18 16:08 ` Len Brown
2009-06-18 19:14   ` Jeremy Fitzhardinge
2009-06-18 19:14     ` Jeremy Fitzhardinge
2009-06-18 19:27     ` Eric W. Biederman
2009-06-18 19:48       ` Jeremy Fitzhardinge
2009-06-18 19:48         ` Jeremy Fitzhardinge
2009-06-18 20:39         ` Eric W. Biederman
2009-06-18 22:33           ` Jeremy Fitzhardinge
2009-06-18 22:33             ` Jeremy Fitzhardinge
2009-06-19  2:42             ` Eric W. Biederman
2009-06-19  2:42               ` Eric W. Biederman
2009-06-19 19:58               ` Jeremy Fitzhardinge
2009-06-19 19:58                 ` Jeremy Fitzhardinge
2009-06-19 23:44                 ` [Xen-devel] " Nakajima, Jun
2009-06-19 23:44                   ` Nakajima, Jun
2009-06-20  7:39                   ` [Xen-devel] " Keir Fraser
2009-06-20  7:39                     ` Keir Fraser
2009-06-20  8:21                     ` [Xen-devel] " Eric W. Biederman
2009-06-20  8:21                       ` Eric W. Biederman
2009-06-20  8:57                       ` [Xen-devel] " Tian, Kevin
2009-06-20  8:57                         ` Tian, Kevin
2009-06-20 10:22                         ` [Xen-devel] " Keir Fraser
2009-06-20 10:22                           ` Keir Fraser
2009-06-20  8:18                   ` [Xen-devel] " Eric W. Biederman
2009-06-20  8:18                     ` Eric W. Biederman
2009-06-19  5:32             ` Yinghai Lu
2009-06-19  5:32               ` Yinghai Lu
2009-06-19  5:50               ` Eric W. Biederman
2009-06-19  5:50                 ` Eric W. Biederman
2009-06-19  7:52               ` [Xen-devel] Re: [PATCH RFC] x86/acpi: don't ignore I/O APICs justbecause " Jan Beulich
2009-06-19  7:52                 ` Jan Beulich
2009-06-19  8:16                 ` [Xen-devel] " Eric W. Biederman
2009-06-19  8:16                   ` Eric W. Biederman
2009-06-20  3:58                   ` [Xen-devel] " Yinghai Lu
2009-06-20  3:58                     ` Yinghai Lu
2009-06-20  5:40                     ` [Xen-devel] " Eric W. Biederman
2009-06-20  5:40                       ` Eric W. Biederman
2009-06-20  5:58                       ` [Xen-devel] " Yinghai Lu
2009-06-20  5:58                         ` Yinghai Lu
2009-06-18 22:51     ` [PATCH RFC] x86/acpi: don't ignore I/O APICs just because " Maciej W. Rozycki

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.