xen-devel.lists.xenproject.org archive mirror
 help / color / mirror / Atom feed
* [PATCH RFC] x86/vMSI-X: avoid missing first unmask of vectors
@ 2016-04-01  9:15 Jan Beulich
  2016-04-01  9:58 ` Jan Beulich
  0 siblings, 1 reply; 14+ messages in thread
From: Jan Beulich @ 2016-04-01  9:15 UTC (permalink / raw)
  To: xen-devel; +Cc: Andrew Cooper, Paul Durrant, Keir Fraser

[-- Attachment #1: Type: text/plain, Size: 5556 bytes --]

Recent changes to Linux result in there just being a single unmask
operation prior to expecting the first interrupts to arrive. However,
we've had a chicken-and-egg problem here: Qemu invokes
xc_domain_update_msi_irq(), ultimately leading to
msixtbl_pt_register(), upon seeing that first unmask operation. Yet
for msixtbl_range() to return true (in order to msixtbl_write() to get
invoked at all) msixtbl_pt_register() must have completed.

Deal with this by snooping suitable writes in msixtbl_range() and
triggering the invocation of msix_write_completion() from
msixtbl_pt_register() when that happens in the context of a still in
progress vector control field write.

Note that the seemingly unrelated deletion of the redundant
irq_desc->msi_desc checks in msixtbl_pt_register() is to make clear to
any compiler version used that the "msi_desc" local variable isn't
being used uninitialized. (Doing the same in msixtbl_pt_unregister() is
just for consistency reasons.)

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
TODO: How to deal with REP MOVS to the MSI-X table (in msixtbl_range())?
TODO: remove //temp-s
TODO: (follow-up patch) defer and conditionalize msixtbl_init() invocation
TODO: (follow-up patch) drop msixtbl_list_lock
TODO: (follow-up patch) drop pci_msix_get_table_len()

--- a/xen/arch/x86/hvm/vmsi.c
+++ b/xen/arch/x86/hvm/vmsi.c
@@ -276,6 +276,7 @@ static int msixtbl_write(struct vcpu *v,
     if ( !entry )
         goto out;
     nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE;
+printk("%pv: write MSI-X#%u: [%lx]=%0*lx\n", v, nr_entry, address, (int)len * 2, val);//temp
 
     offset = address & (PCI_MSIX_ENTRY_SIZE - 1);
     if ( offset != PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET )
@@ -321,7 +322,16 @@ static int msixtbl_write(struct vcpu *v,
 
     ASSERT(msi_desc == desc->msi_desc);
    
+{//temp
+ bool_t h = msi_desc->msi_attrib.host_masked;
+ bool_t g = msi_desc->msi_attrib.guest_masked;
+ bool_t ha = entry->pdev->msix->host_maskall;
+ bool_t ga = entry->pdev->msix->guest_maskall;
     guest_mask_msi_irq(desc, !!(val & PCI_MSIX_VECTOR_BITMASK));
+ printk("%pv: MSI-X#%u %d(%d) / %d(%d) -> %d(%d) / %d(%d)\n", v, nr_entry, h, ha, g, ga,
+        msi_desc->msi_attrib.host_masked, entry->pdev->msix->host_maskall,
+        msi_desc->msi_attrib.guest_masked, entry->pdev->msix->guest_maskall);
+}
 
 unlock:
     spin_unlock_irqrestore(&desc->lock, flags);
@@ -330,6 +340,7 @@ unlock:
 
 out:
     rcu_read_unlock(&msixtbl_rcu_lock);
+printk("%pv: write MSI-X [%lx] -> %d\n", v, address, r);//temp
     return r;
 }
 
@@ -341,7 +352,20 @@ static int msixtbl_range(struct vcpu *v,
     desc = msixtbl_addr_to_desc(msixtbl_find_entry(v, addr), addr);
     rcu_read_unlock(&msixtbl_rcu_lock);
 
-    return !!desc;
+    if ( desc )
+        return 1;
+
+    if ( (addr & (PCI_MSIX_ENTRY_SIZE - 1)) == PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET )
+    {
+        const ioreq_t *r = &v->arch.hvm_vcpu.hvm_io.io_req;
+
+        ASSERT(r->type == IOREQ_TYPE_COPY);
+        if ( r->dir == IOREQ_WRITE && r->size == 4 && !r->data_is_ptr
+             && !(r->data & PCI_MSIX_VECTOR_BITMASK) )
+            v->arch.hvm_vcpu.hvm_io.msix_snoop_address = addr;
+    }
+
+    return 0;
 }
 
 static const struct hvm_mmio_ops msixtbl_mmio_ops = {
@@ -360,6 +384,7 @@ static void add_msixtbl_entry(struct dom
     atomic_set(&entry->refcnt, 0);
 
     entry->table_len = pci_msix_get_table_len(pdev);
+WARN_ON(entry->table_len != (pdev->msix->nr_entries * PCI_MSIX_ENTRY_SIZE));//temp
     entry->pdev = pdev;
     entry->gtable = (unsigned long) gtable;
 
@@ -410,9 +435,6 @@ int msixtbl_pt_register(struct domain *d
         return r;
     }
 
-    if ( !irq_desc->msi_desc )
-        goto out;
-
     msi_desc = irq_desc->msi_desc;
     if ( !msi_desc )
         goto out;
@@ -437,6 +459,24 @@ found:
 out:
     spin_unlock_irq(&irq_desc->lock);
     xfree(new_entry);
+
+    if ( !r )
+    {
+        struct vcpu *v;
+
+printk("d%d: MSI-X#%u %u@%lx\n", d->domain_id, msi_desc->msi_attrib.entry_nr, entry->table_len, gtable);//temp
+        for_each_vcpu ( d, v )
+        {
+if(v->arch.hvm_vcpu.hvm_io.msix_snoop_address)//temp
+ printk("%pv: MSI-X#%u snoop %08lx\n", v, msi_desc->msi_attrib.entry_nr, v->arch.hvm_vcpu.hvm_io.msix_snoop_address);//temp
+            if ( v->arch.hvm_vcpu.hvm_io.msix_snoop_address ==
+                 (gtable + msi_desc->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
+                  PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET) )
+                v->arch.hvm_vcpu.hvm_io.msix_unmask_address =
+                    v->arch.hvm_vcpu.hvm_io.msix_snoop_address;
+        }
+    }
+
     return r;
 }
 
@@ -457,9 +497,6 @@ void msixtbl_pt_unregister(struct domain
     if ( !irq_desc )
         return;
 
-    if ( !irq_desc->msi_desc )
-        goto out;
-
     msi_desc = irq_desc->msi_desc;
     if ( !msi_desc )
         goto out;
@@ -522,6 +559,8 @@ void msix_write_completion(struct vcpu *
 {
     unsigned long ctrl_address = v->arch.hvm_vcpu.hvm_io.msix_unmask_address;
 
+    v->arch.hvm_vcpu.hvm_io.msix_snoop_address = 0;
+
     if ( !ctrl_address )
         return;
 
--- a/xen/include/asm-x86/hvm/vcpu.h
+++ b/xen/include/asm-x86/hvm/vcpu.h
@@ -85,6 +85,7 @@ struct hvm_vcpu_io {
     bool_t mmio_retry;
 
     unsigned long msix_unmask_address;
+    unsigned long msix_snoop_address;
 
     const struct g2m_ioport *g2m_ioport;
 };



[-- Attachment #2: x86-vMSI-X-first-unmask.patch --]
[-- Type: text/plain, Size: 5605 bytes --]

x86/vMSI-X: avoid missing first unmask of vectors

Recent changes to Linux result in there just being a single unmask
operation prior to expecting the first interrupts to arrive. However,
we've had a chicken-and-egg problem here: Qemu invokes
xc_domain_update_msi_irq(), ultimately leading to
msixtbl_pt_register(), upon seeing that first unmask operation. Yet
for msixtbl_range() to return true (in order to msixtbl_write() to get
invoked at all) msixtbl_pt_register() must have completed.

Deal with this by snooping suitable writes in msixtbl_range() and
triggering the invocation of msix_write_completion() from
msixtbl_pt_register() when that happens in the context of a still in
progress vector control field write.

Note that the seemingly unrelated deletion of the redundant
irq_desc->msi_desc checks in msixtbl_pt_register() is to make clear to
any compiler version used that the "msi_desc" local variable isn't
being used uninitialized. (Doing the same in msixtbl_pt_unregister() is
just for consistency reasons.)

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
TODO: How to deal with REP MOVS to the MSI-X table (in msixtbl_range())?
TODO: remove //temp-s
TODO: (follow-up patch) defer and conditionalize msixtbl_init() invocation
TODO: (follow-up patch) drop msixtbl_list_lock
TODO: (follow-up patch) drop pci_msix_get_table_len()

--- a/xen/arch/x86/hvm/vmsi.c
+++ b/xen/arch/x86/hvm/vmsi.c
@@ -276,6 +276,7 @@ static int msixtbl_write(struct vcpu *v,
     if ( !entry )
         goto out;
     nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE;
+printk("%pv: write MSI-X#%u: [%lx]=%0*lx\n", v, nr_entry, address, (int)len * 2, val);//temp
 
     offset = address & (PCI_MSIX_ENTRY_SIZE - 1);
     if ( offset != PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET )
@@ -321,7 +322,16 @@ static int msixtbl_write(struct vcpu *v,
 
     ASSERT(msi_desc == desc->msi_desc);
    
+{//temp
+ bool_t h = msi_desc->msi_attrib.host_masked;
+ bool_t g = msi_desc->msi_attrib.guest_masked;
+ bool_t ha = entry->pdev->msix->host_maskall;
+ bool_t ga = entry->pdev->msix->guest_maskall;
     guest_mask_msi_irq(desc, !!(val & PCI_MSIX_VECTOR_BITMASK));
+ printk("%pv: MSI-X#%u %d(%d) / %d(%d) -> %d(%d) / %d(%d)\n", v, nr_entry, h, ha, g, ga,
+        msi_desc->msi_attrib.host_masked, entry->pdev->msix->host_maskall,
+        msi_desc->msi_attrib.guest_masked, entry->pdev->msix->guest_maskall);
+}
 
 unlock:
     spin_unlock_irqrestore(&desc->lock, flags);
@@ -330,6 +340,7 @@ unlock:
 
 out:
     rcu_read_unlock(&msixtbl_rcu_lock);
+printk("%pv: write MSI-X [%lx] -> %d\n", v, address, r);//temp
     return r;
 }
 
@@ -341,7 +352,20 @@ static int msixtbl_range(struct vcpu *v,
     desc = msixtbl_addr_to_desc(msixtbl_find_entry(v, addr), addr);
     rcu_read_unlock(&msixtbl_rcu_lock);
 
-    return !!desc;
+    if ( desc )
+        return 1;
+
+    if ( (addr & (PCI_MSIX_ENTRY_SIZE - 1)) == PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET )
+    {
+        const ioreq_t *r = &v->arch.hvm_vcpu.hvm_io.io_req;
+
+        ASSERT(r->type == IOREQ_TYPE_COPY);
+        if ( r->dir == IOREQ_WRITE && r->size == 4 && !r->data_is_ptr
+             && !(r->data & PCI_MSIX_VECTOR_BITMASK) )
+            v->arch.hvm_vcpu.hvm_io.msix_snoop_address = addr;
+    }
+
+    return 0;
 }
 
 static const struct hvm_mmio_ops msixtbl_mmio_ops = {
@@ -360,6 +384,7 @@ static void add_msixtbl_entry(struct dom
     atomic_set(&entry->refcnt, 0);
 
     entry->table_len = pci_msix_get_table_len(pdev);
+WARN_ON(entry->table_len != (pdev->msix->nr_entries * PCI_MSIX_ENTRY_SIZE));//temp
     entry->pdev = pdev;
     entry->gtable = (unsigned long) gtable;
 
@@ -410,9 +435,6 @@ int msixtbl_pt_register(struct domain *d
         return r;
     }
 
-    if ( !irq_desc->msi_desc )
-        goto out;
-
     msi_desc = irq_desc->msi_desc;
     if ( !msi_desc )
         goto out;
@@ -437,6 +459,24 @@ found:
 out:
     spin_unlock_irq(&irq_desc->lock);
     xfree(new_entry);
+
+    if ( !r )
+    {
+        struct vcpu *v;
+
+printk("d%d: MSI-X#%u %u@%lx\n", d->domain_id, msi_desc->msi_attrib.entry_nr, entry->table_len, gtable);//temp
+        for_each_vcpu ( d, v )
+        {
+if(v->arch.hvm_vcpu.hvm_io.msix_snoop_address)//temp
+ printk("%pv: MSI-X#%u snoop %08lx\n", v, msi_desc->msi_attrib.entry_nr, v->arch.hvm_vcpu.hvm_io.msix_snoop_address);//temp
+            if ( v->arch.hvm_vcpu.hvm_io.msix_snoop_address ==
+                 (gtable + msi_desc->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
+                  PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET) )
+                v->arch.hvm_vcpu.hvm_io.msix_unmask_address =
+                    v->arch.hvm_vcpu.hvm_io.msix_snoop_address;
+        }
+    }
+
     return r;
 }
 
@@ -457,9 +497,6 @@ void msixtbl_pt_unregister(struct domain
     if ( !irq_desc )
         return;
 
-    if ( !irq_desc->msi_desc )
-        goto out;
-
     msi_desc = irq_desc->msi_desc;
     if ( !msi_desc )
         goto out;
@@ -522,6 +559,8 @@ void msix_write_completion(struct vcpu *
 {
     unsigned long ctrl_address = v->arch.hvm_vcpu.hvm_io.msix_unmask_address;
 
+    v->arch.hvm_vcpu.hvm_io.msix_snoop_address = 0;
+
     if ( !ctrl_address )
         return;
 
--- a/xen/include/asm-x86/hvm/vcpu.h
+++ b/xen/include/asm-x86/hvm/vcpu.h
@@ -85,6 +85,7 @@ struct hvm_vcpu_io {
     bool_t mmio_retry;
 
     unsigned long msix_unmask_address;
+    unsigned long msix_snoop_address;
 
     const struct g2m_ioport *g2m_ioport;
 };

[-- Attachment #3: Type: text/plain, Size: 126 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH RFC] x86/vMSI-X: avoid missing first unmask of vectors
  2016-04-01  9:15 [PATCH RFC] x86/vMSI-X: avoid missing first unmask of vectors Jan Beulich
@ 2016-04-01  9:58 ` Jan Beulich
  2016-04-01 10:56   ` Paul Durrant
  0 siblings, 1 reply; 14+ messages in thread
From: Jan Beulich @ 2016-04-01  9:58 UTC (permalink / raw)
  To: xen-devel
  Cc: anthony.perard, Andrew Cooper, Paul Durrant, Keir Fraser,
	Stefano Stabellini

>>> On 01.04.16 at 11:15, <JBeulich@suse.com> wrote:
> Recent changes to Linux result in there just being a single unmask
> operation prior to expecting the first interrupts to arrive. However,
> we've had a chicken-and-egg problem here: Qemu invokes
> xc_domain_update_msi_irq(), ultimately leading to
> msixtbl_pt_register(), upon seeing that first unmask operation. Yet
> for msixtbl_range() to return true (in order to msixtbl_write() to get
> invoked at all) msixtbl_pt_register() must have completed.
> 
> Deal with this by snooping suitable writes in msixtbl_range() and
> triggering the invocation of msix_write_completion() from
> msixtbl_pt_register() when that happens in the context of a still in
> progress vector control field write.
> 
> Note that the seemingly unrelated deletion of the redundant
> irq_desc->msi_desc checks in msixtbl_pt_register() is to make clear to
> any compiler version used that the "msi_desc" local variable isn't
> being used uninitialized. (Doing the same in msixtbl_pt_unregister() is
> just for consistency reasons.)
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> TODO: How to deal with REP MOVS to the MSI-X table (in msixtbl_range())?

Some more detail on the thoughts I so far had for this aspect:
It has always been puzzling me that the hypervisor doesn't
see _all_ the MSI-X table accesses (which is a result of the
addresses only getting registered via XEN_DOMCTL_bind_pt_irq);
it's quite natural that this is an at least latent issue possibly
causing guest misbehavior. I cannot, however, currently see any
way to address this without altering both Xen and qemu, since for
Xen to see all accesses it would need to become aware of the
GPA of the MSI-X table much earlier (read: before the domain
actually start, or at the latest when the domain first enables
memory decoding on the device).

The mapping of the MMIO BARs of the device into guest memory,
however, intentionally excludes the page(s) covering the MSI-X
table, so the hypervisor can't become aware of them by just
looking at data it gets presented today. Hence either we need to
add some new hypercall for qemu to invoke, or we need to make
qemu map the full BAR ranges, filtering out MSI-X table pages
in the hypervisor (using those mapping requests just to learn the
GPA of the MSI-X table, without entering them into P2M).

Unless someone can think of a way which doesn't require altering
both qemu and Xen (creating the well known compatibility issues
between unmatched pairs), I think the patch as presented should
be okay without handling this case, i.e. best possible effort, and a
subsequent change then ought to be to deal with this by changing
both components. In which case I'd suggest that the change here
go into 4.7, but the full fix would then likely need deferring until
4.8.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH RFC] x86/vMSI-X: avoid missing first unmask of vectors
  2016-04-01  9:58 ` Jan Beulich
@ 2016-04-01 10:56   ` Paul Durrant
  2016-04-01 11:21     ` Jan Beulich
  0 siblings, 1 reply; 14+ messages in thread
From: Paul Durrant @ 2016-04-01 10:56 UTC (permalink / raw)
  To: Jan Beulich, xen-devel
  Cc: Anthony Perard, Andrew Cooper, Keir (Xen.org), Stefano Stabellini

> -----Original Message-----
> From: Jan Beulich [mailto:JBeulich@suse.com]
> Sent: 01 April 2016 10:59
> To: xen-devel
> Cc: Andrew Cooper; Anthony Perard; Paul Durrant; Stefano Stabellini; Keir
> (Xen.org)
> Subject: Re: [PATCH RFC] x86/vMSI-X: avoid missing first unmask of vectors
> 
> >>> On 01.04.16 at 11:15, <JBeulich@suse.com> wrote:
> > Recent changes to Linux result in there just being a single unmask
> > operation prior to expecting the first interrupts to arrive. However,
> > we've had a chicken-and-egg problem here: Qemu invokes
> > xc_domain_update_msi_irq(), ultimately leading to
> > msixtbl_pt_register(), upon seeing that first unmask operation. Yet
> > for msixtbl_range() to return true (in order to msixtbl_write() to get
> > invoked at all) msixtbl_pt_register() must have completed.
> >
> > Deal with this by snooping suitable writes in msixtbl_range() and
> > triggering the invocation of msix_write_completion() from
> > msixtbl_pt_register() when that happens in the context of a still in
> > progress vector control field write.
> >
> > Note that the seemingly unrelated deletion of the redundant
> > irq_desc->msi_desc checks in msixtbl_pt_register() is to make clear to
> > any compiler version used that the "msi_desc" local variable isn't
> > being used uninitialized. (Doing the same in msixtbl_pt_unregister() is
> > just for consistency reasons.)
> >
> > Signed-off-by: Jan Beulich <jbeulich@suse.com>
> > ---
> > TODO: How to deal with REP MOVS to the MSI-X table (in msixtbl_range())?
> 
> Some more detail on the thoughts I so far had for this aspect:
> It has always been puzzling me that the hypervisor doesn't
> see _all_ the MSI-X table accesses (which is a result of the
> addresses only getting registered via XEN_DOMCTL_bind_pt_irq);
> it's quite natural that this is an at least latent issue possibly
> causing guest misbehavior. I cannot, however, currently see any
> way to address this without altering both Xen and qemu, since for
> Xen to see all accesses it would need to become aware of the
> GPA of the MSI-X table much earlier (read: before the domain
> actually start, or at the latest when the domain first enables
> memory decoding on the device).
> 
> The mapping of the MMIO BARs of the device into guest memory,
> however, intentionally excludes the page(s) covering the MSI-X
> table, so the hypervisor can't become aware of them by just
> looking at data it gets presented today. Hence either we need to
> add some new hypercall for qemu to invoke, or we need to make
> qemu map the full BAR ranges, filtering out MSI-X table pages
> in the hypervisor (using those mapping requests just to learn the
> GPA of the MSI-X table, without entering them into P2M).
> 
> Unless someone can think of a way which doesn't require altering
> both qemu and Xen (creating the well known compatibility issues
> between unmatched pairs), I think the patch as presented should
> be okay without handling this case, i.e. best possible effort, and a
> subsequent change then ought to be to deal with this by changing
> both components. In which case I'd suggest that the change here
> go into 4.7, but the full fix would then likely need deferring until
> 4.8.
> 

I guess it could be handled entirely in Xen if we are willing to snoop on PCI configuration. It would not be too hard to snoop guest writes to the BARs in config space so that Xen can keep track of where they are. Snooping on the MSI-X capability could then tell Xen when to start interposing on the table, and allow it to discover the GPA at that point (via the BIR and offset values).

  Paul

> Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH RFC] x86/vMSI-X: avoid missing first unmask of vectors
  2016-04-01 10:56   ` Paul Durrant
@ 2016-04-01 11:21     ` Jan Beulich
  2016-04-01 13:01       ` Paul Durrant
  0 siblings, 1 reply; 14+ messages in thread
From: Jan Beulich @ 2016-04-01 11:21 UTC (permalink / raw)
  To: Paul Durrant
  Cc: Anthony Perard, Andrew Cooper, Keir (Xen.org),
	Stefano Stabellini, xen-devel

>>> On 01.04.16 at 12:56, <Paul.Durrant@citrix.com> wrote:
>>  -----Original Message-----
>> From: Jan Beulich [mailto:JBeulich@suse.com]
>> Sent: 01 April 2016 10:59
>> To: xen-devel
>> Cc: Andrew Cooper; Anthony Perard; Paul Durrant; Stefano Stabellini; Keir
>> (Xen.org)
>> Subject: Re: [PATCH RFC] x86/vMSI-X: avoid missing first unmask of vectors
>> 
>> >>> On 01.04.16 at 11:15, <JBeulich@suse.com> wrote:
>> > Recent changes to Linux result in there just being a single unmask
>> > operation prior to expecting the first interrupts to arrive. However,
>> > we've had a chicken-and-egg problem here: Qemu invokes
>> > xc_domain_update_msi_irq(), ultimately leading to
>> > msixtbl_pt_register(), upon seeing that first unmask operation. Yet
>> > for msixtbl_range() to return true (in order to msixtbl_write() to get
>> > invoked at all) msixtbl_pt_register() must have completed.
>> >
>> > Deal with this by snooping suitable writes in msixtbl_range() and
>> > triggering the invocation of msix_write_completion() from
>> > msixtbl_pt_register() when that happens in the context of a still in
>> > progress vector control field write.
>> >
>> > Note that the seemingly unrelated deletion of the redundant
>> > irq_desc->msi_desc checks in msixtbl_pt_register() is to make clear to
>> > any compiler version used that the "msi_desc" local variable isn't
>> > being used uninitialized. (Doing the same in msixtbl_pt_unregister() is
>> > just for consistency reasons.)
>> >
>> > Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> > ---
>> > TODO: How to deal with REP MOVS to the MSI-X table (in msixtbl_range())?
>> 
>> Some more detail on the thoughts I so far had for this aspect:
>> It has always been puzzling me that the hypervisor doesn't
>> see _all_ the MSI-X table accesses (which is a result of the
>> addresses only getting registered via XEN_DOMCTL_bind_pt_irq);
>> it's quite natural that this is an at least latent issue possibly
>> causing guest misbehavior. I cannot, however, currently see any
>> way to address this without altering both Xen and qemu, since for
>> Xen to see all accesses it would need to become aware of the
>> GPA of the MSI-X table much earlier (read: before the domain
>> actually start, or at the latest when the domain first enables
>> memory decoding on the device).
>> 
>> The mapping of the MMIO BARs of the device into guest memory,
>> however, intentionally excludes the page(s) covering the MSI-X
>> table, so the hypervisor can't become aware of them by just
>> looking at data it gets presented today. Hence either we need to
>> add some new hypercall for qemu to invoke, or we need to make
>> qemu map the full BAR ranges, filtering out MSI-X table pages
>> in the hypervisor (using those mapping requests just to learn the
>> GPA of the MSI-X table, without entering them into P2M).
>> 
>> Unless someone can think of a way which doesn't require altering
>> both qemu and Xen (creating the well known compatibility issues
>> between unmatched pairs), I think the patch as presented should
>> be okay without handling this case, i.e. best possible effort, and a
>> subsequent change then ought to be to deal with this by changing
>> both components. In which case I'd suggest that the change here
>> go into 4.7, but the full fix would then likely need deferring until
>> 4.8.
> 
> I guess it could be handled entirely in Xen if we are willing to snoop on 
> PCI configuration. It would not be too hard to snoop guest writes to the BARs 
> in config space so that Xen can keep track of where they are. Snooping on the 
> MSI-X capability could then tell Xen when to start interposing on the table, 
> and allow it to discover the GPA at that point (via the BIR and offset 
> values).

Well, that's a possibility, but won't - afaict - work without qemu's
help at another point: So far we don't know the guest's PCI bus
topology, hence we can't correlate vBAR writes we might snoop
with the physical devices they correspond to.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH RFC] x86/vMSI-X: avoid missing first unmask of vectors
  2016-04-01 11:21     ` Jan Beulich
@ 2016-04-01 13:01       ` Paul Durrant
  2016-04-01 13:42         ` Jan Beulich
  0 siblings, 1 reply; 14+ messages in thread
From: Paul Durrant @ 2016-04-01 13:01 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Anthony Perard, Andrew Cooper, Keir (Xen.org),
	Stefano Stabellini, xen-devel

> -----Original Message-----
> From: Jan Beulich [mailto:JBeulich@suse.com]
> Sent: 01 April 2016 12:21
> To: Paul Durrant
> Cc: Andrew Cooper; Anthony Perard; Stefano Stabellini; xen-devel; Keir
> (Xen.org)
> Subject: RE: [PATCH RFC] x86/vMSI-X: avoid missing first unmask of vectors
> 
> >>> On 01.04.16 at 12:56, <Paul.Durrant@citrix.com> wrote:
> >>  -----Original Message-----
> >> From: Jan Beulich [mailto:JBeulich@suse.com]
> >> Sent: 01 April 2016 10:59
> >> To: xen-devel
> >> Cc: Andrew Cooper; Anthony Perard; Paul Durrant; Stefano Stabellini; Keir
> >> (Xen.org)
> >> Subject: Re: [PATCH RFC] x86/vMSI-X: avoid missing first unmask of
> vectors
> >>
> >> >>> On 01.04.16 at 11:15, <JBeulich@suse.com> wrote:
> >> > Recent changes to Linux result in there just being a single unmask
> >> > operation prior to expecting the first interrupts to arrive. However,
> >> > we've had a chicken-and-egg problem here: Qemu invokes
> >> > xc_domain_update_msi_irq(), ultimately leading to
> >> > msixtbl_pt_register(), upon seeing that first unmask operation. Yet
> >> > for msixtbl_range() to return true (in order to msixtbl_write() to get
> >> > invoked at all) msixtbl_pt_register() must have completed.
> >> >
> >> > Deal with this by snooping suitable writes in msixtbl_range() and
> >> > triggering the invocation of msix_write_completion() from
> >> > msixtbl_pt_register() when that happens in the context of a still in
> >> > progress vector control field write.
> >> >
> >> > Note that the seemingly unrelated deletion of the redundant
> >> > irq_desc->msi_desc checks in msixtbl_pt_register() is to make clear to
> >> > any compiler version used that the "msi_desc" local variable isn't
> >> > being used uninitialized. (Doing the same in msixtbl_pt_unregister() is
> >> > just for consistency reasons.)
> >> >
> >> > Signed-off-by: Jan Beulich <jbeulich@suse.com>
> >> > ---
> >> > TODO: How to deal with REP MOVS to the MSI-X table (in
> msixtbl_range())?
> >>
> >> Some more detail on the thoughts I so far had for this aspect:
> >> It has always been puzzling me that the hypervisor doesn't
> >> see _all_ the MSI-X table accesses (which is a result of the
> >> addresses only getting registered via XEN_DOMCTL_bind_pt_irq);
> >> it's quite natural that this is an at least latent issue possibly
> >> causing guest misbehavior. I cannot, however, currently see any
> >> way to address this without altering both Xen and qemu, since for
> >> Xen to see all accesses it would need to become aware of the
> >> GPA of the MSI-X table much earlier (read: before the domain
> >> actually start, or at the latest when the domain first enables
> >> memory decoding on the device).
> >>
> >> The mapping of the MMIO BARs of the device into guest memory,
> >> however, intentionally excludes the page(s) covering the MSI-X
> >> table, so the hypervisor can't become aware of them by just
> >> looking at data it gets presented today. Hence either we need to
> >> add some new hypercall for qemu to invoke, or we need to make
> >> qemu map the full BAR ranges, filtering out MSI-X table pages
> >> in the hypervisor (using those mapping requests just to learn the
> >> GPA of the MSI-X table, without entering them into P2M).
> >>
> >> Unless someone can think of a way which doesn't require altering
> >> both qemu and Xen (creating the well known compatibility issues
> >> between unmatched pairs), I think the patch as presented should
> >> be okay without handling this case, i.e. best possible effort, and a
> >> subsequent change then ought to be to deal with this by changing
> >> both components. In which case I'd suggest that the change here
> >> go into 4.7, but the full fix would then likely need deferring until
> >> 4.8.
> >
> > I guess it could be handled entirely in Xen if we are willing to snoop on
> > PCI configuration. It would not be too hard to snoop guest writes to the
> BARs
> > in config space so that Xen can keep track of where they are. Snooping on
> the
> > MSI-X capability could then tell Xen when to start interposing on the table,
> > and allow it to discover the GPA at that point (via the BIR and offset
> > values).
> 
> Well, that's a possibility, but won't - afaict - work without qemu's
> help at another point: So far we don't know the guest's PCI bus
> topology, hence we can't correlate vBAR writes we might snoop
> with the physical devices they correspond to.
> 

Does Xen need to know that correspondence just to track state? I thought the problem here was that Xen does not see every guest access to an MSI-X table. If Xen always interposes on MSI-X tables then it can at least track the state of the emulated table, even if we end up just forwarding the access for QEMU to handle at first. When the mapping is created to the actual h/w table then, presumably, Xen's idea of state should correspond to QEMU's.

  Paul

> Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH RFC] x86/vMSI-X: avoid missing first unmask of vectors
  2016-04-01 13:01       ` Paul Durrant
@ 2016-04-01 13:42         ` Jan Beulich
  2016-04-01 13:54           ` Paul Durrant
  0 siblings, 1 reply; 14+ messages in thread
From: Jan Beulich @ 2016-04-01 13:42 UTC (permalink / raw)
  To: Paul Durrant
  Cc: Anthony Perard, Andrew Cooper, Keir (Xen.org),
	Stefano Stabellini, xen-devel

>>> On 01.04.16 at 15:01, <Paul.Durrant@citrix.com> wrote:
>> From: Jan Beulich [mailto:JBeulich@suse.com]
>> Sent: 01 April 2016 12:21
>> >>> On 01.04.16 at 12:56, <Paul.Durrant@citrix.com> wrote:
>> >> From: Jan Beulich [mailto:JBeulich@suse.com]
>> >> Sent: 01 April 2016 10:59
>> >> >>> On 01.04.16 at 11:15, <JBeulich@suse.com> wrote:
>> >> > Recent changes to Linux result in there just being a single unmask
>> >> > operation prior to expecting the first interrupts to arrive. However,
>> >> > we've had a chicken-and-egg problem here: Qemu invokes
>> >> > xc_domain_update_msi_irq(), ultimately leading to
>> >> > msixtbl_pt_register(), upon seeing that first unmask operation. Yet
>> >> > for msixtbl_range() to return true (in order to msixtbl_write() to get
>> >> > invoked at all) msixtbl_pt_register() must have completed.
>> >> >
>> >> > Deal with this by snooping suitable writes in msixtbl_range() and
>> >> > triggering the invocation of msix_write_completion() from
>> >> > msixtbl_pt_register() when that happens in the context of a still in
>> >> > progress vector control field write.
>> >> >
>> >> > Note that the seemingly unrelated deletion of the redundant
>> >> > irq_desc->msi_desc checks in msixtbl_pt_register() is to make clear to
>> >> > any compiler version used that the "msi_desc" local variable isn't
>> >> > being used uninitialized. (Doing the same in msixtbl_pt_unregister() is
>> >> > just for consistency reasons.)
>> >> >
>> >> > Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> >> > ---
>> >> > TODO: How to deal with REP MOVS to the MSI-X table (in
>> msixtbl_range())?
>> >>
>> >> Some more detail on the thoughts I so far had for this aspect:
>> >> It has always been puzzling me that the hypervisor doesn't
>> >> see _all_ the MSI-X table accesses (which is a result of the
>> >> addresses only getting registered via XEN_DOMCTL_bind_pt_irq);
>> >> it's quite natural that this is an at least latent issue possibly
>> >> causing guest misbehavior. I cannot, however, currently see any
>> >> way to address this without altering both Xen and qemu, since for
>> >> Xen to see all accesses it would need to become aware of the
>> >> GPA of the MSI-X table much earlier (read: before the domain
>> >> actually start, or at the latest when the domain first enables
>> >> memory decoding on the device).
>> >>
>> >> The mapping of the MMIO BARs of the device into guest memory,
>> >> however, intentionally excludes the page(s) covering the MSI-X
>> >> table, so the hypervisor can't become aware of them by just
>> >> looking at data it gets presented today. Hence either we need to
>> >> add some new hypercall for qemu to invoke, or we need to make
>> >> qemu map the full BAR ranges, filtering out MSI-X table pages
>> >> in the hypervisor (using those mapping requests just to learn the
>> >> GPA of the MSI-X table, without entering them into P2M).
>> >>
>> >> Unless someone can think of a way which doesn't require altering
>> >> both qemu and Xen (creating the well known compatibility issues
>> >> between unmatched pairs), I think the patch as presented should
>> >> be okay without handling this case, i.e. best possible effort, and a
>> >> subsequent change then ought to be to deal with this by changing
>> >> both components. In which case I'd suggest that the change here
>> >> go into 4.7, but the full fix would then likely need deferring until
>> >> 4.8.
>> >
>> > I guess it could be handled entirely in Xen if we are willing to snoop on
>> > PCI configuration. It would not be too hard to snoop guest writes to the
>> BARs
>> > in config space so that Xen can keep track of where they are. Snooping on
>> the
>> > MSI-X capability could then tell Xen when to start interposing on the table,
>> > and allow it to discover the GPA at that point (via the BIR and offset
>> > values).
>> 
>> Well, that's a possibility, but won't - afaict - work without qemu's
>> help at another point: So far we don't know the guest's PCI bus
>> topology, hence we can't correlate vBAR writes we might snoop
>> with the physical devices they correspond to.
> 
> Does Xen need to know that correspondence just to track state? I thought the 
> problem here was that Xen does not see every guest access to an MSI-X table. 
> If Xen always interposes on MSI-X tables then it can at least track the state 
> of the emulated table, even if we end up just forwarding the access for QEMU 
> to handle at first. When the mapping is created to the actual h/w table then, 
> presumably, Xen's idea of state should correspond to QEMU's.

But Xen doesn't see the guest view of config space, and the
whereabouts of the MSI-X table are in read-only config space fields
(i.e. snooping writes to those doesn't make sense, as the guest may
not ever write them or write anything, which would just get dropped
on the floor).

And additionally msixtbl_addr_to_desc() needs to know the physical
device.

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH RFC] x86/vMSI-X: avoid missing first unmask of vectors
  2016-04-01 13:42         ` Jan Beulich
@ 2016-04-01 13:54           ` Paul Durrant
  2016-04-01 14:17             ` Jan Beulich
  0 siblings, 1 reply; 14+ messages in thread
From: Paul Durrant @ 2016-04-01 13:54 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Anthony Perard, Andrew Cooper, Keir (Xen.org),
	Stefano Stabellini, xen-devel

> -----Original Message-----
> From: Jan Beulich [mailto:JBeulich@suse.com]
> Sent: 01 April 2016 14:43
> To: Paul Durrant
> Cc: Andrew Cooper; Anthony Perard; Stefano Stabellini; xen-devel; Keir
> (Xen.org)
> Subject: RE: [PATCH RFC] x86/vMSI-X: avoid missing first unmask of vectors
> 
> >>> On 01.04.16 at 15:01, <Paul.Durrant@citrix.com> wrote:
> >> From: Jan Beulich [mailto:JBeulich@suse.com]
> >> Sent: 01 April 2016 12:21
> >> >>> On 01.04.16 at 12:56, <Paul.Durrant@citrix.com> wrote:
> >> >> From: Jan Beulich [mailto:JBeulich@suse.com]
> >> >> Sent: 01 April 2016 10:59
> >> >> >>> On 01.04.16 at 11:15, <JBeulich@suse.com> wrote:
> >> >> > Recent changes to Linux result in there just being a single unmask
> >> >> > operation prior to expecting the first interrupts to arrive. However,
> >> >> > we've had a chicken-and-egg problem here: Qemu invokes
> >> >> > xc_domain_update_msi_irq(), ultimately leading to
> >> >> > msixtbl_pt_register(), upon seeing that first unmask operation. Yet
> >> >> > for msixtbl_range() to return true (in order to msixtbl_write() to get
> >> >> > invoked at all) msixtbl_pt_register() must have completed.
> >> >> >
> >> >> > Deal with this by snooping suitable writes in msixtbl_range() and
> >> >> > triggering the invocation of msix_write_completion() from
> >> >> > msixtbl_pt_register() when that happens in the context of a still in
> >> >> > progress vector control field write.
> >> >> >
> >> >> > Note that the seemingly unrelated deletion of the redundant
> >> >> > irq_desc->msi_desc checks in msixtbl_pt_register() is to make clear
> to
> >> >> > any compiler version used that the "msi_desc" local variable isn't
> >> >> > being used uninitialized. (Doing the same in msixtbl_pt_unregister()
> is
> >> >> > just for consistency reasons.)
> >> >> >
> >> >> > Signed-off-by: Jan Beulich <jbeulich@suse.com>
> >> >> > ---
> >> >> > TODO: How to deal with REP MOVS to the MSI-X table (in
> >> msixtbl_range())?
> >> >>
> >> >> Some more detail on the thoughts I so far had for this aspect:
> >> >> It has always been puzzling me that the hypervisor doesn't
> >> >> see _all_ the MSI-X table accesses (which is a result of the
> >> >> addresses only getting registered via XEN_DOMCTL_bind_pt_irq);
> >> >> it's quite natural that this is an at least latent issue possibly
> >> >> causing guest misbehavior. I cannot, however, currently see any
> >> >> way to address this without altering both Xen and qemu, since for
> >> >> Xen to see all accesses it would need to become aware of the
> >> >> GPA of the MSI-X table much earlier (read: before the domain
> >> >> actually start, or at the latest when the domain first enables
> >> >> memory decoding on the device).
> >> >>
> >> >> The mapping of the MMIO BARs of the device into guest memory,
> >> >> however, intentionally excludes the page(s) covering the MSI-X
> >> >> table, so the hypervisor can't become aware of them by just
> >> >> looking at data it gets presented today. Hence either we need to
> >> >> add some new hypercall for qemu to invoke, or we need to make
> >> >> qemu map the full BAR ranges, filtering out MSI-X table pages
> >> >> in the hypervisor (using those mapping requests just to learn the
> >> >> GPA of the MSI-X table, without entering them into P2M).
> >> >>
> >> >> Unless someone can think of a way which doesn't require altering
> >> >> both qemu and Xen (creating the well known compatibility issues
> >> >> between unmatched pairs), I think the patch as presented should
> >> >> be okay without handling this case, i.e. best possible effort, and a
> >> >> subsequent change then ought to be to deal with this by changing
> >> >> both components. In which case I'd suggest that the change here
> >> >> go into 4.7, but the full fix would then likely need deferring until
> >> >> 4.8.
> >> >
> >> > I guess it could be handled entirely in Xen if we are willing to snoop on
> >> > PCI configuration. It would not be too hard to snoop guest writes to the
> >> BARs
> >> > in config space so that Xen can keep track of where they are. Snooping
> on
> >> the
> >> > MSI-X capability could then tell Xen when to start interposing on the
> table,
> >> > and allow it to discover the GPA at that point (via the BIR and offset
> >> > values).
> >>
> >> Well, that's a possibility, but won't - afaict - work without qemu's
> >> help at another point: So far we don't know the guest's PCI bus
> >> topology, hence we can't correlate vBAR writes we might snoop
> >> with the physical devices they correspond to.
> >
> > Does Xen need to know that correspondence just to track state? I thought
> the
> > problem here was that Xen does not see every guest access to an MSI-X
> table.
> > If Xen always interposes on MSI-X tables then it can at least track the state
> > of the emulated table, even if we end up just forwarding the access for
> QEMU
> > to handle at first. When the mapping is created to the actual h/w table
> then,
> > presumably, Xen's idea of state should correspond to QEMU's.
> 
> But Xen doesn't see the guest view of config space,

Well Xen interposes on every single config cycle so arguably it sees exactly what the guest sees.

> and the
> whereabouts of the MSI-X table are in read-only config space fields
> (i.e. snooping writes to those doesn't make sense, as the guest may
> not ever write them or write anything, which would just get dropped
> on the floor).

I was thinking of snooping the reads. The guest has to find out where the table is before it can write to it, right? Or Xen could even pro-actively read the capability chain of any pcidev registered with it.

> 
> And additionally msixtbl_addr_to_desc() needs to know the physical
> device.
> 

Yes, but msixtbl_range() could be trivially changed to accept any access where msixtbl_find_entry() returns non-NULL. That would allow msixtbl_write() to manipulate entry->flags even if msixtbl_addr_to_desc() returns NULL.

  Paul

> Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH RFC] x86/vMSI-X: avoid missing first unmask of vectors
  2016-04-01 13:54           ` Paul Durrant
@ 2016-04-01 14:17             ` Jan Beulich
  2016-04-01 14:19               ` Paul Durrant
  2016-04-01 14:27               ` Paul Durrant
  0 siblings, 2 replies; 14+ messages in thread
From: Jan Beulich @ 2016-04-01 14:17 UTC (permalink / raw)
  To: Paul Durrant
  Cc: Anthony Perard, Andrew Cooper, Keir (Xen.org),
	Stefano Stabellini, xen-devel

>>> On 01.04.16 at 15:54, <Paul.Durrant@citrix.com> wrote:
>> From: Jan Beulich [mailto:JBeulich@suse.com]
>> Sent: 01 April 2016 14:43
>> >>> On 01.04.16 at 15:01, <Paul.Durrant@citrix.com> wrote:
>> >> From: Jan Beulich [mailto:JBeulich@suse.com]
>> >> Sent: 01 April 2016 12:21
>> >> >>> On 01.04.16 at 12:56, <Paul.Durrant@citrix.com> wrote:
>> >> >> From: Jan Beulich [mailto:JBeulich@suse.com]
>> >> >> Sent: 01 April 2016 10:59
>> >> > I guess it could be handled entirely in Xen if we are willing to snoop on
>> >> > PCI configuration. It would not be too hard to snoop guest writes to the
>> >> BARs
>> >> > in config space so that Xen can keep track of where they are. Snooping
>> on
>> >> the
>> >> > MSI-X capability could then tell Xen when to start interposing on the
>> table,
>> >> > and allow it to discover the GPA at that point (via the BIR and offset
>> >> > values).
>> >>
>> >> Well, that's a possibility, but won't - afaict - work without qemu's
>> >> help at another point: So far we don't know the guest's PCI bus
>> >> topology, hence we can't correlate vBAR writes we might snoop
>> >> with the physical devices they correspond to.
>> >
>> > Does Xen need to know that correspondence just to track state? I thought
>> the
>> > problem here was that Xen does not see every guest access to an MSI-X
>> table.
>> > If Xen always interposes on MSI-X tables then it can at least track the 
> state
>> > of the emulated table, even if we end up just forwarding the access for
>> QEMU
>> > to handle at first. When the mapping is created to the actual h/w table
>> then,
>> > presumably, Xen's idea of state should correspond to QEMU's.
>> 
>> But Xen doesn't see the guest view of config space,
> 
> Well Xen interposes on every single config cycle so arguably it sees exactly 
> what the guest sees.

Ah, so you mean to snoop what qemu returns. Yes, that would be
an option.

>> And additionally msixtbl_addr_to_desc() needs to know the physical
>> device.
> 
> Yes, but msixtbl_range() could be trivially changed to accept any access 
> where msixtbl_find_entry() returns non-NULL. That would allow msixtbl_write() 
> to manipulate entry->flags even if msixtbl_addr_to_desc() returns NULL.

Are you looking at some old code base? There's no entry->flags
manipulation. We call guest_mask_msi_irq(), and for that we need
to know the IRQ descriptor, which in turn requires knowing the
pdev (for msixtbl_addr_to_desc() to return non-NULL).

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH RFC] x86/vMSI-X: avoid missing first unmask of vectors
  2016-04-01 14:17             ` Jan Beulich
@ 2016-04-01 14:19               ` Paul Durrant
  2016-04-01 14:28                 ` Jan Beulich
  2016-04-01 14:27               ` Paul Durrant
  1 sibling, 1 reply; 14+ messages in thread
From: Paul Durrant @ 2016-04-01 14:19 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Anthony Perard, Andrew Cooper, Keir (Xen.org),
	Stefano Stabellini, xen-devel

> -----Original Message-----
> From: Jan Beulich [mailto:JBeulich@suse.com]
> Sent: 01 April 2016 15:18
> To: Paul Durrant
> Cc: Andrew Cooper; Anthony Perard; Stefano Stabellini; xen-devel; Keir
> (Xen.org)
> Subject: RE: [PATCH RFC] x86/vMSI-X: avoid missing first unmask of vectors
> 
> >>> On 01.04.16 at 15:54, <Paul.Durrant@citrix.com> wrote:
> >> From: Jan Beulich [mailto:JBeulich@suse.com]
> >> Sent: 01 April 2016 14:43
> >> >>> On 01.04.16 at 15:01, <Paul.Durrant@citrix.com> wrote:
> >> >> From: Jan Beulich [mailto:JBeulich@suse.com]
> >> >> Sent: 01 April 2016 12:21
> >> >> >>> On 01.04.16 at 12:56, <Paul.Durrant@citrix.com> wrote:
> >> >> >> From: Jan Beulich [mailto:JBeulich@suse.com]
> >> >> >> Sent: 01 April 2016 10:59
> >> >> > I guess it could be handled entirely in Xen if we are willing to snoop
> on
> >> >> > PCI configuration. It would not be too hard to snoop guest writes to
> the
> >> >> BARs
> >> >> > in config space so that Xen can keep track of where they are.
> Snooping
> >> on
> >> >> the
> >> >> > MSI-X capability could then tell Xen when to start interposing on the
> >> table,
> >> >> > and allow it to discover the GPA at that point (via the BIR and offset
> >> >> > values).
> >> >>
> >> >> Well, that's a possibility, but won't - afaict - work without qemu's
> >> >> help at another point: So far we don't know the guest's PCI bus
> >> >> topology, hence we can't correlate vBAR writes we might snoop
> >> >> with the physical devices they correspond to.
> >> >
> >> > Does Xen need to know that correspondence just to track state? I
> thought
> >> the
> >> > problem here was that Xen does not see every guest access to an MSI-X
> >> table.
> >> > If Xen always interposes on MSI-X tables then it can at least track the
> > state
> >> > of the emulated table, even if we end up just forwarding the access for
> >> QEMU
> >> > to handle at first. When the mapping is created to the actual h/w table
> >> then,
> >> > presumably, Xen's idea of state should correspond to QEMU's.
> >>
> >> But Xen doesn't see the guest view of config space,
> >
> > Well Xen interposes on every single config cycle so arguably it sees exactly
> > what the guest sees.
> 
> Ah, so you mean to snoop what qemu returns. Yes, that would be
> an option.
> 
> >> And additionally msixtbl_addr_to_desc() needs to know the physical
> >> device.
> >
> > Yes, but msixtbl_range() could be trivially changed to accept any access
> > where msixtbl_find_entry() returns non-NULL. That would allow
> msixtbl_write()
> > to manipulate entry->flags even if msixtbl_addr_to_desc() returns NULL.
> 
> Are you looking at some old code base? There's no entry->flags
> manipulation. We call guest_mask_msi_irq(), and for that we need
> to know the IRQ descriptor, which in turn requires knowing the
> pdev (for msixtbl_addr_to_desc() to return non-NULL).

Ah, maybe I'm out of date. I haven't pulled for a day or so.

  Paul

> 
> Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH RFC] x86/vMSI-X: avoid missing first unmask of vectors
  2016-04-01 14:17             ` Jan Beulich
  2016-04-01 14:19               ` Paul Durrant
@ 2016-04-01 14:27               ` Paul Durrant
  2016-04-01 14:40                 ` Jan Beulich
  1 sibling, 1 reply; 14+ messages in thread
From: Paul Durrant @ 2016-04-01 14:27 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Anthony Perard, Andrew Cooper, Keir (Xen.org),
	Stefano Stabellini, xen-devel

> -----Original Message-----
> From: Paul Durrant
> Sent: 01 April 2016 15:20
> To: 'Jan Beulich'
> Cc: Andrew Cooper; Anthony Perard; Stefano Stabellini; xen-devel; Keir
> (Xen.org)
> Subject: RE: [PATCH RFC] x86/vMSI-X: avoid missing first unmask of vectors
> 
> > -----Original Message-----
> > From: Jan Beulich [mailto:JBeulich@suse.com]
> > Sent: 01 April 2016 15:18
> > To: Paul Durrant
> > Cc: Andrew Cooper; Anthony Perard; Stefano Stabellini; xen-devel; Keir
> > (Xen.org)
> > Subject: RE: [PATCH RFC] x86/vMSI-X: avoid missing first unmask of vectors
> >
> > >>> On 01.04.16 at 15:54, <Paul.Durrant@citrix.com> wrote:
> > >> From: Jan Beulich [mailto:JBeulich@suse.com]
> > >> Sent: 01 April 2016 14:43
> > >> >>> On 01.04.16 at 15:01, <Paul.Durrant@citrix.com> wrote:
> > >> >> From: Jan Beulich [mailto:JBeulich@suse.com]
> > >> >> Sent: 01 April 2016 12:21
> > >> >> >>> On 01.04.16 at 12:56, <Paul.Durrant@citrix.com> wrote:
> > >> >> >> From: Jan Beulich [mailto:JBeulich@suse.com]
> > >> >> >> Sent: 01 April 2016 10:59
> > >> >> > I guess it could be handled entirely in Xen if we are willing to snoop
> > on
> > >> >> > PCI configuration. It would not be too hard to snoop guest writes to
> > the
> > >> >> BARs
> > >> >> > in config space so that Xen can keep track of where they are.
> > Snooping
> > >> on
> > >> >> the
> > >> >> > MSI-X capability could then tell Xen when to start interposing on
> the
> > >> table,
> > >> >> > and allow it to discover the GPA at that point (via the BIR and offset
> > >> >> > values).
> > >> >>
> > >> >> Well, that's a possibility, but won't - afaict - work without qemu's
> > >> >> help at another point: So far we don't know the guest's PCI bus
> > >> >> topology, hence we can't correlate vBAR writes we might snoop
> > >> >> with the physical devices they correspond to.
> > >> >
> > >> > Does Xen need to know that correspondence just to track state? I
> > thought
> > >> the
> > >> > problem here was that Xen does not see every guest access to an
> MSI-X
> > >> table.
> > >> > If Xen always interposes on MSI-X tables then it can at least track the
> > > state
> > >> > of the emulated table, even if we end up just forwarding the access
> for
> > >> QEMU
> > >> > to handle at first. When the mapping is created to the actual h/w table
> > >> then,
> > >> > presumably, Xen's idea of state should correspond to QEMU's.
> > >>
> > >> But Xen doesn't see the guest view of config space,
> > >
> > > Well Xen interposes on every single config cycle so arguably it sees
> exactly
> > > what the guest sees.
> >
> > Ah, so you mean to snoop what qemu returns. Yes, that would be
> > an option.
> >
> > >> And additionally msixtbl_addr_to_desc() needs to know the physical
> > >> device.
> > >
> > > Yes, but msixtbl_range() could be trivially changed to accept any access
> > > where msixtbl_find_entry() returns non-NULL. That would allow
> > msixtbl_write()
> > > to manipulate entry->flags even if msixtbl_addr_to_desc() returns NULL.
> >
> > Are you looking at some old code base? There's no entry->flags
> > manipulation. We call guest_mask_msi_irq(), and for that we need
> > to know the IRQ descriptor, which in turn requires knowing the
> > pdev (for msixtbl_addr_to_desc() to return non-NULL).
> 
> Ah, maybe I'm out of date. I haven't pulled for a day or so.
> 

I pulled staging and I still see (starting at line 300 in vmsi.c)

    /* Exit to device model when unmasking and address/data got modified. */
    if ( !(val & PCI_MSIX_VECTOR_BITMASK) &&
         test_and_clear_bit(nr_entry, &entry->table_flags) )
    {
        v->arch.hvm_vcpu.hvm_io.msix_unmask_address = address;
        goto out;
    }

    msi_desc = msixtbl_addr_to_desc(entry, address);
    if ( !msi_desc || msi_desc->irq < 0 )
        goto out;

I was wrong about the name. I meant 'entry->table_flags', and that's clearly manipulated before calling msixtbl_addr_to_desc() so even if that returns NULL Xen still keeps in sync with QEMU AFAICT.

  Paul

>   Paul
> 
> >
> > Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH RFC] x86/vMSI-X: avoid missing first unmask of vectors
  2016-04-01 14:19               ` Paul Durrant
@ 2016-04-01 14:28                 ` Jan Beulich
  0 siblings, 0 replies; 14+ messages in thread
From: Jan Beulich @ 2016-04-01 14:28 UTC (permalink / raw)
  To: Paul Durrant
  Cc: Anthony Perard, Andrew Cooper, Keir (Xen.org),
	Stefano Stabellini, xen-devel

>>> On 01.04.16 at 16:19, <Paul.Durrant@citrix.com> wrote:
>> From: Jan Beulich [mailto:JBeulich@suse.com]
>> Sent: 01 April 2016 15:18
>> Are you looking at some old code base? There's no entry->flags
>> manipulation. We call guest_mask_msi_irq(), and for that we need
>> to know the IRQ descriptor, which in turn requires knowing the
>> pdev (for msixtbl_addr_to_desc() to return non-NULL).
> 
> Ah, maybe I'm out of date. I haven't pulled for a day or so.

Well, that's been this way for a couple of months.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH RFC] x86/vMSI-X: avoid missing first unmask of vectors
  2016-04-01 14:27               ` Paul Durrant
@ 2016-04-01 14:40                 ` Jan Beulich
  2016-04-01 14:58                   ` Paul Durrant
  0 siblings, 1 reply; 14+ messages in thread
From: Jan Beulich @ 2016-04-01 14:40 UTC (permalink / raw)
  To: Paul Durrant
  Cc: Anthony Perard, Andrew Cooper, Keir (Xen.org),
	Stefano Stabellini, xen-devel

>>> On 01.04.16 at 16:27, <Paul.Durrant@citrix.com> wrote:
>>  -----Original Message-----
>> From: Paul Durrant
>> Sent: 01 April 2016 15:20
>> To: 'Jan Beulich'
>> Cc: Andrew Cooper; Anthony Perard; Stefano Stabellini; xen-devel; Keir
>> (Xen.org)
>> Subject: RE: [PATCH RFC] x86/vMSI-X: avoid missing first unmask of vectors
>> 
>> > -----Original Message-----
>> > From: Jan Beulich [mailto:JBeulich@suse.com]
>> > Sent: 01 April 2016 15:18
>> > To: Paul Durrant
>> > Cc: Andrew Cooper; Anthony Perard; Stefano Stabellini; xen-devel; Keir
>> > (Xen.org)
>> > Subject: RE: [PATCH RFC] x86/vMSI-X: avoid missing first unmask of vectors
>> >
>> > >>> On 01.04.16 at 15:54, <Paul.Durrant@citrix.com> wrote:
>> > >> From: Jan Beulich [mailto:JBeulich@suse.com]
>> > >> Sent: 01 April 2016 14:43
>> > >> >>> On 01.04.16 at 15:01, <Paul.Durrant@citrix.com> wrote:
>> > >> >> From: Jan Beulich [mailto:JBeulich@suse.com]
>> > >> >> Sent: 01 April 2016 12:21
>> > >> >> >>> On 01.04.16 at 12:56, <Paul.Durrant@citrix.com> wrote:
>> > >> >> >> From: Jan Beulich [mailto:JBeulich@suse.com]
>> > >> >> >> Sent: 01 April 2016 10:59
>> > >> >> > I guess it could be handled entirely in Xen if we are willing to snoop
>> > on
>> > >> >> > PCI configuration. It would not be too hard to snoop guest writes to
>> > the
>> > >> >> BARs
>> > >> >> > in config space so that Xen can keep track of where they are.
>> > Snooping
>> > >> on
>> > >> >> the
>> > >> >> > MSI-X capability could then tell Xen when to start interposing on
>> the
>> > >> table,
>> > >> >> > and allow it to discover the GPA at that point (via the BIR and offset
>> > >> >> > values).
>> > >> >>
>> > >> >> Well, that's a possibility, but won't - afaict - work without qemu's
>> > >> >> help at another point: So far we don't know the guest's PCI bus
>> > >> >> topology, hence we can't correlate vBAR writes we might snoop
>> > >> >> with the physical devices they correspond to.
>> > >> >
>> > >> > Does Xen need to know that correspondence just to track state? I
>> > thought
>> > >> the
>> > >> > problem here was that Xen does not see every guest access to an
>> MSI-X
>> > >> table.
>> > >> > If Xen always interposes on MSI-X tables then it can at least track the
>> > > state
>> > >> > of the emulated table, even if we end up just forwarding the access
>> for
>> > >> QEMU
>> > >> > to handle at first. When the mapping is created to the actual h/w table
>> > >> then,
>> > >> > presumably, Xen's idea of state should correspond to QEMU's.
>> > >>
>> > >> But Xen doesn't see the guest view of config space,
>> > >
>> > > Well Xen interposes on every single config cycle so arguably it sees
>> exactly
>> > > what the guest sees.
>> >
>> > Ah, so you mean to snoop what qemu returns. Yes, that would be
>> > an option.
>> >
>> > >> And additionally msixtbl_addr_to_desc() needs to know the physical
>> > >> device.
>> > >
>> > > Yes, but msixtbl_range() could be trivially changed to accept any access
>> > > where msixtbl_find_entry() returns non-NULL. That would allow
>> > msixtbl_write()
>> > > to manipulate entry->flags even if msixtbl_addr_to_desc() returns NULL.
>> >
>> > Are you looking at some old code base? There's no entry->flags
>> > manipulation. We call guest_mask_msi_irq(), and for that we need
>> > to know the IRQ descriptor, which in turn requires knowing the
>> > pdev (for msixtbl_addr_to_desc() to return non-NULL).
>> 
>> Ah, maybe I'm out of date. I haven't pulled for a day or so.
>> 
> 
> I pulled staging and I still see (starting at line 300 in vmsi.c)
> 
>     /* Exit to device model when unmasking and address/data got modified. */
>     if ( !(val & PCI_MSIX_VECTOR_BITMASK) &&
>          test_and_clear_bit(nr_entry, &entry->table_flags) )
>     {
>         v->arch.hvm_vcpu.hvm_io.msix_unmask_address = address;
>         goto out;
>     }
> 
>     msi_desc = msixtbl_addr_to_desc(entry, address);
>     if ( !msi_desc || msi_desc->irq < 0 )
>         goto out;
> 
> I was wrong about the name. I meant 'entry->table_flags', and that's clearly 
> manipulated before calling msixtbl_addr_to_desc() so even if that returns 
> NULL Xen still keeps in sync with QEMU AFAICT.

Staying in sync with qemu is not the problem. The problem is that
the re-invocation from msix_write_completion() then would get
past this, and find said NULL returned from msixtbl_addr_to_desc().

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH RFC] x86/vMSI-X: avoid missing first unmask of vectors
  2016-04-01 14:40                 ` Jan Beulich
@ 2016-04-01 14:58                   ` Paul Durrant
  2016-04-01 15:28                     ` Jan Beulich
  0 siblings, 1 reply; 14+ messages in thread
From: Paul Durrant @ 2016-04-01 14:58 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Anthony Perard, Andrew Cooper, Keir (Xen.org),
	Stefano Stabellini, xen-devel

> -----Original Message-----
> From: Jan Beulich [mailto:JBeulich@suse.com]
> Sent: 01 April 2016 15:40
> To: Paul Durrant
> Cc: Andrew Cooper; Anthony Perard; Stefano Stabellini; xen-devel; Keir
> (Xen.org)
> Subject: RE: [PATCH RFC] x86/vMSI-X: avoid missing first unmask of vectors
> 
> >>> On 01.04.16 at 16:27, <Paul.Durrant@citrix.com> wrote:
> >>  -----Original Message-----
> >> From: Paul Durrant
> >> Sent: 01 April 2016 15:20
> >> To: 'Jan Beulich'
> >> Cc: Andrew Cooper; Anthony Perard; Stefano Stabellini; xen-devel; Keir
> >> (Xen.org)
> >> Subject: RE: [PATCH RFC] x86/vMSI-X: avoid missing first unmask of
> vectors
> >>
> >> > -----Original Message-----
> >> > From: Jan Beulich [mailto:JBeulich@suse.com]
> >> > Sent: 01 April 2016 15:18
> >> > To: Paul Durrant
> >> > Cc: Andrew Cooper; Anthony Perard; Stefano Stabellini; xen-devel; Keir
> >> > (Xen.org)
> >> > Subject: RE: [PATCH RFC] x86/vMSI-X: avoid missing first unmask of
> vectors
> >> >
> >> > >>> On 01.04.16 at 15:54, <Paul.Durrant@citrix.com> wrote:
> >> > >> From: Jan Beulich [mailto:JBeulich@suse.com]
> >> > >> Sent: 01 April 2016 14:43
> >> > >> >>> On 01.04.16 at 15:01, <Paul.Durrant@citrix.com> wrote:
> >> > >> >> From: Jan Beulich [mailto:JBeulich@suse.com]
> >> > >> >> Sent: 01 April 2016 12:21
> >> > >> >> >>> On 01.04.16 at 12:56, <Paul.Durrant@citrix.com> wrote:
> >> > >> >> >> From: Jan Beulich [mailto:JBeulich@suse.com]
> >> > >> >> >> Sent: 01 April 2016 10:59
> >> > >> >> > I guess it could be handled entirely in Xen if we are willing to
> snoop
> >> > on
> >> > >> >> > PCI configuration. It would not be too hard to snoop guest
> writes to
> >> > the
> >> > >> >> BARs
> >> > >> >> > in config space so that Xen can keep track of where they are.
> >> > Snooping
> >> > >> on
> >> > >> >> the
> >> > >> >> > MSI-X capability could then tell Xen when to start interposing on
> >> the
> >> > >> table,
> >> > >> >> > and allow it to discover the GPA at that point (via the BIR and
> offset
> >> > >> >> > values).
> >> > >> >>
> >> > >> >> Well, that's a possibility, but won't - afaict - work without qemu's
> >> > >> >> help at another point: So far we don't know the guest's PCI bus
> >> > >> >> topology, hence we can't correlate vBAR writes we might snoop
> >> > >> >> with the physical devices they correspond to.
> >> > >> >
> >> > >> > Does Xen need to know that correspondence just to track state? I
> >> > thought
> >> > >> the
> >> > >> > problem here was that Xen does not see every guest access to an
> >> MSI-X
> >> > >> table.
> >> > >> > If Xen always interposes on MSI-X tables then it can at least track
> the
> >> > > state
> >> > >> > of the emulated table, even if we end up just forwarding the
> access
> >> for
> >> > >> QEMU
> >> > >> > to handle at first. When the mapping is created to the actual h/w
> table
> >> > >> then,
> >> > >> > presumably, Xen's idea of state should correspond to QEMU's.
> >> > >>
> >> > >> But Xen doesn't see the guest view of config space,
> >> > >
> >> > > Well Xen interposes on every single config cycle so arguably it sees
> >> exactly
> >> > > what the guest sees.
> >> >
> >> > Ah, so you mean to snoop what qemu returns. Yes, that would be
> >> > an option.
> >> >
> >> > >> And additionally msixtbl_addr_to_desc() needs to know the physical
> >> > >> device.
> >> > >
> >> > > Yes, but msixtbl_range() could be trivially changed to accept any
> access
> >> > > where msixtbl_find_entry() returns non-NULL. That would allow
> >> > msixtbl_write()
> >> > > to manipulate entry->flags even if msixtbl_addr_to_desc() returns
> NULL.
> >> >
> >> > Are you looking at some old code base? There's no entry->flags
> >> > manipulation. We call guest_mask_msi_irq(), and for that we need
> >> > to know the IRQ descriptor, which in turn requires knowing the
> >> > pdev (for msixtbl_addr_to_desc() to return non-NULL).
> >>
> >> Ah, maybe I'm out of date. I haven't pulled for a day or so.
> >>
> >
> > I pulled staging and I still see (starting at line 300 in vmsi.c)
> >
> >     /* Exit to device model when unmasking and address/data got modified.
> */
> >     if ( !(val & PCI_MSIX_VECTOR_BITMASK) &&
> >          test_and_clear_bit(nr_entry, &entry->table_flags) )
> >     {
> >         v->arch.hvm_vcpu.hvm_io.msix_unmask_address = address;
> >         goto out;
> >     }
> >
> >     msi_desc = msixtbl_addr_to_desc(entry, address);
> >     if ( !msi_desc || msi_desc->irq < 0 )
> >         goto out;
> >
> > I was wrong about the name. I meant 'entry->table_flags', and that's clearly
> > manipulated before calling msixtbl_addr_to_desc() so even if that returns
> > NULL Xen still keeps in sync with QEMU AFAICT.
> 
> Staying in sync with qemu is not the problem. The problem is that
> the re-invocation from msix_write_completion() then would get
> past this, and find said NULL returned from msixtbl_addr_to_desc().
> 

True, but perhaps if we know this is a completion cycle then we should always set r to X86EMUL_OKAY? TBH I'm not sure what would happen if we didn't... I suspect the emulation state machine would get upset.

  Paul

> Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH RFC] x86/vMSI-X: avoid missing first unmask of vectors
  2016-04-01 14:58                   ` Paul Durrant
@ 2016-04-01 15:28                     ` Jan Beulich
  0 siblings, 0 replies; 14+ messages in thread
From: Jan Beulich @ 2016-04-01 15:28 UTC (permalink / raw)
  To: Paul Durrant
  Cc: Anthony Perard, Andrew Cooper, Keir (Xen.org),
	Stefano Stabellini, xen-devel

>>> On 01.04.16 at 16:58, <Paul.Durrant@citrix.com> wrote:
>> From: Jan Beulich [mailto:JBeulich@suse.com]
>> Sent: 01 April 2016 15:40
>> >>> On 01.04.16 at 16:27, <Paul.Durrant@citrix.com> wrote:
>> >> From: Paul Durrant
>> >> Sent: 01 April 2016 15:20
>> > I pulled staging and I still see (starting at line 300 in vmsi.c)
>> >
>> >     /* Exit to device model when unmasking and address/data got modified.
>> */
>> >     if ( !(val & PCI_MSIX_VECTOR_BITMASK) &&
>> >          test_and_clear_bit(nr_entry, &entry->table_flags) )
>> >     {
>> >         v->arch.hvm_vcpu.hvm_io.msix_unmask_address = address;
>> >         goto out;
>> >     }
>> >
>> >     msi_desc = msixtbl_addr_to_desc(entry, address);
>> >     if ( !msi_desc || msi_desc->irq < 0 )
>> >         goto out;
>> >
>> > I was wrong about the name. I meant 'entry->table_flags', and that's clearly
>> > manipulated before calling msixtbl_addr_to_desc() so even if that returns
>> > NULL Xen still keeps in sync with QEMU AFAICT.
>> 
>> Staying in sync with qemu is not the problem. The problem is that
>> the re-invocation from msix_write_completion() then would get
>> past this, and find said NULL returned from msixtbl_addr_to_desc().
> 
> True, but perhaps if we know this is a completion cycle then we should 
> always set r to X86EMUL_OKAY? TBH I'm not sure what would happen if we 
> didn't... I suspect the emulation state machine would get upset.

The question isn't what to set r to (msix_write_completion() only
logs a debug message whan that's not the case), but how to make
sure guest_mask_msi_irq() gets called.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2016-04-01 15:28 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-04-01  9:15 [PATCH RFC] x86/vMSI-X: avoid missing first unmask of vectors Jan Beulich
2016-04-01  9:58 ` Jan Beulich
2016-04-01 10:56   ` Paul Durrant
2016-04-01 11:21     ` Jan Beulich
2016-04-01 13:01       ` Paul Durrant
2016-04-01 13:42         ` Jan Beulich
2016-04-01 13:54           ` Paul Durrant
2016-04-01 14:17             ` Jan Beulich
2016-04-01 14:19               ` Paul Durrant
2016-04-01 14:28                 ` Jan Beulich
2016-04-01 14:27               ` Paul Durrant
2016-04-01 14:40                 ` Jan Beulich
2016-04-01 14:58                   ` Paul Durrant
2016-04-01 15:28                     ` Jan Beulich

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).