All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] xen, libxc: init msix addr/data with value from qemu via hypercall
@ 2013-05-08  8:17 Zhenzhong Duan
  2013-05-08  9:39 ` Jan Beulich
  0 siblings, 1 reply; 12+ messages in thread
From: Zhenzhong Duan @ 2013-05-08  8:17 UTC (permalink / raw)
  To: xen-devel; +Cc: Chien Yen, Feng Jin, Yuval Shaia, Konrad Rzeszutek Wilk

Accelerated msix entry is initialized to zero when msixtbl_pt_register is
called. This doesn't match the value from qemu side, although pirq may already
be mapped and binded in qemu side. Kernel will get wrong value when reading
msix info.

Signed-off-by: Zhenzhong Duan <zhenzhong.duan@oracle.com>
Tested-by: Yuval Shaia <yuval.shaia@oracle.com>
---
 tools/libxc/xc_domain.c      |    7 ++++++-
 tools/libxc/xenctrl.h        |    4 +++-
 xen/arch/x86/hvm/vmsi.c      |   13 ++++++++++++-
 xen/drivers/passthrough/io.c |    3 ++-
 xen/include/public/domctl.h  |    2 ++
 xen/include/xen/pci.h        |    3 ++-
 6 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/tools/libxc/xc_domain.c b/tools/libxc/xc_domain.c
index bb71cca..f6fc8e4 100644
--- a/tools/libxc/xc_domain.c
+++ b/tools/libxc/xc_domain.c
@@ -1339,7 +1339,9 @@ int xc_domain_update_msi_irq(
     uint32_t gvec,
     uint32_t pirq,
     uint32_t gflags,
-    uint64_t gtable)
+    uint64_t gtable,
+    uint16_t entry_nr,
+    uint32_t msi_ad[3])
 {
     int rc;
     xen_domctl_bind_pt_irq_t *bind;
@@ -1356,6 +1358,9 @@ int xc_domain_update_msi_irq(
     bind->u.msi.gvec = gvec;
     bind->u.msi.gflags = gflags;
     bind->u.msi.gtable = gtable;
+    bind->u.msi.entry_nr = entry_nr;
+    if ( gtable )
+        memcpy(bind->u.msi.msi_ad, msi_ad, sizeof(uint32_t[3]));
 
     rc = do_domctl(xch, &domctl);
     return rc;
diff --git a/tools/libxc/xenctrl.h b/tools/libxc/xenctrl.h
index 54a2d5a..f292443 100644
--- a/tools/libxc/xenctrl.h
+++ b/tools/libxc/xenctrl.h
@@ -1749,7 +1749,9 @@ int xc_domain_update_msi_irq(
     uint32_t gvec,
     uint32_t pirq,
     uint32_t gflags,
-    uint64_t gtable);
+    uint64_t gtable,
+    uint16_t entry_nr,
+    uint32_t msi_ad[3]);
 
 int xc_domain_unbind_msi_irq(xc_interface *xch,
                              uint32_t domid,
diff --git a/xen/arch/x86/hvm/vmsi.c b/xen/arch/x86/hvm/vmsi.c
index 36de312..06ea324 100644
--- a/xen/arch/x86/hvm/vmsi.c
+++ b/xen/arch/x86/hvm/vmsi.c
@@ -168,6 +168,7 @@ struct msixtbl_entry
     struct { 
         uint32_t msi_ad[3];	/* Shadow of address low, high and data */
     } gentries[MAX_MSIX_ACC_ENTRIES];
+    unsigned long table_shadow[BITS_TO_LONGS(MAX_MSIX_ACC_ENTRIES)];
     struct rcu_head rcu;
 };
 
@@ -229,6 +230,9 @@ static int msixtbl_read(
         nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE;
         if ( nr_entry >= MAX_MSIX_ACC_ENTRIES )
             goto out;
+        if( !test_bit(nr_entry, entry->table_shadow) )
+            goto out;
+
         index = offset / sizeof(uint32_t);
         *pval = entry->gentries[nr_entry].msi_ad[index];
     }
@@ -361,7 +365,8 @@ static void del_msixtbl_entry(struct msixtbl_entry *entry)
     call_rcu(&entry->rcu, free_msixtbl_entry);
 }
 
-int msixtbl_pt_register(struct domain *d, struct pirq *pirq, uint64_t gtable)
+int msixtbl_pt_register(struct domain *d, struct pirq *pirq, uint64_t gtable,
+                        uint16_t entry_nr, uint32_t msi_ad[3])
 {
     struct irq_desc *irq_desc;
     struct msi_desc *msi_desc;
@@ -408,6 +413,12 @@ int msixtbl_pt_register(struct domain *d, struct pirq *pirq, uint64_t gtable)
 
 found:
     atomic_inc(&entry->refcnt);
+
+    if( entry_nr < MAX_MSIX_ACC_ENTRIES ) {
+        memcpy(entry->gentries[entry_nr].msi_ad, msi_ad, sizeof(uint32_t[3]));
+        set_bit(entry_nr, entry->table_shadow);
+    }
+
     spin_unlock(&d->arch.hvm_domain.msixtbl_list_lock);
     r = 0;
 
diff --git a/xen/drivers/passthrough/io.c b/xen/drivers/passthrough/io.c
index 13002c0..17cb8c2 100644
--- a/xen/drivers/passthrough/io.c
+++ b/xen/drivers/passthrough/io.c
@@ -153,7 +153,8 @@ int pt_irq_create_bind(
             rc = pirq_guest_bind(d->vcpu[0], info, 0);
             if ( rc == 0 && pt_irq_bind->u.msi.gtable )
             {
-                rc = msixtbl_pt_register(d, info, pt_irq_bind->u.msi.gtable);
+                rc = msixtbl_pt_register(d, info, pt_irq_bind->u.msi.gtable,
+                                         pt_irq_bind->u.msi.entry_nr, pt_irq_bind->u.msi.msi_ad);
                 if ( unlikely(rc) )
                     pirq_guest_unbind(d, info);
             }
diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h
index 4c5b2bb..4b160a0 100644
--- a/xen/include/public/domctl.h
+++ b/xen/include/public/domctl.h
@@ -504,6 +504,8 @@ struct xen_domctl_bind_pt_irq {
             uint8_t gvec;
             uint32_t gflags;
             uint64_aligned_t gtable;
+            uint16_t entry_nr;
+            uint32_t msi_ad[3];
         } msi;
     } u;
 };
diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h
index ca72a99..d8e22a8 100644
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -151,7 +151,8 @@ int pci_find_next_cap(u16 seg, u8 bus, unsigned int devfn, u8 pos, int cap);
 int pci_find_ext_capability(int seg, int bus, int devfn, int cap);
 
 struct pirq;
-int msixtbl_pt_register(struct domain *, struct pirq *, uint64_t gtable);
+int msixtbl_pt_register(struct domain *, struct pirq *, uint64_t gtable,
+                        uint16_t entry_nr, uint32_t msi_ad[3]);
 void msixtbl_pt_unregister(struct domain *, struct pirq *);
 void msixtbl_pt_cleanup(struct domain *d);
 
-- 
1.7.3

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/2] xen, libxc: init msix addr/data with value from qemu via hypercall
  2013-05-08  8:17 [PATCH 1/2] xen, libxc: init msix addr/data with value from qemu via hypercall Zhenzhong Duan
@ 2013-05-08  9:39 ` Jan Beulich
  2013-05-08 10:00   ` Zhenzhong Duan
  0 siblings, 1 reply; 12+ messages in thread
From: Jan Beulich @ 2013-05-08  9:39 UTC (permalink / raw)
  To: zhenzhong.duan
  Cc: Chien Yen, Konrad Rzeszutek Wilk, Feng Jin, Yuval Shaia, xen-devel

>>> On 08.05.13 at 10:17, Zhenzhong Duan <zhenzhong.duan@oracle.com> wrote:
> Accelerated msix entry is initialized to zero when msixtbl_pt_register is
> called. This doesn't match the value from qemu side, although pirq may 
> already
> be mapped and binded in qemu side. Kernel will get wrong value when reading
> msix info.
> 
> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@oracle.com>
> Tested-by: Yuval Shaia <yuval.shaia@oracle.com>

I appreciate this needing to change, but it is a no-go to expose an
implementation detail of the hypervisor (number of accelerated
entries being 3) trough a hypercall interface (and even less so by
scattering around literal 3-s).

Please work towards a different solution, leaving the tool stack
agnostic to the number of accelerated entries. And if at all
possible, arrange for the patch to be split into tool stack and
hypervisor pieces, such that they can be applied independently
(and in either order).

Jan

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/2] xen, libxc: init msix addr/data with value from qemu via hypercall
  2013-05-08  9:39 ` Jan Beulich
@ 2013-05-08 10:00   ` Zhenzhong Duan
  2013-05-08 12:03     ` Jan Beulich
  0 siblings, 1 reply; 12+ messages in thread
From: Zhenzhong Duan @ 2013-05-08 10:00 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Chien Yen, Konrad Rzeszutek Wilk, Feng Jin, Yuval Shaia, xen-devel


On 2013-05-08 17:39, Jan Beulich wrote:
>>>> On 08.05.13 at 10:17, Zhenzhong Duan<zhenzhong.duan@oracle.com>  wrote:
>> Accelerated msix entry is initialized to zero when msixtbl_pt_register is
>> called. This doesn't match the value from qemu side, although pirq may
>> already
>> be mapped and binded in qemu side. Kernel will get wrong value when reading
>> msix info.
>>
>> Signed-off-by: Zhenzhong Duan<zhenzhong.duan@oracle.com>
>> Tested-by: Yuval Shaia<yuval.shaia@oracle.com>
> I appreciate this needing to change, but it is a no-go to expose an
> implementation detail of the hypervisor (number of accelerated
> entries being 3) trough a hypercall interface (and even less so by
> scattering around literal 3-s).
I presume you mean msi_ad[3]. msi_ad[3] is addr_lo, addr_high and data.
Not related to accelerated entries count.

or others?
> Please work towards a different solution, leaving the tool stack
> agnostic to the number of accelerated entries. And if at all
> possible, arrange for the patch to be split into tool stack and
> hypervisor pieces, such that they can be applied independently
> (and in either order).
sure, will do it after above question is clear.

Regards
zduan

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/2] xen, libxc: init msix addr/data with value from qemu via hypercall
  2013-05-08 10:00   ` Zhenzhong Duan
@ 2013-05-08 12:03     ` Jan Beulich
  2013-05-09  3:02       ` Zhenzhong Duan
  0 siblings, 1 reply; 12+ messages in thread
From: Jan Beulich @ 2013-05-08 12:03 UTC (permalink / raw)
  To: zhenzhong.duan
  Cc: Chien Yen, Konrad Rzeszutek Wilk, Feng Jin, Yuval Shaia, xen-devel

>>> On 08.05.13 at 12:00, Zhenzhong Duan <zhenzhong.duan@oracle.com> wrote:

> On 2013-05-08 17:39, Jan Beulich wrote:
>>>>> On 08.05.13 at 10:17, Zhenzhong Duan<zhenzhong.duan@oracle.com>  wrote:
>>> Accelerated msix entry is initialized to zero when msixtbl_pt_register is
>>> called. This doesn't match the value from qemu side, although pirq may
>>> already
>>> be mapped and binded in qemu side. Kernel will get wrong value when reading
>>> msix info.
>>>
>>> Signed-off-by: Zhenzhong Duan<zhenzhong.duan@oracle.com>
>>> Tested-by: Yuval Shaia<yuval.shaia@oracle.com>
>> I appreciate this needing to change, but it is a no-go to expose an
>> implementation detail of the hypervisor (number of accelerated
>> entries being 3) trough a hypercall interface (and even less so by
>> scattering around literal 3-s).
> I presume you mean msi_ad[3]. msi_ad[3] is addr_lo, addr_high and data.
> Not related to accelerated entries count.
> 
> or others?

Oh, right you are. But then nevertheless give this meaningful
names in the hypercall interface (e.g. addr_lo, addr_hi, and data,
or just [64-bit] addr and [32-bit] data) rather than following the
bad practice in vmsi.c.

>> Please work towards a different solution, leaving the tool stack
>> agnostic to the number of accelerated entries. And if at all
>> possible, arrange for the patch to be split into tool stack and
>> hypervisor pieces, such that they can be applied independently
>> (and in either order).
> sure, will do it after above question is clear.

With the above it's going to be difficult to split the two pieces.

But of course I still don't really understand why all of the sudden
this needs to be passed in rather than being under the full control
of the hypervisor at all times. Perhaps this is related to me not
understanding why the kernel would read these values at all:
There's no other place in the kernel where the message would
be read before first getting written (in fact, apart from the
use of __read_msi_msg() by the Xen code, there's only one
other user under arch/powerpc/, and there - according to the
accompanying comment - this is just to save away the data for
later use during resume).

Jan

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/2] xen, libxc: init msix addr/data with value from qemu via hypercall
  2013-05-08 12:03     ` Jan Beulich
@ 2013-05-09  3:02       ` Zhenzhong Duan
  2013-05-09 19:05         ` Jan Beulich
  0 siblings, 1 reply; 12+ messages in thread
From: Zhenzhong Duan @ 2013-05-09  3:02 UTC (permalink / raw)
  To: Jan Beulich
  Cc: Chien Yen, Konrad Rzeszutek Wilk, Feng Jin, Yuval Shaia, xen-devel

On 2013/5/8 20:03, Jan Beulich wrote:
>>>> On 08.05.13 at 12:00, Zhenzhong Duan <zhenzhong.duan@oracle.com> wrote:
>> On 2013-05-08 17:39, Jan Beulich wrote:
>>>>>> On 08.05.13 at 10:17, Zhenzhong Duan<zhenzhong.duan@oracle.com>  wrote:
>>>> Accelerated msix entry is initialized to zero when msixtbl_pt_register is
>>>> called. This doesn't match the value from qemu side, although pirq may
>>>> already
>>>> be mapped and binded in qemu side. Kernel will get wrong value when reading
>>>> msix info.
>>>>
>>>> Signed-off-by: Zhenzhong Duan<zhenzhong.duan@oracle.com>
>>>> Tested-by: Yuval Shaia<yuval.shaia@oracle.com>
>>> I appreciate this needing to change, but it is a no-go to expose an
>>> implementation detail of the hypervisor (number of accelerated
>>> entries being 3) trough a hypercall interface (and even less so by
>>> scattering around literal 3-s).
>> I presume you mean msi_ad[3]. msi_ad[3] is addr_lo, addr_high and data.
>> Not related to accelerated entries count.
>>
>> or others?
> Oh, right you are. But then nevertheless give this meaningful
> names in the hypercall interface (e.g. addr_lo, addr_hi, and data,
> or just [64-bit] addr and [32-bit] data) rather than following the
> bad practice in vmsi.c.
>
>>> Please work towards a different solution, leaving the tool stack
>>> agnostic to the number of accelerated entries. And if at all
>>> possible, arrange for the patch to be split into tool stack and
>>> hypervisor pieces, such that they can be applied independently
>>> (and in either order).
>> sure, will do it after above question is clear.
> With the above it's going to be difficult to split the two pieces.
so, only change to a meaningful names without split patch, right?
>
> But of course I still don't really understand why all of the sudden
> this needs to be passed in rather than being under the full control
> of the hypervisor at all times. Perhaps this is related to me not
> understanding why the kernel would read these values at all:
> There's no other place in the kernel where the message would
> be read before first getting written (in fact, apart from the
> use of __read_msi_msg() by the Xen code, there's only one
> other user under arch/powerpc/, and there - according to the
> accompanying comment - this is just to save away the data for
> later use during resume).
There is a bug if msi_ad is not passed in.

when driver first load,

kernel.__read_msi_msg()
        (got all zero)
kernel.__write_msi_msg(pirq)
        (ioreq passed to qemu as no msixtbl_entry established yet)
qemu.pt_msi_update_one()
        xc_domain_update_msi_irq()
             (msixtbl_entry dynamicly allocated with msi_ad all zero)

then driver unload,
...
driver load again,

kernel.__read_msi_msg()
        (got all zero from xen as accelerated entry just established with all zero)
qemu.__write_msi_msg(a new pirq)

pirq would exhaust or fail to map and bind.

zduan

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/2] xen, libxc: init msix addr/data with value from qemu via hypercall
  2013-05-09  3:02       ` Zhenzhong Duan
@ 2013-05-09 19:05         ` Jan Beulich
  2013-05-10  2:49           ` Zhenzhong Duan
  0 siblings, 1 reply; 12+ messages in thread
From: Jan Beulich @ 2013-05-09 19:05 UTC (permalink / raw)
  To: zhenzhong.duan; +Cc: chien.yen, konrad.wilk, joe.jin, yuval.shaia, xen-devel

>>> Zhenzhong Duan <zhenzhong.duan@oracle.com> 05/09/13 5:02 AM >>>
>On 2013/5/8 20:03, Jan Beulich wrote:
>> But of course I still don't really understand why all of the sudden
>> this needs to be passed in rather than being under the full control
>> of the hypervisor at all times. Perhaps this is related to me not
>> understanding why the kernel would read these values at all:
>> There's no other place in the kernel where the message would
>> be read before first getting written (in fact, apart from the
>> use of __read_msi_msg() by the Xen code, there's only one
>> other user under arch/powerpc/, and there - according to the
>> accompanying comment - this is just to save away the data for
>> later use during resume).
>There is a bug if msi_ad is not passed in.
>
>when driver first load,
>
>kernel.__read_msi_msg()
>(got all zero)

But you don't even comment on the apparently bogus use of the function here.

>kernel.__write_msi_msg(pirq)
>(ioreq passed to qemu as no msixtbl_entry established yet)
>qemu.pt_msi_update_one()
>xc_domain_update_msi_irq()
>(msixtbl_entry dynamicly allocated with msi_ad all zero)
>
>then driver unload,
>...
>driver load again,
>
>kernel.__read_msi_msg()
>(got all zero from xen as accelerated entry just established with all zero)

If all zeroes get returned, why would the flow here be different then above?

>qemu.__write_msi_msg(a new pirq)
>
>pirq would exhaust or fail to map and bind.

I'm afraid your replies are more confusing to me than clarifying...

Jan

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/2] xen, libxc: init msix addr/data with value from qemu via hypercall
  2013-05-09 19:05         ` Jan Beulich
@ 2013-05-10  2:49           ` Zhenzhong Duan
  2013-05-10  6:37             ` Jan Beulich
  0 siblings, 1 reply; 12+ messages in thread
From: Zhenzhong Duan @ 2013-05-10  2:49 UTC (permalink / raw)
  To: Jan Beulich; +Cc: chien.yen, konrad.wilk, joe.jin, yuval.shaia, xen-devel


On 2013-05-10 03:05, Jan Beulich wrote:
>>>> Zhenzhong Duan <zhenzhong.duan@oracle.com> 05/09/13 5:02 AM >>>
>> On 2013/5/8 20:03, Jan Beulich wrote:
>>> But of course I still don't really understand why all of the sudden
>>> this needs to be passed in rather than being under the full control
>>> of the hypervisor at all times. Perhaps this is related to me not
>>> understanding why the kernel would read these values at all:
>>> There's no other place in the kernel where the message would
>>> be read before first getting written (in fact, apart from the
>>> use of __read_msi_msg() by the Xen code, there's only one
>>> other user under arch/powerpc/, and there - according to the
>>> accompanying comment - this is just to save away the data for
>>> later use during resume).
>> There is a bug if msi_ad is not passed in.
>>
>> when driver first load,
>>
>> kernel.__read_msi_msg()
>> (got all zero)
> But you don't even comment on the apparently bogus use of the function here.
This pattern is used only when hvm_pirq is enabled. kernel need to check 
XEN_PIRQ_MSI_DATA.
It's not a issue if data is 0 at first driver load, kernel will call 
__write_msi_msg with pirq and  XEN_PIRQ_MSI_DATA set.
>
>> kernel.__write_msi_msg(pirq)
>> (ioreq passed to qemu as no msixtbl_entry established yet)
>> qemu.pt_msi_update_one()
>> xc_domain_update_msi_irq()
>> (msixtbl_entry dynamicly allocated with msi_ad all zero)
>>
>> then driver unload,
>> ...
>> driver load again,
>>
>> kernel.__read_msi_msg()
>> (got all zero from xen as accelerated entry just established with all zero)
> If all zeroes get returned, why would the flow here be different then above?
Because pirq and related mapping and binding are not freed between 
driver load-unload-load. They are freed when device detach.
We should try to use the last pirq.

Regards
zduan

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/2] xen, libxc: init msix addr/data with value from qemu via hypercall
  2013-05-10  2:49           ` Zhenzhong Duan
@ 2013-05-10  6:37             ` Jan Beulich
  2013-05-10  7:39               ` Zhenzhong Duan
  0 siblings, 1 reply; 12+ messages in thread
From: Jan Beulich @ 2013-05-10  6:37 UTC (permalink / raw)
  To: zhenzhong.duan; +Cc: chien.yen, konrad.wilk, joe.jin, yuval.shaia, xen-devel

>>> On 10.05.13 at 04:49, Zhenzhong Duan <zhenzhong.duan@oracle.com> wrote:

> On 2013-05-10 03:05, Jan Beulich wrote:
>>>>> Zhenzhong Duan <zhenzhong.duan@oracle.com> 05/09/13 5:02 AM >>>
>>> On 2013/5/8 20:03, Jan Beulich wrote:
>>>> But of course I still don't really understand why all of the sudden
>>>> this needs to be passed in rather than being under the full control
>>>> of the hypervisor at all times. Perhaps this is related to me not
>>>> understanding why the kernel would read these values at all:
>>>> There's no other place in the kernel where the message would
>>>> be read before first getting written (in fact, apart from the
>>>> use of __read_msi_msg() by the Xen code, there's only one
>>>> other user under arch/powerpc/, and there - according to the
>>>> accompanying comment - this is just to save away the data for
>>>> later use during resume).
>>> There is a bug if msi_ad is not passed in.
>>>
>>> when driver first load,
>>>
>>> kernel.__read_msi_msg()
>>> (got all zero)
>> But you don't even comment on the apparently bogus use of the function here.
> This pattern is used only when hvm_pirq is enabled. kernel need to check 
> XEN_PIRQ_MSI_DATA.
> It's not a issue if data is 0 at first driver load, kernel will call 
> __write_msi_msg with pirq and  XEN_PIRQ_MSI_DATA set.

But this doesn't make the use of __read_msi_msg() less bogus. It's
not clear on what basis this mechanism got invented in the first
place.

>>> kernel.__write_msi_msg(pirq)
>>> (ioreq passed to qemu as no msixtbl_entry established yet)
>>> qemu.pt_msi_update_one()
>>> xc_domain_update_msi_irq()
>>> (msixtbl_entry dynamicly allocated with msi_ad all zero)
>>>
>>> then driver unload,
>>> ...
>>> driver load again,
>>>
>>> kernel.__read_msi_msg()
>>> (got all zero from xen as accelerated entry just established with all zero)
>> If all zeroes get returned, why would the flow here be different then above?
> Because pirq and related mapping and binding are not freed between 
> driver load-unload-load. They are freed when device detach.
> We should try to use the last pirq.

But then you need to solve the problem generically, i.e. not just
for the driver reload case, but also for e.g. the kexec one (where
__read_msi_msg() returning other than all zeros wouldn't help you
as xen_irq_from_pirq() would then return -1, and you'd be back to
the same problem. IOW I think the prior IRQ needs to be freed
anyway rather than an attempt be made to reuse it.

Jan

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/2] xen, libxc: init msix addr/data with value from qemu via hypercall
  2013-05-10  6:37             ` Jan Beulich
@ 2013-05-10  7:39               ` Zhenzhong Duan
  2013-05-10  7:55                 ` Jan Beulich
  0 siblings, 1 reply; 12+ messages in thread
From: Zhenzhong Duan @ 2013-05-10  7:39 UTC (permalink / raw)
  To: Jan Beulich; +Cc: chien.yen, konrad.wilk, joe.jin, yuval.shaia, xen-devel


On 2013-05-10 14:37, Jan Beulich wrote:
>>>> On 10.05.13 at 04:49, Zhenzhong Duan <zhenzhong.duan@oracle.com> wrote:
>> On 2013-05-10 03:05, Jan Beulich wrote:
>>>>>> Zhenzhong Duan <zhenzhong.duan@oracle.com> 05/09/13 5:02 AM >>>
>>>> On 2013/5/8 20:03, Jan Beulich wrote:
>>>>> But of course I still don't really understand why all of the sudden
>>>>> this needs to be passed in rather than being under the full control
>>>>> of the hypervisor at all times. Perhaps this is related to me not
>>>>> understanding why the kernel would read these values at all:
>>>>> There's no other place in the kernel where the message would
>>>>> be read before first getting written (in fact, apart from the
>>>>> use of __read_msi_msg() by the Xen code, there's only one
>>>>> other user under arch/powerpc/, and there - according to the
>>>>> accompanying comment - this is just to save away the data for
>>>>> later use during resume).
>>>> There is a bug if msi_ad is not passed in.
>>>>
>>>> when driver first load,
>>>>
>>>> kernel.__read_msi_msg()
>>>> (got all zero)
>>> But you don't even comment on the apparently bogus use of the function here.
>> This pattern is used only when hvm_pirq is enabled. kernel need to check
>> XEN_PIRQ_MSI_DATA.
>> It's not a issue if data is 0 at first driver load, kernel will call
>> __write_msi_msg with pirq and  XEN_PIRQ_MSI_DATA set.
> But this doesn't make the use of __read_msi_msg() less bogus. It's
> not clear on what basis this mechanism got invented in the first
> place.
It's there since hvm_irq introduced. But it works indeed.
>
>>>> kernel.__write_msi_msg(pirq)
>>>> (ioreq passed to qemu as no msixtbl_entry established yet)
>>>> qemu.pt_msi_update_one()
>>>> xc_domain_update_msi_irq()
>>>> (msixtbl_entry dynamicly allocated with msi_ad all zero)
>>>>
>>>> then driver unload,
>>>> ...
>>>> driver load again,
>>>>
>>>> kernel.__read_msi_msg()
>>>> (got all zero from xen as accelerated entry just established with all zero)
>>> If all zeroes get returned, why would the flow here be different then above?
>> Because pirq and related mapping and binding are not freed between
>> driver load-unload-load. They are freed when device detach.
>> We should try to use the last pirq.
> But then you need to solve the problem generically, i.e. not just
> for the driver reload case, but also for e.g. the kexec one (where
> __read_msi_msg() returning other than all zeros wouldn't help you
> as xen_irq_from_pirq() would then return -1, and you'd be back to
> the same problem.
No, not only kexec ones, it's driver unload that makes xen_irq_from_pirq 
return -1. So there is also a bug in kernel side.
I have sent a patch about kernel. I think you miss it.
http://www.gossamer-threads.com/lists/xen/devel/281498
> IOW I think the prior IRQ needs to be freed
> anyway rather than an attempt be made to reuse it.
I have ever thought about this idea, but when to free the pirq is a problem.
When driver unload? qemu has no idea of if driver unloaded.
When msix entry masked? kernel mask and unmask msix entry 
intermittently, especially when irqbalance enabled.

So based on above, I think it's better to reuse same pirq, only free it 
when device detached.

Regards
zduan

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/2] xen, libxc: init msix addr/data with value from qemu via hypercall
  2013-05-10  7:39               ` Zhenzhong Duan
@ 2013-05-10  7:55                 ` Jan Beulich
  2013-05-10  8:22                   ` Zhenzhong Duan
  2013-05-10 19:03                   ` Is: Telling QEMU to re-use PIRQ value Was: " Konrad Rzeszutek Wilk
  0 siblings, 2 replies; 12+ messages in thread
From: Jan Beulich @ 2013-05-10  7:55 UTC (permalink / raw)
  To: zhenzhong.duan; +Cc: chien.yen, konrad.wilk, joe.jin, yuval.shaia, xen-devel

>>> On 10.05.13 at 09:39, Zhenzhong Duan <zhenzhong.duan@oracle.com> wrote:

> On 2013-05-10 14:37, Jan Beulich wrote:
>>>>> On 10.05.13 at 04:49, Zhenzhong Duan <zhenzhong.duan@oracle.com> wrote:
>>> On 2013-05-10 03:05, Jan Beulich wrote:
>>>>>>> Zhenzhong Duan <zhenzhong.duan@oracle.com> 05/09/13 5:02 AM >>>
>>>>> On 2013/5/8 20:03, Jan Beulich wrote:
>>>>>> But of course I still don't really understand why all of the sudden
>>>>>> this needs to be passed in rather than being under the full control
>>>>>> of the hypervisor at all times. Perhaps this is related to me not
>>>>>> understanding why the kernel would read these values at all:
>>>>>> There's no other place in the kernel where the message would
>>>>>> be read before first getting written (in fact, apart from the
>>>>>> use of __read_msi_msg() by the Xen code, there's only one
>>>>>> other user under arch/powerpc/, and there - according to the
>>>>>> accompanying comment - this is just to save away the data for
>>>>>> later use during resume).
>>>>> There is a bug if msi_ad is not passed in.
>>>>>
>>>>> when driver first load,
>>>>>
>>>>> kernel.__read_msi_msg()
>>>>> (got all zero)
>>>> But you don't even comment on the apparently bogus use of the function here.
>>> This pattern is used only when hvm_pirq is enabled. kernel need to check
>>> XEN_PIRQ_MSI_DATA.
>>> It's not a issue if data is 0 at first driver load, kernel will call
>>> __write_msi_msg with pirq and  XEN_PIRQ_MSI_DATA set.
>> But this doesn't make the use of __read_msi_msg() less bogus. It's
>> not clear on what basis this mechanism got invented in the first
>> place.
> It's there since hvm_irq introduced. But it works indeed.

But that doesn't in any way mean the concept is sound.

>>>>> kernel.__write_msi_msg(pirq)
>>>>> (ioreq passed to qemu as no msixtbl_entry established yet)
>>>>> qemu.pt_msi_update_one()
>>>>> xc_domain_update_msi_irq()
>>>>> (msixtbl_entry dynamicly allocated with msi_ad all zero)
>>>>>
>>>>> then driver unload,
>>>>> ...
>>>>> driver load again,
>>>>>
>>>>> kernel.__read_msi_msg()
>>>>> (got all zero from xen as accelerated entry just established with all zero)
>>>> If all zeroes get returned, why would the flow here be different then above?
>>> Because pirq and related mapping and binding are not freed between
>>> driver load-unload-load. They are freed when device detach.
>>> We should try to use the last pirq.
>> But then you need to solve the problem generically, i.e. not just
>> for the driver reload case, but also for e.g. the kexec one (where
>> __read_msi_msg() returning other than all zeros wouldn't help you
>> as xen_irq_from_pirq() would then return -1, and you'd be back to
>> the same problem.
> No, not only kexec ones, it's driver unload that makes xen_irq_from_pirq 
> return -1. So there is also a bug in kernel side.
> I have sent a patch about kernel. I think you miss it.
> http://www.gossamer-threads.com/lists/xen/devel/281498 
>> IOW I think the prior IRQ needs to be freed
>> anyway rather than an attempt be made to reuse it.
> I have ever thought about this idea, but when to free the pirq is a problem.
> When driver unload? qemu has no idea of if driver unloaded.

But the kernel does, and hence could deal with this. As much as
the setup is being done when the driver gets loaded, cleanup
should be done when the driver gets unloaded. _If_ there
already is such an odd protocol between kernel and qemu, then
if that can't be dropped, it surely can be leveraged to also deal
with the cleanup side of things? No need to fiddle with the
hypervisor interfaces for something that it's not supposed to
know about anyway.

> When msix entry masked? kernel mask and unmask msix entry 
> intermittently, especially when irqbalance enabled.
> 
> So based on above, I think it's better to reuse same pirq, only free it 
> when device detached.

I continue to disagree. Also from a theoretical perspective - if you
have a lot of devices that no driver is loaded for, you'd keep a lot
of IRQs allocated without any need.

Jan

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/2] xen, libxc: init msix addr/data with value from qemu via hypercall
  2013-05-10  7:55                 ` Jan Beulich
@ 2013-05-10  8:22                   ` Zhenzhong Duan
  2013-05-10 19:03                   ` Is: Telling QEMU to re-use PIRQ value Was: " Konrad Rzeszutek Wilk
  1 sibling, 0 replies; 12+ messages in thread
From: Zhenzhong Duan @ 2013-05-10  8:22 UTC (permalink / raw)
  To: Jan Beulich; +Cc: chien.yen, konrad.wilk, joe.jin, yuval.shaia, xen-devel


On 2013-05-10 15:55, Jan Beulich wrote:
>>>> On 10.05.13 at 09:39, Zhenzhong Duan <zhenzhong.duan@oracle.com> wrote:
>> On 2013-05-10 14:37, Jan Beulich wrote:
>>>>>> On 10.05.13 at 04:49, Zhenzhong Duan <zhenzhong.duan@oracle.com> wrote:
>>>> On 2013-05-10 03:05, Jan Beulich wrote:
>>>>>>>> Zhenzhong Duan <zhenzhong.duan@oracle.com> 05/09/13 5:02 AM >>>
>>>>>> On 2013/5/8 20:03, Jan Beulich wrote:
>>>>>>> But of course I still don't really understand why all of the sudden
>>>>>>> this needs to be passed in rather than being under the full control
>>>>>>> of the hypervisor at all times. Perhaps this is related to me not
>>>>>>> understanding why the kernel would read these values at all:
>>>>>>> There's no other place in the kernel where the message would
>>>>>>> be read before first getting written (in fact, apart from the
>>>>>>> use of __read_msi_msg() by the Xen code, there's only one
>>>>>>> other user under arch/powerpc/, and there - according to the
>>>>>>> accompanying comment - this is just to save away the data for
>>>>>>> later use during resume).
>>>>>> There is a bug if msi_ad is not passed in.
>>>>>>
>>>>>> when driver first load,
>>>>>>
>>>>>> kernel.__read_msi_msg()
>>>>>> (got all zero)
>>>>> But you don't even comment on the apparently bogus use of the function here.
>>>> This pattern is used only when hvm_pirq is enabled. kernel need to check
>>>> XEN_PIRQ_MSI_DATA.
>>>> It's not a issue if data is 0 at first driver load, kernel will call
>>>> __write_msi_msg with pirq and  XEN_PIRQ_MSI_DATA set.
>>> But this doesn't make the use of __read_msi_msg() less bogus. It's
>>> not clear on what basis this mechanism got invented in the first
>>> place.
>> It's there since hvm_irq introduced. But it works indeed.
> But that doesn't in any way mean the concept is sound.
>
>>>>>> kernel.__write_msi_msg(pirq)
>>>>>> (ioreq passed to qemu as no msixtbl_entry established yet)
>>>>>> qemu.pt_msi_update_one()
>>>>>> xc_domain_update_msi_irq()
>>>>>> (msixtbl_entry dynamicly allocated with msi_ad all zero)
>>>>>>
>>>>>> then driver unload,
>>>>>> ...
>>>>>> driver load again,
>>>>>>
>>>>>> kernel.__read_msi_msg()
>>>>>> (got all zero from xen as accelerated entry just established with all zero)
>>>>> If all zeroes get returned, why would the flow here be different then above?
>>>> Because pirq and related mapping and binding are not freed between
>>>> driver load-unload-load. They are freed when device detach.
>>>> We should try to use the last pirq.
>>> But then you need to solve the problem generically, i.e. not just
>>> for the driver reload case, but also for e.g. the kexec one (where
>>> __read_msi_msg() returning other than all zeros wouldn't help you
>>> as xen_irq_from_pirq() would then return -1, and you'd be back to
>>> the same problem.
>> No, not only kexec ones, it's driver unload that makes xen_irq_from_pirq
>> return -1. So there is also a bug in kernel side.
>> I have sent a patch about kernel. I think you miss it.
>> http://www.gossamer-threads.com/lists/xen/devel/281498
>>> IOW I think the prior IRQ needs to be freed
>>> anyway rather than an attempt be made to reuse it.
>> I have ever thought about this idea, but when to free the pirq is a problem.
>> When driver unload? qemu has no idea of if driver unloaded.
> But the kernel does, and hence could deal with this. As much as
> the setup is being done when the driver gets loaded, cleanup
> should be done when the driver gets unloaded. _If_ there
> already is such an odd protocol between kernel and qemu, then
> if that can't be dropped, it surely can be leveraged to also deal
> with the cleanup side of things? No need to fiddle with the
> hypervisor interfaces for something that it's not supposed to
> know about anyway.
But I'm suspecious if domU has authorization to call unmap and unbind 
hypercall.
Looked the kernel code, only dom0 did that.
>
>> When msix entry masked? kernel mask and unmask msix entry
>> intermittently, especially when irqbalance enabled.
>>
>> So based on above, I think it's better to reuse same pirq, only free it
>> when device detached.
> I continue to disagree. Also from a theoretical perspective - if you
> have a lot of devices that no driver is loaded for, you'd keep a lot
> of IRQs allocated without any need.
Sould right, but why do you passthrough those devices but don't use 
them, you will finally use them.
For driver that reload often, this pattern will save some time of 
mapping and binding.
Also both xen and kernel have ability to allocate enough IRQs for each 
device.
If no driver is loaded for a irq, the interrupt will not be triggered 
and no any impact to the whole system.

Regards
zduan

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Is: Telling QEMU to re-use PIRQ value Was: Re: [PATCH 1/2] xen, libxc: init msix addr/data with value from qemu via hypercall
  2013-05-10  7:55                 ` Jan Beulich
  2013-05-10  8:22                   ` Zhenzhong Duan
@ 2013-05-10 19:03                   ` Konrad Rzeszutek Wilk
  1 sibling, 0 replies; 12+ messages in thread
From: Konrad Rzeszutek Wilk @ 2013-05-10 19:03 UTC (permalink / raw)
  To: Jan Beulich, Stefano Stabellini
  Cc: chien.yen, joe.jin, zhenzhong.duan, yuval.shaia, xen-devel

On Fri, May 10, 2013 at 08:55:46AM +0100, Jan Beulich wrote:
> >>> On 10.05.13 at 09:39, Zhenzhong Duan <zhenzhong.duan@oracle.com> wrote:
> 
> > On 2013-05-10 14:37, Jan Beulich wrote:
> >>>>> On 10.05.13 at 04:49, Zhenzhong Duan <zhenzhong.duan@oracle.com> wrote:
> >>> On 2013-05-10 03:05, Jan Beulich wrote:
> >>>>>>> Zhenzhong Duan <zhenzhong.duan@oracle.com> 05/09/13 5:02 AM >>>
> >>>>> On 2013/5/8 20:03, Jan Beulich wrote:
> >>>>>> But of course I still don't really understand why all of the sudden
> >>>>>> this needs to be passed in rather than being under the full control
> >>>>>> of the hypervisor at all times. Perhaps this is related to me not
> >>>>>> understanding why the kernel would read these values at all:
> >>>>>> There's no other place in the kernel where the message would
> >>>>>> be read before first getting written (in fact, apart from the
> >>>>>> use of __read_msi_msg() by the Xen code, there's only one
> >>>>>> other user under arch/powerpc/, and there - according to the
> >>>>>> accompanying comment - this is just to save away the data for
> >>>>>> later use during resume).
> >>>>> There is a bug if msi_ad is not passed in.
> >>>>>
> >>>>> when driver first load,
> >>>>>
> >>>>> kernel.__read_msi_msg()
> >>>>> (got all zero)
> >>>> But you don't even comment on the apparently bogus use of the function here.
> >>> This pattern is used only when hvm_pirq is enabled. kernel need to check
> >>> XEN_PIRQ_MSI_DATA.
> >>> It's not a issue if data is 0 at first driver load, kernel will call
> >>> __write_msi_msg with pirq and  XEN_PIRQ_MSI_DATA set.
> >> But this doesn't make the use of __read_msi_msg() less bogus. It's
> >> not clear on what basis this mechanism got invented in the first
> >> place.
> > It's there since hvm_irq introduced. But it works indeed.
> 
> But that doesn't in any way mean the concept is sound.
> 
> >>>>> kernel.__write_msi_msg(pirq)
> >>>>> (ioreq passed to qemu as no msixtbl_entry established yet)
> >>>>> qemu.pt_msi_update_one()
> >>>>> xc_domain_update_msi_irq()
> >>>>> (msixtbl_entry dynamicly allocated with msi_ad all zero)
> >>>>>
> >>>>> then driver unload,
> >>>>> ...
> >>>>> driver load again,
> >>>>>
> >>>>> kernel.__read_msi_msg()
> >>>>> (got all zero from xen as accelerated entry just established with all zero)
> >>>> If all zeroes get returned, why would the flow here be different then above?
> >>> Because pirq and related mapping and binding are not freed between
> >>> driver load-unload-load. They are freed when device detach.
> >>> We should try to use the last pirq.
> >> But then you need to solve the problem generically, i.e. not just
> >> for the driver reload case, but also for e.g. the kexec one (where
> >> __read_msi_msg() returning other than all zeros wouldn't help you
> >> as xen_irq_from_pirq() would then return -1, and you'd be back to
> >> the same problem.
> > No, not only kexec ones, it's driver unload that makes xen_irq_from_pirq 
> > return -1. So there is also a bug in kernel side.
> > I have sent a patch about kernel. I think you miss it.
> > http://www.gossamer-threads.com/lists/xen/devel/281498 
> >> IOW I think the prior IRQ needs to be freed
> >> anyway rather than an attempt be made to reuse it.
> > I have ever thought about this idea, but when to free the pirq is a problem.
> > When driver unload? qemu has no idea of if driver unloaded.
> 
> But the kernel does, and hence could deal with this. As much as
> the setup is being done when the driver gets loaded, cleanup
> should be done when the driver gets unloaded. _If_ there
> already is such an odd protocol between kernel and qemu, then
> if that can't be dropped, it surely can be leveraged to also deal
> with the cleanup side of things? No need to fiddle with the

I don't know if such thing exists. Stefano, is there a way
to tell QEMU to re-use the PIRQ? Writting zero to the MSI?

> hypervisor interfaces for something that it's not supposed to
> know about anyway.
> 
> > When msix entry masked? kernel mask and unmask msix entry 
> > intermittently, especially when irqbalance enabled.
> > 
> > So based on above, I think it's better to reuse same pirq, only free it 
> > when device detached.
> 
> I continue to disagree. Also from a theoretical perspective - if you
> have a lot of devices that no driver is loaded for, you'd keep a lot
> of IRQs allocated without any need.

The guest has to use PHYSDEVOP_get_free_pirq to allocate it. And 
in this case we don't have a 'free_pirq' hypercall to release it. 

The Linux Xen<->IRQ drivers drops all of the information it has on
the PIRQ once the driver is unloaded (rightly so - the driver after
does not need the IRQ anymore and the PIRQ<->events connection has
been broken).

I wrote a tiny patch that needs improvements that would cache the
last seen BDF and PIRQ (that part is missing). That would allow us
to re-use the PIRQ and not call PHYSDEVOP_get_free_pirq until we
exhaust the allocation we have.

In other words - this can be fixed in the kernel.

But if there is a 'magic' value that can be written to QEMU to tell
it to re-use the PIRQ.. that would good too.
> 
> Jan

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2013-05-10 19:03 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-05-08  8:17 [PATCH 1/2] xen, libxc: init msix addr/data with value from qemu via hypercall Zhenzhong Duan
2013-05-08  9:39 ` Jan Beulich
2013-05-08 10:00   ` Zhenzhong Duan
2013-05-08 12:03     ` Jan Beulich
2013-05-09  3:02       ` Zhenzhong Duan
2013-05-09 19:05         ` Jan Beulich
2013-05-10  2:49           ` Zhenzhong Duan
2013-05-10  6:37             ` Jan Beulich
2013-05-10  7:39               ` Zhenzhong Duan
2013-05-10  7:55                 ` Jan Beulich
2013-05-10  8:22                   ` Zhenzhong Duan
2013-05-10 19:03                   ` Is: Telling QEMU to re-use PIRQ value Was: " Konrad Rzeszutek Wilk

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.