All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/4 v11] MSI-X MMIO support for KVM
@ 2011-02-28  7:20 Sheng Yang
  2011-02-28  7:20 ` [PATCH 1/4] KVM: Move struct kvm_io_device to kvm_host.h Sheng Yang
                   ` (3 more replies)
  0 siblings, 4 replies; 10+ messages in thread
From: Sheng Yang @ 2011-02-28  7:20 UTC (permalink / raw)
  To: Avi Kivity, Marcelo Tosatti
  Cc: Michael S. Tsirkin, Alex Williamson, kvm, Sheng Yang

Change from v9:
Update according to the comments of Alex and Michael.

Notice this patchset still based on 2.6.37 due to a block bug on assigned
device in the upstream now.

Sheng Yang (4):
  KVM: Move struct kvm_io_device to kvm_host.h
  KVM: Add kvm_io_ext_data to IO handler
  KVM: Emulate MSI-X table in kernel
  KVM: Add documents for MSI-X MMIO API

 Documentation/kvm/api.txt       |   58 ++++++++
 arch/x86/include/asm/kvm_host.h |    1 +
 arch/x86/kvm/Makefile           |    2 +-
 arch/x86/kvm/i8254.c            |    6 +-
 arch/x86/kvm/i8259.c            |    3 +-
 arch/x86/kvm/lapic.c            |    3 +-
 arch/x86/kvm/mmu.c              |    2 +
 arch/x86/kvm/x86.c              |   51 +++++--
 include/linux/kvm.h             |   28 ++++
 include/linux/kvm_host.h        |   66 +++++++++-
 virt/kvm/assigned-dev.c         |   44 ++++++
 virt/kvm/coalesced_mmio.c       |    3 +-
 virt/kvm/eventfd.c              |    2 +-
 virt/kvm/ioapic.c               |    2 +-
 virt/kvm/iodev.h                |   31 +----
 virt/kvm/kvm_main.c             |   40 +++++-
 virt/kvm/msix_mmio.c            |  301 +++++++++++++++++++++++++++++++++++++++
 virt/kvm/msix_mmio.h            |   25 ++++
 18 files changed, 616 insertions(+), 52 deletions(-)
 create mode 100644 virt/kvm/msix_mmio.c
 create mode 100644 virt/kvm/msix_mmio.h


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 1/4] KVM: Move struct kvm_io_device to kvm_host.h
  2011-02-28  7:20 [PATCH 0/4 v11] MSI-X MMIO support for KVM Sheng Yang
@ 2011-02-28  7:20 ` Sheng Yang
  2011-02-28  7:20 ` [PATCH 2/4] KVM: Add kvm_io_ext_data to IO handler Sheng Yang
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 10+ messages in thread
From: Sheng Yang @ 2011-02-28  7:20 UTC (permalink / raw)
  To: Avi Kivity, Marcelo Tosatti
  Cc: Michael S. Tsirkin, Alex Williamson, kvm, Sheng Yang

Then it can be used by other struct in kvm_host.h

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
---
 include/linux/kvm_host.h |   23 +++++++++++++++++++++++
 virt/kvm/iodev.h         |   25 +------------------------
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index b5021db..7d313e0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -98,6 +98,29 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
 int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
 #endif
 
+struct kvm_io_device;
+
+/**
+ * kvm_io_device_ops are called under kvm slots_lock.
+ * read and write handlers return 0 if the transaction has been handled,
+ * or non-zero to have it passed to the next device.
+ **/
+struct kvm_io_device_ops {
+	int (*read)(struct kvm_io_device *this,
+		    gpa_t addr,
+		    int len,
+		    void *val);
+	int (*write)(struct kvm_io_device *this,
+		     gpa_t addr,
+		     int len,
+		     const void *val);
+	void (*destructor)(struct kvm_io_device *this);
+};
+
+struct kvm_io_device {
+	const struct kvm_io_device_ops *ops;
+};
+
 struct kvm_vcpu {
 	struct kvm *kvm;
 #ifdef CONFIG_PREEMPT_NOTIFIERS
diff --git a/virt/kvm/iodev.h b/virt/kvm/iodev.h
index 12fd3ca..d1f5651 100644
--- a/virt/kvm/iodev.h
+++ b/virt/kvm/iodev.h
@@ -17,32 +17,9 @@
 #define __KVM_IODEV_H__
 
 #include <linux/kvm_types.h>
+#include <linux/kvm_host.h>
 #include <asm/errno.h>
 
-struct kvm_io_device;
-
-/**
- * kvm_io_device_ops are called under kvm slots_lock.
- * read and write handlers return 0 if the transaction has been handled,
- * or non-zero to have it passed to the next device.
- **/
-struct kvm_io_device_ops {
-	int (*read)(struct kvm_io_device *this,
-		    gpa_t addr,
-		    int len,
-		    void *val);
-	int (*write)(struct kvm_io_device *this,
-		     gpa_t addr,
-		     int len,
-		     const void *val);
-	void (*destructor)(struct kvm_io_device *this);
-};
-
-
-struct kvm_io_device {
-	const struct kvm_io_device_ops *ops;
-};
-
 static inline void kvm_iodevice_init(struct kvm_io_device *dev,
 				     const struct kvm_io_device_ops *ops)
 {
-- 
1.7.0.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 2/4] KVM: Add kvm_io_ext_data to IO handler
  2011-02-28  7:20 [PATCH 0/4 v11] MSI-X MMIO support for KVM Sheng Yang
  2011-02-28  7:20 ` [PATCH 1/4] KVM: Move struct kvm_io_device to kvm_host.h Sheng Yang
@ 2011-02-28  7:20 ` Sheng Yang
  2011-02-28  7:20 ` [PATCH 3/4] KVM: Emulate MSI-X table in kernel Sheng Yang
  2011-02-28  7:20 ` [PATCH 4/4] KVM: Add documents for MSI-X MMIO API Sheng Yang
  3 siblings, 0 replies; 10+ messages in thread
From: Sheng Yang @ 2011-02-28  7:20 UTC (permalink / raw)
  To: Avi Kivity, Marcelo Tosatti
  Cc: Michael S. Tsirkin, Alex Williamson, kvm, Sheng Yang

Add a new parameter to IO writing handler, so that we can transfer information
from IO handler to caller.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
---
 arch/x86/kvm/i8254.c      |    6 ++++--
 arch/x86/kvm/i8259.c      |    3 ++-
 arch/x86/kvm/lapic.c      |    3 ++-
 arch/x86/kvm/x86.c        |   13 ++++++++-----
 include/linux/kvm_host.h  |    9 +++++++--
 virt/kvm/coalesced_mmio.c |    3 ++-
 virt/kvm/eventfd.c        |    2 +-
 virt/kvm/ioapic.c         |    2 +-
 virt/kvm/iodev.h          |    6 ++++--
 virt/kvm/kvm_main.c       |    4 ++--
 10 files changed, 33 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index efad723..bd8f0c5 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -439,7 +439,8 @@ static inline int pit_in_range(gpa_t addr)
 }
 
 static int pit_ioport_write(struct kvm_io_device *this,
-			    gpa_t addr, int len, const void *data)
+			    gpa_t addr, int len, const void *data,
+			    struct kvm_io_ext_data *ext_data)
 {
 	struct kvm_pit *pit = dev_to_pit(this);
 	struct kvm_kpit_state *pit_state = &pit->pit_state;
@@ -585,7 +586,8 @@ static int pit_ioport_read(struct kvm_io_device *this,
 }
 
 static int speaker_ioport_write(struct kvm_io_device *this,
-				gpa_t addr, int len, const void *data)
+				gpa_t addr, int len, const void *data,
+				struct kvm_io_ext_data *ext_data)
 {
 	struct kvm_pit *pit = speaker_to_pit(this);
 	struct kvm_kpit_state *pit_state = &pit->pit_state;
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 3cece05..96b1070 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -480,7 +480,8 @@ static inline struct kvm_pic *to_pic(struct kvm_io_device *dev)
 }
 
 static int picdev_write(struct kvm_io_device *this,
-			 gpa_t addr, int len, const void *val)
+			 gpa_t addr, int len, const void *val,
+			 struct kvm_io_ext_data *ext_data)
 {
 	struct kvm_pic *s = to_pic(this);
 	unsigned char data = *(unsigned char *)val;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 93cf9d0..f413e9c 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -836,7 +836,8 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 }
 
 static int apic_mmio_write(struct kvm_io_device *this,
-			    gpa_t address, int len, const void *data)
+			    gpa_t address, int len, const void *data,
+			    struct kvm_io_ext_data *ext_data)
 {
 	struct kvm_lapic *apic = to_lapic(this);
 	unsigned int offset = address - apic->base_address;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fa708c9..21b84e2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3571,13 +3571,14 @@ static void kvm_init_msr_list(void)
 }
 
 static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
-			   const void *v)
+			   const void *v, struct kvm_io_ext_data *ext_data)
 {
 	if (vcpu->arch.apic &&
-	    !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v))
+	    !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v, ext_data))
 		return 0;
 
-	return kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
+	return kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS,
+				addr, len, v, ext_data);
 }
 
 static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
@@ -3807,6 +3808,7 @@ static int emulator_write_emulated_onepage(unsigned long addr,
 					   struct kvm_vcpu *vcpu)
 {
 	gpa_t                 gpa;
+	struct kvm_io_ext_data ext_data;
 
 	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
 
@@ -3825,7 +3827,7 @@ mmio:
 	/*
 	 * Is this MMIO handled locally?
 	 */
-	if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
+	if (!vcpu_mmio_write(vcpu, gpa, bytes, val, &ext_data))
 		return X86EMUL_CONTINUE;
 
 	vcpu->mmio_needed = 1;
@@ -3940,6 +3942,7 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
 {
 	/* TODO: String I/O for in kernel device */
 	int r;
+	struct kvm_io_ext_data ext_data;
 
 	if (vcpu->arch.pio.in)
 		r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
@@ -3947,7 +3950,7 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
 	else
 		r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
 				     vcpu->arch.pio.port, vcpu->arch.pio.size,
-				     pd);
+				     pd, &ext_data);
 	return r;
 }
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7d313e0..a32c53e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -68,8 +68,12 @@ enum kvm_bus {
 	KVM_NR_BUSES
 };
 
+struct kvm_io_ext_data {
+	int type;   /* Extended data type */
+};
+
 int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
-		     int len, const void *val);
+		     int len, const void *val, struct kvm_io_ext_data *data);
 int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len,
 		    void *val);
 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx,
@@ -113,7 +117,8 @@ struct kvm_io_device_ops {
 	int (*write)(struct kvm_io_device *this,
 		     gpa_t addr,
 		     int len,
-		     const void *val);
+		     const void *val,
+		     struct kvm_io_ext_data *data);
 	void (*destructor)(struct kvm_io_device *this);
 };
 
diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c
index fc84875..37b254c 100644
--- a/virt/kvm/coalesced_mmio.c
+++ b/virt/kvm/coalesced_mmio.c
@@ -59,7 +59,8 @@ static int coalesced_mmio_in_range(struct kvm_coalesced_mmio_dev *dev,
 }
 
 static int coalesced_mmio_write(struct kvm_io_device *this,
-				gpa_t addr, int len, const void *val)
+				gpa_t addr, int len, const void *val,
+				struct kvm_io_ext_data *ext_data)
 {
 	struct kvm_coalesced_mmio_dev *dev = to_mmio(this);
 	struct kvm_coalesced_mmio_ring *ring = dev->kvm->coalesced_mmio_ring;
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 2ca4535..8edd757 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -483,7 +483,7 @@ ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val)
 /* MMIO/PIO writes trigger an event if the addr/val match */
 static int
 ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len,
-		const void *val)
+		const void *val, struct kvm_io_ext_data *ext_data)
 {
 	struct _ioeventfd *p = to_ioeventfd(this);
 
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 0b9df83..6a027ef 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -321,7 +321,7 @@ static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
 }
 
 static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
-			     const void *val)
+			     const void *val, struct kvm_io_ext_data *ext_data)
 {
 	struct kvm_ioapic *ioapic = to_ioapic(this);
 	u32 data;
diff --git a/virt/kvm/iodev.h b/virt/kvm/iodev.h
index d1f5651..340ab79 100644
--- a/virt/kvm/iodev.h
+++ b/virt/kvm/iodev.h
@@ -33,9 +33,11 @@ static inline int kvm_iodevice_read(struct kvm_io_device *dev,
 }
 
 static inline int kvm_iodevice_write(struct kvm_io_device *dev,
-				     gpa_t addr, int l, const void *v)
+				     gpa_t addr, int l, const void *v,
+				     struct kvm_io_ext_data *data)
 {
-	return dev->ops->write ? dev->ops->write(dev, addr, l, v) : -EOPNOTSUPP;
+	return dev->ops->write ?
+		dev->ops->write(dev, addr, l, v, data) : -EOPNOTSUPP;
 }
 
 static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index b1b6cbb..a61f90e 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2221,14 +2221,14 @@ static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
 
 /* kvm_io_bus_write - called under kvm->slots_lock */
 int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
-		     int len, const void *val)
+		     int len, const void *val, struct kvm_io_ext_data *ext_data)
 {
 	int i;
 	struct kvm_io_bus *bus;
 
 	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
 	for (i = 0; i < bus->dev_count; i++)
-		if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
+		if (!kvm_iodevice_write(bus->devs[i], addr, len, val, ext_data))
 			return 0;
 	return -EOPNOTSUPP;
 }
-- 
1.7.0.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 3/4] KVM: Emulate MSI-X table in kernel
  2011-02-28  7:20 [PATCH 0/4 v11] MSI-X MMIO support for KVM Sheng Yang
  2011-02-28  7:20 ` [PATCH 1/4] KVM: Move struct kvm_io_device to kvm_host.h Sheng Yang
  2011-02-28  7:20 ` [PATCH 2/4] KVM: Add kvm_io_ext_data to IO handler Sheng Yang
@ 2011-02-28  7:20 ` Sheng Yang
  2011-02-28 11:27   ` Michael S. Tsirkin
  2011-02-28  7:20 ` [PATCH 4/4] KVM: Add documents for MSI-X MMIO API Sheng Yang
  3 siblings, 1 reply; 10+ messages in thread
From: Sheng Yang @ 2011-02-28  7:20 UTC (permalink / raw)
  To: Avi Kivity, Marcelo Tosatti
  Cc: Michael S. Tsirkin, Alex Williamson, kvm, Sheng Yang

Then we can support mask bit operation of assigned devices now.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
---
 arch/x86/include/asm/kvm_host.h |    1 +
 arch/x86/kvm/Makefile           |    2 +-
 arch/x86/kvm/mmu.c              |    2 +
 arch/x86/kvm/x86.c              |   40 ++++-
 include/linux/kvm.h             |   28 ++++
 include/linux/kvm_host.h        |   36 +++++
 virt/kvm/assigned-dev.c         |   44 ++++++
 virt/kvm/kvm_main.c             |   38 +++++-
 virt/kvm/msix_mmio.c            |  301 +++++++++++++++++++++++++++++++++++++++
 virt/kvm/msix_mmio.h            |   25 ++++
 10 files changed, 504 insertions(+), 13 deletions(-)
 create mode 100644 virt/kvm/msix_mmio.c
 create mode 100644 virt/kvm/msix_mmio.h

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index aa75f21..4a390a4 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -635,6 +635,7 @@ enum emulation_result {
 	EMULATE_DONE,       /* no further processing */
 	EMULATE_DO_MMIO,      /* kvm_run filled with mmio request */
 	EMULATE_FAIL,         /* can't emulate this instruction */
+	EMULATE_USERSPACE_EXIT, /* we need exit to userspace */
 };
 
 #define EMULTYPE_NO_DECODE	    (1 << 0)
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index f15501f..3a0d851 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
 
 kvm-y			+= $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
 				coalesced_mmio.o irq_comm.o eventfd.o \
-				assigned-dev.o)
+				assigned-dev.o msix_mmio.o)
 kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/, iommu.o)
 kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix ../../../virt/kvm/, async_pf.o)
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9cafbb4..912dca4 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3358,6 +3358,8 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
 	case EMULATE_DO_MMIO:
 		++vcpu->stat.mmio_exits;
 		/* fall through */
+	case EMULATE_USERSPACE_EXIT:
+		/* fall through */
 	case EMULATE_FAIL:
 		return 0;
 	default:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 21b84e2..87308eb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_X86_ROBUST_SINGLESTEP:
 	case KVM_CAP_XSAVE:
 	case KVM_CAP_ASYNC_PF:
+	case KVM_CAP_MSIX_MMIO:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
@@ -3809,6 +3810,7 @@ static int emulator_write_emulated_onepage(unsigned long addr,
 {
 	gpa_t                 gpa;
 	struct kvm_io_ext_data ext_data;
+	int r;
 
 	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
 
@@ -3824,18 +3826,32 @@ static int emulator_write_emulated_onepage(unsigned long addr,
 
 mmio:
 	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
+	r = vcpu_mmio_write(vcpu, gpa, bytes, val, &ext_data);
 	/*
 	 * Is this MMIO handled locally?
 	 */
-	if (!vcpu_mmio_write(vcpu, gpa, bytes, val, &ext_data))
+	if (!r)
 		return X86EMUL_CONTINUE;
 
-	vcpu->mmio_needed = 1;
-	vcpu->run->exit_reason = KVM_EXIT_MMIO;
-	vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
-	vcpu->run->mmio.len = vcpu->mmio_size = bytes;
-	vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
-	memcpy(vcpu->run->mmio.data, val, bytes);
+	if (r == -ENOTSYNC) {
+		vcpu->userspace_exit_needed = 1;
+		vcpu->run->exit_reason = KVM_EXIT_MSIX_ROUTING_UPDATE;
+		vcpu->run->msix_routing.dev_id =
+			ext_data.msix_routing.dev_id;
+		vcpu->run->msix_routing.type =
+			ext_data.msix_routing.type;
+		vcpu->run->msix_routing.entry_idx =
+			ext_data.msix_routing.entry_idx;
+		vcpu->run->msix_routing.flags =
+			ext_data.msix_routing.flags;
+	} else  {
+		vcpu->mmio_needed = 1;
+		vcpu->run->exit_reason = KVM_EXIT_MMIO;
+		vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
+		vcpu->run->mmio.len = vcpu->mmio_size = bytes;
+		vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
+		memcpy(vcpu->run->mmio.data, val, bytes);
+	}
 
 	return X86EMUL_CONTINUE;
 }
@@ -4469,6 +4485,8 @@ done:
 		r = EMULATE_DO_MMIO;
 	} else if (r == EMULATION_RESTART)
 		goto restart;
+	else if (vcpu->userspace_exit_needed)
+		r = EMULATE_USERSPACE_EXIT;
 	else
 		r = EMULATE_DONE;
 
@@ -5397,12 +5415,18 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		}
 	}
 
-	if (vcpu->arch.pio.count || vcpu->mmio_needed) {
+	if (vcpu->arch.pio.count || vcpu->mmio_needed ||
+			vcpu->userspace_exit_needed) {
 		if (vcpu->mmio_needed) {
 			memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
 			vcpu->mmio_read_completed = 1;
 			vcpu->mmio_needed = 0;
 		}
+		if (vcpu->userspace_exit_needed) {
+			vcpu->userspace_exit_needed = 0;
+			r = 0;
+			goto out;
+		}
 		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
 		r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
 		srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index ea2dc1a..4393e4e 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -161,6 +161,7 @@ struct kvm_pit_config {
 #define KVM_EXIT_NMI              16
 #define KVM_EXIT_INTERNAL_ERROR   17
 #define KVM_EXIT_OSI              18
+#define KVM_EXIT_MSIX_ROUTING_UPDATE 19
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 #define KVM_INTERNAL_ERROR_EMULATION 1
@@ -264,6 +265,13 @@ struct kvm_run {
 		struct {
 			__u64 gprs[32];
 		} osi;
+		/* KVM_EXIT_MSIX_ROUTING_UPDATE*/
+		struct {
+			__u32 dev_id;
+			__u16 type;
+			__u16 entry_idx;
+			__u64 flags;
+		} msix_routing;
 		/* Fix the size of the union. */
 		char padding[256];
 	};
@@ -541,6 +549,7 @@ struct kvm_ppc_pvinfo {
 #define KVM_CAP_PPC_GET_PVINFO 57
 #define KVM_CAP_PPC_IRQ_LEVEL 58
 #define KVM_CAP_ASYNC_PF 59
+#define KVM_CAP_MSIX_MMIO 60
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -672,6 +681,9 @@ struct kvm_clock_data {
 #define KVM_XEN_HVM_CONFIG        _IOW(KVMIO,  0x7a, struct kvm_xen_hvm_config)
 #define KVM_SET_CLOCK             _IOW(KVMIO,  0x7b, struct kvm_clock_data)
 #define KVM_GET_CLOCK             _IOR(KVMIO,  0x7c, struct kvm_clock_data)
+/* Available with KVM_CAP_MSIX_MMIO */
+#define KVM_REGISTER_MSIX_MMIO    _IOW(KVMIO,  0x7d, struct kvm_msix_mmio_user)
+#define KVM_UNREGISTER_MSIX_MMIO  _IOW(KVMIO,  0x7e, struct kvm_msix_mmio_user)
 /* Available with KVM_CAP_PIT_STATE2 */
 #define KVM_GET_PIT2              _IOR(KVMIO,  0x9f, struct kvm_pit_state2)
 #define KVM_SET_PIT2              _IOW(KVMIO,  0xa0, struct kvm_pit_state2)
@@ -795,4 +807,20 @@ struct kvm_assigned_msix_entry {
 	__u16 padding[3];
 };
 
+#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1 << 0)
+
+#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1 << 8)
+
+#define KVM_MSIX_MMIO_TYPE_DEV_MASK	    0x00ff
+#define KVM_MSIX_MMIO_TYPE_BASE_MASK	    0xff00
+struct kvm_msix_mmio_user {
+	__u32 dev_id;
+	__u16 type;
+	__u16 max_entries_nr;
+	__u64 base_addr;
+	__u64 base_va;
+	__u64 flags;
+	__u64 reserved[4];
+};
+
 #endif /* __LINUX_KVM_H */
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index a32c53e..d6c05e3 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -68,8 +68,17 @@ enum kvm_bus {
 	KVM_NR_BUSES
 };
 
+#define KVM_IO_EXT_DATA_TYPE_MSIX_ROUTING   1
 struct kvm_io_ext_data {
 	int type;   /* Extended data type */
+	union {
+		struct {
+			u32 dev_id;	/* Device ID */
+			u16 type;	/* Device type */
+			u16 entry_idx;  /* Accessed MSI-X entry index */
+			u64 flags;
+		} msix_routing;
+	};
 };
 
 int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
@@ -165,6 +174,8 @@ struct kvm_vcpu {
 	} async_pf;
 #endif
 
+	int userspace_exit_needed;
+
 	struct kvm_vcpu_arch arch;
 };
 
@@ -238,6 +249,27 @@ struct kvm_memslots {
 					KVM_PRIVATE_MEM_SLOTS];
 };
 
+#define KVM_MSIX_MMIO_MAX    32
+
+struct kvm_msix_mmio {
+	u32 dev_id;
+	u16 type;
+	u16 max_entries_nr;
+	u64 flags;
+	gpa_t table_base_addr;
+	hva_t table_base_va;
+	gpa_t pba_base_addr;
+	hva_t pba_base_va;
+};
+
+struct kvm_msix_mmio_dev {
+	struct kvm *kvm;
+	struct kvm_io_device table_dev;
+	int mmio_nr;
+	struct kvm_msix_mmio mmio[KVM_MSIX_MMIO_MAX];
+	struct mutex lock;
+};
+
 struct kvm {
 	spinlock_t mmu_lock;
 	raw_spinlock_t requests_lock;
@@ -286,6 +318,7 @@ struct kvm {
 	long mmu_notifier_count;
 #endif
 	long tlbs_dirty;
+	struct kvm_msix_mmio_dev msix_mmio_dev;
 };
 
 /* The guest did something we don't support. */
@@ -558,6 +591,9 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
 int kvm_request_irq_source_id(struct kvm *kvm);
 void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
 
+int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
+			int assigned_dev_id, int entry, bool mask);
+
 /* For vcpu->arch.iommu_flags */
 #define KVM_IOMMU_CACHE_COHERENCY	0x1
 
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
index ae72ae6..d1598a6 100644
--- a/virt/kvm/assigned-dev.c
+++ b/virt/kvm/assigned-dev.c
@@ -18,6 +18,7 @@
 #include <linux/interrupt.h>
 #include <linux/slab.h>
 #include "irq.h"
+#include "msix_mmio.h"
 
 static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
 						      int assigned_dev_id)
@@ -191,12 +192,25 @@ static void kvm_free_assigned_irq(struct kvm *kvm,
 	kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
 }
 
+static void assigned_device_free_msix_mmio(struct kvm *kvm,
+				struct kvm_assigned_dev_kernel *adev)
+{
+	struct kvm_msix_mmio mmio;
+
+	mmio.dev_id = adev->assigned_dev_id;
+	mmio.type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV |
+		    KVM_MSIX_MMIO_TYPE_BASE_TABLE;
+	kvm_free_msix_mmio(kvm, &mmio);
+}
+
 static void kvm_free_assigned_device(struct kvm *kvm,
 				     struct kvm_assigned_dev_kernel
 				     *assigned_dev)
 {
 	kvm_free_assigned_irq(kvm, assigned_dev);
 
+	assigned_device_free_msix_mmio(kvm, assigned_dev);
+
 	__pci_reset_function(assigned_dev->dev);
 	pci_restore_state(assigned_dev->dev);
 
@@ -785,3 +799,33 @@ out:
 	return r;
 }
 
+/* The caller should hold kvm->lock */
+int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
+				int assigned_dev_id, int entry, bool mask)
+{
+	int r = -EFAULT;
+	struct kvm_assigned_dev_kernel *adev;
+	int i;
+
+	if (!irqchip_in_kernel(kvm))
+		return r;
+
+	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+				      assigned_dev_id);
+	if (!adev)
+		goto out;
+
+	/* For non-MSIX enabled devices, entries_nr == 0 */
+	for (i = 0; i < adev->entries_nr; i++)
+		if (adev->host_msix_entries[i].entry == entry) {
+			if (mask)
+				disable_irq_nosync(
+					adev->host_msix_entries[i].vector);
+			else
+				enable_irq(adev->host_msix_entries[i].vector);
+			r = 0;
+			break;
+		}
+out:
+	return r;
+}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a61f90e..f211e49 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -56,6 +56,7 @@
 
 #include "coalesced_mmio.h"
 #include "async_pf.h"
+#include "msix_mmio.h"
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/kvm.h>
@@ -509,6 +510,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
 	struct mm_struct *mm = kvm->mm;
 
 	kvm_arch_sync_events(kvm);
+	kvm_unregister_msix_mmio_dev(kvm);
 	spin_lock(&kvm_lock);
 	list_del(&kvm->vm_list);
 	spin_unlock(&kvm_lock);
@@ -1877,6 +1879,24 @@ static long kvm_vm_ioctl(struct file *filp,
 		mutex_unlock(&kvm->lock);
 		break;
 #endif
+	case KVM_REGISTER_MSIX_MMIO: {
+		struct kvm_msix_mmio_user mmio_user;
+
+		r = -EFAULT;
+		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
+			goto out;
+		r = kvm_vm_ioctl_register_msix_mmio(kvm, &mmio_user);
+		break;
+	}
+	case KVM_UNREGISTER_MSIX_MMIO: {
+		struct kvm_msix_mmio_user mmio_user;
+
+		r = -EFAULT;
+		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
+			goto out;
+		r = kvm_vm_ioctl_unregister_msix_mmio(kvm, &mmio_user);
+		break;
+	}
 	default:
 		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
 		if (r == -ENOTTY)
@@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void)
 		return r;
 	}
 #endif
+	r = kvm_register_msix_mmio_dev(kvm);
+	if (r < 0) {
+		kvm_put_kvm(kvm);
+		return r;
+	}
+
 	r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
 	if (r < 0)
 		kvm_put_kvm(kvm);
@@ -2223,14 +2249,18 @@ static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
 int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 		     int len, const void *val, struct kvm_io_ext_data *ext_data)
 {
-	int i;
+	int i, r = -EOPNOTSUPP;
 	struct kvm_io_bus *bus;
 
 	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
-	for (i = 0; i < bus->dev_count; i++)
-		if (!kvm_iodevice_write(bus->devs[i], addr, len, val, ext_data))
+	for (i = 0; i < bus->dev_count; i++) {
+		r = kvm_iodevice_write(bus->devs[i], addr, len, val, ext_data);
+		if (r == -ENOTSYNC)
+			break;
+		else if (!r)
 			return 0;
-	return -EOPNOTSUPP;
+	}
+	return r;
 }
 
 /* kvm_io_bus_read - called under kvm->slots_lock */
diff --git a/virt/kvm/msix_mmio.c b/virt/kvm/msix_mmio.c
new file mode 100644
index 0000000..b2a5f86
--- /dev/null
+++ b/virt/kvm/msix_mmio.c
@@ -0,0 +1,301 @@
+/*
+ * MSI-X MMIO emulation
+ *
+ * Copyright (c) 2010 Intel Corporation
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Author:
+ *   Sheng Yang <sheng.yang@intel.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+
+#include "msix_mmio.h"
+#include "iodev.h"
+
+static int update_msix_mask_bit(struct kvm *kvm, struct kvm_msix_mmio *mmio,
+				int entry, u32 flag)
+{
+	if (mmio->type & KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
+		return kvm_assigned_device_update_msix_mask_bit(kvm,
+				mmio->dev_id, entry, flag);
+	return -EFAULT;
+}
+
+/* Caller must hold dev->lock */
+static int get_mmio_table_index(struct kvm_msix_mmio_dev *dev,
+				gpa_t addr, int len)
+{
+	gpa_t start, end;
+	int i, r = -EINVAL;
+
+	for (i = 0; i < dev->mmio_nr; i++) {
+		start = dev->mmio[i].table_base_addr;
+		end = dev->mmio[i].table_base_addr + PCI_MSIX_ENTRY_SIZE *
+			dev->mmio[i].max_entries_nr;
+		if (addr >= start && addr + len <= end) {
+			r = i;
+			break;
+		}
+	}
+
+	return r;
+}
+
+static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
+				void *val)
+{
+	/*TODO: Add big endian support */
+	struct kvm_msix_mmio_dev *mmio_dev =
+		container_of(this, struct kvm_msix_mmio_dev, table_dev);
+	struct kvm_msix_mmio *mmio;
+	int idx, ret = 0, entry, offset, r;
+
+	mutex_lock(&mmio_dev->lock);
+	idx = get_mmio_table_index(mmio_dev, addr, len);
+	if (idx < 0) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+	if ((len != 4 && len != 8) || addr & (len - 1))
+		goto out;
+
+	offset = addr % PCI_MSIX_ENTRY_SIZE;
+	mmio = &mmio_dev->mmio[idx];
+	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
+	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
+			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
+	if (r)
+		goto out;
+out:
+	mutex_unlock(&mmio_dev->lock);
+	return ret;
+}
+
+static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr,
+				int len, const void *val,
+				struct kvm_io_ext_data *ext_data)
+{
+	/*TODO: Add big endian support */
+	struct kvm_msix_mmio_dev *mmio_dev =
+		container_of(this, struct kvm_msix_mmio_dev, table_dev);
+	struct kvm_msix_mmio *mmio;
+	int idx, entry, offset, ret = 0, r = 0;
+	void __user *entry_base;
+	__le32 __user *ctrl_pos;
+	__le32 old_ctrl, new_ctrl;
+
+	mutex_lock(&mmio_dev->kvm->lock);
+	mutex_lock(&mmio_dev->lock);
+	idx = get_mmio_table_index(mmio_dev, addr, len);
+	if (idx < 0) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+	if ((len != 4 && len != 8) || addr & (len - 1))
+		goto out;
+
+	offset = addr % PCI_MSIX_ENTRY_SIZE;
+
+	mmio = &mmio_dev->mmio[idx];
+	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
+	entry_base = (void __user *)(mmio->table_base_va +
+			entry * PCI_MSIX_ENTRY_SIZE);
+	ctrl_pos = entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL;
+
+	if (get_user(old_ctrl, ctrl_pos))
+		goto out;
+
+	/* Don't allow writing to other fields when entry is unmasked */
+	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
+	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
+		goto out;
+
+	if (copy_to_user((void __user *)(entry_base + offset), val, len))
+		goto out;
+
+	ext_data->type = KVM_IO_EXT_DATA_TYPE_MSIX_ROUTING;
+	ext_data->msix_routing.dev_id = mmio->dev_id;
+	ext_data->msix_routing.type = mmio->type;
+	ext_data->msix_routing.entry_idx = entry;
+	ext_data->msix_routing.flags = 0;
+
+	if (offset + len < PCI_MSIX_ENTRY_VECTOR_CTRL) {
+		ret = -ENOTSYNC;
+		goto out;
+	}
+
+	if (get_user(new_ctrl, ctrl_pos))
+		goto out;
+
+	if (old_ctrl == new_ctrl) {
+		if (offset == PCI_MSIX_ENTRY_DATA && len == 8)
+			ret = -ENOTSYNC;
+		goto out;
+	}
+	if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) ^
+			(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
+		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry,
+				!!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT));
+	if (r)
+		ret = -ENOTSYNC;
+out:
+	mutex_unlock(&mmio_dev->lock);
+	mutex_unlock(&mmio_dev->kvm->lock);
+	return ret;
+}
+
+static const struct kvm_io_device_ops msix_mmio_table_ops = {
+	.read     = msix_table_mmio_read,
+	.write    = msix_table_mmio_write,
+};
+
+int kvm_register_msix_mmio_dev(struct kvm *kvm)
+{
+	int ret;
+
+	kvm_iodevice_init(&kvm->msix_mmio_dev.table_dev, &msix_mmio_table_ops);
+	mutex_init(&kvm->msix_mmio_dev.lock);
+	kvm->msix_mmio_dev.kvm = kvm;
+	mutex_lock(&kvm->slots_lock);
+	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS,
+				      &kvm->msix_mmio_dev.table_dev);
+	mutex_unlock(&kvm->slots_lock);
+	return ret;
+}
+
+int kvm_unregister_msix_mmio_dev(struct kvm *kvm)
+{
+	int ret;
+
+	mutex_lock(&kvm->slots_lock);
+	ret = kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
+				      &kvm->msix_mmio_dev.table_dev);
+	mutex_unlock(&kvm->slots_lock);
+	return ret;
+}
+
+int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
+				    struct kvm_msix_mmio_user *mmio_user)
+{
+	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
+	struct kvm_msix_mmio *mmio = NULL;
+	int r = 0, i;
+
+	mutex_lock(&mmio_dev->lock);
+	for (i = 0; i < mmio_dev->mmio_nr; i++) {
+		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id &&
+		    (mmio_dev->mmio[i].type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
+		    (mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
+			mmio = &mmio_dev->mmio[i];
+			if (mmio->max_entries_nr != mmio_user->max_entries_nr) {
+				r = -EINVAL;
+				goto out;
+			}
+			break;
+		}
+	}
+	if (mmio_user->max_entries_nr > KVM_MAX_MSIX_PER_DEV) {
+		r = -EINVAL;
+		goto out;
+	}
+	/* All reserved currently */
+	if (mmio_user->flags) {
+		r = -EINVAL;
+		goto out;
+	}
+
+	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) !=
+			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) {
+		r = -EINVAL;
+		goto out;
+	}
+	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) !=
+			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
+		r = -EINVAL;
+		goto out;
+	}
+
+#ifndef CONFIG_64BIT
+	if (mmio_user->base_va >= 0xffffffff ||
+	    mmio_user->base_addr >= 0xffffffff) {
+		r = -EINVAL;
+		goto out;
+	}
+#endif
+
+	/* Check alignment and accessibility */
+	if ((mmio_user->base_va % PCI_MSIX_ENTRY_SIZE) ||
+	    !access_ok(VERIFY_WRITE, (void __user *)mmio_user->base_va,
+			mmio_user->max_entries_nr * PCI_MSIX_ENTRY_SIZE)) {
+		r = -EINVAL;
+		goto out;
+	}
+	if (!mmio) {
+		if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) {
+			r = -ENOSPC;
+			goto out;
+		}
+		mmio = &mmio_dev->mmio[mmio_dev->mmio_nr];
+		mmio_dev->mmio_nr++;
+	}
+
+	mmio->max_entries_nr = mmio_user->max_entries_nr;
+	mmio->dev_id = mmio_user->dev_id;
+	mmio->flags = mmio_user->flags;
+
+	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
+			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
+		mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV;
+	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
+			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
+		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE;
+		mmio->table_base_addr = mmio_user->base_addr;
+		mmio->table_base_va = mmio_user->base_va;
+	}
+out:
+	mutex_unlock(&mmio_dev->lock);
+	return r;
+}
+
+int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio)
+{
+	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
+	int r = -EINVAL, i, j;
+
+	if (!mmio)
+		return 0;
+
+	mutex_lock(&mmio_dev->lock);
+	BUG_ON(mmio_dev->mmio_nr > KVM_MSIX_MMIO_MAX);
+	for (i = 0; i < mmio_dev->mmio_nr; i++) {
+		if (mmio_dev->mmio[i].dev_id == mmio->dev_id &&
+		    mmio_dev->mmio[i].type == mmio->type) {
+			r = 0;
+			for (j = i; j < mmio_dev->mmio_nr - 1; j++)
+				mmio_dev->mmio[j] = mmio_dev->mmio[j + 1];
+			mmio_dev->mmio[mmio_dev->mmio_nr].max_entries_nr = 0;
+			mmio_dev->mmio[mmio_dev->mmio_nr].dev_id = 0;
+			mmio_dev->mmio[mmio_dev->mmio_nr].type = 0;
+			mmio_dev->mmio_nr--;
+			break;
+		}
+	}
+	mutex_unlock(&mmio_dev->lock);
+	return r;
+}
+
+int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
+				      struct kvm_msix_mmio_user *mmio_user)
+{
+	struct kvm_msix_mmio mmio;
+
+	mmio.dev_id = mmio_user->dev_id;
+	mmio.type = mmio_user->type;
+
+	return kvm_free_msix_mmio(kvm, &mmio);
+}
+
diff --git a/virt/kvm/msix_mmio.h b/virt/kvm/msix_mmio.h
new file mode 100644
index 0000000..01b6587
--- /dev/null
+++ b/virt/kvm/msix_mmio.h
@@ -0,0 +1,25 @@
+#ifndef __KVM_MSIX_MMIO_H__
+#define __KVM_MSIX_MMIO_H__
+/*
+ * MSI-X MMIO emulation
+ *
+ * Copyright (c) 2010 Intel Corporation
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Author:
+ *   Sheng Yang <sheng.yang@intel.com>
+ */
+
+#include <linux/pci.h>
+
+int kvm_register_msix_mmio_dev(struct kvm *kvm);
+int kvm_unregister_msix_mmio_dev(struct kvm *kvm);
+int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
+				    struct kvm_msix_mmio_user *mmio_user);
+int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
+				      struct kvm_msix_mmio_user *mmio_user);
+int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio_user);
+
+#endif
-- 
1.7.0.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 4/4] KVM: Add documents for MSI-X MMIO API
  2011-02-28  7:20 [PATCH 0/4 v11] MSI-X MMIO support for KVM Sheng Yang
                   ` (2 preceding siblings ...)
  2011-02-28  7:20 ` [PATCH 3/4] KVM: Emulate MSI-X table in kernel Sheng Yang
@ 2011-02-28  7:20 ` Sheng Yang
  3 siblings, 0 replies; 10+ messages in thread
From: Sheng Yang @ 2011-02-28  7:20 UTC (permalink / raw)
  To: Avi Kivity, Marcelo Tosatti
  Cc: Michael S. Tsirkin, Alex Williamson, kvm, Sheng Yang


Signed-off-by: Sheng Yang <sheng@linux.intel.com>
---
 Documentation/kvm/api.txt |   58 +++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 58 insertions(+), 0 deletions(-)

diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
index e1a9297..dd10c3b 100644
--- a/Documentation/kvm/api.txt
+++ b/Documentation/kvm/api.txt
@@ -1263,6 +1263,53 @@ struct kvm_assigned_msix_entry {
 	__u16 padding[3];
 };
 
+4.54 KVM_REGISTER_MSIX_MMIO
+
+Capability: KVM_CAP_MSIX_MMIO
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_msix_mmio_user (in)
+Returns: 0 on success, -1 on error
+
+This API indicates an MSI-X MMIO address of a guest device. Then all MMIO
+operation would be handled by kernel. When necessary(e.g. MSI data/address
+changed), KVM would exit to userspace using KVM_EXIT_MSIX_ROUTING_UPDATE to
+indicate the MMIO modification and require userspace to update IRQ routing
+table.
+
+NOTICE: Writing the MSI-X MMIO page after it was registered with this API may
+be dangerous for userspace program. The writing during VM running may result
+in synchronization issue therefore the assigned device can't work properly.
+The writing is allowed when VM is not running and can be used as save/restore
+mechanism.
+
+struct kvm_msix_mmio_user {
+	__u32 dev_id;
+	__u16 type;		/* Device type and MMIO address type */
+	__u16 max_entries_nr;	/* Maximum entries supported */
+	__u64 base_addr;	/* Guest physical address of MMIO */
+	__u64 base_va;		/* Host virtual address of MMIO mapping */
+	__u64 flags;		/* Reserved for now */
+	__u64 reserved[4];
+};
+
+Current device type can be:
+#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1 << 0)
+
+Current MMIO type can be:
+#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1 << 8)
+
+4.55 KVM_UNREGISTER_MSIX_MMIO
+
+Capability: KVM_CAP_MSIX_MMIO
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_msix_mmio_user (in)
+Returns: 0 on success, -1 on error
+
+This API would unregister the specific MSI-X MMIO, indicated by dev_id and
+type fields of struct kvm_msix_mmio_user.
+
 5. The kvm_run structure
 
 Application code obtains a pointer to the kvm_run structure by
@@ -1445,6 +1492,17 @@ Userspace can now handle the hypercall and when it's done modify the gprs as
 necessary. Upon guest entry all guest GPRs will then be replaced by the values
 in this struct.
 
+		/* KVM_EXIT_MSIX_ROUTING_UPDATE*/
+		struct {
+			__u32 dev_id;
+			__u16 type;
+			__u16 entry_idx;
+			__u64 flags;
+		} msix_routing;
+
+KVM_EXIT_MSIX_ROUTING_UPDATE indicates one MSI-X entry has been modified, and
+userspace need to update the correlated routing table.
+
 		/* Fix the size of the union. */
 		char padding[256];
 	};
-- 
1.7.0.1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH 3/4] KVM: Emulate MSI-X table in kernel
  2011-02-28  7:20 ` [PATCH 3/4] KVM: Emulate MSI-X table in kernel Sheng Yang
@ 2011-02-28 11:27   ` Michael S. Tsirkin
  2011-03-01  6:10     ` Sheng Yang
  0 siblings, 1 reply; 10+ messages in thread
From: Michael S. Tsirkin @ 2011-02-28 11:27 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Avi Kivity, Marcelo Tosatti, Alex Williamson, kvm

On Mon, Feb 28, 2011 at 03:20:04PM +0800, Sheng Yang wrote:
> Then we can support mask bit operation of assigned devices now.
> 
> Signed-off-by: Sheng Yang <sheng@linux.intel.com>

A general question: we implement mmio read and write
operation here, but seem to do nothing
about ordering. In particular, pci mmio reads are normally
assumed to flush all writes out to the device
and all device writes in to the CPU.

How exactly does it work here?

> ---
>  arch/x86/include/asm/kvm_host.h |    1 +
>  arch/x86/kvm/Makefile           |    2 +-
>  arch/x86/kvm/mmu.c              |    2 +
>  arch/x86/kvm/x86.c              |   40 ++++-
>  include/linux/kvm.h             |   28 ++++
>  include/linux/kvm_host.h        |   36 +++++
>  virt/kvm/assigned-dev.c         |   44 ++++++
>  virt/kvm/kvm_main.c             |   38 +++++-
>  virt/kvm/msix_mmio.c            |  301 +++++++++++++++++++++++++++++++++++++++
>  virt/kvm/msix_mmio.h            |   25 ++++
>  10 files changed, 504 insertions(+), 13 deletions(-)
>  create mode 100644 virt/kvm/msix_mmio.c
>  create mode 100644 virt/kvm/msix_mmio.h
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index aa75f21..4a390a4 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -635,6 +635,7 @@ enum emulation_result {
>  	EMULATE_DONE,       /* no further processing */
>  	EMULATE_DO_MMIO,      /* kvm_run filled with mmio request */
>  	EMULATE_FAIL,         /* can't emulate this instruction */
> +	EMULATE_USERSPACE_EXIT, /* we need exit to userspace */
>  };
>  
>  #define EMULTYPE_NO_DECODE	    (1 << 0)
> diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> index f15501f..3a0d851 100644
> --- a/arch/x86/kvm/Makefile
> +++ b/arch/x86/kvm/Makefile
> @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
>  
>  kvm-y			+= $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
>  				coalesced_mmio.o irq_comm.o eventfd.o \
> -				assigned-dev.o)
> +				assigned-dev.o msix_mmio.o)
>  kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/, iommu.o)
>  kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix ../../../virt/kvm/, async_pf.o)
>  
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 9cafbb4..912dca4 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -3358,6 +3358,8 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
>  	case EMULATE_DO_MMIO:
>  		++vcpu->stat.mmio_exits;
>  		/* fall through */
> +	case EMULATE_USERSPACE_EXIT:
> +		/* fall through */
>  	case EMULATE_FAIL:
>  		return 0;
>  	default:
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 21b84e2..87308eb 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)
>  	case KVM_CAP_X86_ROBUST_SINGLESTEP:
>  	case KVM_CAP_XSAVE:
>  	case KVM_CAP_ASYNC_PF:
> +	case KVM_CAP_MSIX_MMIO:
>  		r = 1;
>  		break;
>  	case KVM_CAP_COALESCED_MMIO:
> @@ -3809,6 +3810,7 @@ static int emulator_write_emulated_onepage(unsigned long addr,
>  {
>  	gpa_t                 gpa;
>  	struct kvm_io_ext_data ext_data;
> +	int r;
>  
>  	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
>  
> @@ -3824,18 +3826,32 @@ static int emulator_write_emulated_onepage(unsigned long addr,
>  
>  mmio:
>  	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
> +	r = vcpu_mmio_write(vcpu, gpa, bytes, val, &ext_data);
>  	/*
>  	 * Is this MMIO handled locally?
>  	 */
> -	if (!vcpu_mmio_write(vcpu, gpa, bytes, val, &ext_data))
> +	if (!r)
>  		return X86EMUL_CONTINUE;
>  
> -	vcpu->mmio_needed = 1;
> -	vcpu->run->exit_reason = KVM_EXIT_MMIO;
> -	vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
> -	vcpu->run->mmio.len = vcpu->mmio_size = bytes;
> -	vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
> -	memcpy(vcpu->run->mmio.data, val, bytes);
> +	if (r == -ENOTSYNC) {

Can you replace -ENOTSYNC with KVM_EXIT_MSIX_ROUTING_UPDATE all over please?

> +		vcpu->userspace_exit_needed = 1;
> +		vcpu->run->exit_reason = KVM_EXIT_MSIX_ROUTING_UPDATE;
> +		vcpu->run->msix_routing.dev_id =
> +			ext_data.msix_routing.dev_id;
> +		vcpu->run->msix_routing.type =
> +			ext_data.msix_routing.type;
> +		vcpu->run->msix_routing.entry_idx =
> +			ext_data.msix_routing.entry_idx;
> +		vcpu->run->msix_routing.flags =
> +			ext_data.msix_routing.flags;
> +	} else  {
> +		vcpu->mmio_needed = 1;
> +		vcpu->run->exit_reason = KVM_EXIT_MMIO;
> +		vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
> +		vcpu->run->mmio.len = vcpu->mmio_size = bytes;
> +		vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
> +		memcpy(vcpu->run->mmio.data, val, bytes);
> +	}
>  
>  	return X86EMUL_CONTINUE;
>  }
> @@ -4469,6 +4485,8 @@ done:
>  		r = EMULATE_DO_MMIO;
>  	} else if (r == EMULATION_RESTART)
>  		goto restart;
> +	else if (vcpu->userspace_exit_needed)
> +		r = EMULATE_USERSPACE_EXIT;
>  	else
>  		r = EMULATE_DONE;
>  
> @@ -5397,12 +5415,18 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
>  		}
>  	}
>  
> -	if (vcpu->arch.pio.count || vcpu->mmio_needed) {
> +	if (vcpu->arch.pio.count || vcpu->mmio_needed ||
> +			vcpu->userspace_exit_needed) {
>  		if (vcpu->mmio_needed) {
>  			memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
>  			vcpu->mmio_read_completed = 1;
>  			vcpu->mmio_needed = 0;
>  		}
> +		if (vcpu->userspace_exit_needed) {
> +			vcpu->userspace_exit_needed = 0;
> +			r = 0;
> +			goto out;
> +		}
>  		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
>  		r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
>  		srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> index ea2dc1a..4393e4e 100644
> --- a/include/linux/kvm.h
> +++ b/include/linux/kvm.h
> @@ -161,6 +161,7 @@ struct kvm_pit_config {
>  #define KVM_EXIT_NMI              16
>  #define KVM_EXIT_INTERNAL_ERROR   17
>  #define KVM_EXIT_OSI              18
> +#define KVM_EXIT_MSIX_ROUTING_UPDATE 19
>  
>  /* For KVM_EXIT_INTERNAL_ERROR */
>  #define KVM_INTERNAL_ERROR_EMULATION 1
> @@ -264,6 +265,13 @@ struct kvm_run {
>  		struct {
>  			__u64 gprs[32];
>  		} osi;
> +		/* KVM_EXIT_MSIX_ROUTING_UPDATE*/
> +		struct {
> +			__u32 dev_id;
> +			__u16 type;
> +			__u16 entry_idx;
> +			__u64 flags;
> +		} msix_routing;
>  		/* Fix the size of the union. */
>  		char padding[256];
>  	};
> @@ -541,6 +549,7 @@ struct kvm_ppc_pvinfo {
>  #define KVM_CAP_PPC_GET_PVINFO 57
>  #define KVM_CAP_PPC_IRQ_LEVEL 58
>  #define KVM_CAP_ASYNC_PF 59
> +#define KVM_CAP_MSIX_MMIO 60
>  
>  #ifdef KVM_CAP_IRQ_ROUTING
>  
> @@ -672,6 +681,9 @@ struct kvm_clock_data {
>  #define KVM_XEN_HVM_CONFIG        _IOW(KVMIO,  0x7a, struct kvm_xen_hvm_config)
>  #define KVM_SET_CLOCK             _IOW(KVMIO,  0x7b, struct kvm_clock_data)
>  #define KVM_GET_CLOCK             _IOR(KVMIO,  0x7c, struct kvm_clock_data)
> +/* Available with KVM_CAP_MSIX_MMIO */
> +#define KVM_REGISTER_MSIX_MMIO    _IOW(KVMIO,  0x7d, struct kvm_msix_mmio_user)
> +#define KVM_UNREGISTER_MSIX_MMIO  _IOW(KVMIO,  0x7e, struct kvm_msix_mmio_user)
>  /* Available with KVM_CAP_PIT_STATE2 */
>  #define KVM_GET_PIT2              _IOR(KVMIO,  0x9f, struct kvm_pit_state2)
>  #define KVM_SET_PIT2              _IOW(KVMIO,  0xa0, struct kvm_pit_state2)
> @@ -795,4 +807,20 @@ struct kvm_assigned_msix_entry {
>  	__u16 padding[3];
>  };
>  
> +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1 << 0)
> +
> +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1 << 8)
> +
> +#define KVM_MSIX_MMIO_TYPE_DEV_MASK	    0x00ff
> +#define KVM_MSIX_MMIO_TYPE_BASE_MASK	    0xff00
> +struct kvm_msix_mmio_user {
> +	__u32 dev_id;
> +	__u16 type;
> +	__u16 max_entries_nr;
> +	__u64 base_addr;
> +	__u64 base_va;
> +	__u64 flags;
> +	__u64 reserved[4];
> +};
> +
>  #endif /* __LINUX_KVM_H */
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index a32c53e..d6c05e3 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -68,8 +68,17 @@ enum kvm_bus {
>  	KVM_NR_BUSES
>  };
>  
> +#define KVM_IO_EXT_DATA_TYPE_MSIX_ROUTING   1
>  struct kvm_io_ext_data {
>  	int type;   /* Extended data type */

type seems unused for now. Do we need it?
I'd guess exit type should usually be sufficient.

> +	union {
> +		struct {
> +			u32 dev_id;	/* Device ID */
> +			u16 type;	/* Device type */
> +			u16 entry_idx;  /* Accessed MSI-X entry index */
> +			u64 flags;
> +		} msix_routing;

So this is only for when we exit on msix routing?

> +	};
>  };
>  
>  int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
> @@ -165,6 +174,8 @@ struct kvm_vcpu {
>  	} async_pf;
>  #endif
>  
> +	int userspace_exit_needed;
> +
>  	struct kvm_vcpu_arch arch;
>  };
>  
> @@ -238,6 +249,27 @@ struct kvm_memslots {
>  					KVM_PRIVATE_MEM_SLOTS];
>  };
>  
> +#define KVM_MSIX_MMIO_MAX    32
> +
> +struct kvm_msix_mmio {
> +	u32 dev_id;
> +	u16 type;
> +	u16 max_entries_nr;
> +	u64 flags;
> +	gpa_t table_base_addr;
> +	hva_t table_base_va;

void __user* would make for less casting in code.

Or even

	struct kvm_msix_entry {
		__le32 addr_lo;
		__le32 addr_hi;
		__le32 data;
		__le32 ctrl;
	};

	struct kvm_msix_entry __user *table;

and then we can do table[entry].ctrl
and so on.

> +	gpa_t pba_base_addr;
> +	hva_t pba_base_va;
> +};
> +
> +struct kvm_msix_mmio_dev {
> +	struct kvm *kvm;
> +	struct kvm_io_device table_dev;
> +	int mmio_nr;
> +	struct kvm_msix_mmio mmio[KVM_MSIX_MMIO_MAX];
> +	struct mutex lock;
> +};
> +
>  struct kvm {
>  	spinlock_t mmu_lock;
>  	raw_spinlock_t requests_lock;
> @@ -286,6 +318,7 @@ struct kvm {
>  	long mmu_notifier_count;
>  #endif
>  	long tlbs_dirty;
> +	struct kvm_msix_mmio_dev msix_mmio_dev;
>  };
>  
>  /* The guest did something we don't support. */
> @@ -558,6 +591,9 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
>  int kvm_request_irq_source_id(struct kvm *kvm);
>  void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
>  
> +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> +			int assigned_dev_id, int entry, bool mask);
> +
>  /* For vcpu->arch.iommu_flags */
>  #define KVM_IOMMU_CACHE_COHERENCY	0x1
>  
> diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
> index ae72ae6..d1598a6 100644
> --- a/virt/kvm/assigned-dev.c
> +++ b/virt/kvm/assigned-dev.c
> @@ -18,6 +18,7 @@
>  #include <linux/interrupt.h>
>  #include <linux/slab.h>
>  #include "irq.h"
> +#include "msix_mmio.h"
>  
>  static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
>  						      int assigned_dev_id)
> @@ -191,12 +192,25 @@ static void kvm_free_assigned_irq(struct kvm *kvm,
>  	kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
>  }
>  
> +static void assigned_device_free_msix_mmio(struct kvm *kvm,
> +				struct kvm_assigned_dev_kernel *adev)
> +{
> +	struct kvm_msix_mmio mmio;
> +
> +	mmio.dev_id = adev->assigned_dev_id;
> +	mmio.type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV |
> +		    KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> +	kvm_free_msix_mmio(kvm, &mmio);
> +}
> +
>  static void kvm_free_assigned_device(struct kvm *kvm,
>  				     struct kvm_assigned_dev_kernel
>  				     *assigned_dev)
>  {
>  	kvm_free_assigned_irq(kvm, assigned_dev);
>  
> +	assigned_device_free_msix_mmio(kvm, assigned_dev);
> +
>  	__pci_reset_function(assigned_dev->dev);
>  	pci_restore_state(assigned_dev->dev);
>  
> @@ -785,3 +799,33 @@ out:
>  	return r;
>  }
>  
> +/* The caller should hold kvm->lock */
> +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> +				int assigned_dev_id, int entry, bool mask)
> +{
> +	int r = -EFAULT;
> +	struct kvm_assigned_dev_kernel *adev;
> +	int i;
> +
> +	if (!irqchip_in_kernel(kvm))
> +		return r;

This is wrong, EFAULT should only be returned on
bad/misaligned address.

Can we check this during setup instead?
And then this check can go away or become
BUG_ON, and the function be void.

> +
> +	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> +				      assigned_dev_id);
> +	if (!adev)
> +		goto out;
> +
> +	/* For non-MSIX enabled devices, entries_nr == 0 */
> +	for (i = 0; i < adev->entries_nr; i++)
> +		if (adev->host_msix_entries[i].entry == entry) {
> +			if (mask)
> +				disable_irq_nosync(
> +					adev->host_msix_entries[i].vector);
> +			else
> +				enable_irq(adev->host_msix_entries[i].vector);
> +			r = 0;
> +			break;
> +		}
> +out:
> +	return r;
> +}
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index a61f90e..f211e49 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -56,6 +56,7 @@
>  
>  #include "coalesced_mmio.h"
>  #include "async_pf.h"
> +#include "msix_mmio.h"
>  
>  #define CREATE_TRACE_POINTS
>  #include <trace/events/kvm.h>
> @@ -509,6 +510,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
>  	struct mm_struct *mm = kvm->mm;
>  
>  	kvm_arch_sync_events(kvm);
> +	kvm_unregister_msix_mmio_dev(kvm);
>  	spin_lock(&kvm_lock);
>  	list_del(&kvm->vm_list);
>  	spin_unlock(&kvm_lock);
> @@ -1877,6 +1879,24 @@ static long kvm_vm_ioctl(struct file *filp,
>  		mutex_unlock(&kvm->lock);
>  		break;
>  #endif
> +	case KVM_REGISTER_MSIX_MMIO: {
> +		struct kvm_msix_mmio_user mmio_user;
> +
> +		r = -EFAULT;
> +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> +			goto out;
> +		r = kvm_vm_ioctl_register_msix_mmio(kvm, &mmio_user);
> +		break;
> +	}
> +	case KVM_UNREGISTER_MSIX_MMIO: {
> +		struct kvm_msix_mmio_user mmio_user;
> +
> +		r = -EFAULT;
> +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> +			goto out;
> +		r = kvm_vm_ioctl_unregister_msix_mmio(kvm, &mmio_user);
> +		break;
> +	}
>  	default:
>  		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
>  		if (r == -ENOTTY)
> @@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void)
>  		return r;
>  	}
>  #endif
> +	r = kvm_register_msix_mmio_dev(kvm);
> +	if (r < 0) {
> +		kvm_put_kvm(kvm);
> +		return r;
> +	}
> +

Need to fix error handling below as well?
Better do it with chained gotos if yes.

>  	r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
>  	if (r < 0)
>  		kvm_put_kvm(kvm);
> @@ -2223,14 +2249,18 @@ static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
>  int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
>  		     int len, const void *val, struct kvm_io_ext_data *ext_data)
>  {
> -	int i;
> +	int i, r = -EOPNOTSUPP;
>  	struct kvm_io_bus *bus;
>  
>  	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
> -	for (i = 0; i < bus->dev_count; i++)
> -		if (!kvm_iodevice_write(bus->devs[i], addr, len, val, ext_data))
> +	for (i = 0; i < bus->dev_count; i++) {
> +		r = kvm_iodevice_write(bus->devs[i], addr, len, val, ext_data);
> +		if (r == -ENOTSYNC)
> +			break;
> +		else if (!r)
>  			return 0;
> -	return -EOPNOTSUPP;
> +	}
> +	return r;
>  }
>  
>  /* kvm_io_bus_read - called under kvm->slots_lock */
> diff --git a/virt/kvm/msix_mmio.c b/virt/kvm/msix_mmio.c
> new file mode 100644
> index 0000000..b2a5f86
> --- /dev/null
> +++ b/virt/kvm/msix_mmio.c
> @@ -0,0 +1,301 @@
> +/*
> + * MSI-X MMIO emulation
> + *
> + * Copyright (c) 2010 Intel Corporation
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.  See
> + * the COPYING file in the top-level directory.
> + *
> + * Author:
> + *   Sheng Yang <sheng.yang@intel.com>
> + */
> +
> +#include <linux/kvm_host.h>
> +#include <linux/kvm.h>
> +
> +#include "msix_mmio.h"
> +#include "iodev.h"
> +
> +static int update_msix_mask_bit(struct kvm *kvm, struct kvm_msix_mmio *mmio,
> +				int entry, u32 flag)
> +{
> +	if (mmio->type & KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)

Does something else even happen?
If not - BUG_ON.

> +		return kvm_assigned_device_update_msix_mask_bit(kvm,
> +				mmio->dev_id, entry, flag);

> +	return -EFAULT;

EINVAL or something

> +}
> +
> +/* Caller must hold dev->lock */
> +static int get_mmio_table_index(struct kvm_msix_mmio_dev *dev,
> +				gpa_t addr, int len)
> +{
> +	gpa_t start, end;
> +	int i, r = -EINVAL;
> +
> +	for (i = 0; i < dev->mmio_nr; i++) {
> +		start = dev->mmio[i].table_base_addr;
> +		end = dev->mmio[i].table_base_addr + PCI_MSIX_ENTRY_SIZE *
> +			dev->mmio[i].max_entries_nr;
> +		if (addr >= start && addr + len <= end) {
> +			r = i;
> +			break;
> +		}
> +	}
> +
> +	return r;
> +}
> +
> +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
> +				void *val)
> +{
> +	/*TODO: Add big endian support */

likely can be removed.

> +	struct kvm_msix_mmio_dev *mmio_dev =
> +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> +	struct kvm_msix_mmio *mmio;
> +	int idx, ret = 0, entry, offset, r;
> +
> +	mutex_lock(&mmio_dev->lock);
> +	idx = get_mmio_table_index(mmio_dev, addr, len);
> +	if (idx < 0) {
> +		ret = -EOPNOTSUPP;
> +		goto out;
> +	}
> +	if ((len != 4 && len != 8) || addr & (len - 1))
> +		goto out;
> +
> +	offset = addr % PCI_MSIX_ENTRY_SIZE;
> +	mmio = &mmio_dev->mmio[idx];
> +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> +	r = copy_from_user(val, (void __user *)(mmio->table_base_va +

cast above is not needed.

> +			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> +	if (r)
> +		goto out;
> +out:
> +	mutex_unlock(&mmio_dev->lock);
> +	return ret;
> +}
> +
> +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr,
> +				int len, const void *val,
> +				struct kvm_io_ext_data *ext_data)
> +{
> +	/*TODO: Add big endian support */

comment above can go I think.

> +	struct kvm_msix_mmio_dev *mmio_dev =
> +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> +	struct kvm_msix_mmio *mmio;
> +	int idx, entry, offset, ret = 0, r = 0;
> +	void __user *entry_base;
> +	__le32 __user *ctrl_pos;
> +	__le32 old_ctrl, new_ctrl;
> +
> +	mutex_lock(&mmio_dev->kvm->lock);
> +	mutex_lock(&mmio_dev->lock);
> +	idx = get_mmio_table_index(mmio_dev, addr, len);
> +	if (idx < 0) {
> +		ret = -EOPNOTSUPP;
> +		goto out;
> +	}
> +	if ((len != 4 && len != 8) || addr & (len - 1))
> +		goto out;
> +
> +	offset = addr % PCI_MSIX_ENTRY_SIZE;
> +
> +	mmio = &mmio_dev->mmio[idx];
> +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> +	entry_base = (void __user *)(mmio->table_base_va +
> +			entry * PCI_MSIX_ENTRY_SIZE);
> +	ctrl_pos = entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL;
> +
> +	if (get_user(old_ctrl, ctrl_pos))
> +		goto out;
> +
> +	/* Don't allow writing to other fields when entry is unmasked */
> +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&

This should be le32_to_cpu(old_ctrl)
I think sparse shall warn about this.

> +	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> +		goto out;
> +
> +	if (copy_to_user((void __user *)(entry_base + offset), val, len))

A cast above is no longer needed

> +		goto out;
> +
> +	ext_data->type = KVM_IO_EXT_DATA_TYPE_MSIX_ROUTING;
> +	ext_data->msix_routing.dev_id = mmio->dev_id;
> +	ext_data->msix_routing.type = mmio->type;
> +	ext_data->msix_routing.entry_idx = entry;
> +	ext_data->msix_routing.flags = 0;

Is this strictly necessary?
Can we fill the data in vcpu->run->msix_routing
directly instead?
Also, on good path we don't need to fill this structure
as there's no exit. Let's only do it then?


> +
> +	if (offset + len < PCI_MSIX_ENTRY_VECTOR_CTRL) {
> +		ret = -ENOTSYNC;
> +		goto out;
> +	}
> +
> +	if (get_user(new_ctrl, ctrl_pos))
> +		goto out;
> +
> +	if (old_ctrl == new_ctrl) {
> +		if (offset == PCI_MSIX_ENTRY_DATA && len == 8)
> +			ret = -ENOTSYNC;
> +		goto out;
> +	}
> +	if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) ^
> +			(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry,
> +				!!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT));

So if update_msix_mask_bit returns EFAULT above this will
trigger msix exit? Why?

> +	if (r)
> +		ret = -ENOTSYNC;
> +out:
> +	mutex_unlock(&mmio_dev->lock);
> +	mutex_unlock(&mmio_dev->kvm->lock);
> +	return ret;
> +}
> +
> +static const struct kvm_io_device_ops msix_mmio_table_ops = {
> +	.read     = msix_table_mmio_read,
> +	.write    = msix_table_mmio_write,
> +};
> +
> +int kvm_register_msix_mmio_dev(struct kvm *kvm)
> +{
> +	int ret;
> +
> +	kvm_iodevice_init(&kvm->msix_mmio_dev.table_dev, &msix_mmio_table_ops);
> +	mutex_init(&kvm->msix_mmio_dev.lock);
> +	kvm->msix_mmio_dev.kvm = kvm;
> +	mutex_lock(&kvm->slots_lock);
> +	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS,
> +				      &kvm->msix_mmio_dev.table_dev);
> +	mutex_unlock(&kvm->slots_lock);
> +	return ret;
> +}
> +
> +int kvm_unregister_msix_mmio_dev(struct kvm *kvm)
> +{
> +	int ret;
> +
> +	mutex_lock(&kvm->slots_lock);
> +	ret = kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
> +				      &kvm->msix_mmio_dev.table_dev);
> +	mutex_unlock(&kvm->slots_lock);
> +	return ret;
> +}
> +
> +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> +				    struct kvm_msix_mmio_user *mmio_user)
> +{
> +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> +	struct kvm_msix_mmio *mmio = NULL;
> +	int r = 0, i;
> +
> +	mutex_lock(&mmio_dev->lock);
> +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> +		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id &&
> +		    (mmio_dev->mmio[i].type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> +		    (mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
> +			mmio = &mmio_dev->mmio[i];
> +			if (mmio->max_entries_nr != mmio_user->max_entries_nr) {
> +				r = -EINVAL;
> +				goto out;
> +			}
> +			break;
> +		}
> +	}

what does the loop above do? Pls add a comment.

> +	if (mmio_user->max_entries_nr > KVM_MAX_MSIX_PER_DEV) {
> +		r = -EINVAL;

-ENOSPC here as well?

> +		goto out;
> +	}
> +	/* All reserved currently */
> +	if (mmio_user->flags) {
> +		r = -EINVAL;
> +		goto out;
> +	}
> +
> +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) !=
> +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) {
> +		r = -EINVAL;
> +		goto out;
> +	}
> +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) !=
> +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> +		r = -EINVAL;
> +		goto out;
> +	}
> +
> +#ifndef CONFIG_64BIT
> +	if (mmio_user->base_va >= 0xffffffff ||
> +	    mmio_user->base_addr >= 0xffffffff) {
> +		r = -EINVAL;
> +		goto out;
> +	}
> +#endif

A way to check this without ifdef (long is as pointer in length):
	if ((unsigned long)mmio_user->base_va !=
		mmio_user->base_va)

and for base_va it should be EFAULT I think as with any illegal
virtual address.

> +
> +	/* Check alignment and accessibility */
> +	if ((mmio_user->base_va % PCI_MSIX_ENTRY_SIZE) ||
> +	    !access_ok(VERIFY_WRITE, (void __user *)mmio_user->base_va,
> +			mmio_user->max_entries_nr * PCI_MSIX_ENTRY_SIZE)) {

One thing to watch for here is wrap around. AFAIK
access_ok does not check for it, you must do it yourself.

> +		r = -EINVAL;

EFAULT is usually better for an access error.

> +		goto out;
> +	}
> +	if (!mmio) {
> +		if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) {
> +			r = -ENOSPC;
> +			goto out;
> +		}
> +		mmio = &mmio_dev->mmio[mmio_dev->mmio_nr];
> +		mmio_dev->mmio_nr++;
> +	}
> +
> +	mmio->max_entries_nr = mmio_user->max_entries_nr;
> +	mmio->dev_id = mmio_user->dev_id;
> +	mmio->flags = mmio_user->flags;
> +
> +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> +		mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV;
> +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
> +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> +		mmio->table_base_addr = mmio_user->base_addr;
> +		mmio->table_base_va = mmio_user->base_va;
> +	}
> +out:
> +	mutex_unlock(&mmio_dev->lock);
> +	return r;
> +}
> +
> +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio)

Pls add a comment documenting what this does.

> +{
> +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> +	int r = -EINVAL, i, j;
> +
> +	if (!mmio)
> +		return 0;
> +
> +	mutex_lock(&mmio_dev->lock);
> +	BUG_ON(mmio_dev->mmio_nr > KVM_MSIX_MMIO_MAX);
> +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> +		if (mmio_dev->mmio[i].dev_id == mmio->dev_id &&
> +		    mmio_dev->mmio[i].type == mmio->type) {
> +			r = 0;
> +			for (j = i; j < mmio_dev->mmio_nr - 1; j++)
> +				mmio_dev->mmio[j] = mmio_dev->mmio[j + 1];
> +			mmio_dev->mmio[mmio_dev->mmio_nr].max_entries_nr = 0;
> +			mmio_dev->mmio[mmio_dev->mmio_nr].dev_id = 0;
> +			mmio_dev->mmio[mmio_dev->mmio_nr].type = 0;
> +			mmio_dev->mmio_nr--;
> +			break;
> +		}
> +	}
> +	mutex_unlock(&mmio_dev->lock);
> +	return r;
> +}
> +
> +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> +				      struct kvm_msix_mmio_user *mmio_user)
> +{
> +	struct kvm_msix_mmio mmio;
> +
> +	mmio.dev_id = mmio_user->dev_id;
> +	mmio.type = mmio_user->type;
> +

So you pass in kvm_msix_mmio but only 2 fields are
valid? This is very confusing. Please just pass the
two values as function params to kvm_free_msix_mmio.

> +	return kvm_free_msix_mmio(kvm, &mmio);
> +}
> +
> diff --git a/virt/kvm/msix_mmio.h b/virt/kvm/msix_mmio.h
> new file mode 100644
> index 0000000..01b6587
> --- /dev/null
> +++ b/virt/kvm/msix_mmio.h
> @@ -0,0 +1,25 @@
> +#ifndef __KVM_MSIX_MMIO_H__
> +#define __KVM_MSIX_MMIO_H__
> +/*
> + * MSI-X MMIO emulation
> + *
> + * Copyright (c) 2010 Intel Corporation
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.  See
> + * the COPYING file in the top-level directory.
> + *
> + * Author:
> + *   Sheng Yang <sheng.yang@intel.com>
> + */
> +
> +#include <linux/pci.h>

I don't see where you use this include.
OTOH you do need to declare struct kvm_msix_mmio_user and
struct kvm here.

> +
> +int kvm_register_msix_mmio_dev(struct kvm *kvm);
> +int kvm_unregister_msix_mmio_dev(struct kvm *kvm);
> +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> +				    struct kvm_msix_mmio_user *mmio_user);
> +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> +				      struct kvm_msix_mmio_user *mmio_user);
> +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio_user);
> +
> +#endif
> -- 
> 1.7.0.1

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 3/4] KVM: Emulate MSI-X table in kernel
  2011-02-28 11:27   ` Michael S. Tsirkin
@ 2011-03-01  6:10     ` Sheng Yang
  2011-03-01 12:20       ` Michael S. Tsirkin
  0 siblings, 1 reply; 10+ messages in thread
From: Sheng Yang @ 2011-03-01  6:10 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Avi Kivity, Marcelo Tosatti, Alex Williamson, kvm

On Monday 28 February 2011 19:27:29 Michael S. Tsirkin wrote:
> On Mon, Feb 28, 2011 at 03:20:04PM +0800, Sheng Yang wrote:
> > Then we can support mask bit operation of assigned devices now.
> > 
> > Signed-off-by: Sheng Yang <sheng@linux.intel.com>
> 
> A general question: we implement mmio read and write
> operation here, but seem to do nothing
> about ordering. In particular, pci mmio reads are normally
> assumed to flush all writes out to the device
> and all device writes in to the CPU.
> 
> How exactly does it work here?

I don't understand your problem... We emulate all operation here, where is 
"ordering" issue?

And Michael, thanks for you detail comments, but could you give comments all at 
once? Now I've tried my best to address comments, but still feeling endless 
comments are coming. And I would leave Intel this Friday, I want to get it done 
before I leave.
> 
> > ---
> > 
> >  arch/x86/include/asm/kvm_host.h |    1 +
> >  arch/x86/kvm/Makefile           |    2 +-
> >  arch/x86/kvm/mmu.c              |    2 +
> >  arch/x86/kvm/x86.c              |   40 ++++-
> >  include/linux/kvm.h             |   28 ++++
> >  include/linux/kvm_host.h        |   36 +++++
> >  virt/kvm/assigned-dev.c         |   44 ++++++
> >  virt/kvm/kvm_main.c             |   38 +++++-
> >  virt/kvm/msix_mmio.c            |  301
> >  +++++++++++++++++++++++++++++++++++++++ virt/kvm/msix_mmio.h           
> >  |   25 ++++
> >  10 files changed, 504 insertions(+), 13 deletions(-)
> >  create mode 100644 virt/kvm/msix_mmio.c
> >  create mode 100644 virt/kvm/msix_mmio.h
> > 
> > diff --git a/arch/x86/include/asm/kvm_host.h
> > b/arch/x86/include/asm/kvm_host.h index aa75f21..4a390a4 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -635,6 +635,7 @@ enum emulation_result {
> > 
> >  	EMULATE_DONE,       /* no further processing */
> >  	EMULATE_DO_MMIO,      /* kvm_run filled with mmio request */
> >  	EMULATE_FAIL,         /* can't emulate this instruction */
> > 
> > +	EMULATE_USERSPACE_EXIT, /* we need exit to userspace */
> > 
> >  };
> >  
> >  #define EMULTYPE_NO_DECODE	    (1 << 0)
> > 
> > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> > index f15501f..3a0d851 100644
> > --- a/arch/x86/kvm/Makefile
> > +++ b/arch/x86/kvm/Makefile
> > @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
> > 
> >  kvm-y			+= $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
> >  
> >  				coalesced_mmio.o irq_comm.o eventfd.o \
> > 
> > -				assigned-dev.o)
> > +				assigned-dev.o msix_mmio.o)
> > 
> >  kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/, iommu.o)
> >  kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix ../../../virt/kvm/,
> >  async_pf.o)
> > 
> > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> > index 9cafbb4..912dca4 100644
> > --- a/arch/x86/kvm/mmu.c
> > +++ b/arch/x86/kvm/mmu.c
> > @@ -3358,6 +3358,8 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t
> > cr2, u32 error_code,
> > 
> >  	case EMULATE_DO_MMIO:
> >  		++vcpu->stat.mmio_exits;
> >  		/* fall through */
> > 
> > +	case EMULATE_USERSPACE_EXIT:
> > +		/* fall through */
> > 
> >  	case EMULATE_FAIL:
> >  		return 0;
> >  	
> >  	default:
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index 21b84e2..87308eb 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)
> > 
> >  	case KVM_CAP_X86_ROBUST_SINGLESTEP:
> >  	case KVM_CAP_XSAVE:
> > 
> >  	case KVM_CAP_ASYNC_PF:
> > +	case KVM_CAP_MSIX_MMIO:
> >  		r = 1;
> >  		break;
> >  	
> >  	case KVM_CAP_COALESCED_MMIO:
> > @@ -3809,6 +3810,7 @@ static int emulator_write_emulated_onepage(unsigned
> > long addr,
> > 
> >  {
> >  
> >  	gpa_t                 gpa;
> >  	struct kvm_io_ext_data ext_data;
> > 
> > +	int r;
> > 
> >  	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
> > 
> > @@ -3824,18 +3826,32 @@ static int
> > emulator_write_emulated_onepage(unsigned long addr,
> > 
> >  mmio:
> >  	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
> > 
> > +	r = vcpu_mmio_write(vcpu, gpa, bytes, val, &ext_data);
> > 
> >  	/*
> >  	
> >  	 * Is this MMIO handled locally?
> >  	 */
> > 
> > -	if (!vcpu_mmio_write(vcpu, gpa, bytes, val, &ext_data))
> > +	if (!r)
> > 
> >  		return X86EMUL_CONTINUE;
> > 
> > -	vcpu->mmio_needed = 1;
> > -	vcpu->run->exit_reason = KVM_EXIT_MMIO;
> > -	vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
> > -	vcpu->run->mmio.len = vcpu->mmio_size = bytes;
> > -	vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
> > -	memcpy(vcpu->run->mmio.data, val, bytes);
> > +	if (r == -ENOTSYNC) {
> 
> Can you replace -ENOTSYNC with KVM_EXIT_MSIX_ROUTING_UPDATE all over
> please?

How about let Avi/Marcelo decide?
> 
> > +		vcpu->userspace_exit_needed = 1;
> > +		vcpu->run->exit_reason = KVM_EXIT_MSIX_ROUTING_UPDATE;
> > +		vcpu->run->msix_routing.dev_id =
> > +			ext_data.msix_routing.dev_id;
> > +		vcpu->run->msix_routing.type =
> > +			ext_data.msix_routing.type;
> > +		vcpu->run->msix_routing.entry_idx =
> > +			ext_data.msix_routing.entry_idx;
> > +		vcpu->run->msix_routing.flags =
> > +			ext_data.msix_routing.flags;
> > +	} else  {
> > +		vcpu->mmio_needed = 1;
> > +		vcpu->run->exit_reason = KVM_EXIT_MMIO;
> > +		vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
> > +		vcpu->run->mmio.len = vcpu->mmio_size = bytes;
> > +		vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
> > +		memcpy(vcpu->run->mmio.data, val, bytes);
> > +	}
> > 
> >  	return X86EMUL_CONTINUE;
> >  
> >  }
> > 
> > @@ -4469,6 +4485,8 @@ done:
> >  		r = EMULATE_DO_MMIO;
> >  	
> >  	} else if (r == EMULATION_RESTART)
> >  	
> >  		goto restart;
> > 
> > +	else if (vcpu->userspace_exit_needed)
> > +		r = EMULATE_USERSPACE_EXIT;
> > 
> >  	else
> >  	
> >  		r = EMULATE_DONE;
> > 
> > @@ -5397,12 +5415,18 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu
> > *vcpu, struct kvm_run *kvm_run)
> > 
> >  		}
> >  	
> >  	}
> > 
> > -	if (vcpu->arch.pio.count || vcpu->mmio_needed) {
> > +	if (vcpu->arch.pio.count || vcpu->mmio_needed ||
> > +			vcpu->userspace_exit_needed) {
> > 
> >  		if (vcpu->mmio_needed) {
> >  		
> >  			memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
> >  			vcpu->mmio_read_completed = 1;
> >  			vcpu->mmio_needed = 0;
> >  		
> >  		}
> > 
> > +		if (vcpu->userspace_exit_needed) {
> > +			vcpu->userspace_exit_needed = 0;
> > +			r = 0;
> > +			goto out;
> > +		}
> > 
> >  		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
> >  		r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
> >  		srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
> > 
> > diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> > index ea2dc1a..4393e4e 100644
> > --- a/include/linux/kvm.h
> > +++ b/include/linux/kvm.h
> > @@ -161,6 +161,7 @@ struct kvm_pit_config {
> > 
> >  #define KVM_EXIT_NMI              16
> >  #define KVM_EXIT_INTERNAL_ERROR   17
> >  #define KVM_EXIT_OSI              18
> > 
> > +#define KVM_EXIT_MSIX_ROUTING_UPDATE 19
> > 
> >  /* For KVM_EXIT_INTERNAL_ERROR */
> >  #define KVM_INTERNAL_ERROR_EMULATION 1
> > 
> > @@ -264,6 +265,13 @@ struct kvm_run {
> > 
> >  		struct {
> >  		
> >  			__u64 gprs[32];
> >  		
> >  		} osi;
> > 
> > +		/* KVM_EXIT_MSIX_ROUTING_UPDATE*/
> > +		struct {
> > +			__u32 dev_id;
> > +			__u16 type;
> > +			__u16 entry_idx;
> > +			__u64 flags;
> > +		} msix_routing;
> > 
> >  		/* Fix the size of the union. */
> >  		char padding[256];
> >  	
> >  	};
> > 
> > @@ -541,6 +549,7 @@ struct kvm_ppc_pvinfo {
> > 
> >  #define KVM_CAP_PPC_GET_PVINFO 57
> >  #define KVM_CAP_PPC_IRQ_LEVEL 58
> >  #define KVM_CAP_ASYNC_PF 59
> > 
> > +#define KVM_CAP_MSIX_MMIO 60
> > 
> >  #ifdef KVM_CAP_IRQ_ROUTING
> > 
> > @@ -672,6 +681,9 @@ struct kvm_clock_data {
> > 
> >  #define KVM_XEN_HVM_CONFIG        _IOW(KVMIO,  0x7a, struct
> >  kvm_xen_hvm_config) #define KVM_SET_CLOCK             _IOW(KVMIO, 
> >  0x7b, struct kvm_clock_data) #define KVM_GET_CLOCK            
> >  _IOR(KVMIO,  0x7c, struct kvm_clock_data)
> > 
> > +/* Available with KVM_CAP_MSIX_MMIO */
> > +#define KVM_REGISTER_MSIX_MMIO    _IOW(KVMIO,  0x7d, struct
> > kvm_msix_mmio_user) +#define KVM_UNREGISTER_MSIX_MMIO  _IOW(KVMIO, 
> > 0x7e, struct kvm_msix_mmio_user)
> > 
> >  /* Available with KVM_CAP_PIT_STATE2 */
> >  #define KVM_GET_PIT2              _IOR(KVMIO,  0x9f, struct
> >  kvm_pit_state2) #define KVM_SET_PIT2              _IOW(KVMIO,  0xa0,
> >  struct kvm_pit_state2)
> > 
> > @@ -795,4 +807,20 @@ struct kvm_assigned_msix_entry {
> > 
> >  	__u16 padding[3];
> >  
> >  };
> > 
> > +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1 << 0)
> > +
> > +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1 << 8)
> > +
> > +#define KVM_MSIX_MMIO_TYPE_DEV_MASK	    0x00ff
> > +#define KVM_MSIX_MMIO_TYPE_BASE_MASK	    0xff00
> > +struct kvm_msix_mmio_user {
> > +	__u32 dev_id;
> > +	__u16 type;
> > +	__u16 max_entries_nr;
> > +	__u64 base_addr;
> > +	__u64 base_va;
> > +	__u64 flags;
> > +	__u64 reserved[4];
> > +};
> > +
> > 
> >  #endif /* __LINUX_KVM_H */
> > 
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index a32c53e..d6c05e3 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -68,8 +68,17 @@ enum kvm_bus {
> > 
> >  	KVM_NR_BUSES
> >  
> >  };
> > 
> > +#define KVM_IO_EXT_DATA_TYPE_MSIX_ROUTING   1
> > 
> >  struct kvm_io_ext_data {
> >  
> >  	int type;   /* Extended data type */
> 
> type seems unused for now. Do we need it?
> I'd guess exit type should usually be sufficient.

I want to get this structure self-explained. 
> 
> > +	union {
> > +		struct {
> > +			u32 dev_id;	/* Device ID */
> > +			u16 type;	/* Device type */
> > +			u16 entry_idx;  /* Accessed MSI-X entry index */
> > +			u64 flags;
> > +		} msix_routing;
> 
> So this is only for when we exit on msix routing?

Yes for now.
> 
> > +	};
> > 
> >  };
> >  
> >  int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
> > 
> > @@ -165,6 +174,8 @@ struct kvm_vcpu {
> > 
> >  	} async_pf;
> >  
> >  #endif
> > 
> > +	int userspace_exit_needed;
> > +
> > 
> >  	struct kvm_vcpu_arch arch;
> >  
> >  };
> > 
> > @@ -238,6 +249,27 @@ struct kvm_memslots {
> > 
> >  					KVM_PRIVATE_MEM_SLOTS];
> >  
> >  };
> > 
> > +#define KVM_MSIX_MMIO_MAX    32
> > +
> > +struct kvm_msix_mmio {
> > +	u32 dev_id;
> > +	u16 type;
> > +	u16 max_entries_nr;
> > +	u64 flags;
> > +	gpa_t table_base_addr;
> > +	hva_t table_base_va;
> 
> void __user* would make for less casting in code.

OK.
> 
> Or even
> 
> 	struct kvm_msix_entry {
> 		__le32 addr_lo;
> 		__le32 addr_hi;
> 		__le32 data;
> 		__le32 ctrl;
> 	};
> 
> 	struct kvm_msix_entry __user *table;
> 
> and then we can do table[entry].ctrl
> and so on.

It's a userspace address, I think use it in this way maybe misleading. Personally 
I want to keep the current way.
> 
> > +	gpa_t pba_base_addr;
> > +	hva_t pba_base_va;
> > +};
> > +
> > +struct kvm_msix_mmio_dev {
> > +	struct kvm *kvm;
> > +	struct kvm_io_device table_dev;
> > +	int mmio_nr;
> > +	struct kvm_msix_mmio mmio[KVM_MSIX_MMIO_MAX];
> > +	struct mutex lock;
> > +};
> > +
> > 
> >  struct kvm {
> >  
> >  	spinlock_t mmu_lock;
> >  	raw_spinlock_t requests_lock;
> > 
> > @@ -286,6 +318,7 @@ struct kvm {
> > 
> >  	long mmu_notifier_count;
> >  
> >  #endif
> >  
> >  	long tlbs_dirty;
> > 
> > +	struct kvm_msix_mmio_dev msix_mmio_dev;
> > 
> >  };
> >  
> >  /* The guest did something we don't support. */
> > 
> > @@ -558,6 +591,9 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
> > 
> >  int kvm_request_irq_source_id(struct kvm *kvm);
> >  void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
> > 
> > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> > +			int assigned_dev_id, int entry, bool mask);
> > +
> > 
> >  /* For vcpu->arch.iommu_flags */
> >  #define KVM_IOMMU_CACHE_COHERENCY	0x1
> > 
> > diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
> > index ae72ae6..d1598a6 100644
> > --- a/virt/kvm/assigned-dev.c
> > +++ b/virt/kvm/assigned-dev.c
> > @@ -18,6 +18,7 @@
> > 
> >  #include <linux/interrupt.h>
> >  #include <linux/slab.h>
> >  #include "irq.h"
> > 
> > +#include "msix_mmio.h"
> > 
> >  static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct
> >  list_head *head,
> >  
> >  						      int assigned_dev_id)
> > 
> > @@ -191,12 +192,25 @@ static void kvm_free_assigned_irq(struct kvm *kvm,
> > 
> >  	kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
> >  
> >  }
> > 
> > +static void assigned_device_free_msix_mmio(struct kvm *kvm,
> > +				struct kvm_assigned_dev_kernel *adev)
> > +{
> > +	struct kvm_msix_mmio mmio;
> > +
> > +	mmio.dev_id = adev->assigned_dev_id;
> > +	mmio.type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV |
> > +		    KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> > +	kvm_free_msix_mmio(kvm, &mmio);
> > +}
> > +
> > 
> >  static void kvm_free_assigned_device(struct kvm *kvm,
> >  
> >  				     struct kvm_assigned_dev_kernel
> >  				     *assigned_dev)
> >  
> >  {
> >  
> >  	kvm_free_assigned_irq(kvm, assigned_dev);
> > 
> > +	assigned_device_free_msix_mmio(kvm, assigned_dev);
> > +
> > 
> >  	__pci_reset_function(assigned_dev->dev);
> >  	pci_restore_state(assigned_dev->dev);
> > 
> > @@ -785,3 +799,33 @@ out:
> >  	return r;
> >  
> >  }
> > 
> > +/* The caller should hold kvm->lock */
> > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> > +				int assigned_dev_id, int entry, bool mask)
> > +{
> > +	int r = -EFAULT;
> > +	struct kvm_assigned_dev_kernel *adev;
> > +	int i;
> > +
> > +	if (!irqchip_in_kernel(kvm))
> > +		return r;
> 
> This is wrong, EFAULT should only be returned on
> bad/misaligned address.
> 
> Can we check this during setup instead?
> And then this check can go away or become
> BUG_ON, and the function be void.

I think BUG_ON is OK.
> 
> > +
> > +	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> > +				      assigned_dev_id);
> > +	if (!adev)
> > +		goto out;
> > +
> > +	/* For non-MSIX enabled devices, entries_nr == 0 */
> > +	for (i = 0; i < adev->entries_nr; i++)
> > +		if (adev->host_msix_entries[i].entry == entry) {
> > +			if (mask)
> > +				disable_irq_nosync(
> > +					adev->host_msix_entries[i].vector);
> > +			else
> > +				enable_irq(adev->host_msix_entries[i].vector);
> > +			r = 0;
> > +			break;
> > +		}
> > +out:
> > +	return r;
> > +}
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index a61f90e..f211e49 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -56,6 +56,7 @@
> > 
> >  #include "coalesced_mmio.h"
> >  #include "async_pf.h"
> > 
> > +#include "msix_mmio.h"
> > 
> >  #define CREATE_TRACE_POINTS
> >  #include <trace/events/kvm.h>
> > 
> > @@ -509,6 +510,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
> > 
> >  	struct mm_struct *mm = kvm->mm;
> >  	
> >  	kvm_arch_sync_events(kvm);
> > 
> > +	kvm_unregister_msix_mmio_dev(kvm);
> > 
> >  	spin_lock(&kvm_lock);
> >  	list_del(&kvm->vm_list);
> >  	spin_unlock(&kvm_lock);
> > 
> > @@ -1877,6 +1879,24 @@ static long kvm_vm_ioctl(struct file *filp,
> > 
> >  		mutex_unlock(&kvm->lock);
> >  		break;
> >  
> >  #endif
> > 
> > +	case KVM_REGISTER_MSIX_MMIO: {
> > +		struct kvm_msix_mmio_user mmio_user;
> > +
> > +		r = -EFAULT;
> > +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> > +			goto out;
> > +		r = kvm_vm_ioctl_register_msix_mmio(kvm, &mmio_user);
> > +		break;
> > +	}
> > +	case KVM_UNREGISTER_MSIX_MMIO: {
> > +		struct kvm_msix_mmio_user mmio_user;
> > +
> > +		r = -EFAULT;
> > +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> > +			goto out;
> > +		r = kvm_vm_ioctl_unregister_msix_mmio(kvm, &mmio_user);
> > +		break;
> > +	}
> > 
> >  	default:
> >  		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
> >  		if (r == -ENOTTY)
> > 
> > @@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void)
> > 
> >  		return r;
> >  	
> >  	}
> >  
> >  #endif
> > 
> > +	r = kvm_register_msix_mmio_dev(kvm);
> > +	if (r < 0) {
> > +		kvm_put_kvm(kvm);
> > +		return r;
> > +	}
> > +
> 
> Need to fix error handling below as well?
> Better do it with chained gotos if yes.

Let's make it another separate patch. 
> 
> >  	r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
> >  	if (r < 0)
> >  	
> >  		kvm_put_kvm(kvm);
> > 
> > @@ -2223,14 +2249,18 @@ static void kvm_io_bus_destroy(struct kvm_io_bus
> > *bus)
> > 
> >  int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
> >  
> >  		     int len, const void *val, struct kvm_io_ext_data *ext_data)
> >  
> >  {
> > 
> > -	int i;
> > +	int i, r = -EOPNOTSUPP;
> > 
> >  	struct kvm_io_bus *bus;
> >  	
> >  	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
> > 
> > -	for (i = 0; i < bus->dev_count; i++)
> > -		if (!kvm_iodevice_write(bus->devs[i], addr, len, val, ext_data))
> > +	for (i = 0; i < bus->dev_count; i++) {
> > +		r = kvm_iodevice_write(bus->devs[i], addr, len, val, ext_data);
> > +		if (r == -ENOTSYNC)
> > +			break;
> > +		else if (!r)
> > 
> >  			return 0;
> > 
> > -	return -EOPNOTSUPP;
> > +	}
> > +	return r;
> > 
> >  }
> >  
> >  /* kvm_io_bus_read - called under kvm->slots_lock */
> > 
> > diff --git a/virt/kvm/msix_mmio.c b/virt/kvm/msix_mmio.c
> > new file mode 100644
> > index 0000000..b2a5f86
> > --- /dev/null
> > +++ b/virt/kvm/msix_mmio.c
> > @@ -0,0 +1,301 @@
> > +/*
> > + * MSI-X MMIO emulation
> > + *
> > + * Copyright (c) 2010 Intel Corporation
> > + *
> > + * This work is licensed under the terms of the GNU GPL, version 2.  See
> > + * the COPYING file in the top-level directory.
> > + *
> > + * Author:
> > + *   Sheng Yang <sheng.yang@intel.com>
> > + */
> > +
> > +#include <linux/kvm_host.h>
> > +#include <linux/kvm.h>
> > +
> > +#include "msix_mmio.h"
> > +#include "iodev.h"
> > +
> > +static int update_msix_mask_bit(struct kvm *kvm, struct kvm_msix_mmio
> > *mmio, +				int entry, u32 flag)
> > +{
> > +	if (mmio->type & KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> 
> Does something else even happen?
> If not - BUG_ON.

Not for now, the next would be your vfio? Would use BUG_ON for now.
> 
> > +		return kvm_assigned_device_update_msix_mask_bit(kvm,
> > +				mmio->dev_id, entry, flag);
> > 
> > +	return -EFAULT;
> 
> EINVAL or something

OK
> 
> > +}
> > +
> > +/* Caller must hold dev->lock */
> > +static int get_mmio_table_index(struct kvm_msix_mmio_dev *dev,
> > +				gpa_t addr, int len)
> > +{
> > +	gpa_t start, end;
> > +	int i, r = -EINVAL;
> > +
> > +	for (i = 0; i < dev->mmio_nr; i++) {
> > +		start = dev->mmio[i].table_base_addr;
> > +		end = dev->mmio[i].table_base_addr + PCI_MSIX_ENTRY_SIZE *
> > +			dev->mmio[i].max_entries_nr;
> > +		if (addr >= start && addr + len <= end) {
> > +			r = i;
> > +			break;
> > +		}
> > +	}
> > +
> > +	return r;
> > +}
> > +
> > +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr,
> > int len, +				void *val)
> > +{
> > +	/*TODO: Add big endian support */
> 
> likely can be removed.
> 
> > +	struct kvm_msix_mmio_dev *mmio_dev =
> > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > +	struct kvm_msix_mmio *mmio;
> > +	int idx, ret = 0, entry, offset, r;
> > +
> > +	mutex_lock(&mmio_dev->lock);
> > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > +	if (idx < 0) {
> > +		ret = -EOPNOTSUPP;
> > +		goto out;
> > +	}
> > +	if ((len != 4 && len != 8) || addr & (len - 1))
> > +		goto out;
> > +
> > +	offset = addr % PCI_MSIX_ENTRY_SIZE;
> > +	mmio = &mmio_dev->mmio[idx];
> > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > +	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
> 
> cast above is not needed.
> 
> > +			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> > +	if (r)
> > +		goto out;
> > +out:
> > +	mutex_unlock(&mmio_dev->lock);
> > +	return ret;
> > +}
> > +
> > +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr,
> > +				int len, const void *val,
> > +				struct kvm_io_ext_data *ext_data)
> > +{
> > +	/*TODO: Add big endian support */
> 
> comment above can go I think.
> 
> > +	struct kvm_msix_mmio_dev *mmio_dev =
> > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > +	struct kvm_msix_mmio *mmio;
> > +	int idx, entry, offset, ret = 0, r = 0;
> > +	void __user *entry_base;
> > +	__le32 __user *ctrl_pos;
> > +	__le32 old_ctrl, new_ctrl;
> > +
> > +	mutex_lock(&mmio_dev->kvm->lock);
> > +	mutex_lock(&mmio_dev->lock);
> > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > +	if (idx < 0) {
> > +		ret = -EOPNOTSUPP;
> > +		goto out;
> > +	}
> > +	if ((len != 4 && len != 8) || addr & (len - 1))
> > +		goto out;
> > +
> > +	offset = addr % PCI_MSIX_ENTRY_SIZE;
> > +
> > +	mmio = &mmio_dev->mmio[idx];
> > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > +	entry_base = (void __user *)(mmio->table_base_va +
> > +			entry * PCI_MSIX_ENTRY_SIZE);
> > +	ctrl_pos = entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL;
> > +
> > +	if (get_user(old_ctrl, ctrl_pos))
> > +		goto out;
> > +
> > +	/* Don't allow writing to other fields when entry is unmasked */
> > +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> 
> This should be le32_to_cpu(old_ctrl)
> I think sparse shall warn about this.

Sadly it didn't. 
> 
> > +	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> > +		goto out;
> > +
> > +	if (copy_to_user((void __user *)(entry_base + offset), val, len))
> 
> A cast above is no longer needed
> 
> > +		goto out;
> > +
> > +	ext_data->type = KVM_IO_EXT_DATA_TYPE_MSIX_ROUTING;
> > +	ext_data->msix_routing.dev_id = mmio->dev_id;
> > +	ext_data->msix_routing.type = mmio->type;
> > +	ext_data->msix_routing.entry_idx = entry;
> > +	ext_data->msix_routing.flags = 0;
> 
> Is this strictly necessary?
> Can we fill the data in vcpu->run->msix_routing
> directly instead?
> Also, on good path we don't need to fill this structure
> as there's no exit. Let's only do it then?

Where could you get the "entry_idx"?
> 
> > +
> > +	if (offset + len < PCI_MSIX_ENTRY_VECTOR_CTRL) {
> > +		ret = -ENOTSYNC;
> > +		goto out;
> > +	}
> > +
> > +	if (get_user(new_ctrl, ctrl_pos))
> > +		goto out;
> > +
> > +	if (old_ctrl == new_ctrl) {
> > +		if (offset == PCI_MSIX_ENTRY_DATA && len == 8)
> > +			ret = -ENOTSYNC;
> > +		goto out;
> > +	}
> > +	if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) ^
> > +			(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry,
> > +				!!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT));
> 
> So if update_msix_mask_bit returns EFAULT above this will
> trigger msix exit? Why?

This means kernel didn't handle the mask bit, so we need let userspace handle it - 
for enable the vectors, or others.

> 
> > +	if (r)
> > +		ret = -ENOTSYNC;
> > +out:
> > +	mutex_unlock(&mmio_dev->lock);
> > +	mutex_unlock(&mmio_dev->kvm->lock);
> > +	return ret;
> > +}
> > +
> > +static const struct kvm_io_device_ops msix_mmio_table_ops = {
> > +	.read     = msix_table_mmio_read,
> > +	.write    = msix_table_mmio_write,
> > +};
> > +
> > +int kvm_register_msix_mmio_dev(struct kvm *kvm)
> > +{
> > +	int ret;
> > +
> > +	kvm_iodevice_init(&kvm->msix_mmio_dev.table_dev, &msix_mmio_table_ops);
> > +	mutex_init(&kvm->msix_mmio_dev.lock);
> > +	kvm->msix_mmio_dev.kvm = kvm;
> > +	mutex_lock(&kvm->slots_lock);
> > +	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS,
> > +				      &kvm->msix_mmio_dev.table_dev);
> > +	mutex_unlock(&kvm->slots_lock);
> > +	return ret;
> > +}
> > +
> > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm)
> > +{
> > +	int ret;
> > +
> > +	mutex_lock(&kvm->slots_lock);
> > +	ret = kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
> > +				      &kvm->msix_mmio_dev.table_dev);
> > +	mutex_unlock(&kvm->slots_lock);
> > +	return ret;
> > +}
> > +
> > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> > +				    struct kvm_msix_mmio_user *mmio_user)
> > +{
> > +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> > +	struct kvm_msix_mmio *mmio = NULL;
> > +	int r = 0, i;
> > +
> > +	mutex_lock(&mmio_dev->lock);
> > +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> > +		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id &&
> > +		    (mmio_dev->mmio[i].type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > +		    (mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
> > +			mmio = &mmio_dev->mmio[i];
> > +			if (mmio->max_entries_nr != mmio_user->max_entries_nr) {
> > +				r = -EINVAL;
> > +				goto out;
> > +			}
> > +			break;
> > +		}
> > +	}
> 
> what does the loop above do? Pls add a comment.

Find the existed MMIO, I think code is clear here.
> 
> > +	if (mmio_user->max_entries_nr > KVM_MAX_MSIX_PER_DEV) {
> > +		r = -EINVAL;
> 
> -ENOSPC here as well?

OK
> 
> > +		goto out;
> > +	}
> > +	/* All reserved currently */
> > +	if (mmio_user->flags) {
> > +		r = -EINVAL;
> > +		goto out;
> > +	}
> > +
> > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) !=
> > +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) {
> > +		r = -EINVAL;
> > +		goto out;
> > +	}
> > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) !=
> > +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> > +		r = -EINVAL;
> > +		goto out;
> > +	}
> > +
> > +#ifndef CONFIG_64BIT
> > +	if (mmio_user->base_va >= 0xffffffff ||
> > +	    mmio_user->base_addr >= 0xffffffff) {
> > +		r = -EINVAL;
> > +		goto out;
> > +	}
> > +#endif
> 
> A way to check this without ifdef (long is as pointer in length):
> 	if ((unsigned long)mmio_user->base_va !=
> 		mmio_user->base_va)
> 
> and for base_va it should be EFAULT I think as with any illegal
> virtual address.

OK.
> 
> > +
> > +	/* Check alignment and accessibility */
> > +	if ((mmio_user->base_va % PCI_MSIX_ENTRY_SIZE) ||
> > +	    !access_ok(VERIFY_WRITE, (void __user *)mmio_user->base_va,
> > +			mmio_user->max_entries_nr * PCI_MSIX_ENTRY_SIZE)) {
> 
> One thing to watch for here is wrap around. AFAIK
> access_ok does not check for it, you must do it yourself.

I am curious that if so, every access_ok should go with a wrap around checking?

> 
> > +		r = -EINVAL;
> 
> EFAULT is usually better for an access error.

OK.
> 
> > +		goto out;
> > +	}
> > +	if (!mmio) {
> > +		if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) {
> > +			r = -ENOSPC;
> > +			goto out;
> > +		}
> > +		mmio = &mmio_dev->mmio[mmio_dev->mmio_nr];
> > +		mmio_dev->mmio_nr++;
> > +	}
> > +
> > +	mmio->max_entries_nr = mmio_user->max_entries_nr;
> > +	mmio->dev_id = mmio_user->dev_id;
> > +	mmio->flags = mmio_user->flags;
> > +
> > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> > +		mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV;
> > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
> > +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> > +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> > +		mmio->table_base_addr = mmio_user->base_addr;
> > +		mmio->table_base_va = mmio_user->base_va;
> > +	}
> > +out:
> > +	mutex_unlock(&mmio_dev->lock);
> > +	return r;
> > +}
> > +
> > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio)
> 
> Pls add a comment documenting what this does.

You mean code is too confusing here? I don't think so.
> 
> > +{
> > +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> > +	int r = -EINVAL, i, j;
> > +
> > +	if (!mmio)
> > +		return 0;
> > +
> > +	mutex_lock(&mmio_dev->lock);
> > +	BUG_ON(mmio_dev->mmio_nr > KVM_MSIX_MMIO_MAX);
> > +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> > +		if (mmio_dev->mmio[i].dev_id == mmio->dev_id &&
> > +		    mmio_dev->mmio[i].type == mmio->type) {
> > +			r = 0;
> > +			for (j = i; j < mmio_dev->mmio_nr - 1; j++)
> > +				mmio_dev->mmio[j] = mmio_dev->mmio[j + 1];
> > +			mmio_dev->mmio[mmio_dev->mmio_nr].max_entries_nr = 0;
> > +			mmio_dev->mmio[mmio_dev->mmio_nr].dev_id = 0;
> > +			mmio_dev->mmio[mmio_dev->mmio_nr].type = 0;
> > +			mmio_dev->mmio_nr--;
> > +			break;
> > +		}
> > +	}
> > +	mutex_unlock(&mmio_dev->lock);
> > +	return r;
> > +}
> > +
> > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> > +				      struct kvm_msix_mmio_user *mmio_user)
> > +{
> > +	struct kvm_msix_mmio mmio;
> > +
> > +	mmio.dev_id = mmio_user->dev_id;
> > +	mmio.type = mmio_user->type;
> > +
> 
> So you pass in kvm_msix_mmio but only 2 fields are
> valid? This is very confusing. Please just pass the
> two values as function params to kvm_free_msix_mmio.

OK.
> 
> > +	return kvm_free_msix_mmio(kvm, &mmio);
> > +}
> > +
> > diff --git a/virt/kvm/msix_mmio.h b/virt/kvm/msix_mmio.h
> > new file mode 100644
> > index 0000000..01b6587
> > --- /dev/null
> > +++ b/virt/kvm/msix_mmio.h
> > @@ -0,0 +1,25 @@
> > +#ifndef __KVM_MSIX_MMIO_H__
> > +#define __KVM_MSIX_MMIO_H__
> > +/*
> > + * MSI-X MMIO emulation
> > + *
> > + * Copyright (c) 2010 Intel Corporation
> > + *
> > + * This work is licensed under the terms of the GNU GPL, version 2.  See
> > + * the COPYING file in the top-level directory.
> > + *
> > + * Author:
> > + *   Sheng Yang <sheng.yang@intel.com>
> > + */
> > +
> > +#include <linux/pci.h>
> 
> I don't see where you use this include.
> OTOH you do need to declare struct kvm_msix_mmio_user and
> struct kvm here.

Yes.

--
regards
Yang, Sheng

> 
> > +
> > +int kvm_register_msix_mmio_dev(struct kvm *kvm);
> > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm);
> > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> > +				    struct kvm_msix_mmio_user *mmio_user);
> > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> > +				      struct kvm_msix_mmio_user *mmio_user);
> > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio
> > *mmio_user); +
> > +#endif

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 3/4] KVM: Emulate MSI-X table in kernel
  2011-03-01  6:10     ` Sheng Yang
@ 2011-03-01 12:20       ` Michael S. Tsirkin
  2011-03-01 12:37         ` Sheng Yang
  2011-03-01 13:01         ` Sheng Yang
  0 siblings, 2 replies; 10+ messages in thread
From: Michael S. Tsirkin @ 2011-03-01 12:20 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Avi Kivity, Marcelo Tosatti, Alex Williamson, kvm

On Tue, Mar 01, 2011 at 02:10:37PM +0800, Sheng Yang wrote:
> On Monday 28 February 2011 19:27:29 Michael S. Tsirkin wrote:
> > On Mon, Feb 28, 2011 at 03:20:04PM +0800, Sheng Yang wrote:
> > > Then we can support mask bit operation of assigned devices now.
> > > 
> > > Signed-off-by: Sheng Yang <sheng@linux.intel.com>
> > 
> > A general question: we implement mmio read and write
> > operation here, but seem to do nothing
> > about ordering. In particular, pci mmio reads are normally
> > assumed to flush all writes out to the device
> > and all device writes in to the CPU.
> > 
> > How exactly does it work here?
> 
> I don't understand your problem... We emulate all operation here, where is 
> "ordering" issue?
> 
> And Michael, thanks for you detail comments, but could you give comments all at 
> once? Now I've tried my best to address comments, but still feeling endless 
> comments are coming.

Hmm, sorry about that. At least some of the comments are in
the new code, so I could not have commented on it earlier ...
E.g. the ext_data thing only appeared in the latest version
or the one before that. In some cases such as the non-standard
error codes used, I don't always udnerstand the logic, as
the error handling is switched to standard conventions
so comments appear as the logic becomes apparent. This
applies to EFAULT handling below.

> And I would leave Intel this Friday, I want to get it done 
> before I leave.

Permanently?
I think it would be helpful, in that case, if you publish some
testing data detailing how best to test the patch,
which hardware etc.  We will then do my best to carry on, fix up
the remaining nits if any, test and apply the patch.

> > 
> > > ---
> > > 
> > >  arch/x86/include/asm/kvm_host.h |    1 +
> > >  arch/x86/kvm/Makefile           |    2 +-
> > >  arch/x86/kvm/mmu.c              |    2 +
> > >  arch/x86/kvm/x86.c              |   40 ++++-
> > >  include/linux/kvm.h             |   28 ++++
> > >  include/linux/kvm_host.h        |   36 +++++
> > >  virt/kvm/assigned-dev.c         |   44 ++++++
> > >  virt/kvm/kvm_main.c             |   38 +++++-
> > >  virt/kvm/msix_mmio.c            |  301
> > >  +++++++++++++++++++++++++++++++++++++++ virt/kvm/msix_mmio.h           
> > >  |   25 ++++
> > >  10 files changed, 504 insertions(+), 13 deletions(-)
> > >  create mode 100644 virt/kvm/msix_mmio.c
> > >  create mode 100644 virt/kvm/msix_mmio.h
> > > 
> > > diff --git a/arch/x86/include/asm/kvm_host.h
> > > b/arch/x86/include/asm/kvm_host.h index aa75f21..4a390a4 100644
> > > --- a/arch/x86/include/asm/kvm_host.h
> > > +++ b/arch/x86/include/asm/kvm_host.h
> > > @@ -635,6 +635,7 @@ enum emulation_result {
> > > 
> > >  	EMULATE_DONE,       /* no further processing */
> > >  	EMULATE_DO_MMIO,      /* kvm_run filled with mmio request */
> > >  	EMULATE_FAIL,         /* can't emulate this instruction */
> > > 
> > > +	EMULATE_USERSPACE_EXIT, /* we need exit to userspace */
> > > 
> > >  };
> > >  
> > >  #define EMULTYPE_NO_DECODE	    (1 << 0)
> > > 
> > > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> > > index f15501f..3a0d851 100644
> > > --- a/arch/x86/kvm/Makefile
> > > +++ b/arch/x86/kvm/Makefile
> > > @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
> > > 
> > >  kvm-y			+= $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
> > >  
> > >  				coalesced_mmio.o irq_comm.o eventfd.o \
> > > 
> > > -				assigned-dev.o)
> > > +				assigned-dev.o msix_mmio.o)
> > > 
> > >  kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/, iommu.o)
> > >  kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix ../../../virt/kvm/,
> > >  async_pf.o)
> > > 
> > > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> > > index 9cafbb4..912dca4 100644
> > > --- a/arch/x86/kvm/mmu.c
> > > +++ b/arch/x86/kvm/mmu.c
> > > @@ -3358,6 +3358,8 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t
> > > cr2, u32 error_code,
> > > 
> > >  	case EMULATE_DO_MMIO:
> > >  		++vcpu->stat.mmio_exits;
> > >  		/* fall through */
> > > 
> > > +	case EMULATE_USERSPACE_EXIT:
> > > +		/* fall through */
> > > 
> > >  	case EMULATE_FAIL:
> > >  		return 0;
> > >  	
> > >  	default:
> > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > index 21b84e2..87308eb 100644
> > > --- a/arch/x86/kvm/x86.c
> > > +++ b/arch/x86/kvm/x86.c
> > > @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)
> > > 
> > >  	case KVM_CAP_X86_ROBUST_SINGLESTEP:
> > >  	case KVM_CAP_XSAVE:
> > > 
> > >  	case KVM_CAP_ASYNC_PF:
> > > +	case KVM_CAP_MSIX_MMIO:
> > >  		r = 1;
> > >  		break;
> > >  	
> > >  	case KVM_CAP_COALESCED_MMIO:
> > > @@ -3809,6 +3810,7 @@ static int emulator_write_emulated_onepage(unsigned
> > > long addr,
> > > 
> > >  {
> > >  
> > >  	gpa_t                 gpa;
> > >  	struct kvm_io_ext_data ext_data;
> > > 
> > > +	int r;
> > > 
> > >  	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
> > > 
> > > @@ -3824,18 +3826,32 @@ static int
> > > emulator_write_emulated_onepage(unsigned long addr,
> > > 
> > >  mmio:
> > >  	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
> > > 
> > > +	r = vcpu_mmio_write(vcpu, gpa, bytes, val, &ext_data);
> > > 
> > >  	/*
> > >  	
> > >  	 * Is this MMIO handled locally?
> > >  	 */
> > > 
> > > -	if (!vcpu_mmio_write(vcpu, gpa, bytes, val, &ext_data))
> > > +	if (!r)
> > > 
> > >  		return X86EMUL_CONTINUE;
> > > 
> > > -	vcpu->mmio_needed = 1;
> > > -	vcpu->run->exit_reason = KVM_EXIT_MMIO;
> > > -	vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
> > > -	vcpu->run->mmio.len = vcpu->mmio_size = bytes;
> > > -	vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
> > > -	memcpy(vcpu->run->mmio.data, val, bytes);
> > > +	if (r == -ENOTSYNC) {
> > 
> > Can you replace -ENOTSYNC with KVM_EXIT_MSIX_ROUTING_UPDATE all over
> > please?
> 
> How about let Avi/Marcelo decide?

They are the ones that decide anyway :)
It's prettier though, if you have the time it'd be nice to
fix it up directly.

> > 
> > > +		vcpu->userspace_exit_needed = 1;
> > > +		vcpu->run->exit_reason = KVM_EXIT_MSIX_ROUTING_UPDATE;
> > > +		vcpu->run->msix_routing.dev_id =
> > > +			ext_data.msix_routing.dev_id;
> > > +		vcpu->run->msix_routing.type =
> > > +			ext_data.msix_routing.type;
> > > +		vcpu->run->msix_routing.entry_idx =
> > > +			ext_data.msix_routing.entry_idx;
> > > +		vcpu->run->msix_routing.flags =
> > > +			ext_data.msix_routing.flags;
> > > +	} else  {
> > > +		vcpu->mmio_needed = 1;
> > > +		vcpu->run->exit_reason = KVM_EXIT_MMIO;
> > > +		vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
> > > +		vcpu->run->mmio.len = vcpu->mmio_size = bytes;
> > > +		vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
> > > +		memcpy(vcpu->run->mmio.data, val, bytes);
> > > +	}
> > > 
> > >  	return X86EMUL_CONTINUE;
> > >  
> > >  }
> > > 
> > > @@ -4469,6 +4485,8 @@ done:
> > >  		r = EMULATE_DO_MMIO;
> > >  	
> > >  	} else if (r == EMULATION_RESTART)
> > >  	
> > >  		goto restart;
> > > 
> > > +	else if (vcpu->userspace_exit_needed)
> > > +		r = EMULATE_USERSPACE_EXIT;
> > > 
> > >  	else
> > >  	
> > >  		r = EMULATE_DONE;
> > > 
> > > @@ -5397,12 +5415,18 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu
> > > *vcpu, struct kvm_run *kvm_run)
> > > 
> > >  		}
> > >  	
> > >  	}
> > > 
> > > -	if (vcpu->arch.pio.count || vcpu->mmio_needed) {
> > > +	if (vcpu->arch.pio.count || vcpu->mmio_needed ||
> > > +			vcpu->userspace_exit_needed) {
> > > 
> > >  		if (vcpu->mmio_needed) {
> > >  		
> > >  			memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
> > >  			vcpu->mmio_read_completed = 1;
> > >  			vcpu->mmio_needed = 0;
> > >  		
> > >  		}
> > > 
> > > +		if (vcpu->userspace_exit_needed) {
> > > +			vcpu->userspace_exit_needed = 0;
> > > +			r = 0;
> > > +			goto out;
> > > +		}
> > > 
> > >  		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
> > >  		r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
> > >  		srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
> > > 
> > > diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> > > index ea2dc1a..4393e4e 100644
> > > --- a/include/linux/kvm.h
> > > +++ b/include/linux/kvm.h
> > > @@ -161,6 +161,7 @@ struct kvm_pit_config {
> > > 
> > >  #define KVM_EXIT_NMI              16
> > >  #define KVM_EXIT_INTERNAL_ERROR   17
> > >  #define KVM_EXIT_OSI              18
> > > 
> > > +#define KVM_EXIT_MSIX_ROUTING_UPDATE 19
> > > 
> > >  /* For KVM_EXIT_INTERNAL_ERROR */
> > >  #define KVM_INTERNAL_ERROR_EMULATION 1
> > > 
> > > @@ -264,6 +265,13 @@ struct kvm_run {
> > > 
> > >  		struct {
> > >  		
> > >  			__u64 gprs[32];
> > >  		
> > >  		} osi;
> > > 
> > > +		/* KVM_EXIT_MSIX_ROUTING_UPDATE*/
> > > +		struct {
> > > +			__u32 dev_id;
> > > +			__u16 type;
> > > +			__u16 entry_idx;
> > > +			__u64 flags;
> > > +		} msix_routing;
> > > 
> > >  		/* Fix the size of the union. */
> > >  		char padding[256];
> > >  	
> > >  	};
> > > 
> > > @@ -541,6 +549,7 @@ struct kvm_ppc_pvinfo {
> > > 
> > >  #define KVM_CAP_PPC_GET_PVINFO 57
> > >  #define KVM_CAP_PPC_IRQ_LEVEL 58
> > >  #define KVM_CAP_ASYNC_PF 59
> > > 
> > > +#define KVM_CAP_MSIX_MMIO 60
> > > 
> > >  #ifdef KVM_CAP_IRQ_ROUTING
> > > 
> > > @@ -672,6 +681,9 @@ struct kvm_clock_data {
> > > 
> > >  #define KVM_XEN_HVM_CONFIG        _IOW(KVMIO,  0x7a, struct
> > >  kvm_xen_hvm_config) #define KVM_SET_CLOCK             _IOW(KVMIO, 
> > >  0x7b, struct kvm_clock_data) #define KVM_GET_CLOCK            
> > >  _IOR(KVMIO,  0x7c, struct kvm_clock_data)
> > > 
> > > +/* Available with KVM_CAP_MSIX_MMIO */
> > > +#define KVM_REGISTER_MSIX_MMIO    _IOW(KVMIO,  0x7d, struct
> > > kvm_msix_mmio_user) +#define KVM_UNREGISTER_MSIX_MMIO  _IOW(KVMIO, 
> > > 0x7e, struct kvm_msix_mmio_user)
> > > 
> > >  /* Available with KVM_CAP_PIT_STATE2 */
> > >  #define KVM_GET_PIT2              _IOR(KVMIO,  0x9f, struct
> > >  kvm_pit_state2) #define KVM_SET_PIT2              _IOW(KVMIO,  0xa0,
> > >  struct kvm_pit_state2)
> > > 
> > > @@ -795,4 +807,20 @@ struct kvm_assigned_msix_entry {
> > > 
> > >  	__u16 padding[3];
> > >  
> > >  };
> > > 
> > > +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1 << 0)
> > > +
> > > +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1 << 8)
> > > +
> > > +#define KVM_MSIX_MMIO_TYPE_DEV_MASK	    0x00ff
> > > +#define KVM_MSIX_MMIO_TYPE_BASE_MASK	    0xff00
> > > +struct kvm_msix_mmio_user {
> > > +	__u32 dev_id;
> > > +	__u16 type;
> > > +	__u16 max_entries_nr;
> > > +	__u64 base_addr;
> > > +	__u64 base_va;
> > > +	__u64 flags;
> > > +	__u64 reserved[4];
> > > +};
> > > +
> > > 
> > >  #endif /* __LINUX_KVM_H */
> > > 
> > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > index a32c53e..d6c05e3 100644
> > > --- a/include/linux/kvm_host.h
> > > +++ b/include/linux/kvm_host.h
> > > @@ -68,8 +68,17 @@ enum kvm_bus {
> > > 
> > >  	KVM_NR_BUSES
> > >  
> > >  };
> > > 
> > > +#define KVM_IO_EXT_DATA_TYPE_MSIX_ROUTING   1
> > > 
> > >  struct kvm_io_ext_data {
> > >  
> > >  	int type;   /* Extended data type */
> > 
> > type seems unused for now. Do we need it?
> > I'd guess exit type should usually be sufficient.
> 
> I want to get this structure self-explained. 
> > 
> > > +	union {
> > > +		struct {
> > > +			u32 dev_id;	/* Device ID */
> > > +			u16 type;	/* Device type */
> > > +			u16 entry_idx;  /* Accessed MSI-X entry index */
> > > +			u64 flags;
> > > +		} msix_routing;
> > 
> > So this is only for when we exit on msix routing?
> 
> Yes for now.
> > 
> > > +	};
> > > 
> > >  };
> > >  
> > >  int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
> > > 
> > > @@ -165,6 +174,8 @@ struct kvm_vcpu {
> > > 
> > >  	} async_pf;
> > >  
> > >  #endif
> > > 
> > > +	int userspace_exit_needed;
> > > +
> > > 
> > >  	struct kvm_vcpu_arch arch;
> > >  
> > >  };
> > > 
> > > @@ -238,6 +249,27 @@ struct kvm_memslots {
> > > 
> > >  					KVM_PRIVATE_MEM_SLOTS];
> > >  
> > >  };
> > > 
> > > +#define KVM_MSIX_MMIO_MAX    32
> > > +
> > > +struct kvm_msix_mmio {
> > > +	u32 dev_id;
> > > +	u16 type;
> > > +	u16 max_entries_nr;
> > > +	u64 flags;
> > > +	gpa_t table_base_addr;
> > > +	hva_t table_base_va;
> > 
> > void __user* would make for less casting in code.
> 
> OK.
> > 
> > Or even
> > 
> > 	struct kvm_msix_entry {
> > 		__le32 addr_lo;
> > 		__le32 addr_hi;
> > 		__le32 data;
> > 		__le32 ctrl;
> > 	};
> > 
> > 	struct kvm_msix_entry __user *table;
> > 
> > and then we can do table[entry].ctrl
> > and so on.
> 
> It's a userspace address, I think use it in this way maybe misleading. Personally 
> I want to keep the current way.
> > 
> > > +	gpa_t pba_base_addr;
> > > +	hva_t pba_base_va;
> > > +};
> > > +
> > > +struct kvm_msix_mmio_dev {
> > > +	struct kvm *kvm;
> > > +	struct kvm_io_device table_dev;
> > > +	int mmio_nr;
> > > +	struct kvm_msix_mmio mmio[KVM_MSIX_MMIO_MAX];
> > > +	struct mutex lock;
> > > +};
> > > +
> > > 
> > >  struct kvm {
> > >  
> > >  	spinlock_t mmu_lock;
> > >  	raw_spinlock_t requests_lock;
> > > 
> > > @@ -286,6 +318,7 @@ struct kvm {
> > > 
> > >  	long mmu_notifier_count;
> > >  
> > >  #endif
> > >  
> > >  	long tlbs_dirty;
> > > 
> > > +	struct kvm_msix_mmio_dev msix_mmio_dev;
> > > 
> > >  };
> > >  
> > >  /* The guest did something we don't support. */
> > > 
> > > @@ -558,6 +591,9 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
> > > 
> > >  int kvm_request_irq_source_id(struct kvm *kvm);
> > >  void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
> > > 
> > > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> > > +			int assigned_dev_id, int entry, bool mask);
> > > +
> > > 
> > >  /* For vcpu->arch.iommu_flags */
> > >  #define KVM_IOMMU_CACHE_COHERENCY	0x1
> > > 
> > > diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
> > > index ae72ae6..d1598a6 100644
> > > --- a/virt/kvm/assigned-dev.c
> > > +++ b/virt/kvm/assigned-dev.c
> > > @@ -18,6 +18,7 @@
> > > 
> > >  #include <linux/interrupt.h>
> > >  #include <linux/slab.h>
> > >  #include "irq.h"
> > > 
> > > +#include "msix_mmio.h"
> > > 
> > >  static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct
> > >  list_head *head,
> > >  
> > >  						      int assigned_dev_id)
> > > 
> > > @@ -191,12 +192,25 @@ static void kvm_free_assigned_irq(struct kvm *kvm,
> > > 
> > >  	kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
> > >  
> > >  }
> > > 
> > > +static void assigned_device_free_msix_mmio(struct kvm *kvm,
> > > +				struct kvm_assigned_dev_kernel *adev)
> > > +{
> > > +	struct kvm_msix_mmio mmio;
> > > +
> > > +	mmio.dev_id = adev->assigned_dev_id;
> > > +	mmio.type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV |
> > > +		    KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> > > +	kvm_free_msix_mmio(kvm, &mmio);
> > > +}
> > > +
> > > 
> > >  static void kvm_free_assigned_device(struct kvm *kvm,
> > >  
> > >  				     struct kvm_assigned_dev_kernel
> > >  				     *assigned_dev)
> > >  
> > >  {
> > >  
> > >  	kvm_free_assigned_irq(kvm, assigned_dev);
> > > 
> > > +	assigned_device_free_msix_mmio(kvm, assigned_dev);
> > > +
> > > 
> > >  	__pci_reset_function(assigned_dev->dev);
> > >  	pci_restore_state(assigned_dev->dev);
> > > 
> > > @@ -785,3 +799,33 @@ out:
> > >  	return r;
> > >  
> > >  }
> > > 
> > > +/* The caller should hold kvm->lock */
> > > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> > > +				int assigned_dev_id, int entry, bool mask)
> > > +{
> > > +	int r = -EFAULT;
> > > +	struct kvm_assigned_dev_kernel *adev;
> > > +	int i;
> > > +
> > > +	if (!irqchip_in_kernel(kvm))
> > > +		return r;
> > 
> > This is wrong, EFAULT should only be returned on
> > bad/misaligned address.
> > 
> > Can we check this during setup instead?
> > And then this check can go away or become
> > BUG_ON, and the function be void.
> 
> I think BUG_ON is OK.
> > 
> > > +
> > > +	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> > > +				      assigned_dev_id);
> > > +	if (!adev)
> > > +		goto out;
> > > +
> > > +	/* For non-MSIX enabled devices, entries_nr == 0 */
> > > +	for (i = 0; i < adev->entries_nr; i++)
> > > +		if (adev->host_msix_entries[i].entry == entry) {
> > > +			if (mask)
> > > +				disable_irq_nosync(
> > > +					adev->host_msix_entries[i].vector);
> > > +			else
> > > +				enable_irq(adev->host_msix_entries[i].vector);
> > > +			r = 0;
> > > +			break;
> > > +		}
> > > +out:
> > > +	return r;
> > > +}
> > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > > index a61f90e..f211e49 100644
> > > --- a/virt/kvm/kvm_main.c
> > > +++ b/virt/kvm/kvm_main.c
> > > @@ -56,6 +56,7 @@
> > > 
> > >  #include "coalesced_mmio.h"
> > >  #include "async_pf.h"
> > > 
> > > +#include "msix_mmio.h"
> > > 
> > >  #define CREATE_TRACE_POINTS
> > >  #include <trace/events/kvm.h>
> > > 
> > > @@ -509,6 +510,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
> > > 
> > >  	struct mm_struct *mm = kvm->mm;
> > >  	
> > >  	kvm_arch_sync_events(kvm);
> > > 
> > > +	kvm_unregister_msix_mmio_dev(kvm);
> > > 
> > >  	spin_lock(&kvm_lock);
> > >  	list_del(&kvm->vm_list);
> > >  	spin_unlock(&kvm_lock);
> > > 
> > > @@ -1877,6 +1879,24 @@ static long kvm_vm_ioctl(struct file *filp,
> > > 
> > >  		mutex_unlock(&kvm->lock);
> > >  		break;
> > >  
> > >  #endif
> > > 
> > > +	case KVM_REGISTER_MSIX_MMIO: {
> > > +		struct kvm_msix_mmio_user mmio_user;
> > > +
> > > +		r = -EFAULT;
> > > +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> > > +			goto out;
> > > +		r = kvm_vm_ioctl_register_msix_mmio(kvm, &mmio_user);
> > > +		break;
> > > +	}
> > > +	case KVM_UNREGISTER_MSIX_MMIO: {
> > > +		struct kvm_msix_mmio_user mmio_user;
> > > +
> > > +		r = -EFAULT;
> > > +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> > > +			goto out;
> > > +		r = kvm_vm_ioctl_unregister_msix_mmio(kvm, &mmio_user);
> > > +		break;
> > > +	}
> > > 
> > >  	default:
> > >  		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
> > >  		if (r == -ENOTTY)
> > > 
> > > @@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void)
> > > 
> > >  		return r;
> > >  	
> > >  	}
> > >  
> > >  #endif
> > > 
> > > +	r = kvm_register_msix_mmio_dev(kvm);
> > > +	if (r < 0) {
> > > +		kvm_put_kvm(kvm);
> > > +		return r;
> > > +	}
> > > +
> > 
> > Need to fix error handling below as well?
> > Better do it with chained gotos if yes.
> 
> Let's make it another separate patch. 

Well you add a new failure mode, you need to cleanup
properly ...

> > 
> > >  	r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
> > >  	if (r < 0)
> > >  	
> > >  		kvm_put_kvm(kvm);
> > > 
> > > @@ -2223,14 +2249,18 @@ static void kvm_io_bus_destroy(struct kvm_io_bus
> > > *bus)
> > > 
> > >  int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
> > >  
> > >  		     int len, const void *val, struct kvm_io_ext_data *ext_data)
> > >  
> > >  {
> > > 
> > > -	int i;
> > > +	int i, r = -EOPNOTSUPP;
> > > 
> > >  	struct kvm_io_bus *bus;
> > >  	
> > >  	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
> > > 
> > > -	for (i = 0; i < bus->dev_count; i++)
> > > -		if (!kvm_iodevice_write(bus->devs[i], addr, len, val, ext_data))
> > > +	for (i = 0; i < bus->dev_count; i++) {
> > > +		r = kvm_iodevice_write(bus->devs[i], addr, len, val, ext_data);
> > > +		if (r == -ENOTSYNC)
> > > +			break;
> > > +		else if (!r)
> > > 
> > >  			return 0;
> > > 
> > > -	return -EOPNOTSUPP;
> > > +	}
> > > +	return r;
> > > 
> > >  }
> > >  
> > >  /* kvm_io_bus_read - called under kvm->slots_lock */
> > > 
> > > diff --git a/virt/kvm/msix_mmio.c b/virt/kvm/msix_mmio.c
> > > new file mode 100644
> > > index 0000000..b2a5f86
> > > --- /dev/null
> > > +++ b/virt/kvm/msix_mmio.c
> > > @@ -0,0 +1,301 @@
> > > +/*
> > > + * MSI-X MMIO emulation
> > > + *
> > > + * Copyright (c) 2010 Intel Corporation
> > > + *
> > > + * This work is licensed under the terms of the GNU GPL, version 2.  See
> > > + * the COPYING file in the top-level directory.
> > > + *
> > > + * Author:
> > > + *   Sheng Yang <sheng.yang@intel.com>
> > > + */
> > > +
> > > +#include <linux/kvm_host.h>
> > > +#include <linux/kvm.h>
> > > +
> > > +#include "msix_mmio.h"
> > > +#include "iodev.h"
> > > +
> > > +static int update_msix_mask_bit(struct kvm *kvm, struct kvm_msix_mmio
> > > *mmio, +				int entry, u32 flag)
> > > +{
> > > +	if (mmio->type & KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> > 
> > Does something else even happen?
> > If not - BUG_ON.
> 
> Not for now, the next would be your vfio? Would use BUG_ON for now.
> > 
> > > +		return kvm_assigned_device_update_msix_mask_bit(kvm,
> > > +				mmio->dev_id, entry, flag);
> > > 
> > > +	return -EFAULT;
> > 
> > EINVAL or something
> 
> OK
> > 
> > > +}
> > > +
> > > +/* Caller must hold dev->lock */
> > > +static int get_mmio_table_index(struct kvm_msix_mmio_dev *dev,
> > > +				gpa_t addr, int len)
> > > +{
> > > +	gpa_t start, end;
> > > +	int i, r = -EINVAL;
> > > +
> > > +	for (i = 0; i < dev->mmio_nr; i++) {
> > > +		start = dev->mmio[i].table_base_addr;
> > > +		end = dev->mmio[i].table_base_addr + PCI_MSIX_ENTRY_SIZE *
> > > +			dev->mmio[i].max_entries_nr;
> > > +		if (addr >= start && addr + len <= end) {
> > > +			r = i;
> > > +			break;
> > > +		}
> > > +	}
> > > +
> > > +	return r;
> > > +}
> > > +
> > > +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr,
> > > int len, +				void *val)
> > > +{
> > > +	/*TODO: Add big endian support */
> > 
> > likely can be removed.
> > 
> > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > +	struct kvm_msix_mmio *mmio;
> > > +	int idx, ret = 0, entry, offset, r;
> > > +
> > > +	mutex_lock(&mmio_dev->lock);
> > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > +	if (idx < 0) {
> > > +		ret = -EOPNOTSUPP;
> > > +		goto out;
> > > +	}
> > > +	if ((len != 4 && len != 8) || addr & (len - 1))
> > > +		goto out;
> > > +
> > > +	offset = addr % PCI_MSIX_ENTRY_SIZE;
> > > +	mmio = &mmio_dev->mmio[idx];
> > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > +	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
> > 
> > cast above is not needed.
> > 
> > > +			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> > > +	if (r)
> > > +		goto out;
> > > +out:
> > > +	mutex_unlock(&mmio_dev->lock);
> > > +	return ret;
> > > +}
> > > +
> > > +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr,
> > > +				int len, const void *val,
> > > +				struct kvm_io_ext_data *ext_data)
> > > +{
> > > +	/*TODO: Add big endian support */
> > 
> > comment above can go I think.
> > 
> > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > +	struct kvm_msix_mmio *mmio;
> > > +	int idx, entry, offset, ret = 0, r = 0;
> > > +	void __user *entry_base;
> > > +	__le32 __user *ctrl_pos;
> > > +	__le32 old_ctrl, new_ctrl;
> > > +
> > > +	mutex_lock(&mmio_dev->kvm->lock);
> > > +	mutex_lock(&mmio_dev->lock);
> > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > +	if (idx < 0) {
> > > +		ret = -EOPNOTSUPP;
> > > +		goto out;
> > > +	}
> > > +	if ((len != 4 && len != 8) || addr & (len - 1))
> > > +		goto out;
> > > +
> > > +	offset = addr % PCI_MSIX_ENTRY_SIZE;
> > > +
> > > +	mmio = &mmio_dev->mmio[idx];
> > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > +	entry_base = (void __user *)(mmio->table_base_va +
> > > +			entry * PCI_MSIX_ENTRY_SIZE);
> > > +	ctrl_pos = entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL;
> > > +
> > > +	if (get_user(old_ctrl, ctrl_pos))
> > > +		goto out;
> > > +
> > > +	/* Don't allow writing to other fields when entry is unmasked */
> > > +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > 
> > This should be le32_to_cpu(old_ctrl)
> > I think sparse shall warn about this.
> 
> Sadly it didn't. 
> > 
> > > +	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> > > +		goto out;
> > > +
> > > +	if (copy_to_user((void __user *)(entry_base + offset), val, len))
> > 
> > A cast above is no longer needed
> > 
> > > +		goto out;
> > > +
> > > +	ext_data->type = KVM_IO_EXT_DATA_TYPE_MSIX_ROUTING;
> > > +	ext_data->msix_routing.dev_id = mmio->dev_id;
> > > +	ext_data->msix_routing.type = mmio->type;
> > > +	ext_data->msix_routing.entry_idx = entry;
> > > +	ext_data->msix_routing.flags = 0;
> > 
> > Is this strictly necessary?
> > Can we fill the data in vcpu->run->msix_routing
> > directly instead?
> > Also, on good path we don't need to fill this structure
> > as there's no exit. Let's only do it then?
> 
> Where could you get the "entry_idx"?

It's only used to pass the exit info to userspace, isn't that right?

> > 
> > > +
> > > +	if (offset + len < PCI_MSIX_ENTRY_VECTOR_CTRL) {
> > > +		ret = -ENOTSYNC;
> > > +		goto out;
> > > +	}
> > > +
> > > +	if (get_user(new_ctrl, ctrl_pos))
> > > +		goto out;
> > > +
> > > +	if (old_ctrl == new_ctrl) {
> > > +		if (offset == PCI_MSIX_ENTRY_DATA && len == 8)
> > > +			ret = -ENOTSYNC;
> > > +		goto out;
> > > +	}
> > > +	if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) ^
> > > +			(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry,
> > > +				!!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT));
> > 
> > So if update_msix_mask_bit returns EFAULT above this will
> > trigger msix exit? Why?
> 
> This means kernel didn't handle the mask bit, so we need let userspace handle it - 
> for enable the vectors, or others.

I think the problem is the misuse of EFAULT here.
EFAULT should be for bad userspace addresses only.
Do you use EFAULT instead ENOTSUP, or something?

> > 
> > > +	if (r)
> > > +		ret = -ENOTSYNC;
> > > +out:
> > > +	mutex_unlock(&mmio_dev->lock);
> > > +	mutex_unlock(&mmio_dev->kvm->lock);
> > > +	return ret;
> > > +}
> > > +
> > > +static const struct kvm_io_device_ops msix_mmio_table_ops = {
> > > +	.read     = msix_table_mmio_read,
> > > +	.write    = msix_table_mmio_write,
> > > +};
> > > +
> > > +int kvm_register_msix_mmio_dev(struct kvm *kvm)
> > > +{
> > > +	int ret;
> > > +
> > > +	kvm_iodevice_init(&kvm->msix_mmio_dev.table_dev, &msix_mmio_table_ops);
> > > +	mutex_init(&kvm->msix_mmio_dev.lock);
> > > +	kvm->msix_mmio_dev.kvm = kvm;
> > > +	mutex_lock(&kvm->slots_lock);
> > > +	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS,
> > > +				      &kvm->msix_mmio_dev.table_dev);
> > > +	mutex_unlock(&kvm->slots_lock);
> > > +	return ret;
> > > +}
> > > +
> > > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm)
> > > +{
> > > +	int ret;
> > > +
> > > +	mutex_lock(&kvm->slots_lock);
> > > +	ret = kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
> > > +				      &kvm->msix_mmio_dev.table_dev);
> > > +	mutex_unlock(&kvm->slots_lock);
> > > +	return ret;
> > > +}
> > > +
> > > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> > > +				    struct kvm_msix_mmio_user *mmio_user)
> > > +{
> > > +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> > > +	struct kvm_msix_mmio *mmio = NULL;
> > > +	int r = 0, i;
> > > +
> > > +	mutex_lock(&mmio_dev->lock);
> > > +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> > > +		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id &&
> > > +		    (mmio_dev->mmio[i].type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > > +		    (mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
> > > +			mmio = &mmio_dev->mmio[i];
> > > +			if (mmio->max_entries_nr != mmio_user->max_entries_nr) {
> > > +				r = -EINVAL;
> > > +				goto out;
> > > +			}
> > > +			break;
> > > +		}
> > > +	}
> > 
> > what does the loop above do? Pls add a comment.
> 
> Find the existed MMIO, I think code is clear here.
> > 
> > > +	if (mmio_user->max_entries_nr > KVM_MAX_MSIX_PER_DEV) {
> > > +		r = -EINVAL;
> > 
> > -ENOSPC here as well?
> 
> OK
> > 
> > > +		goto out;
> > > +	}
> > > +	/* All reserved currently */
> > > +	if (mmio_user->flags) {
> > > +		r = -EINVAL;
> > > +		goto out;
> > > +	}
> > > +
> > > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) !=
> > > +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) {
> > > +		r = -EINVAL;
> > > +		goto out;
> > > +	}
> > > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) !=
> > > +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> > > +		r = -EINVAL;
> > > +		goto out;
> > > +	}
> > > +
> > > +#ifndef CONFIG_64BIT
> > > +	if (mmio_user->base_va >= 0xffffffff ||
> > > +	    mmio_user->base_addr >= 0xffffffff) {
> > > +		r = -EINVAL;
> > > +		goto out;
> > > +	}
> > > +#endif
> > 
> > A way to check this without ifdef (long is as pointer in length):
> > 	if ((unsigned long)mmio_user->base_va !=
> > 		mmio_user->base_va)
> > 
> > and for base_va it should be EFAULT I think as with any illegal
> > virtual address.
> 
> OK.
> > 
> > > +
> > > +	/* Check alignment and accessibility */
> > > +	if ((mmio_user->base_va % PCI_MSIX_ENTRY_SIZE) ||
> > > +	    !access_ok(VERIFY_WRITE, (void __user *)mmio_user->base_va,
> > > +			mmio_user->max_entries_nr * PCI_MSIX_ENTRY_SIZE)) {
> > 
> > One thing to watch for here is wrap around. AFAIK
> > access_ok does not check for it, you must do it yourself.
> 
> I am curious that if so, every access_ok should go with a wrap around checking?
> 
> > 
> > > +		r = -EINVAL;
> > 
> > EFAULT is usually better for an access error.
> 
> OK.
> > 
> > > +		goto out;
> > > +	}
> > > +	if (!mmio) {
> > > +		if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) {
> > > +			r = -ENOSPC;
> > > +			goto out;
> > > +		}
> > > +		mmio = &mmio_dev->mmio[mmio_dev->mmio_nr];
> > > +		mmio_dev->mmio_nr++;
> > > +	}
> > > +
> > > +	mmio->max_entries_nr = mmio_user->max_entries_nr;
> > > +	mmio->dev_id = mmio_user->dev_id;
> > > +	mmio->flags = mmio_user->flags;
> > > +
> > > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > > +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> > > +		mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV;
> > > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
> > > +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> > > +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> > > +		mmio->table_base_addr = mmio_user->base_addr;
> > > +		mmio->table_base_va = mmio_user->base_va;
> > > +	}
> > > +out:
> > > +	mutex_unlock(&mmio_dev->lock);
> > > +	return r;
> > > +}
> > > +
> > > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio)
> > 
> > Pls add a comment documenting what this does.
> 
> You mean code is too confusing here? I don't think so.
> > 
> > > +{
> > > +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> > > +	int r = -EINVAL, i, j;
> > > +
> > > +	if (!mmio)
> > > +		return 0;
> > > +
> > > +	mutex_lock(&mmio_dev->lock);
> > > +	BUG_ON(mmio_dev->mmio_nr > KVM_MSIX_MMIO_MAX);
> > > +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> > > +		if (mmio_dev->mmio[i].dev_id == mmio->dev_id &&
> > > +		    mmio_dev->mmio[i].type == mmio->type) {
> > > +			r = 0;
> > > +			for (j = i; j < mmio_dev->mmio_nr - 1; j++)
> > > +				mmio_dev->mmio[j] = mmio_dev->mmio[j + 1];
> > > +			mmio_dev->mmio[mmio_dev->mmio_nr].max_entries_nr = 0;
> > > +			mmio_dev->mmio[mmio_dev->mmio_nr].dev_id = 0;
> > > +			mmio_dev->mmio[mmio_dev->mmio_nr].type = 0;
> > > +			mmio_dev->mmio_nr--;
> > > +			break;
> > > +		}
> > > +	}
> > > +	mutex_unlock(&mmio_dev->lock);
> > > +	return r;
> > > +}
> > > +
> > > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> > > +				      struct kvm_msix_mmio_user *mmio_user)
> > > +{
> > > +	struct kvm_msix_mmio mmio;
> > > +
> > > +	mmio.dev_id = mmio_user->dev_id;
> > > +	mmio.type = mmio_user->type;
> > > +
> > 
> > So you pass in kvm_msix_mmio but only 2 fields are
> > valid? This is very confusing. Please just pass the
> > two values as function params to kvm_free_msix_mmio.
> 
> OK.
> > 
> > > +	return kvm_free_msix_mmio(kvm, &mmio);
> > > +}
> > > +
> > > diff --git a/virt/kvm/msix_mmio.h b/virt/kvm/msix_mmio.h
> > > new file mode 100644
> > > index 0000000..01b6587
> > > --- /dev/null
> > > +++ b/virt/kvm/msix_mmio.h
> > > @@ -0,0 +1,25 @@
> > > +#ifndef __KVM_MSIX_MMIO_H__
> > > +#define __KVM_MSIX_MMIO_H__
> > > +/*
> > > + * MSI-X MMIO emulation
> > > + *
> > > + * Copyright (c) 2010 Intel Corporation
> > > + *
> > > + * This work is licensed under the terms of the GNU GPL, version 2.  See
> > > + * the COPYING file in the top-level directory.
> > > + *
> > > + * Author:
> > > + *   Sheng Yang <sheng.yang@intel.com>
> > > + */
> > > +
> > > +#include <linux/pci.h>
> > 
> > I don't see where you use this include.
> > OTOH you do need to declare struct kvm_msix_mmio_user and
> > struct kvm here.
> 
> Yes.
> 
> --
> regards
> Yang, Sheng
> 
> > 
> > > +
> > > +int kvm_register_msix_mmio_dev(struct kvm *kvm);
> > > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm);
> > > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> > > +				    struct kvm_msix_mmio_user *mmio_user);
> > > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> > > +				      struct kvm_msix_mmio_user *mmio_user);
> > > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio
> > > *mmio_user); +
> > > +#endif

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 3/4] KVM: Emulate MSI-X table in kernel
  2011-03-01 12:20       ` Michael S. Tsirkin
@ 2011-03-01 12:37         ` Sheng Yang
  2011-03-01 13:01         ` Sheng Yang
  1 sibling, 0 replies; 10+ messages in thread
From: Sheng Yang @ 2011-03-01 12:37 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Avi Kivity, Marcelo Tosatti, Alex Williamson, kvm

On Tue, Mar 01, 2011 at 02:20:02PM +0200, Michael S. Tsirkin wrote:
> On Tue, Mar 01, 2011 at 02:10:37PM +0800, Sheng Yang wrote:
> > On Monday 28 February 2011 19:27:29 Michael S. Tsirkin wrote:
> > > On Mon, Feb 28, 2011 at 03:20:04PM +0800, Sheng Yang wrote:
> > > > Then we can support mask bit operation of assigned devices now.
> > > > 
> > > > Signed-off-by: Sheng Yang <sheng@linux.intel.com>
> > > 
> > > A general question: we implement mmio read and write
> > > operation here, but seem to do nothing
> > > about ordering. In particular, pci mmio reads are normally
> > > assumed to flush all writes out to the device
> > > and all device writes in to the CPU.
> > > 
> > > How exactly does it work here?
> > 
> > I don't understand your problem... We emulate all operation here, where is 
> > "ordering" issue?
> > 
> > And Michael, thanks for you detail comments, but could you give comments all at 
> > once? Now I've tried my best to address comments, but still feeling endless 
> > comments are coming.
> 
> Hmm, sorry about that. At least some of the comments are in
> the new code, so I could not have commented on it earlier ...
> E.g. the ext_data thing only appeared in the latest version
> or the one before that. In some cases such as the non-standard
> error codes used, I don't always udnerstand the logic, as
> the error handling is switched to standard conventions
> so comments appear as the logic becomes apparent. This
> applies to EFAULT handling below.
> 
> > And I would leave Intel this Friday, I want to get it done 
> > before I leave.
> 
> Permanently?
> I think it would be helpful, in that case, if you publish some
> testing data detailing how best to test the patch,
> which hardware etc.  We will then do my best to carry on, fix up
> the remaining nits if any, test and apply the patch.

Yes. I would relocate to California, and work for another company start next
month.

Since the patch is already v11 now, if there is not big logic issue, I really
want to get it done before I leave. So I would still try my best to address
all comments and get it checked in ASAP.
> 
> > > 
> > > > ---
> > > > 
> > > >  arch/x86/include/asm/kvm_host.h |    1 +
> > > >  arch/x86/kvm/Makefile           |    2 +-
> > > >  arch/x86/kvm/mmu.c              |    2 +
> > > >  arch/x86/kvm/x86.c              |   40 ++++-
> > > >  include/linux/kvm.h             |   28 ++++
> > > >  include/linux/kvm_host.h        |   36 +++++
> > > >  virt/kvm/assigned-dev.c         |   44 ++++++
> > > >  virt/kvm/kvm_main.c             |   38 +++++-
> > > >  virt/kvm/msix_mmio.c            |  301
> > > >  +++++++++++++++++++++++++++++++++++++++ virt/kvm/msix_mmio.h           
> > > >  |   25 ++++
> > > >  10 files changed, 504 insertions(+), 13 deletions(-)
> > > >  create mode 100644 virt/kvm/msix_mmio.c
> > > >  create mode 100644 virt/kvm/msix_mmio.h
> > > > 
> > > > diff --git a/arch/x86/include/asm/kvm_host.h
> > > > b/arch/x86/include/asm/kvm_host.h index aa75f21..4a390a4 100644
> > > > --- a/arch/x86/include/asm/kvm_host.h
> > > > +++ b/arch/x86/include/asm/kvm_host.h
> > > > @@ -635,6 +635,7 @@ enum emulation_result {
> > > > 
> > > >  	EMULATE_DONE,       /* no further processing */
> > > >  	EMULATE_DO_MMIO,      /* kvm_run filled with mmio request */
> > > >  	EMULATE_FAIL,         /* can't emulate this instruction */
> > > > 
> > > > +	EMULATE_USERSPACE_EXIT, /* we need exit to userspace */
> > > > 
> > > >  };
> > > >  
> > > >  #define EMULTYPE_NO_DECODE	    (1 << 0)
> > > > 
> > > > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> > > > index f15501f..3a0d851 100644
> > > > --- a/arch/x86/kvm/Makefile
> > > > +++ b/arch/x86/kvm/Makefile
> > > > @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
> > > > 
> > > >  kvm-y			+= $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
> > > >  
> > > >  				coalesced_mmio.o irq_comm.o eventfd.o \
> > > > 
> > > > -				assigned-dev.o)
> > > > +				assigned-dev.o msix_mmio.o)
> > > > 
> > > >  kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/, iommu.o)
> > > >  kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix ../../../virt/kvm/,
> > > >  async_pf.o)
> > > > 
> > > > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> > > > index 9cafbb4..912dca4 100644
> > > > --- a/arch/x86/kvm/mmu.c
> > > > +++ b/arch/x86/kvm/mmu.c
> > > > @@ -3358,6 +3358,8 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t
> > > > cr2, u32 error_code,
> > > > 
> > > >  	case EMULATE_DO_MMIO:
> > > >  		++vcpu->stat.mmio_exits;
> > > >  		/* fall through */
> > > > 
> > > > +	case EMULATE_USERSPACE_EXIT:
> > > > +		/* fall through */
> > > > 
> > > >  	case EMULATE_FAIL:
> > > >  		return 0;
> > > >  	
> > > >  	default:
> > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > > index 21b84e2..87308eb 100644
> > > > --- a/arch/x86/kvm/x86.c
> > > > +++ b/arch/x86/kvm/x86.c
> > > > @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)
> > > > 
> > > >  	case KVM_CAP_X86_ROBUST_SINGLESTEP:
> > > >  	case KVM_CAP_XSAVE:
> > > > 
> > > >  	case KVM_CAP_ASYNC_PF:
> > > > +	case KVM_CAP_MSIX_MMIO:
> > > >  		r = 1;
> > > >  		break;
> > > >  	
> > > >  	case KVM_CAP_COALESCED_MMIO:
> > > > @@ -3809,6 +3810,7 @@ static int emulator_write_emulated_onepage(unsigned
> > > > long addr,
> > > > 
> > > >  {
> > > >  
> > > >  	gpa_t                 gpa;
> > > >  	struct kvm_io_ext_data ext_data;
> > > > 
> > > > +	int r;
> > > > 
> > > >  	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
> > > > 
> > > > @@ -3824,18 +3826,32 @@ static int
> > > > emulator_write_emulated_onepage(unsigned long addr,
> > > > 
> > > >  mmio:
> > > >  	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
> > > > 
> > > > +	r = vcpu_mmio_write(vcpu, gpa, bytes, val, &ext_data);
> > > > 
> > > >  	/*
> > > >  	
> > > >  	 * Is this MMIO handled locally?
> > > >  	 */
> > > > 
> > > > -	if (!vcpu_mmio_write(vcpu, gpa, bytes, val, &ext_data))
> > > > +	if (!r)
> > > > 
> > > >  		return X86EMUL_CONTINUE;
> > > > 
> > > > -	vcpu->mmio_needed = 1;
> > > > -	vcpu->run->exit_reason = KVM_EXIT_MMIO;
> > > > -	vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
> > > > -	vcpu->run->mmio.len = vcpu->mmio_size = bytes;
> > > > -	vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
> > > > -	memcpy(vcpu->run->mmio.data, val, bytes);
> > > > +	if (r == -ENOTSYNC) {
> > > 
> > > Can you replace -ENOTSYNC with KVM_EXIT_MSIX_ROUTING_UPDATE all over
> > > please?
> > 
> > How about let Avi/Marcelo decide?
> 
> They are the ones that decide anyway :)
> It's prettier though, if you have the time it'd be nice to
> fix it up directly.

OK, I would work out another version.

> 
> > > 
> > > > +		vcpu->userspace_exit_needed = 1;
> > > > +		vcpu->run->exit_reason = KVM_EXIT_MSIX_ROUTING_UPDATE;
> > > > +		vcpu->run->msix_routing.dev_id =
> > > > +			ext_data.msix_routing.dev_id;
> > > > +		vcpu->run->msix_routing.type =
> > > > +			ext_data.msix_routing.type;
> > > > +		vcpu->run->msix_routing.entry_idx =
> > > > +			ext_data.msix_routing.entry_idx;
> > > > +		vcpu->run->msix_routing.flags =
> > > > +			ext_data.msix_routing.flags;
> > > > +	} else  {
> > > > +		vcpu->mmio_needed = 1;
> > > > +		vcpu->run->exit_reason = KVM_EXIT_MMIO;
> > > > +		vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
> > > > +		vcpu->run->mmio.len = vcpu->mmio_size = bytes;
> > > > +		vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
> > > > +		memcpy(vcpu->run->mmio.data, val, bytes);
> > > > +	}
> > > > 
> > > >  	return X86EMUL_CONTINUE;
> > > >  
> > > >  }
> > > > 
> > > > @@ -4469,6 +4485,8 @@ done:
> > > >  		r = EMULATE_DO_MMIO;
> > > >  	
> > > >  	} else if (r == EMULATION_RESTART)
> > > >  	
> > > >  		goto restart;
> > > > 
> > > > +	else if (vcpu->userspace_exit_needed)
> > > > +		r = EMULATE_USERSPACE_EXIT;
> > > > 
> > > >  	else
> > > >  	
> > > >  		r = EMULATE_DONE;
> > > > 
> > > > @@ -5397,12 +5415,18 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu
> > > > *vcpu, struct kvm_run *kvm_run)
> > > > 
> > > >  		}
> > > >  	
> > > >  	}
> > > > 
> > > > -	if (vcpu->arch.pio.count || vcpu->mmio_needed) {
> > > > +	if (vcpu->arch.pio.count || vcpu->mmio_needed ||
> > > > +			vcpu->userspace_exit_needed) {
> > > > 
> > > >  		if (vcpu->mmio_needed) {
> > > >  		
> > > >  			memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
> > > >  			vcpu->mmio_read_completed = 1;
> > > >  			vcpu->mmio_needed = 0;
> > > >  		
> > > >  		}
> > > > 
> > > > +		if (vcpu->userspace_exit_needed) {
> > > > +			vcpu->userspace_exit_needed = 0;
> > > > +			r = 0;
> > > > +			goto out;
> > > > +		}
> > > > 
> > > >  		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
> > > >  		r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
> > > >  		srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
> > > > 
> > > > diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> > > > index ea2dc1a..4393e4e 100644
> > > > --- a/include/linux/kvm.h
> > > > +++ b/include/linux/kvm.h
> > > > @@ -161,6 +161,7 @@ struct kvm_pit_config {
> > > > 
> > > >  #define KVM_EXIT_NMI              16
> > > >  #define KVM_EXIT_INTERNAL_ERROR   17
> > > >  #define KVM_EXIT_OSI              18
> > > > 
> > > > +#define KVM_EXIT_MSIX_ROUTING_UPDATE 19
> > > > 
> > > >  /* For KVM_EXIT_INTERNAL_ERROR */
> > > >  #define KVM_INTERNAL_ERROR_EMULATION 1
> > > > 
> > > > @@ -264,6 +265,13 @@ struct kvm_run {
> > > > 
> > > >  		struct {
> > > >  		
> > > >  			__u64 gprs[32];
> > > >  		
> > > >  		} osi;
> > > > 
> > > > +		/* KVM_EXIT_MSIX_ROUTING_UPDATE*/
> > > > +		struct {
> > > > +			__u32 dev_id;
> > > > +			__u16 type;
> > > > +			__u16 entry_idx;
> > > > +			__u64 flags;
> > > > +		} msix_routing;
> > > > 
> > > >  		/* Fix the size of the union. */
> > > >  		char padding[256];
> > > >  	
> > > >  	};
> > > > 
> > > > @@ -541,6 +549,7 @@ struct kvm_ppc_pvinfo {
> > > > 
> > > >  #define KVM_CAP_PPC_GET_PVINFO 57
> > > >  #define KVM_CAP_PPC_IRQ_LEVEL 58
> > > >  #define KVM_CAP_ASYNC_PF 59
> > > > 
> > > > +#define KVM_CAP_MSIX_MMIO 60
> > > > 
> > > >  #ifdef KVM_CAP_IRQ_ROUTING
> > > > 
> > > > @@ -672,6 +681,9 @@ struct kvm_clock_data {
> > > > 
> > > >  #define KVM_XEN_HVM_CONFIG        _IOW(KVMIO,  0x7a, struct
> > > >  kvm_xen_hvm_config) #define KVM_SET_CLOCK             _IOW(KVMIO, 
> > > >  0x7b, struct kvm_clock_data) #define KVM_GET_CLOCK            
> > > >  _IOR(KVMIO,  0x7c, struct kvm_clock_data)
> > > > 
> > > > +/* Available with KVM_CAP_MSIX_MMIO */
> > > > +#define KVM_REGISTER_MSIX_MMIO    _IOW(KVMIO,  0x7d, struct
> > > > kvm_msix_mmio_user) +#define KVM_UNREGISTER_MSIX_MMIO  _IOW(KVMIO, 
> > > > 0x7e, struct kvm_msix_mmio_user)
> > > > 
> > > >  /* Available with KVM_CAP_PIT_STATE2 */
> > > >  #define KVM_GET_PIT2              _IOR(KVMIO,  0x9f, struct
> > > >  kvm_pit_state2) #define KVM_SET_PIT2              _IOW(KVMIO,  0xa0,
> > > >  struct kvm_pit_state2)
> > > > 
> > > > @@ -795,4 +807,20 @@ struct kvm_assigned_msix_entry {
> > > > 
> > > >  	__u16 padding[3];
> > > >  
> > > >  };
> > > > 
> > > > +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1 << 0)
> > > > +
> > > > +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1 << 8)
> > > > +
> > > > +#define KVM_MSIX_MMIO_TYPE_DEV_MASK	    0x00ff
> > > > +#define KVM_MSIX_MMIO_TYPE_BASE_MASK	    0xff00
> > > > +struct kvm_msix_mmio_user {
> > > > +	__u32 dev_id;
> > > > +	__u16 type;
> > > > +	__u16 max_entries_nr;
> > > > +	__u64 base_addr;
> > > > +	__u64 base_va;
> > > > +	__u64 flags;
> > > > +	__u64 reserved[4];
> > > > +};
> > > > +
> > > > 
> > > >  #endif /* __LINUX_KVM_H */
> > > > 
> > > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > > index a32c53e..d6c05e3 100644
> > > > --- a/include/linux/kvm_host.h
> > > > +++ b/include/linux/kvm_host.h
> > > > @@ -68,8 +68,17 @@ enum kvm_bus {
> > > > 
> > > >  	KVM_NR_BUSES
> > > >  
> > > >  };
> > > > 
> > > > +#define KVM_IO_EXT_DATA_TYPE_MSIX_ROUTING   1
> > > > 
> > > >  struct kvm_io_ext_data {
> > > >  
> > > >  	int type;   /* Extended data type */
> > > 
> > > type seems unused for now. Do we need it?
> > > I'd guess exit type should usually be sufficient.
> > 
> > I want to get this structure self-explained. 
> > > 
> > > > +	union {
> > > > +		struct {
> > > > +			u32 dev_id;	/* Device ID */
> > > > +			u16 type;	/* Device type */
> > > > +			u16 entry_idx;  /* Accessed MSI-X entry index */
> > > > +			u64 flags;
> > > > +		} msix_routing;
> > > 
> > > So this is only for when we exit on msix routing?
> > 
> > Yes for now.
> > > 
> > > > +	};
> > > > 
> > > >  };
> > > >  
> > > >  int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
> > > > 
> > > > @@ -165,6 +174,8 @@ struct kvm_vcpu {
> > > > 
> > > >  	} async_pf;
> > > >  
> > > >  #endif
> > > > 
> > > > +	int userspace_exit_needed;
> > > > +
> > > > 
> > > >  	struct kvm_vcpu_arch arch;
> > > >  
> > > >  };
> > > > 
> > > > @@ -238,6 +249,27 @@ struct kvm_memslots {
> > > > 
> > > >  					KVM_PRIVATE_MEM_SLOTS];
> > > >  
> > > >  };
> > > > 
> > > > +#define KVM_MSIX_MMIO_MAX    32
> > > > +
> > > > +struct kvm_msix_mmio {
> > > > +	u32 dev_id;
> > > > +	u16 type;
> > > > +	u16 max_entries_nr;
> > > > +	u64 flags;
> > > > +	gpa_t table_base_addr;
> > > > +	hva_t table_base_va;
> > > 
> > > void __user* would make for less casting in code.
> > 
> > OK.
> > > 
> > > Or even
> > > 
> > > 	struct kvm_msix_entry {
> > > 		__le32 addr_lo;
> > > 		__le32 addr_hi;
> > > 		__le32 data;
> > > 		__le32 ctrl;
> > > 	};
> > > 
> > > 	struct kvm_msix_entry __user *table;
> > > 
> > > and then we can do table[entry].ctrl
> > > and so on.
> > 
> > It's a userspace address, I think use it in this way maybe misleading. Personally 
> > I want to keep the current way.
> > > 
> > > > +	gpa_t pba_base_addr;
> > > > +	hva_t pba_base_va;
> > > > +};
> > > > +
> > > > +struct kvm_msix_mmio_dev {
> > > > +	struct kvm *kvm;
> > > > +	struct kvm_io_device table_dev;
> > > > +	int mmio_nr;
> > > > +	struct kvm_msix_mmio mmio[KVM_MSIX_MMIO_MAX];
> > > > +	struct mutex lock;
> > > > +};
> > > > +
> > > > 
> > > >  struct kvm {
> > > >  
> > > >  	spinlock_t mmu_lock;
> > > >  	raw_spinlock_t requests_lock;
> > > > 
> > > > @@ -286,6 +318,7 @@ struct kvm {
> > > > 
> > > >  	long mmu_notifier_count;
> > > >  
> > > >  #endif
> > > >  
> > > >  	long tlbs_dirty;
> > > > 
> > > > +	struct kvm_msix_mmio_dev msix_mmio_dev;
> > > > 
> > > >  };
> > > >  
> > > >  /* The guest did something we don't support. */
> > > > 
> > > > @@ -558,6 +591,9 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
> > > > 
> > > >  int kvm_request_irq_source_id(struct kvm *kvm);
> > > >  void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
> > > > 
> > > > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> > > > +			int assigned_dev_id, int entry, bool mask);
> > > > +
> > > > 
> > > >  /* For vcpu->arch.iommu_flags */
> > > >  #define KVM_IOMMU_CACHE_COHERENCY	0x1
> > > > 
> > > > diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
> > > > index ae72ae6..d1598a6 100644
> > > > --- a/virt/kvm/assigned-dev.c
> > > > +++ b/virt/kvm/assigned-dev.c
> > > > @@ -18,6 +18,7 @@
> > > > 
> > > >  #include <linux/interrupt.h>
> > > >  #include <linux/slab.h>
> > > >  #include "irq.h"
> > > > 
> > > > +#include "msix_mmio.h"
> > > > 
> > > >  static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct
> > > >  list_head *head,
> > > >  
> > > >  						      int assigned_dev_id)
> > > > 
> > > > @@ -191,12 +192,25 @@ static void kvm_free_assigned_irq(struct kvm *kvm,
> > > > 
> > > >  	kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
> > > >  
> > > >  }
> > > > 
> > > > +static void assigned_device_free_msix_mmio(struct kvm *kvm,
> > > > +				struct kvm_assigned_dev_kernel *adev)
> > > > +{
> > > > +	struct kvm_msix_mmio mmio;
> > > > +
> > > > +	mmio.dev_id = adev->assigned_dev_id;
> > > > +	mmio.type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV |
> > > > +		    KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> > > > +	kvm_free_msix_mmio(kvm, &mmio);
> > > > +}
> > > > +
> > > > 
> > > >  static void kvm_free_assigned_device(struct kvm *kvm,
> > > >  
> > > >  				     struct kvm_assigned_dev_kernel
> > > >  				     *assigned_dev)
> > > >  
> > > >  {
> > > >  
> > > >  	kvm_free_assigned_irq(kvm, assigned_dev);
> > > > 
> > > > +	assigned_device_free_msix_mmio(kvm, assigned_dev);
> > > > +
> > > > 
> > > >  	__pci_reset_function(assigned_dev->dev);
> > > >  	pci_restore_state(assigned_dev->dev);
> > > > 
> > > > @@ -785,3 +799,33 @@ out:
> > > >  	return r;
> > > >  
> > > >  }
> > > > 
> > > > +/* The caller should hold kvm->lock */
> > > > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> > > > +				int assigned_dev_id, int entry, bool mask)
> > > > +{
> > > > +	int r = -EFAULT;
> > > > +	struct kvm_assigned_dev_kernel *adev;
> > > > +	int i;
> > > > +
> > > > +	if (!irqchip_in_kernel(kvm))
> > > > +		return r;
> > > 
> > > This is wrong, EFAULT should only be returned on
> > > bad/misaligned address.
> > > 
> > > Can we check this during setup instead?
> > > And then this check can go away or become
> > > BUG_ON, and the function be void.
> > 
> > I think BUG_ON is OK.
> > > 
> > > > +
> > > > +	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> > > > +				      assigned_dev_id);
> > > > +	if (!adev)
> > > > +		goto out;
> > > > +
> > > > +	/* For non-MSIX enabled devices, entries_nr == 0 */
> > > > +	for (i = 0; i < adev->entries_nr; i++)
> > > > +		if (adev->host_msix_entries[i].entry == entry) {
> > > > +			if (mask)
> > > > +				disable_irq_nosync(
> > > > +					adev->host_msix_entries[i].vector);
> > > > +			else
> > > > +				enable_irq(adev->host_msix_entries[i].vector);
> > > > +			r = 0;
> > > > +			break;
> > > > +		}
> > > > +out:
> > > > +	return r;
> > > > +}
> > > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > > > index a61f90e..f211e49 100644
> > > > --- a/virt/kvm/kvm_main.c
> > > > +++ b/virt/kvm/kvm_main.c
> > > > @@ -56,6 +56,7 @@
> > > > 
> > > >  #include "coalesced_mmio.h"
> > > >  #include "async_pf.h"
> > > > 
> > > > +#include "msix_mmio.h"
> > > > 
> > > >  #define CREATE_TRACE_POINTS
> > > >  #include <trace/events/kvm.h>
> > > > 
> > > > @@ -509,6 +510,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
> > > > 
> > > >  	struct mm_struct *mm = kvm->mm;
> > > >  	
> > > >  	kvm_arch_sync_events(kvm);
> > > > 
> > > > +	kvm_unregister_msix_mmio_dev(kvm);
> > > > 
> > > >  	spin_lock(&kvm_lock);
> > > >  	list_del(&kvm->vm_list);
> > > >  	spin_unlock(&kvm_lock);
> > > > 
> > > > @@ -1877,6 +1879,24 @@ static long kvm_vm_ioctl(struct file *filp,
> > > > 
> > > >  		mutex_unlock(&kvm->lock);
> > > >  		break;
> > > >  
> > > >  #endif
> > > > 
> > > > +	case KVM_REGISTER_MSIX_MMIO: {
> > > > +		struct kvm_msix_mmio_user mmio_user;
> > > > +
> > > > +		r = -EFAULT;
> > > > +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> > > > +			goto out;
> > > > +		r = kvm_vm_ioctl_register_msix_mmio(kvm, &mmio_user);
> > > > +		break;
> > > > +	}
> > > > +	case KVM_UNREGISTER_MSIX_MMIO: {
> > > > +		struct kvm_msix_mmio_user mmio_user;
> > > > +
> > > > +		r = -EFAULT;
> > > > +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> > > > +			goto out;
> > > > +		r = kvm_vm_ioctl_unregister_msix_mmio(kvm, &mmio_user);
> > > > +		break;
> > > > +	}
> > > > 
> > > >  	default:
> > > >  		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
> > > >  		if (r == -ENOTTY)
> > > > 
> > > > @@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void)
> > > > 
> > > >  		return r;
> > > >  	
> > > >  	}
> > > >  
> > > >  #endif
> > > > 
> > > > +	r = kvm_register_msix_mmio_dev(kvm);
> > > > +	if (r < 0) {
> > > > +		kvm_put_kvm(kvm);
> > > > +		return r;
> > > > +	}
> > > > +
> > > 
> > > Need to fix error handling below as well?
> > > Better do it with chained gotos if yes.
> > 
> > Let's make it another separate patch. 
> 
> Well you add a new failure mode, you need to cleanup
> properly ...
> 
> > > 
> > > >  	r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
> > > >  	if (r < 0)
> > > >  	
> > > >  		kvm_put_kvm(kvm);
> > > > 
> > > > @@ -2223,14 +2249,18 @@ static void kvm_io_bus_destroy(struct kvm_io_bus
> > > > *bus)
> > > > 
> > > >  int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
> > > >  
> > > >  		     int len, const void *val, struct kvm_io_ext_data *ext_data)
> > > >  
> > > >  {
> > > > 
> > > > -	int i;
> > > > +	int i, r = -EOPNOTSUPP;
> > > > 
> > > >  	struct kvm_io_bus *bus;
> > > >  	
> > > >  	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
> > > > 
> > > > -	for (i = 0; i < bus->dev_count; i++)
> > > > -		if (!kvm_iodevice_write(bus->devs[i], addr, len, val, ext_data))
> > > > +	for (i = 0; i < bus->dev_count; i++) {
> > > > +		r = kvm_iodevice_write(bus->devs[i], addr, len, val, ext_data);
> > > > +		if (r == -ENOTSYNC)
> > > > +			break;
> > > > +		else if (!r)
> > > > 
> > > >  			return 0;
> > > > 
> > > > -	return -EOPNOTSUPP;
> > > > +	}
> > > > +	return r;
> > > > 
> > > >  }
> > > >  
> > > >  /* kvm_io_bus_read - called under kvm->slots_lock */
> > > > 
> > > > diff --git a/virt/kvm/msix_mmio.c b/virt/kvm/msix_mmio.c
> > > > new file mode 100644
> > > > index 0000000..b2a5f86
> > > > --- /dev/null
> > > > +++ b/virt/kvm/msix_mmio.c
> > > > @@ -0,0 +1,301 @@
> > > > +/*
> > > > + * MSI-X MMIO emulation
> > > > + *
> > > > + * Copyright (c) 2010 Intel Corporation
> > > > + *
> > > > + * This work is licensed under the terms of the GNU GPL, version 2.  See
> > > > + * the COPYING file in the top-level directory.
> > > > + *
> > > > + * Author:
> > > > + *   Sheng Yang <sheng.yang@intel.com>
> > > > + */
> > > > +
> > > > +#include <linux/kvm_host.h>
> > > > +#include <linux/kvm.h>
> > > > +
> > > > +#include "msix_mmio.h"
> > > > +#include "iodev.h"
> > > > +
> > > > +static int update_msix_mask_bit(struct kvm *kvm, struct kvm_msix_mmio
> > > > *mmio, +				int entry, u32 flag)
> > > > +{
> > > > +	if (mmio->type & KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> > > 
> > > Does something else even happen?
> > > If not - BUG_ON.
> > 
> > Not for now, the next would be your vfio? Would use BUG_ON for now.
> > > 
> > > > +		return kvm_assigned_device_update_msix_mask_bit(kvm,
> > > > +				mmio->dev_id, entry, flag);
> > > > 
> > > > +	return -EFAULT;
> > > 
> > > EINVAL or something
> > 
> > OK
> > > 
> > > > +}
> > > > +
> > > > +/* Caller must hold dev->lock */
> > > > +static int get_mmio_table_index(struct kvm_msix_mmio_dev *dev,
> > > > +				gpa_t addr, int len)
> > > > +{
> > > > +	gpa_t start, end;
> > > > +	int i, r = -EINVAL;
> > > > +
> > > > +	for (i = 0; i < dev->mmio_nr; i++) {
> > > > +		start = dev->mmio[i].table_base_addr;
> > > > +		end = dev->mmio[i].table_base_addr + PCI_MSIX_ENTRY_SIZE *
> > > > +			dev->mmio[i].max_entries_nr;
> > > > +		if (addr >= start && addr + len <= end) {
> > > > +			r = i;
> > > > +			break;
> > > > +		}
> > > > +	}
> > > > +
> > > > +	return r;
> > > > +}
> > > > +
> > > > +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr,
> > > > int len, +				void *val)
> > > > +{
> > > > +	/*TODO: Add big endian support */
> > > 
> > > likely can be removed.
> > > 
> > > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > > +	struct kvm_msix_mmio *mmio;
> > > > +	int idx, ret = 0, entry, offset, r;
> > > > +
> > > > +	mutex_lock(&mmio_dev->lock);
> > > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > > +	if (idx < 0) {
> > > > +		ret = -EOPNOTSUPP;
> > > > +		goto out;
> > > > +	}
> > > > +	if ((len != 4 && len != 8) || addr & (len - 1))
> > > > +		goto out;
> > > > +
> > > > +	offset = addr % PCI_MSIX_ENTRY_SIZE;
> > > > +	mmio = &mmio_dev->mmio[idx];
> > > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > > +	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
> > > 
> > > cast above is not needed.
> > > 
> > > > +			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> > > > +	if (r)
> > > > +		goto out;
> > > > +out:
> > > > +	mutex_unlock(&mmio_dev->lock);
> > > > +	return ret;
> > > > +}
> > > > +
> > > > +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr,
> > > > +				int len, const void *val,
> > > > +				struct kvm_io_ext_data *ext_data)
> > > > +{
> > > > +	/*TODO: Add big endian support */
> > > 
> > > comment above can go I think.
> > > 
> > > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > > +	struct kvm_msix_mmio *mmio;
> > > > +	int idx, entry, offset, ret = 0, r = 0;
> > > > +	void __user *entry_base;
> > > > +	__le32 __user *ctrl_pos;
> > > > +	__le32 old_ctrl, new_ctrl;
> > > > +
> > > > +	mutex_lock(&mmio_dev->kvm->lock);
> > > > +	mutex_lock(&mmio_dev->lock);
> > > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > > +	if (idx < 0) {
> > > > +		ret = -EOPNOTSUPP;
> > > > +		goto out;
> > > > +	}
> > > > +	if ((len != 4 && len != 8) || addr & (len - 1))
> > > > +		goto out;
> > > > +
> > > > +	offset = addr % PCI_MSIX_ENTRY_SIZE;
> > > > +
> > > > +	mmio = &mmio_dev->mmio[idx];
> > > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > > +	entry_base = (void __user *)(mmio->table_base_va +
> > > > +			entry * PCI_MSIX_ENTRY_SIZE);
> > > > +	ctrl_pos = entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL;
> > > > +
> > > > +	if (get_user(old_ctrl, ctrl_pos))
> > > > +		goto out;
> > > > +
> > > > +	/* Don't allow writing to other fields when entry is unmasked */
> > > > +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > > 
> > > This should be le32_to_cpu(old_ctrl)
> > > I think sparse shall warn about this.
> > 
> > Sadly it didn't. 
> > > 
> > > > +	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> > > > +		goto out;
> > > > +
> > > > +	if (copy_to_user((void __user *)(entry_base + offset), val, len))
> > > 
> > > A cast above is no longer needed
> > > 
> > > > +		goto out;
> > > > +
> > > > +	ext_data->type = KVM_IO_EXT_DATA_TYPE_MSIX_ROUTING;
> > > > +	ext_data->msix_routing.dev_id = mmio->dev_id;
> > > > +	ext_data->msix_routing.type = mmio->type;
> > > > +	ext_data->msix_routing.entry_idx = entry;
> > > > +	ext_data->msix_routing.flags = 0;
> > > 
> > > Is this strictly necessary?
> > > Can we fill the data in vcpu->run->msix_routing
> > > directly instead?
> > > Also, on good path we don't need to fill this structure
> > > as there's no exit. Let's only do it then?
> > 
> > Where could you get the "entry_idx"?
> 
> It's only used to pass the exit info to userspace, isn't that right?

Yes. That's the reason we need this, suggested by Avi.
> 
> > > 
> > > > +
> > > > +	if (offset + len < PCI_MSIX_ENTRY_VECTOR_CTRL) {
> > > > +		ret = -ENOTSYNC;
> > > > +		goto out;
> > > > +	}
> > > > +
> > > > +	if (get_user(new_ctrl, ctrl_pos))
> > > > +		goto out;
> > > > +
> > > > +	if (old_ctrl == new_ctrl) {
> > > > +		if (offset == PCI_MSIX_ENTRY_DATA && len == 8)
> > > > +			ret = -ENOTSYNC;
> > > > +		goto out;
> > > > +	}
> > > > +	if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) ^
> > > > +			(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > > > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry,
> > > > +				!!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT));
> > > 
> > > So if update_msix_mask_bit returns EFAULT above this will
> > > trigger msix exit? Why?
> > 
> > This means kernel didn't handle the mask bit, so we need let userspace handle it - 
> > for enable the vectors, or others.
> 
> I think the problem is the misuse of EFAULT here.
> EFAULT should be for bad userspace addresses only.
> Do you use EFAULT instead ENOTSUP, or something?

ENOTSUPP is OK.

-- 
regards
Yang, Sheng

> 
> > > 
> > > > +	if (r)
> > > > +		ret = -ENOTSYNC;
> > > > +out:
> > > > +	mutex_unlock(&mmio_dev->lock);
> > > > +	mutex_unlock(&mmio_dev->kvm->lock);
> > > > +	return ret;
> > > > +}
> > > > +
> > > > +static const struct kvm_io_device_ops msix_mmio_table_ops = {
> > > > +	.read     = msix_table_mmio_read,
> > > > +	.write    = msix_table_mmio_write,
> > > > +};
> > > > +
> > > > +int kvm_register_msix_mmio_dev(struct kvm *kvm)
> > > > +{
> > > > +	int ret;
> > > > +
> > > > +	kvm_iodevice_init(&kvm->msix_mmio_dev.table_dev, &msix_mmio_table_ops);
> > > > +	mutex_init(&kvm->msix_mmio_dev.lock);
> > > > +	kvm->msix_mmio_dev.kvm = kvm;
> > > > +	mutex_lock(&kvm->slots_lock);
> > > > +	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS,
> > > > +				      &kvm->msix_mmio_dev.table_dev);
> > > > +	mutex_unlock(&kvm->slots_lock);
> > > > +	return ret;
> > > > +}
> > > > +
> > > > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm)
> > > > +{
> > > > +	int ret;
> > > > +
> > > > +	mutex_lock(&kvm->slots_lock);
> > > > +	ret = kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
> > > > +				      &kvm->msix_mmio_dev.table_dev);
> > > > +	mutex_unlock(&kvm->slots_lock);
> > > > +	return ret;
> > > > +}
> > > > +
> > > > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> > > > +				    struct kvm_msix_mmio_user *mmio_user)
> > > > +{
> > > > +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> > > > +	struct kvm_msix_mmio *mmio = NULL;
> > > > +	int r = 0, i;
> > > > +
> > > > +	mutex_lock(&mmio_dev->lock);
> > > > +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> > > > +		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id &&
> > > > +		    (mmio_dev->mmio[i].type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > > > +		    (mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
> > > > +			mmio = &mmio_dev->mmio[i];
> > > > +			if (mmio->max_entries_nr != mmio_user->max_entries_nr) {
> > > > +				r = -EINVAL;
> > > > +				goto out;
> > > > +			}
> > > > +			break;
> > > > +		}
> > > > +	}
> > > 
> > > what does the loop above do? Pls add a comment.
> > 
> > Find the existed MMIO, I think code is clear here.
> > > 
> > > > +	if (mmio_user->max_entries_nr > KVM_MAX_MSIX_PER_DEV) {
> > > > +		r = -EINVAL;
> > > 
> > > -ENOSPC here as well?
> > 
> > OK
> > > 
> > > > +		goto out;
> > > > +	}
> > > > +	/* All reserved currently */
> > > > +	if (mmio_user->flags) {
> > > > +		r = -EINVAL;
> > > > +		goto out;
> > > > +	}
> > > > +
> > > > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) !=
> > > > +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) {
> > > > +		r = -EINVAL;
> > > > +		goto out;
> > > > +	}
> > > > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) !=
> > > > +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> > > > +		r = -EINVAL;
> > > > +		goto out;
> > > > +	}
> > > > +
> > > > +#ifndef CONFIG_64BIT
> > > > +	if (mmio_user->base_va >= 0xffffffff ||
> > > > +	    mmio_user->base_addr >= 0xffffffff) {
> > > > +		r = -EINVAL;
> > > > +		goto out;
> > > > +	}
> > > > +#endif
> > > 
> > > A way to check this without ifdef (long is as pointer in length):
> > > 	if ((unsigned long)mmio_user->base_va !=
> > > 		mmio_user->base_va)
> > > 
> > > and for base_va it should be EFAULT I think as with any illegal
> > > virtual address.
> > 
> > OK.
> > > 
> > > > +
> > > > +	/* Check alignment and accessibility */
> > > > +	if ((mmio_user->base_va % PCI_MSIX_ENTRY_SIZE) ||
> > > > +	    !access_ok(VERIFY_WRITE, (void __user *)mmio_user->base_va,
> > > > +			mmio_user->max_entries_nr * PCI_MSIX_ENTRY_SIZE)) {
> > > 
> > > One thing to watch for here is wrap around. AFAIK
> > > access_ok does not check for it, you must do it yourself.
> > 
> > I am curious that if so, every access_ok should go with a wrap around checking?
> > 
> > > 
> > > > +		r = -EINVAL;
> > > 
> > > EFAULT is usually better for an access error.
> > 
> > OK.
> > > 
> > > > +		goto out;
> > > > +	}
> > > > +	if (!mmio) {
> > > > +		if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) {
> > > > +			r = -ENOSPC;
> > > > +			goto out;
> > > > +		}
> > > > +		mmio = &mmio_dev->mmio[mmio_dev->mmio_nr];
> > > > +		mmio_dev->mmio_nr++;
> > > > +	}
> > > > +
> > > > +	mmio->max_entries_nr = mmio_user->max_entries_nr;
> > > > +	mmio->dev_id = mmio_user->dev_id;
> > > > +	mmio->flags = mmio_user->flags;
> > > > +
> > > > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > > > +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> > > > +		mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV;
> > > > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
> > > > +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> > > > +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> > > > +		mmio->table_base_addr = mmio_user->base_addr;
> > > > +		mmio->table_base_va = mmio_user->base_va;
> > > > +	}
> > > > +out:
> > > > +	mutex_unlock(&mmio_dev->lock);
> > > > +	return r;
> > > > +}
> > > > +
> > > > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio)
> > > 
> > > Pls add a comment documenting what this does.
> > 
> > You mean code is too confusing here? I don't think so.
> > > 
> > > > +{
> > > > +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> > > > +	int r = -EINVAL, i, j;
> > > > +
> > > > +	if (!mmio)
> > > > +		return 0;
> > > > +
> > > > +	mutex_lock(&mmio_dev->lock);
> > > > +	BUG_ON(mmio_dev->mmio_nr > KVM_MSIX_MMIO_MAX);
> > > > +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> > > > +		if (mmio_dev->mmio[i].dev_id == mmio->dev_id &&
> > > > +		    mmio_dev->mmio[i].type == mmio->type) {
> > > > +			r = 0;
> > > > +			for (j = i; j < mmio_dev->mmio_nr - 1; j++)
> > > > +				mmio_dev->mmio[j] = mmio_dev->mmio[j + 1];
> > > > +			mmio_dev->mmio[mmio_dev->mmio_nr].max_entries_nr = 0;
> > > > +			mmio_dev->mmio[mmio_dev->mmio_nr].dev_id = 0;
> > > > +			mmio_dev->mmio[mmio_dev->mmio_nr].type = 0;
> > > > +			mmio_dev->mmio_nr--;
> > > > +			break;
> > > > +		}
> > > > +	}
> > > > +	mutex_unlock(&mmio_dev->lock);
> > > > +	return r;
> > > > +}
> > > > +
> > > > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> > > > +				      struct kvm_msix_mmio_user *mmio_user)
> > > > +{
> > > > +	struct kvm_msix_mmio mmio;
> > > > +
> > > > +	mmio.dev_id = mmio_user->dev_id;
> > > > +	mmio.type = mmio_user->type;
> > > > +
> > > 
> > > So you pass in kvm_msix_mmio but only 2 fields are
> > > valid? This is very confusing. Please just pass the
> > > two values as function params to kvm_free_msix_mmio.
> > 
> > OK.
> > > 
> > > > +	return kvm_free_msix_mmio(kvm, &mmio);
> > > > +}
> > > > +
> > > > diff --git a/virt/kvm/msix_mmio.h b/virt/kvm/msix_mmio.h
> > > > new file mode 100644
> > > > index 0000000..01b6587
> > > > --- /dev/null
> > > > +++ b/virt/kvm/msix_mmio.h
> > > > @@ -0,0 +1,25 @@
> > > > +#ifndef __KVM_MSIX_MMIO_H__
> > > > +#define __KVM_MSIX_MMIO_H__
> > > > +/*
> > > > + * MSI-X MMIO emulation
> > > > + *
> > > > + * Copyright (c) 2010 Intel Corporation
> > > > + *
> > > > + * This work is licensed under the terms of the GNU GPL, version 2.  See
> > > > + * the COPYING file in the top-level directory.
> > > > + *
> > > > + * Author:
> > > > + *   Sheng Yang <sheng.yang@intel.com>
> > > > + */
> > > > +
> > > > +#include <linux/pci.h>
> > > 
> > > I don't see where you use this include.
> > > OTOH you do need to declare struct kvm_msix_mmio_user and
> > > struct kvm here.
> > 
> > Yes.
> > 
> > --
> > regards
> > Yang, Sheng
> > 
> > > 
> > > > +
> > > > +int kvm_register_msix_mmio_dev(struct kvm *kvm);
> > > > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm);
> > > > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> > > > +				    struct kvm_msix_mmio_user *mmio_user);
> > > > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> > > > +				      struct kvm_msix_mmio_user *mmio_user);
> > > > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio
> > > > *mmio_user); +
> > > > +#endif


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 3/4] KVM: Emulate MSI-X table in kernel
  2011-03-01 12:20       ` Michael S. Tsirkin
  2011-03-01 12:37         ` Sheng Yang
@ 2011-03-01 13:01         ` Sheng Yang
  1 sibling, 0 replies; 10+ messages in thread
From: Sheng Yang @ 2011-03-01 13:01 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Avi Kivity, Marcelo Tosatti, Alex Williamson, kvm

On Tue, Mar 01, 2011 at 02:20:02PM +0200, Michael S. Tsirkin wrote:
> On Tue, Mar 01, 2011 at 02:10:37PM +0800, Sheng Yang wrote:
> > On Monday 28 February 2011 19:27:29 Michael S. Tsirkin wrote:
> > > On Mon, Feb 28, 2011 at 03:20:04PM +0800, Sheng Yang wrote:
> > > > @@ -1877,6 +1879,24 @@ static long kvm_vm_ioctl(struct file *filp,
> > > > 
> > > >  		mutex_unlock(&kvm->lock);
> > > >  		break;
> > > >  
> > > >  #endif
> > > > 
> > > > +	case KVM_REGISTER_MSIX_MMIO: {
> > > > +		struct kvm_msix_mmio_user mmio_user;
> > > > +
> > > > +		r = -EFAULT;
> > > > +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> > > > +			goto out;
> > > > +		r = kvm_vm_ioctl_register_msix_mmio(kvm, &mmio_user);
> > > > +		break;
> > > > +	}
> > > > +	case KVM_UNREGISTER_MSIX_MMIO: {
> > > > +		struct kvm_msix_mmio_user mmio_user;
> > > > +
> > > > +		r = -EFAULT;
> > > > +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> > > > +			goto out;
> > > > +		r = kvm_vm_ioctl_unregister_msix_mmio(kvm, &mmio_user);
> > > > +		break;
> > > > +	}
> > > > 
> > > >  	default:
> > > >  		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
> > > >  		if (r == -ENOTTY)
> > > > 
> > > > @@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void)
> > > > 
> > > >  		return r;
> > > >  	
> > > >  	}
> > > >  
> > > >  #endif
> > > > 
> > > > +	r = kvm_register_msix_mmio_dev(kvm);
> > > > +	if (r < 0) {
> > > > +		kvm_put_kvm(kvm);
> > > > +		return r;
> > > > +	}
> > > > +
> > > 
> > > Need to fix error handling below as well?
> > > Better do it with chained gotos if yes.
> > 
> > Let's make it another separate patch. 
> 
> Well you add a new failure mode, you need to cleanup
> properly ...

Oh, I think kvm_put_kvm() should take care of this?

-- 
regards
Yang, Sheng

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2011-03-01 12:58 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-02-28  7:20 [PATCH 0/4 v11] MSI-X MMIO support for KVM Sheng Yang
2011-02-28  7:20 ` [PATCH 1/4] KVM: Move struct kvm_io_device to kvm_host.h Sheng Yang
2011-02-28  7:20 ` [PATCH 2/4] KVM: Add kvm_io_ext_data to IO handler Sheng Yang
2011-02-28  7:20 ` [PATCH 3/4] KVM: Emulate MSI-X table in kernel Sheng Yang
2011-02-28 11:27   ` Michael S. Tsirkin
2011-03-01  6:10     ` Sheng Yang
2011-03-01 12:20       ` Michael S. Tsirkin
2011-03-01 12:37         ` Sheng Yang
2011-03-01 13:01         ` Sheng Yang
2011-02-28  7:20 ` [PATCH 4/4] KVM: Add documents for MSI-X MMIO API Sheng Yang

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.